Skip to content

Commit

Permalink
Add support for de/serializing list-encoded JSON structs [#6558]
Browse files Browse the repository at this point in the history
Currently, a StructArray can only be deserialized from or serialized to
a JSON object (e.g. `{a: 1, b: "c"}`), but some services (e.g. Presto
and Trino) encode ROW types as JSON lists (e.g. `[1, "c"]`) because this
is more compact, and the schema is known.

This PR adds the ability to encode and decode JSON lists from and to
StructArrays, if StructMode is set to ListOnly.  In ListOnly mode,
object-encoded structs raise an error.  Setting to ObjectOnly (the
default) has the original parsing behavior.

Some notes/questions/points for discussion:
1. I've made a JsonParseMode struct instead of a bool flag for two
   reasons.  One is that it's self-descriptive (what would `true` be?),
   and the other is that it allows a future Mixed mode that could
   deserialize either.  The latter isn't currently requested by anyone.
2. I kept the error messages as similar to the old messages as possible.
   I considered having more specific error messages (like "Encountered a
   '[' when parsing a Struct, but the StructParseMode is ObjectOnly" or
   similar), but wanted to hear opinions before I went that route.
3. I'm not attached to any name/code-style/etc, so happy to modify to
   fit local conventions.

Fixes #6558
  • Loading branch information
jagill committed Dec 30, 2024
1 parent 191a9ec commit 5449cba
Show file tree
Hide file tree
Showing 7 changed files with 533 additions and 43 deletions.
85 changes: 85 additions & 0 deletions arrow-json/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,23 @@ pub use self::writer::{ArrayWriter, LineDelimitedWriter, Writer, WriterBuilder};
use half::f16;
use serde_json::{Number, Value};

/// Specifies what is considered valid JSON when parsing StructArrays.
///
/// If a struct with fields `("a", Int32)` and `("b", Utf8)`, it could be represented as
/// a JSON object (`{"a": 1, "b": "c"}`) or a JSON list (`[1, "c"]`). This enum controls
/// which form(s) the Reader will accept.
///
/// For objects, the order of the key does not matter.
/// For lists, the entries must be the same number and in the same order as the struct fields.
#[derive(Copy, Clone, Debug, Default, PartialEq, Eq)]
pub enum StructMode {
#[default]
/// Encode/decode structs as objects (e.g., {"a": 1, "b": "c"})
ObjectOnly,
/// Encode/decode structs as lists (e.g., [1, "c"])
ListOnly,
}

/// Trait declaring any type that is serializable to JSON. This includes all primitive types (bool, i32, etc.).
pub trait JsonSerializable: 'static {
/// Converts self into json value if its possible
Expand Down Expand Up @@ -156,4 +173,72 @@ mod tests {
);
assert_eq!(None, f32::NAN.into_json_value());
}

#[test]
fn test_json_roundtrip_structs() {
use crate::writer::LineDelimited;
use arrow_schema::DataType;
use arrow_schema::Field;
use arrow_schema::Fields;
use arrow_schema::Schema;
use std::sync::Arc;

let schema = Arc::new(Schema::new(vec![
Field::new(
"c1",
DataType::Struct(Fields::from(vec![
Field::new("c11", DataType::Int32, true),
Field::new(
"c12",
DataType::Struct(vec![Field::new("c121", DataType::Utf8, false)].into()),
false,
),
])),
false,
),
Field::new("c2", DataType::Utf8, false),
]));

{
let object_input = r#"{"c1":{"c11":1,"c12":{"c121":"e"}},"c2":"a"}
{"c1":{"c12":{"c121":"f"}},"c2":"b"}
{"c1":{"c11":5,"c12":{"c121":"g"}},"c2":"c"}
"#
.as_bytes();
let object_reader = ReaderBuilder::new(schema.clone())
.with_struct_mode(StructMode::ObjectOnly)
.build(object_input)
.unwrap();

let mut object_output: Vec<u8> = Vec::new();
let mut object_writer = WriterBuilder::new()
.with_struct_mode(StructMode::ObjectOnly)
.build::<_, LineDelimited>(&mut object_output);
for batch_res in object_reader {
object_writer.write(&batch_res.unwrap()).unwrap();
}
assert_eq!(object_input, &object_output);
}

{
let list_input = r#"[[1,["e"]],"a"]
[[null,["f"]],"b"]
[[5,["g"]],"c"]
"#
.as_bytes();
let list_reader = ReaderBuilder::new(schema.clone())
.with_struct_mode(StructMode::ListOnly)
.build(list_input)
.unwrap();

let mut list_output: Vec<u8> = Vec::new();
let mut list_writer = WriterBuilder::new()
.with_struct_mode(StructMode::ListOnly)
.build::<_, LineDelimited>(&mut list_output);
for batch_res in list_reader {
list_writer.write(&batch_res.unwrap()).unwrap();
}
assert_eq!(list_input, &list_output);
}
}
}
3 changes: 3 additions & 0 deletions arrow-json/src/reader/list_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

use crate::reader::tape::{Tape, TapeElement};
use crate::reader::{make_decoder, ArrayDecoder};
use crate::StructMode;
use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder};
use arrow_array::OffsetSizeTrait;
use arrow_buffer::buffer::NullBuffer;
Expand All @@ -37,6 +38,7 @@ impl<O: OffsetSizeTrait> ListArrayDecoder<O> {
coerce_primitive: bool,
strict_mode: bool,
is_nullable: bool,
struct_mode: StructMode,
) -> Result<Self, ArrowError> {
let field = match &data_type {
DataType::List(f) if !O::IS_LARGE => f,
Expand All @@ -48,6 +50,7 @@ impl<O: OffsetSizeTrait> ListArrayDecoder<O> {
coerce_primitive,
strict_mode,
field.is_nullable(),
struct_mode,
)?;

Ok(Self {
Expand Down
4 changes: 4 additions & 0 deletions arrow-json/src/reader/map_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@

use crate::reader::tape::{Tape, TapeElement};
use crate::reader::{make_decoder, ArrayDecoder};
use crate::StructMode;
use arrow_array::builder::{BooleanBufferBuilder, BufferBuilder};
use arrow_buffer::buffer::NullBuffer;
use arrow_buffer::ArrowNativeType;
Expand All @@ -36,6 +37,7 @@ impl MapArrayDecoder {
coerce_primitive: bool,
strict_mode: bool,
is_nullable: bool,
struct_mode: StructMode,
) -> Result<Self, ArrowError> {
let fields = match &data_type {
DataType::Map(_, true) => {
Expand All @@ -59,12 +61,14 @@ impl MapArrayDecoder {
coerce_primitive,
strict_mode,
fields[0].is_nullable(),
struct_mode,
)?;
let values = make_decoder(
fields[1].data_type().clone(),
coerce_primitive,
strict_mode,
fields[1].is_nullable(),
struct_mode,
)?;

Ok(Self {
Expand Down
Loading

0 comments on commit 5449cba

Please sign in to comment.