From 39646c4743029d51d5636d9eb14ce30683839a2e Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Sun, 7 Sep 2025 20:37:57 +0200 Subject: [PATCH 1/2] Quote struct field names --- arrow-cast/src/cast/mod.rs | 8 +- arrow-json/src/lib.rs | 2 +- arrow-row/src/lib.rs | 4 +- arrow-schema/src/datatype_display.rs | 2 +- arrow-schema/src/datatype_parse.rs | 175 ++++++++++++++------------ arrow-schema/src/schema.rs | 2 +- parquet/src/arrow/arrow_reader/mod.rs | 4 +- 7 files changed, 103 insertions(+), 94 deletions(-) diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index fd43fefe62e8..d29f038448cf 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -8653,7 +8653,7 @@ mod tests { }; assert_eq!( t, - r#"Casting from Map(Field { "entries": Struct(key Utf8, value nullable Utf8) }, false) to Map(Field { "entries": Struct(key Utf8, value Utf8) }, true) not supported"# + r#"Casting from Map(Field { "entries": Struct("key": Utf8, "value": nullable Utf8) }, false) to Map(Field { "entries": Struct("key": Utf8, "value": Utf8) }, true) not supported"# ); } @@ -8704,7 +8704,7 @@ mod tests { }; assert_eq!( t, - r#"Casting from Map(Field { "entries": Struct(key Utf8, value nullable Interval(DayTime)) }, false) to Map(Field { "entries": Struct(key Utf8, value Duration(Second)) }, true) not supported"# + r#"Casting from Map(Field { "entries": Struct("key": Utf8, "value": nullable Interval(DayTime)) }, false) to Map(Field { "entries": Struct("key": Utf8, "value": Duration(Second)) }, true) not supported"# ); } @@ -10796,7 +10796,7 @@ mod tests { let to_type = DataType::Utf8; let result = cast(&struct_array, &to_type); assert_eq!( - r#"Cast error: Casting from Struct(a Boolean) to Utf8 not supported"#, + r#"Cast error: Casting from Struct("a": Boolean) to Utf8 not supported"#, result.unwrap_err().to_string() ); } @@ -10807,7 +10807,7 @@ mod tests { let to_type = DataType::Struct(vec![Field::new("a", DataType::Boolean, false)].into()); let result = cast(&array, &to_type); assert_eq!( - r#"Cast error: Casting from Utf8 to Struct(a Boolean) not supported"#, + r#"Cast error: Casting from Utf8 to Struct("a": Boolean) not supported"#, result.unwrap_err().to_string() ); } diff --git a/arrow-json/src/lib.rs b/arrow-json/src/lib.rs index 5a5430fef973..12ad5efa37b0 100644 --- a/arrow-json/src/lib.rs +++ b/arrow-json/src/lib.rs @@ -87,7 +87,7 @@ use serde_json::{Number, Value}; /// /// This enum controls which form(s) the Reader will accept and which form the /// Writer will produce. For example, if the RecordBatch Schema is -/// `[("a", Int32), ("r", Struct(b Boolean, c Utf8))]` +/// `[("a", Int32), ("r", Struct("b": Boolean, "c" Utf8))]` /// then a Reader with [`StructMode::ObjectOnly`] would read rows of the form /// `{"a": 1, "r": {"b": true, "c": "cat"}}` while with ['StructMode::ListOnly'] /// would read rows of the form `[1, [true, "cat"]]`. A Writer would produce diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index cdb52a8ee7fd..69b8a24cc6ed 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -2292,7 +2292,7 @@ mod tests { let [s2] = back.try_into().unwrap(); // RowConverter flattens Dictionary - // s.ty = Struct(foo Dictionary(Int32, Utf8)), s2.ty = Struct(foo Utf8) + // s.ty = Struct("foo": Dictionary(Int32, Utf8)), s2.ty = Struct("foo": Utf8) assert_ne!(&s.data_type(), &s2.data_type()); s2.to_data().validate_full().unwrap(); @@ -2340,7 +2340,7 @@ mod tests { let [s2] = back.try_into().unwrap(); // RowConverter flattens Dictionary - // s.ty = Struct(foo Dictionary(Int32, Int32)), s2.ty = Struct(foo Int32) + // s.ty = Struct("foo": Dictionary(Int32, Int32)), s2.ty = Struct("foo": Int32) assert_ne!(&s.data_type(), &s2.data_type()); s2.to_data().validate_full().unwrap(); assert_eq!(s.len(), 0); diff --git a/arrow-schema/src/datatype_display.rs b/arrow-schema/src/datatype_display.rs index e1bd86cba08e..647746f0cd25 100644 --- a/arrow-schema/src/datatype_display.rs +++ b/arrow-schema/src/datatype_display.rs @@ -118,7 +118,7 @@ impl fmt::Display for DataType { let maybe_nullable = if field.is_nullable() { "nullable " } else { "" }; let data_type = field.data_type(); let metadata_str = format_metadata(field.metadata()); - format!("{name} {maybe_nullable}{data_type}{metadata_str}") + format!("{name:?}: {maybe_nullable}{data_type}{metadata_str}") }) .collect::>() .join(", "); diff --git a/arrow-schema/src/datatype_parse.rs b/arrow-schema/src/datatype_parse.rs index 8b48ecd17f63..9dc55bbe4a61 100644 --- a/arrow-schema/src/datatype_parse.rs +++ b/arrow-schema/src/datatype_parse.rs @@ -81,9 +81,6 @@ impl<'a> Parser<'a> { Token::LargeList => self.parse_large_list(), Token::FixedSizeList => self.parse_fixed_size_list(), Token::Struct => self.parse_struct(), - Token::FieldName(word) => { - Err(make_error(self.val, &format!("unrecognized word: {word}"))) - } tok => Err(make_error( self.val, &format!("finding next type, got unexpected '{tok}'"), @@ -154,15 +151,14 @@ impl<'a> Parser<'a> { /// Parses the next double quoted string fn parse_double_quoted_string(&mut self, context: &str) -> ArrowResult { - match self.next_token()? { - Token::DoubleQuotedString(s) => Ok(s), - Token::FieldName(word) => { - Err(make_error(self.val, &format!("unrecognized word: {word}"))) - } - tok => Err(make_error( + let token = self.next_token()?; + if let Token::DoubleQuotedString(string) = token { + Ok(string) + } else { + Err(make_error( self.val, - &format!("finding double quoted string for {context}, got '{tok}'"), - )), + &format!("expected double quoted string for {context}, got '{token}'"), + )) } } @@ -324,27 +320,22 @@ impl<'a> Parser<'a> { self.expect_token(Token::LParen)?; let mut fields = Vec::new(); loop { + // expects: "field name": [nullable] #datatype + let field_name = match self.next_token()? { - // It's valid to have a name that is a type name - Token::SimpleType(data_type) => data_type.to_string(), - Token::FieldName(name) => name, Token::RParen => { - if fields.is_empty() { - break; - } else { - return Err(make_error( - self.val, - "Unexpected token while parsing Struct fields. Expected a word for the name of Struct, but got trailing comma", - )); - } + break; } + Token::DoubleQuotedString(field_name) => field_name, tok => { return Err(make_error( self.val, - &format!("Expected a word for the name of Struct, but got {tok}"), + &format!("Expected a quoted string for a field name; got {tok:?}"), )) } }; + self.expect_token(Token::Colon)?; + let nullable = self .tokenizer .next_if(|next| matches!(next, Ok(Token::Nullable))) @@ -386,7 +377,7 @@ impl<'a> Parser<'a> { /// returns true if this character is a separator fn is_separator(c: char) -> bool { - c == '(' || c == ')' || c == ',' || c == ' ' + c == '(' || c == ')' || c == ',' || c == ':' || c == ' ' } #[derive(Debug)] @@ -450,50 +441,6 @@ impl<'a> Tokenizer<'a> { })?; return Ok(Token::Integer(val)); } - // if it started with a double quote `"`, try parsing it as a double quoted string - else if c == '"' { - let len = self.word.chars().count(); - - // to verify it's double quoted - if let Some(last_c) = self.word.chars().last() { - if last_c != '"' || len < 2 { - return Err(make_error( - self.val, - &format!( - "parsing {} as double quoted string: last char must be \"", - self.word - ), - )); - } - } - - if len == 2 { - return Err(make_error( - self.val, - &format!( - "parsing {} as double quoted string: empty string isn't supported", - self.word - ), - )); - } - - let val: String = self.word.parse().map_err(|e| { - make_error( - self.val, - &format!("parsing {} as double quoted string: {e}", self.word), - ) - })?; - - let s = val[1..len - 1].to_string(); - if s.contains('"') { - return Err(make_error( - self.val, - &format!("parsing {} as double quoted string: escaped double quote isn't supported", self.word), - )); - } - - return Ok(Token::DoubleQuotedString(s)); - } } // figure out what the word was @@ -559,11 +506,63 @@ impl<'a> Tokenizer<'a> { "Struct" => Token::Struct, - // If we don't recognize the word, treat it as a field name - word => Token::FieldName(word.to_string()), + token => { + return Err(make_error(self.val, &format!("unknown token: {token}"))); + } }; Ok(token) } + + /// Parses e.g. `"foo bar"` + fn parse_quoted_string(&mut self) -> ArrowResult { + if self.next_char() != Some('\"') { + return Err(make_error(self.val, "Expected \"")); + } + + // reset temp space + self.word.clear(); + + let mut is_escaped = false; + + loop { + match self.next_char() { + None => { + return Err(ArrowError::ParseError(format!( + "Unterminated string at: \"{}", + self.word + ))); + } + Some(c) => match c { + '\\' => { + is_escaped = true; + self.word.push(c); + } + '"' => { + if is_escaped { + self.word.push(c); + is_escaped = false; + } else { + break; + } + } + c => { + self.word.push(c); + } + }, + } + } + + let val: String = self.word.parse().map_err(|err| { + ArrowError::ParseError(format!("Failed to parse string: \"{}\": {err}", self.word)) + })?; + + if val.is_empty() { + // Using empty strings as field names is just asking for trouble + return Err(make_error(self.val, "empty strings aren't allowed")); + } + + Ok(Token::DoubleQuotedString(val)) + } } impl Iterator for Tokenizer<'_> { @@ -577,6 +576,9 @@ impl Iterator for Tokenizer<'_> { self.next_char(); continue; } + '"' => { + return Some(self.parse_quoted_string()); + } '(' => { self.next_char(); return Some(Ok(Token::LParen)); @@ -589,6 +591,10 @@ impl Iterator for Tokenizer<'_> { self.next_char(); return Some(Ok(Token::Comma)); } + ':' => { + self.next_char(); + return Some(Ok(Token::Colon)); + } _ => return Some(self.parse_word()), } } @@ -617,6 +623,7 @@ enum Token { LParen, RParen, Comma, + Colon, Some, None, Integer(i64), @@ -626,7 +633,6 @@ enum Token { FixedSizeList, Struct, Nullable, - FieldName(String), } impl Display for Token { @@ -646,6 +652,7 @@ impl Display for Token { Token::LParen => write!(f, "("), Token::RParen => write!(f, ")"), Token::Comma => write!(f, ","), + Token::Colon => write!(f, ":"), Token::Some => write!(f, "Some"), Token::None => write!(f, "None"), Token::FixedSizeBinary => write!(f, "FixedSizeBinary"), @@ -658,7 +665,6 @@ impl Display for Token { Token::DoubleQuotedString(s) => write!(f, "DoubleQuotedString({s})"), Token::Struct => write!(f, "Struct"), Token::Nullable => write!(f, "nullable"), - Token::FieldName(s) => write!(f, "FieldName({s})"), } } } @@ -842,19 +848,19 @@ mod test { ("Nu", "Unsupported type 'Nu'"), ( r#"Timestamp(Nanosecond, Some(+00:00))"#, - "Error unrecognized word: +00:00", + "Error unknown token: +00", ), ( r#"Timestamp(Nanosecond, Some("+00:00))"#, - r#"parsing "+00:00 as double quoted string: last char must be ""#, + r#"Unterminated string at: "+00:00))"#, ), ( r#"Timestamp(Nanosecond, Some(""))"#, - r#"parsing "" as double quoted string: empty string isn't supported"#, + r#"empty strings aren't allowed"#, ), ( r#"Timestamp(Nanosecond, Some("+00:00""))"#, - r#"parsing "+00:00"" as double quoted string: escaped double quote isn't supported"#, + r#"Parser error: Unterminated string at: "))"#, ), ("Timestamp(Nanosecond, ", "Error finding next token"), ( @@ -876,9 +882,9 @@ mod test { ("Decimal64(3, 500)", "Error converting 500 into i8 for Decimal64: out of range integral type conversion attempted"), ("Decimal128(3, 500)", "Error converting 500 into i8 for Decimal128: out of range integral type conversion attempted"), ("Decimal256(3, 500)", "Error converting 500 into i8 for Decimal256: out of range integral type conversion attempted"), - ("Struct(f1, Int64)", "Error finding next type, got unexpected ','"), - ("Struct(f1 Int64,)", "Expected a word for the name of Struct, but got trailing comma"), - ("Struct(f1)", "Error finding next type, got unexpected ')'"), + ("Struct(f1 Int64)", "Error unknown token: f1"), + ("Struct(\"f1\" Int64)", "Expected ':'"), + ("Struct(\"f1\": )", "Error finding next type, got unexpected ')'"), ]; for (data_type_string, expected_message) in cases { @@ -889,10 +895,13 @@ mod test { let message = e.to_string(); assert!( message.contains(expected_message), - "\n\ndid not find expected in actual.\n\nexpected: {expected_message}\nactual:{message}\n" + "\n\ndid not find expected in actual.\n\nexpected: {expected_message}\nactual: {message}\n" ); - // errors should also contain a help message - assert!(message.contains("Must be a supported arrow type name such as 'Int32' or 'Timestamp(Nanosecond, None)'")); + + if !message.contains("Unterminated string") { + // errors should also contain a help message + assert!(message.contains("Must be a supported arrow type name such as 'Int32' or 'Timestamp(Nanosecond, None)'"), "message: {message}"); + } } } } @@ -902,6 +911,6 @@ mod test { fn parse_error_type() { let err = parse_data_type("foobar").unwrap_err(); assert!(matches!(err, ArrowError::ParseError(_))); - assert_eq!(err.to_string(), "Parser error: Unsupported type 'foobar'. Must be a supported arrow type name such as 'Int32' or 'Timestamp(Nanosecond, None)'. Error unrecognized word: foobar"); + assert_eq!(err.to_string(), "Parser error: Unsupported type 'foobar'. Must be a supported arrow type name such as 'Int32' or 'Timestamp(Nanosecond, None)'. Error unknown token: foobar"); } } diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs index dcb1b6183bf1..37545a8eed21 100644 --- a/arrow-schema/src/schema.rs +++ b/arrow-schema/src/schema.rs @@ -701,7 +701,7 @@ mod tests { schema.to_string(), "Field { \"first_name\": Utf8, metadata: {\"k\": \"v\"} }, \ Field { \"last_name\": Utf8 }, \ - Field { \"address\": Struct(street Utf8, zip UInt16) }, \ + Field { \"address\": Struct(\"street\": Utf8, \"zip\": UInt16) }, \ Field { \"interests\": nullable Dictionary(Int32, Utf8), dict_id: 123, dict_is_ordered }" ) } diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 37ab5c1df922..19fa95926ebf 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -3677,8 +3677,8 @@ mod tests { ), ])), "Arrow: Incompatible supplied Arrow schema: data type mismatch for field nested: \ - requested Struct(nested1_valid Utf8, nested1_invalid Int32) \ - but found Struct(nested1_valid Utf8, nested1_invalid Int64)", + requested Struct(\"nested1_valid\": Utf8, \"nested1_invalid\": Int32) \ + but found Struct(\"nested1_valid\": Utf8, \"nested1_invalid\": Int64)", ); } From 6de27e64132636dc9717b9c1f73d1dd43389feed Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Tue, 23 Sep 2025 16:49:47 +0200 Subject: [PATCH 2/2] Update tests --- parquet/tests/variant_integration.rs | 36 ++++++++++++++-------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/parquet/tests/variant_integration.rs b/parquet/tests/variant_integration.rs index 5e5c3d944c34..dcab658bcdd1 100644 --- a/parquet/tests/variant_integration.rs +++ b/parquet/tests/variant_integration.rs @@ -142,18 +142,18 @@ variant_test_case!(38, "Unsupported typed_value type: Struct("); variant_test_case!(39); // Is an error case (should be failing as the expected error message indicates) variant_test_case!(40, "Unsupported typed_value type: List("); -variant_test_case!(41, "Unsupported typed_value type: List(Field"); +variant_test_case!(41, "Unsupported typed_value type: List("); // Is an error case (should be failing as the expected error message indicates) variant_test_case!( 42, "Expected an error 'Invalid variant, conflicting value and typed_value`, but got no error" ); // https://github.com/apache/arrow-rs/issues/8336 -variant_test_case!(43, "Unsupported typed_value type: Struct([Field"); -variant_test_case!(44, "Unsupported typed_value type: Struct([Field"); +variant_test_case!(43, "Unsupported typed_value type: Struct("); +variant_test_case!(44, "Unsupported typed_value type: Struct("); // https://github.com/apache/arrow-rs/issues/8337 -variant_test_case!(45, "Unsupported typed_value type: List(Field"); -variant_test_case!(46, "Unsupported typed_value type: Struct([Field"); +variant_test_case!(45, "Unsupported typed_value type: List("); +variant_test_case!(46, "Unsupported typed_value type: Struct("); variant_test_case!(47); variant_test_case!(48); variant_test_case!(49); @@ -191,14 +191,14 @@ variant_test_case!(80); variant_test_case!(81); variant_test_case!(82); // https://github.com/apache/arrow-rs/issues/8336 -variant_test_case!(83, "Unsupported typed_value type: Struct([Field"); -variant_test_case!(84, "Unsupported typed_value type: Struct([Field"); +variant_test_case!(83, "Unsupported typed_value type: Struct("); +variant_test_case!(84, "Unsupported typed_value type: Struct("); // https://github.com/apache/arrow-rs/issues/8337 -variant_test_case!(85, "Unsupported typed_value type: List(Field"); -variant_test_case!(86, "Unsupported typed_value type: List(Field"); +variant_test_case!(85, "Unsupported typed_value type: List("); +variant_test_case!(86, "Unsupported typed_value type: List("); // Is an error case (should be failing as the expected error message indicates) -variant_test_case!(87, "Unsupported typed_value type: Struct([Field"); -variant_test_case!(88, "Unsupported typed_value type: List(Field"); +variant_test_case!(87, "Unsupported typed_value type: Struct("); +variant_test_case!(88, "Unsupported typed_value type: List("); variant_test_case!(89); variant_test_case!(90); variant_test_case!(91); @@ -243,17 +243,17 @@ variant_test_case!( "Invalid variant data: InvalidArgumentError(\"Received empty bytes\")" ); // Is an error case (should be failing as the expected error message indicates) -variant_test_case!(128, "Unsupported typed_value type: Struct([Field"); +variant_test_case!(128, "Unsupported typed_value type: Struct("); variant_test_case!(129, "Invalid variant data: InvalidArgumentError("); -variant_test_case!(130, "Unsupported typed_value type: Struct([Field"); +variant_test_case!(130, "Unsupported typed_value type: Struct("); variant_test_case!(131); -variant_test_case!(132, "Unsupported typed_value type: Struct([Field"); -variant_test_case!(133, "Unsupported typed_value type: Struct([Field"); -variant_test_case!(134, "Unsupported typed_value type: Struct([Field"); +variant_test_case!(132, "Unsupported typed_value type: Struct("); +variant_test_case!(133, "Unsupported typed_value type: Struct("); +variant_test_case!(134, "Unsupported typed_value type: Struct("); variant_test_case!(135); -variant_test_case!(136, "Unsupported typed_value type: List(Field "); +variant_test_case!(136, "Unsupported typed_value type: List("); variant_test_case!(137, "Invalid variant data: InvalidArgumentError("); -variant_test_case!(138, "Unsupported typed_value type: Struct([Field"); +variant_test_case!(138, "Unsupported typed_value type: Struct("); /// Test case definition structure matching the format from /// `parquet-testing/parquet_shredded/cases.json`