diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index 0330ce913806..71de8f9f1861 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -8665,7 +8665,7 @@ mod tests { }; assert_eq!( t, - r#"Casting from Map(Field { "entries": Struct(key Utf8, value nullable Utf8) }, false) to Map(Field { "entries": Struct(key Utf8, value Utf8) }, true) not supported"# + r#"Casting from Map(Field { "entries": Struct("key": Utf8, "value": nullable Utf8) }, false) to Map(Field { "entries": Struct("key": Utf8, "value": Utf8) }, true) not supported"# ); } @@ -8716,7 +8716,7 @@ mod tests { }; assert_eq!( t, - r#"Casting from Map(Field { "entries": Struct(key Utf8, value nullable Interval(DayTime)) }, false) to Map(Field { "entries": Struct(key Utf8, value Duration(s)) }, true) not supported"# + r#"Casting from Map(Field { "entries": Struct("key": Utf8, "value": nullable Interval(DayTime)) }, false) to Map(Field { "entries": Struct("key": Utf8, "value": Duration(s)) }, true) not supported"# ); } @@ -10805,7 +10805,7 @@ mod tests { let to_type = DataType::Utf8; let result = cast(&struct_array, &to_type); assert_eq!( - r#"Cast error: Casting from Struct(a Boolean) to Utf8 not supported"#, + r#"Cast error: Casting from Struct("a": Boolean) to Utf8 not supported"#, result.unwrap_err().to_string() ); } @@ -10816,7 +10816,7 @@ mod tests { let to_type = DataType::Struct(vec![Field::new("a", DataType::Boolean, false)].into()); let result = cast(&array, &to_type); assert_eq!( - r#"Cast error: Casting from Utf8 to Struct(a Boolean) not supported"#, + r#"Cast error: Casting from Utf8 to Struct("a": Boolean) not supported"#, result.unwrap_err().to_string() ); } diff --git a/arrow-json/src/lib.rs b/arrow-json/src/lib.rs index 5a5430fef973..12ad5efa37b0 100644 --- a/arrow-json/src/lib.rs +++ b/arrow-json/src/lib.rs @@ -87,7 +87,7 @@ use serde_json::{Number, Value}; /// /// This enum controls which form(s) the Reader will accept and which form the /// Writer will produce. For example, if the RecordBatch Schema is -/// `[("a", Int32), ("r", Struct(b Boolean, c Utf8))]` +/// `[("a", Int32), ("r", Struct("b": Boolean, "c" Utf8))]` /// then a Reader with [`StructMode::ObjectOnly`] would read rows of the form /// `{"a": 1, "r": {"b": true, "c": "cat"}}` while with ['StructMode::ListOnly'] /// would read rows of the form `[1, [true, "cat"]]`. A Writer would produce diff --git a/arrow-row/src/lib.rs b/arrow-row/src/lib.rs index cdb52a8ee7fd..69b8a24cc6ed 100644 --- a/arrow-row/src/lib.rs +++ b/arrow-row/src/lib.rs @@ -2292,7 +2292,7 @@ mod tests { let [s2] = back.try_into().unwrap(); // RowConverter flattens Dictionary - // s.ty = Struct(foo Dictionary(Int32, Utf8)), s2.ty = Struct(foo Utf8) + // s.ty = Struct("foo": Dictionary(Int32, Utf8)), s2.ty = Struct("foo": Utf8) assert_ne!(&s.data_type(), &s2.data_type()); s2.to_data().validate_full().unwrap(); @@ -2340,7 +2340,7 @@ mod tests { let [s2] = back.try_into().unwrap(); // RowConverter flattens Dictionary - // s.ty = Struct(foo Dictionary(Int32, Int32)), s2.ty = Struct(foo Int32) + // s.ty = Struct("foo": Dictionary(Int32, Int32)), s2.ty = Struct("foo": Int32) assert_ne!(&s.data_type(), &s2.data_type()); s2.to_data().validate_full().unwrap(); assert_eq!(s.len(), 0); diff --git a/arrow-schema/src/datatype_display.rs b/arrow-schema/src/datatype_display.rs index f23beb489deb..73ceb3f680f8 100644 --- a/arrow-schema/src/datatype_display.rs +++ b/arrow-schema/src/datatype_display.rs @@ -122,7 +122,7 @@ impl fmt::Display for DataType { let maybe_nullable = if field.is_nullable() { "nullable " } else { "" }; let data_type = field.data_type(); let metadata_str = format_metadata(field.metadata()); - format!("{name} {maybe_nullable}{data_type}{metadata_str}") + format!("{name:?}: {maybe_nullable}{data_type}{metadata_str}") }) .collect::>() .join(", "); diff --git a/arrow-schema/src/datatype_parse.rs b/arrow-schema/src/datatype_parse.rs index f465871ad05d..1042784c304a 100644 --- a/arrow-schema/src/datatype_parse.rs +++ b/arrow-schema/src/datatype_parse.rs @@ -81,9 +81,6 @@ impl<'a> Parser<'a> { Token::LargeList => self.parse_large_list(), Token::FixedSizeList => self.parse_fixed_size_list(), Token::Struct => self.parse_struct(), - Token::FieldName(word) => { - Err(make_error(self.val, &format!("unrecognized word: {word}"))) - } tok => Err(make_error( self.val, &format!("finding next type, got unexpected '{tok}'"), @@ -137,15 +134,14 @@ impl<'a> Parser<'a> { /// Parses the next double quoted string fn parse_double_quoted_string(&mut self, context: &str) -> ArrowResult { - match self.next_token()? { - Token::DoubleQuotedString(s) => Ok(s), - Token::FieldName(word) => { - Err(make_error(self.val, &format!("unrecognized word: {word}"))) - } - tok => Err(make_error( + let token = self.next_token()?; + if let Token::DoubleQuotedString(string) = token { + Ok(string) + } else { + Err(make_error( self.val, - &format!("finding double quoted string for {context}, got '{tok}'"), - )), + &format!("expected double quoted string for {context}, got '{token}'"), + )) } } @@ -321,27 +317,22 @@ impl<'a> Parser<'a> { self.expect_token(Token::LParen)?; let mut fields = Vec::new(); loop { + // expects: "field name": [nullable] #datatype + let field_name = match self.next_token()? { - // It's valid to have a name that is a type name - Token::SimpleType(data_type) => data_type.to_string(), - Token::FieldName(name) => name, Token::RParen => { - if fields.is_empty() { - break; - } else { - return Err(make_error( - self.val, - "Unexpected token while parsing Struct fields. Expected a word for the name of Struct, but got trailing comma", - )); - } + break; } + Token::DoubleQuotedString(field_name) => field_name, tok => { return Err(make_error( self.val, - &format!("Expected a word for the name of Struct, but got {tok}"), + &format!("Expected a quoted string for a field name; got {tok:?}"), )) } }; + self.expect_token(Token::Colon)?; + let nullable = self .tokenizer .next_if(|next| matches!(next, Ok(Token::Nullable))) @@ -383,7 +374,7 @@ impl<'a> Parser<'a> { /// returns true if this character is a separator fn is_separator(c: char) -> bool { - c == '(' || c == ')' || c == ',' || c == ' ' + c == '(' || c == ')' || c == ',' || c == ':' || c == ' ' } #[derive(Debug)] @@ -445,50 +436,6 @@ impl<'a> Tokenizer<'a> { })?; return Ok(Token::Integer(val)); } - // if it started with a double quote `"`, try parsing it as a double quoted string - else if c == '"' { - let len = self.word.chars().count(); - - // to verify it's double quoted - if let Some(last_c) = self.word.chars().last() { - if last_c != '"' || len < 2 { - return Err(make_error( - self.val, - &format!( - "parsing {} as double quoted string: last char must be \"", - self.word - ), - )); - } - } - - if len == 2 { - return Err(make_error( - self.val, - &format!( - "parsing {} as double quoted string: empty string isn't supported", - self.word - ), - )); - } - - let val: String = self.word.parse().map_err(|e| { - make_error( - self.val, - &format!("parsing {} as double quoted string: {e}", self.word), - ) - })?; - - let s = val[1..len - 1].to_string(); - if s.contains('"') { - return Err(make_error( - self.val, - &format!("parsing {} as double quoted string: escaped double quote isn't supported", self.word), - )); - } - - return Ok(Token::DoubleQuotedString(s)); - } } // figure out what the word was @@ -554,11 +501,63 @@ impl<'a> Tokenizer<'a> { "Struct" => Token::Struct, - // If we don't recognize the word, treat it as a field name - word => Token::FieldName(word.to_string()), + token => { + return Err(make_error(self.val, &format!("unknown token: {token}"))); + } }; Ok(token) } + + /// Parses e.g. `"foo bar"` + fn parse_quoted_string(&mut self) -> ArrowResult { + if self.next_char() != Some('\"') { + return Err(make_error(self.val, "Expected \"")); + } + + // reset temp space + self.word.clear(); + + let mut is_escaped = false; + + loop { + match self.next_char() { + None => { + return Err(ArrowError::ParseError(format!( + "Unterminated string at: \"{}", + self.word + ))); + } + Some(c) => match c { + '\\' => { + is_escaped = true; + self.word.push(c); + } + '"' => { + if is_escaped { + self.word.push(c); + is_escaped = false; + } else { + break; + } + } + c => { + self.word.push(c); + } + }, + } + } + + let val: String = self.word.parse().map_err(|err| { + ArrowError::ParseError(format!("Failed to parse string: \"{}\": {err}", self.word)) + })?; + + if val.is_empty() { + // Using empty strings as field names is just asking for trouble + return Err(make_error(self.val, "empty strings aren't allowed")); + } + + Ok(Token::DoubleQuotedString(val)) + } } impl Iterator for Tokenizer<'_> { @@ -572,6 +571,9 @@ impl Iterator for Tokenizer<'_> { self.next_char(); continue; } + '"' => { + return Some(self.parse_quoted_string()); + } '(' => { self.next_char(); return Some(Ok(Token::LParen)); @@ -584,6 +586,10 @@ impl Iterator for Tokenizer<'_> { self.next_char(); return Some(Ok(Token::Comma)); } + ':' => { + self.next_char(); + return Some(Ok(Token::Colon)); + } _ => return Some(self.parse_word()), } } @@ -612,6 +618,7 @@ enum Token { LParen, RParen, Comma, + Colon, Some, None, Integer(i64), @@ -621,7 +628,6 @@ enum Token { FixedSizeList, Struct, Nullable, - FieldName(String), } impl Display for Token { @@ -641,6 +647,7 @@ impl Display for Token { Token::LParen => write!(f, "("), Token::RParen => write!(f, ")"), Token::Comma => write!(f, ","), + Token::Colon => write!(f, ":"), Token::Some => write!(f, "Some"), Token::None => write!(f, "None"), Token::FixedSizeBinary => write!(f, "FixedSizeBinary"), @@ -653,7 +660,6 @@ impl Display for Token { Token::DoubleQuotedString(s) => write!(f, "DoubleQuotedString({s})"), Token::Struct => write!(f, "Struct"), Token::Nullable => write!(f, "nullable"), - Token::FieldName(s) => write!(f, "FieldName({s})"), } } } @@ -837,19 +843,19 @@ mod test { ("Nu", "Unsupported type 'Nu'"), ( r#"Timestamp(ns, +00:00)"#, - "Error unrecognized word: +00:00", + "Error unknown token: +00", ), ( r#"Timestamp(ns, "+00:00)"#, - r#"parsing "+00:00 as double quoted string: last char must be ""#, + r#"Unterminated string at: "+00:00)"#, ), ( r#"Timestamp(ns, "")"#, - r#"parsing "" as double quoted string: empty string isn't supported"#, + r#"empty strings aren't allowed"#, ), ( r#"Timestamp(ns, "+00:00"")"#, - r#"parsing "+00:00"" as double quoted string: escaped double quote isn't supported"#, + r#"Parser error: Unterminated string at: ")"#, ), ("Timestamp(ns, ", "Error finding next token"), ( @@ -871,9 +877,9 @@ mod test { ("Decimal64(3, 500)", "Error converting 500 into i8 for Decimal64: out of range integral type conversion attempted"), ("Decimal128(3, 500)", "Error converting 500 into i8 for Decimal128: out of range integral type conversion attempted"), ("Decimal256(3, 500)", "Error converting 500 into i8 for Decimal256: out of range integral type conversion attempted"), - ("Struct(f1, Int64)", "Error finding next type, got unexpected ','"), - ("Struct(f1 Int64,)", "Expected a word for the name of Struct, but got trailing comma"), - ("Struct(f1)", "Error finding next type, got unexpected ')'"), + ("Struct(f1 Int64)", "Error unknown token: f1"), + ("Struct(\"f1\" Int64)", "Expected ':'"), + ("Struct(\"f1\": )", "Error finding next type, got unexpected ')'"), ]; for (data_type_string, expected_message) in cases { @@ -884,12 +890,13 @@ mod test { let message = e.to_string(); assert!( message.contains(expected_message), - "\n\ndid not find expected in actual.\n\nexpected: {expected_message}\nactual:{message}\n" + "\n\ndid not find expected in actual.\n\nexpected: {expected_message}\nactual: {message}\n" ); - // errors should also contain a help message - assert!(message.contains( - "Must be a supported arrow type name such as 'Int32' or 'Timestamp(ns)'" - )); + + if !message.contains("Unterminated string") { + // errors should also contain a help message + assert!(message.contains("Must be a supported arrow type name such as 'Int32' or 'Timestamp(ns)'"), "message: {message}"); + } } } } @@ -899,6 +906,6 @@ mod test { fn parse_error_type() { let err = parse_data_type("foobar").unwrap_err(); assert!(matches!(err, ArrowError::ParseError(_))); - assert_eq!(err.to_string(), "Parser error: Unsupported type 'foobar'. Must be a supported arrow type name such as 'Int32' or 'Timestamp(ns)'. Error unrecognized word: foobar"); + assert_eq!(err.to_string(), "Parser error: Unsupported type 'foobar'. Must be a supported arrow type name such as 'Int32' or 'Timestamp(ns)'. Error unknown token: foobar"); } } diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs index dcb1b6183bf1..37545a8eed21 100644 --- a/arrow-schema/src/schema.rs +++ b/arrow-schema/src/schema.rs @@ -701,7 +701,7 @@ mod tests { schema.to_string(), "Field { \"first_name\": Utf8, metadata: {\"k\": \"v\"} }, \ Field { \"last_name\": Utf8 }, \ - Field { \"address\": Struct(street Utf8, zip UInt16) }, \ + Field { \"address\": Struct(\"street\": Utf8, \"zip\": UInt16) }, \ Field { \"interests\": nullable Dictionary(Int32, Utf8), dict_id: 123, dict_is_ordered }" ) } diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 0a5a7d096979..8a7e2ef7094f 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -3677,8 +3677,8 @@ mod tests { ), ])), "Arrow: Incompatible supplied Arrow schema: data type mismatch for field nested: \ - requested Struct(nested1_valid Utf8, nested1_invalid Int32) \ - but found Struct(nested1_valid Utf8, nested1_invalid Int64)", + requested Struct(\"nested1_valid\": Utf8, \"nested1_invalid\": Int32) \ + but found Struct(\"nested1_valid\": Utf8, \"nested1_invalid\": Int64)", ); }