Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions arrow-cast/src/cast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8665,7 +8665,7 @@ mod tests {
};
assert_eq!(
t,
r#"Casting from Map(Field { "entries": Struct(key Utf8, value nullable Utf8) }, false) to Map(Field { "entries": Struct(key Utf8, value Utf8) }, true) not supported"#
r#"Casting from Map(Field { "entries": Struct("key": Utf8, "value": nullable Utf8) }, false) to Map(Field { "entries": Struct("key": Utf8, "value": Utf8) }, true) not supported"#
);
}

Expand Down Expand Up @@ -8716,7 +8716,7 @@ mod tests {
};
assert_eq!(
t,
r#"Casting from Map(Field { "entries": Struct(key Utf8, value nullable Interval(DayTime)) }, false) to Map(Field { "entries": Struct(key Utf8, value Duration(s)) }, true) not supported"#
r#"Casting from Map(Field { "entries": Struct("key": Utf8, "value": nullable Interval(DayTime)) }, false) to Map(Field { "entries": Struct("key": Utf8, "value": Duration(s)) }, true) not supported"#
);
}

Expand Down Expand Up @@ -10805,7 +10805,7 @@ mod tests {
let to_type = DataType::Utf8;
let result = cast(&struct_array, &to_type);
assert_eq!(
r#"Cast error: Casting from Struct(a Boolean) to Utf8 not supported"#,
r#"Cast error: Casting from Struct("a": Boolean) to Utf8 not supported"#,
result.unwrap_err().to_string()
);
}
Expand All @@ -10816,7 +10816,7 @@ mod tests {
let to_type = DataType::Struct(vec![Field::new("a", DataType::Boolean, false)].into());
let result = cast(&array, &to_type);
assert_eq!(
r#"Cast error: Casting from Utf8 to Struct(a Boolean) not supported"#,
r#"Cast error: Casting from Utf8 to Struct("a": Boolean) not supported"#,
result.unwrap_err().to_string()
);
}
Expand Down
2 changes: 1 addition & 1 deletion arrow-json/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ use serde_json::{Number, Value};
///
/// This enum controls which form(s) the Reader will accept and which form the
/// Writer will produce. For example, if the RecordBatch Schema is
/// `[("a", Int32), ("r", Struct(b Boolean, c Utf8))]`
/// `[("a", Int32), ("r", Struct("b": Boolean, "c" Utf8))]`
/// then a Reader with [`StructMode::ObjectOnly`] would read rows of the form
/// `{"a": 1, "r": {"b": true, "c": "cat"}}` while with ['StructMode::ListOnly']
/// would read rows of the form `[1, [true, "cat"]]`. A Writer would produce
Expand Down
4 changes: 2 additions & 2 deletions arrow-row/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2292,7 +2292,7 @@ mod tests {
let [s2] = back.try_into().unwrap();

// RowConverter flattens Dictionary
// s.ty = Struct(foo Dictionary(Int32, Utf8)), s2.ty = Struct(foo Utf8)
// s.ty = Struct("foo": Dictionary(Int32, Utf8)), s2.ty = Struct("foo": Utf8)
assert_ne!(&s.data_type(), &s2.data_type());
s2.to_data().validate_full().unwrap();

Expand Down Expand Up @@ -2340,7 +2340,7 @@ mod tests {
let [s2] = back.try_into().unwrap();

// RowConverter flattens Dictionary
// s.ty = Struct(foo Dictionary(Int32, Int32)), s2.ty = Struct(foo Int32)
// s.ty = Struct("foo": Dictionary(Int32, Int32)), s2.ty = Struct("foo": Int32)
assert_ne!(&s.data_type(), &s2.data_type());
s2.to_data().validate_full().unwrap();
assert_eq!(s.len(), 0);
Expand Down
2 changes: 1 addition & 1 deletion arrow-schema/src/datatype_display.rs
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ impl fmt::Display for DataType {
let maybe_nullable = if field.is_nullable() { "nullable " } else { "" };
let data_type = field.data_type();
let metadata_str = format_metadata(field.metadata());
format!("{name} {maybe_nullable}{data_type}{metadata_str}")
format!("{name:?}: {maybe_nullable}{data_type}{metadata_str}")
})
.collect::<Vec<_>>()
.join(", ");
Expand Down
177 changes: 92 additions & 85 deletions arrow-schema/src/datatype_parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,6 @@ impl<'a> Parser<'a> {
Token::LargeList => self.parse_large_list(),
Token::FixedSizeList => self.parse_fixed_size_list(),
Token::Struct => self.parse_struct(),
Token::FieldName(word) => {
Err(make_error(self.val, &format!("unrecognized word: {word}")))
}
tok => Err(make_error(
self.val,
&format!("finding next type, got unexpected '{tok}'"),
Expand Down Expand Up @@ -137,15 +134,14 @@ impl<'a> Parser<'a> {

/// Parses the next double quoted string
fn parse_double_quoted_string(&mut self, context: &str) -> ArrowResult<String> {
match self.next_token()? {
Token::DoubleQuotedString(s) => Ok(s),
Token::FieldName(word) => {
Err(make_error(self.val, &format!("unrecognized word: {word}")))
}
tok => Err(make_error(
let token = self.next_token()?;
if let Token::DoubleQuotedString(string) = token {
Ok(string)
} else {
Err(make_error(
self.val,
&format!("finding double quoted string for {context}, got '{tok}'"),
)),
&format!("expected double quoted string for {context}, got '{token}'"),
))
}
}

Expand Down Expand Up @@ -321,27 +317,22 @@ impl<'a> Parser<'a> {
self.expect_token(Token::LParen)?;
let mut fields = Vec::new();
loop {
// expects: "field name": [nullable] #datatype

let field_name = match self.next_token()? {
// It's valid to have a name that is a type name
Token::SimpleType(data_type) => data_type.to_string(),
Token::FieldName(name) => name,
Token::RParen => {
if fields.is_empty() {
break;
} else {
return Err(make_error(
self.val,
"Unexpected token while parsing Struct fields. Expected a word for the name of Struct, but got trailing comma",
));
}
break;
}
Token::DoubleQuotedString(field_name) => field_name,
tok => {
return Err(make_error(
self.val,
&format!("Expected a word for the name of Struct, but got {tok}"),
&format!("Expected a quoted string for a field name; got {tok:?}"),
))
}
};
self.expect_token(Token::Colon)?;

let nullable = self
.tokenizer
.next_if(|next| matches!(next, Ok(Token::Nullable)))
Expand Down Expand Up @@ -383,7 +374,7 @@ impl<'a> Parser<'a> {

/// returns true if this character is a separator
fn is_separator(c: char) -> bool {
c == '(' || c == ')' || c == ',' || c == ' '
c == '(' || c == ')' || c == ',' || c == ':' || c == ' '
}

#[derive(Debug)]
Expand Down Expand Up @@ -445,50 +436,6 @@ impl<'a> Tokenizer<'a> {
})?;
return Ok(Token::Integer(val));
}
// if it started with a double quote `"`, try parsing it as a double quoted string
else if c == '"' {
let len = self.word.chars().count();

// to verify it's double quoted
if let Some(last_c) = self.word.chars().last() {
if last_c != '"' || len < 2 {
return Err(make_error(
self.val,
&format!(
"parsing {} as double quoted string: last char must be \"",
self.word
),
));
}
}

if len == 2 {
return Err(make_error(
self.val,
&format!(
"parsing {} as double quoted string: empty string isn't supported",
self.word
),
));
}

let val: String = self.word.parse().map_err(|e| {
make_error(
self.val,
&format!("parsing {} as double quoted string: {e}", self.word),
)
})?;

let s = val[1..len - 1].to_string();
if s.contains('"') {
return Err(make_error(
self.val,
&format!("parsing {} as double quoted string: escaped double quote isn't supported", self.word),
));
}

return Ok(Token::DoubleQuotedString(s));
}
}

// figure out what the word was
Expand Down Expand Up @@ -554,11 +501,63 @@ impl<'a> Tokenizer<'a> {

"Struct" => Token::Struct,

// If we don't recognize the word, treat it as a field name
word => Token::FieldName(word.to_string()),
token => {
return Err(make_error(self.val, &format!("unknown token: {token}")));
}
};
Ok(token)
}

/// Parses e.g. `"foo bar"`
fn parse_quoted_string(&mut self) -> ArrowResult<Token> {
if self.next_char() != Some('\"') {
return Err(make_error(self.val, "Expected \""));
}

// reset temp space
self.word.clear();

let mut is_escaped = false;

loop {
match self.next_char() {
None => {
return Err(ArrowError::ParseError(format!(
"Unterminated string at: \"{}",
self.word
)));
}
Some(c) => match c {
'\\' => {
is_escaped = true;
self.word.push(c);
}
'"' => {
if is_escaped {
self.word.push(c);
is_escaped = false;
} else {
break;
}
}
c => {
self.word.push(c);
}
},
}
}

let val: String = self.word.parse().map_err(|err| {
ArrowError::ParseError(format!("Failed to parse string: \"{}\": {err}", self.word))
})?;

if val.is_empty() {
// Using empty strings as field names is just asking for trouble
return Err(make_error(self.val, "empty strings aren't allowed"));
}

Ok(Token::DoubleQuotedString(val))
}
}

impl Iterator for Tokenizer<'_> {
Expand All @@ -572,6 +571,9 @@ impl Iterator for Tokenizer<'_> {
self.next_char();
continue;
}
'"' => {
return Some(self.parse_quoted_string());
}
'(' => {
self.next_char();
return Some(Ok(Token::LParen));
Expand All @@ -584,6 +586,10 @@ impl Iterator for Tokenizer<'_> {
self.next_char();
return Some(Ok(Token::Comma));
}
':' => {
self.next_char();
return Some(Ok(Token::Colon));
}
_ => return Some(self.parse_word()),
}
}
Expand Down Expand Up @@ -612,6 +618,7 @@ enum Token {
LParen,
RParen,
Comma,
Colon,
Some,
None,
Integer(i64),
Expand All @@ -621,7 +628,6 @@ enum Token {
FixedSizeList,
Struct,
Nullable,
FieldName(String),
}

impl Display for Token {
Expand All @@ -641,6 +647,7 @@ impl Display for Token {
Token::LParen => write!(f, "("),
Token::RParen => write!(f, ")"),
Token::Comma => write!(f, ","),
Token::Colon => write!(f, ":"),
Token::Some => write!(f, "Some"),
Token::None => write!(f, "None"),
Token::FixedSizeBinary => write!(f, "FixedSizeBinary"),
Expand All @@ -653,7 +660,6 @@ impl Display for Token {
Token::DoubleQuotedString(s) => write!(f, "DoubleQuotedString({s})"),
Token::Struct => write!(f, "Struct"),
Token::Nullable => write!(f, "nullable"),
Token::FieldName(s) => write!(f, "FieldName({s})"),
}
}
}
Expand Down Expand Up @@ -837,19 +843,19 @@ mod test {
("Nu", "Unsupported type 'Nu'"),
(
r#"Timestamp(ns, +00:00)"#,
"Error unrecognized word: +00:00",
"Error unknown token: +00",
),
(
r#"Timestamp(ns, "+00:00)"#,
r#"parsing "+00:00 as double quoted string: last char must be ""#,
r#"Unterminated string at: "+00:00)"#,
),
(
r#"Timestamp(ns, "")"#,
r#"parsing "" as double quoted string: empty string isn't supported"#,
r#"empty strings aren't allowed"#,
),
(
r#"Timestamp(ns, "+00:00"")"#,
r#"parsing "+00:00"" as double quoted string: escaped double quote isn't supported"#,
r#"Parser error: Unterminated string at: ")"#,
),
("Timestamp(ns, ", "Error finding next token"),
(
Expand All @@ -871,9 +877,9 @@ mod test {
("Decimal64(3, 500)", "Error converting 500 into i8 for Decimal64: out of range integral type conversion attempted"),
("Decimal128(3, 500)", "Error converting 500 into i8 for Decimal128: out of range integral type conversion attempted"),
("Decimal256(3, 500)", "Error converting 500 into i8 for Decimal256: out of range integral type conversion attempted"),
("Struct(f1, Int64)", "Error finding next type, got unexpected ','"),
("Struct(f1 Int64,)", "Expected a word for the name of Struct, but got trailing comma"),
("Struct(f1)", "Error finding next type, got unexpected ')'"),
("Struct(f1 Int64)", "Error unknown token: f1"),
("Struct(\"f1\" Int64)", "Expected ':'"),
("Struct(\"f1\": )", "Error finding next type, got unexpected ')'"),
];

for (data_type_string, expected_message) in cases {
Expand All @@ -884,12 +890,13 @@ mod test {
let message = e.to_string();
assert!(
message.contains(expected_message),
"\n\ndid not find expected in actual.\n\nexpected: {expected_message}\nactual:{message}\n"
"\n\ndid not find expected in actual.\n\nexpected: {expected_message}\nactual: {message}\n"
);
// errors should also contain a help message
assert!(message.contains(
"Must be a supported arrow type name such as 'Int32' or 'Timestamp(ns)'"
));

if !message.contains("Unterminated string") {
// errors should also contain a help message
assert!(message.contains("Must be a supported arrow type name such as 'Int32' or 'Timestamp(ns)'"), "message: {message}");
}
}
}
}
Expand All @@ -899,6 +906,6 @@ mod test {
fn parse_error_type() {
let err = parse_data_type("foobar").unwrap_err();
assert!(matches!(err, ArrowError::ParseError(_)));
assert_eq!(err.to_string(), "Parser error: Unsupported type 'foobar'. Must be a supported arrow type name such as 'Int32' or 'Timestamp(ns)'. Error unrecognized word: foobar");
assert_eq!(err.to_string(), "Parser error: Unsupported type 'foobar'. Must be a supported arrow type name such as 'Int32' or 'Timestamp(ns)'. Error unknown token: foobar");
}
}
2 changes: 1 addition & 1 deletion arrow-schema/src/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -701,7 +701,7 @@ mod tests {
schema.to_string(),
"Field { \"first_name\": Utf8, metadata: {\"k\": \"v\"} }, \
Field { \"last_name\": Utf8 }, \
Field { \"address\": Struct(street Utf8, zip UInt16) }, \
Field { \"address\": Struct(\"street\": Utf8, \"zip\": UInt16) }, \
Field { \"interests\": nullable Dictionary(Int32, Utf8), dict_id: 123, dict_is_ordered }"
)
}
Expand Down
Loading
Loading