From 94d53659ec6d04fc83ca09b9ea3b8c135f24c802 Mon Sep 17 00:00:00 2001 From: irenjj Date: Fri, 31 Jan 2025 20:22:38 +0800 Subject: [PATCH 01/27] feat: impl display for `DataType::List` --- arrow-schema/src/datatype.rs | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index 5c9073c4eeb6..6e7c03002ae2 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -458,7 +458,13 @@ pub enum UnionMode { impl fmt::Display for DataType { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - write!(f, "{self:?}") + match self { + DataType::List(field) => { + let nullstr = if field.is_nullable() { "Y" } else { "N" }; + write!(f, "List({};{})", field.data_type(), nullstr) + } + _ => write!(f, "{self:?}"), + } } } @@ -1129,4 +1135,18 @@ mod tests { let data_type: DataType = "UInt64".parse().unwrap(); assert_eq!(data_type, DataType::UInt64); } + + #[test] + fn test_display_list() { + let list_data_type = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))); + let list_data_type_string = list_data_type.to_string(); + let expected_string = "List(Int32;Y)"; + assert_eq!(list_data_type_string, expected_string); + + let list_data_type = + DataType::List(Arc::new(Field::new_list_field(DataType::UInt64, false))); + let list_data_type_string = list_data_type.to_string(); + let expected_string = "List(UInt64;N)"; + assert_eq!(list_data_type_string, expected_string); + } } From 7ed104b60181fd5086f796621ba29a6c79edbd88 Mon Sep 17 00:00:00 2001 From: irenjj Date: Fri, 31 Jan 2025 20:29:37 +0800 Subject: [PATCH 02/27] fix --- arrow-schema/src/datatype.rs | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index 6e7c03002ae2..6f15061ef976 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -460,8 +460,11 @@ impl fmt::Display for DataType { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { DataType::List(field) => { - let nullstr = if field.is_nullable() { "Y" } else { "N" }; - write!(f, "List({};{})", field.data_type(), nullstr) + if field.is_nullable() { + write!(f, "List({};N)", field.data_type()) + } else { + write!(f, "List({})", field.data_type()) + } } _ => write!(f, "{self:?}"), } @@ -1140,13 +1143,13 @@ mod tests { fn test_display_list() { let list_data_type = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))); let list_data_type_string = list_data_type.to_string(); - let expected_string = "List(Int32;Y)"; + let expected_string = "List(Int32;N)"; assert_eq!(list_data_type_string, expected_string); let list_data_type = DataType::List(Arc::new(Field::new_list_field(DataType::UInt64, false))); let list_data_type_string = list_data_type.to_string(); - let expected_string = "List(UInt64;N)"; + let expected_string = "List(UInt64)"; assert_eq!(list_data_type_string, expected_string); } } From d0ed2dbc3a97c1c3dc6802674a75fb478812fcd3 Mon Sep 17 00:00:00 2001 From: irenjj Date: Sat, 1 Feb 2025 10:21:52 +0800 Subject: [PATCH 03/27] add nest test --- arrow-schema/src/datatype.rs | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index 6f15061ef976..4bed7cdd7539 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -1151,5 +1151,13 @@ mod tests { let list_data_type_string = list_data_type.to_string(); let expected_string = "List(UInt64)"; assert_eq!(list_data_type_string, expected_string); + + let nested_data_type = DataType::List(Arc::new(Field::new_list_field( + DataType::List(Arc::new(Field::new_list_field(DataType::UInt64, false))), + false, + ))); + let nested_data_type_string = nested_data_type.to_string(); + let nested_expected_string = "List(List(UInt64))"; + assert_eq!(nested_data_type_string, nested_expected_string); } } From aef44fa4bd52f94b1c43515f5fbf4a29fbcbcf78 Mon Sep 17 00:00:00 2001 From: irenjj Date: Sat, 1 Feb 2025 22:11:51 +0800 Subject: [PATCH 04/27] fix --- arrow-schema/src/datatype.rs | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index 4bed7cdd7539..ca6735a9dea5 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -460,11 +460,15 @@ impl fmt::Display for DataType { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { DataType::List(field) => { - if field.is_nullable() { - write!(f, "List({};N)", field.data_type()) - } else { - write!(f, "List({})", field.data_type()) - } + let nullstr = if field.is_nullable() { ";N" } else { "" }; + write!( + f, + "List({}{}, field = '{}', metadata = {:?})", + field.data_type(), + nullstr, + field.name(), + field.metadata() + ) } _ => write!(f, "{self:?}"), } @@ -1143,21 +1147,27 @@ mod tests { fn test_display_list() { let list_data_type = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))); let list_data_type_string = list_data_type.to_string(); - let expected_string = "List(Int32;N)"; + let expected_string = "List(Int32;N, field = 'item', metadata = {})"; assert_eq!(list_data_type_string, expected_string); + } - let list_data_type = - DataType::List(Arc::new(Field::new_list_field(DataType::UInt64, false))); + #[test] + fn test_display_list_with_named_field() { + let list_data_type = DataType::List(Arc::new(Field::new("foo", DataType::UInt64, false))); let list_data_type_string = list_data_type.to_string(); - let expected_string = "List(UInt64)"; + let expected_string = "List(UInt64, field = 'foo', metadata = {})"; assert_eq!(list_data_type_string, expected_string); + } + #[test] + fn test_display_nested_list() { let nested_data_type = DataType::List(Arc::new(Field::new_list_field( DataType::List(Arc::new(Field::new_list_field(DataType::UInt64, false))), false, ))); let nested_data_type_string = nested_data_type.to_string(); - let nested_expected_string = "List(List(UInt64))"; + let nested_expected_string = + "List(List(UInt64, field = 'item', metadata = {}), field = 'item', metadata = {})"; assert_eq!(nested_data_type_string, nested_expected_string); } } From 73e8ce92428ed685732e271324eb7f2dd1518377 Mon Sep 17 00:00:00 2001 From: irenjj Date: Sat, 1 Feb 2025 22:31:40 +0800 Subject: [PATCH 05/27] fix --- arrow-schema/src/datatype.rs | 37 ++++++++++++++++++++++++++++-------- 1 file changed, 29 insertions(+), 8 deletions(-) diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index ca6735a9dea5..7faa844cb5f1 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -460,14 +460,19 @@ impl fmt::Display for DataType { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { DataType::List(field) => { - let nullstr = if field.is_nullable() { ";N" } else { "" }; + let null_str = if field.is_nullable() { ";N" } else { "" }; + let metadata_str = if field.metadata().is_empty() { + String::new() + } else { + format!(", metadata = {:?}", field.metadata()) + }; write!( f, - "List({}{}, field = '{}', metadata = {:?})", + "List({}{}, field = '{}'{})", field.data_type(), - nullstr, + null_str, field.name(), - field.metadata() + metadata_str, ) } _ => write!(f, "{self:?}"), @@ -830,6 +835,8 @@ pub const DECIMAL_DEFAULT_SCALE: i8 = 10; #[cfg(test)] mod tests { + use std::collections::HashMap; + use super::*; #[test] @@ -1147,7 +1154,7 @@ mod tests { fn test_display_list() { let list_data_type = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))); let list_data_type_string = list_data_type.to_string(); - let expected_string = "List(Int32;N, field = 'item', metadata = {})"; + let expected_string = "List(Int32;N, field = 'item')"; assert_eq!(list_data_type_string, expected_string); } @@ -1155,7 +1162,7 @@ mod tests { fn test_display_list_with_named_field() { let list_data_type = DataType::List(Arc::new(Field::new("foo", DataType::UInt64, false))); let list_data_type_string = list_data_type.to_string(); - let expected_string = "List(UInt64, field = 'foo', metadata = {})"; + let expected_string = "List(UInt64, field = 'foo')"; assert_eq!(list_data_type_string, expected_string); } @@ -1166,8 +1173,22 @@ mod tests { false, ))); let nested_data_type_string = nested_data_type.to_string(); - let nested_expected_string = - "List(List(UInt64, field = 'item', metadata = {}), field = 'item', metadata = {})"; + let nested_expected_string = "List(List(UInt64, field = 'item'), field = 'item')"; assert_eq!(nested_data_type_string, nested_expected_string); } + + #[test] + fn test_display_list_with_metadata() { + let mut field = Field::new_list_field(DataType::Int32, true); + let metadata = HashMap::from([ + ("foo1".to_string(), "value1".to_string()), + ("foo2".to_string(), "value2".to_string()), + ]); + field.set_metadata(metadata); + let list_data_type = DataType::List(Arc::new(field)); + let list_data_type_string = list_data_type.to_string(); + let expected_string = "List(Int32;N, field = 'item', metadata = {\"foo2\": \"value2\", \"foo1\": \"value1\"})"; + + assert_eq!(list_data_type_string, expected_string); + } } From e381095b8e00e1932534d013968989ee154e79c4 Mon Sep 17 00:00:00 2001 From: irenjj Date: Tue, 11 Feb 2025 19:00:13 +0800 Subject: [PATCH 06/27] fix test --- arrow-schema/src/datatype.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index 7faa844cb5f1..462ec612a67e 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -1180,14 +1180,11 @@ mod tests { #[test] fn test_display_list_with_metadata() { let mut field = Field::new_list_field(DataType::Int32, true); - let metadata = HashMap::from([ - ("foo1".to_string(), "value1".to_string()), - ("foo2".to_string(), "value2".to_string()), - ]); + let metadata = HashMap::from([("foo1".to_string(), "value1".to_string())]); field.set_metadata(metadata); let list_data_type = DataType::List(Arc::new(field)); let list_data_type_string = list_data_type.to_string(); - let expected_string = "List(Int32;N, field = 'item', metadata = {\"foo2\": \"value2\", \"foo1\": \"value1\"})"; + let expected_string = "List(Int32;N, field = 'item', metadata = {\"foo1\": \"value1\"})"; assert_eq!(list_data_type_string, expected_string); } From bb7ada1333c79e1933063ab23d7a43bcd960d393 Mon Sep 17 00:00:00 2001 From: irenjj Date: Sat, 12 Apr 2025 21:43:55 +0800 Subject: [PATCH 07/27] fix --- arrow-schema/src/datatype.rs | 72 +++++++++++++++++++++++++++++++++++- 1 file changed, 71 insertions(+), 1 deletion(-) diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index 462ec612a67e..fe3046058aaf 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -459,7 +459,7 @@ pub enum UnionMode { impl fmt::Display for DataType { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self { - DataType::List(field) => { + DataType::List(field) | DataType::LargeList(field) => { let null_str = if field.is_nullable() { ";N" } else { "" }; let metadata_str = if field.metadata().is_empty() { String::new() @@ -475,6 +475,24 @@ impl fmt::Display for DataType { metadata_str, ) } + DataType::FixedSizeList(field, size) => { + let null_str = if field.is_nullable() { ";N" } else { "" }; + let metadata_str = if field.metadata().is_empty() { + String::new() + } else { + format!(", metadata = {:?}", field.metadata()) + }; + write!( + f, + "FixedSizeList({}{}, field = '{}', size = {}{})", + field.data_type(), + null_str, + field.name(), + size, + metadata_str, + ) + } + _ => write!(f, "{self:?}"), } } @@ -1188,4 +1206,56 @@ mod tests { assert_eq!(list_data_type_string, expected_string); } + + #[test] + fn test_display_large_list() { + let large_list_data_type = + DataType::LargeList(Arc::new(Field::new_list_field(DataType::Int32, true))); + let large_list_data_type_string = large_list_data_type.to_string(); + let expected_string = "List(Int32;N, field = 'item')"; + assert_eq!(large_list_data_type_string, expected_string); + + // Test with named field + let large_list_named = + DataType::LargeList(Arc::new(Field::new("bar", DataType::UInt64, false))); + let large_list_named_string = large_list_named.to_string(); + let expected_named_string = "List(UInt64, field = 'bar')"; + assert_eq!(large_list_named_string, expected_named_string); + + // Test with metadata + let mut field = Field::new_list_field(DataType::Int32, true); + let metadata = HashMap::from([("key1".to_string(), "value1".to_string())]); + field.set_metadata(metadata); + let large_list_metadata = DataType::LargeList(Arc::new(field)); + let large_list_metadata_string = large_list_metadata.to_string(); + let expected_metadata_string = + "List(Int32;N, field = 'item', metadata = {\"key1\": \"value1\"})"; + assert_eq!(large_list_metadata_string, expected_metadata_string); + } + + #[test] + fn test_display_fixed_size_list() { + let fixed_size_list = + DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int32, true)), 5); + let fixed_size_list_string = fixed_size_list.to_string(); + let expected_string = "FixedSizeList(Int32;N, field = 'item', size = 5)"; + assert_eq!(fixed_size_list_string, expected_string); + + // Test with named field + let fixed_size_named = + DataType::FixedSizeList(Arc::new(Field::new("baz", DataType::UInt64, false)), 3); + let fixed_size_named_string = fixed_size_named.to_string(); + let expected_named_string = "FixedSizeList(UInt64, field = 'baz', size = 3)"; + assert_eq!(fixed_size_named_string, expected_named_string); + + // Test with metadata + let mut field = Field::new_list_field(DataType::Int32, true); + let metadata = HashMap::from([("key2".to_string(), "value2".to_string())]); + field.set_metadata(metadata); + let fixed_size_metadata = DataType::FixedSizeList(Arc::new(field), 4); + let fixed_size_metadata_string = fixed_size_metadata.to_string(); + let expected_metadata_string = + "FixedSizeList(Int32;N, field = 'item', size = 4, metadata = {\"key2\": \"value2\"})"; + assert_eq!(fixed_size_metadata_string, expected_metadata_string); + } } From 02384b97e5e08ba4dce030ea398bce42be878621 Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Sun, 7 Sep 2025 17:35:35 +0200 Subject: [PATCH 08/27] More compact formatting, and distinguish List and LargeList --- arrow-schema/src/datatype.rs | 75 +++++++++++++++++++++++++----------- 1 file changed, 52 insertions(+), 23 deletions(-) diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index fe3046058aaf..c6776d8bb292 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -458,40 +458,69 @@ pub enum UnionMode { impl fmt::Display for DataType { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match self { + match &self { DataType::List(field) | DataType::LargeList(field) => { - let null_str = if field.is_nullable() { ";N" } else { "" }; + let type_name = if matches!(self, DataType::List(_)) { + "List" + } else { + "LargeList" + }; + + let maybe_nullable = if field.is_nullable() { "nullable " } else { "" }; let metadata_str = if field.metadata().is_empty() { String::new() } else { format!(", metadata = {:?}", field.metadata()) }; + + let name = field.name(); + let data_type = field.data_type(); + + let field_name_str = if name == "item" { + String::default() + } else { + format!(", field = '{name}'") + }; + + // e.g. `LargeList(nullable Uint32) write!( f, - "List({}{}, field = '{}'{})", - field.data_type(), - null_str, - field.name(), - metadata_str, + "{type_name}({maybe_nullable}{data_type}{field_name_str}{metadata_str})" ) } DataType::FixedSizeList(field, size) => { - let null_str = if field.is_nullable() { ";N" } else { "" }; + let name = field.name(); + let data_type = field.data_type(); + let maybe_nullable = if field.is_nullable() { "nullable " } else { "" }; + let field_name_str = if name == "item" { + String::default() + } else { + format!(", field = '{name}'") + }; let metadata_str = if field.metadata().is_empty() { String::new() } else { format!(", metadata = {:?}", field.metadata()) }; + write!( f, - "FixedSizeList({}{}, field = '{}', size = {}{})", - field.data_type(), - null_str, - field.name(), - size, - metadata_str, + "FixedSizeList({size} x {maybe_nullable}{data_type}{field_name_str}{metadata_str})", ) } + DataType::Struct(fields) => { + write!(f, "Struct(")?; + if !fields.is_empty() { + let fields_str = fields + .iter() + .map(|f| format!("{} {}", f.name(), f.data_type())) + .collect::>() + .join(", "); + write!(f, "{fields_str}")?; + } + write!(f, ")")?; + Ok(()) + } _ => write!(f, "{self:?}"), } @@ -1172,7 +1201,7 @@ mod tests { fn test_display_list() { let list_data_type = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))); let list_data_type_string = list_data_type.to_string(); - let expected_string = "List(Int32;N, field = 'item')"; + let expected_string = "List(nullable Int32)"; assert_eq!(list_data_type_string, expected_string); } @@ -1191,7 +1220,7 @@ mod tests { false, ))); let nested_data_type_string = nested_data_type.to_string(); - let nested_expected_string = "List(List(UInt64, field = 'item'), field = 'item')"; + let nested_expected_string = "List(List(UInt64))"; assert_eq!(nested_data_type_string, nested_expected_string); } @@ -1202,7 +1231,7 @@ mod tests { field.set_metadata(metadata); let list_data_type = DataType::List(Arc::new(field)); let list_data_type_string = list_data_type.to_string(); - let expected_string = "List(Int32;N, field = 'item', metadata = {\"foo1\": \"value1\"})"; + let expected_string = "List(nullable Int32, metadata = {\"foo1\": \"value1\"})"; assert_eq!(list_data_type_string, expected_string); } @@ -1212,14 +1241,14 @@ mod tests { let large_list_data_type = DataType::LargeList(Arc::new(Field::new_list_field(DataType::Int32, true))); let large_list_data_type_string = large_list_data_type.to_string(); - let expected_string = "List(Int32;N, field = 'item')"; + let expected_string = "LargeList(nullable Int32)"; assert_eq!(large_list_data_type_string, expected_string); // Test with named field let large_list_named = DataType::LargeList(Arc::new(Field::new("bar", DataType::UInt64, false))); let large_list_named_string = large_list_named.to_string(); - let expected_named_string = "List(UInt64, field = 'bar')"; + let expected_named_string = "LargeList(UInt64, field = 'bar')"; assert_eq!(large_list_named_string, expected_named_string); // Test with metadata @@ -1229,7 +1258,7 @@ mod tests { let large_list_metadata = DataType::LargeList(Arc::new(field)); let large_list_metadata_string = large_list_metadata.to_string(); let expected_metadata_string = - "List(Int32;N, field = 'item', metadata = {\"key1\": \"value1\"})"; + "LargeList(nullable Int32, metadata = {\"key1\": \"value1\"})"; assert_eq!(large_list_metadata_string, expected_metadata_string); } @@ -1238,14 +1267,14 @@ mod tests { let fixed_size_list = DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int32, true)), 5); let fixed_size_list_string = fixed_size_list.to_string(); - let expected_string = "FixedSizeList(Int32;N, field = 'item', size = 5)"; + let expected_string = "FixedSizeList(5 x nullable Int32)"; assert_eq!(fixed_size_list_string, expected_string); // Test with named field let fixed_size_named = DataType::FixedSizeList(Arc::new(Field::new("baz", DataType::UInt64, false)), 3); let fixed_size_named_string = fixed_size_named.to_string(); - let expected_named_string = "FixedSizeList(UInt64, field = 'baz', size = 3)"; + let expected_named_string = "FixedSizeList(3 x UInt64, field = 'baz')"; assert_eq!(fixed_size_named_string, expected_named_string); // Test with metadata @@ -1255,7 +1284,7 @@ mod tests { let fixed_size_metadata = DataType::FixedSizeList(Arc::new(field), 4); let fixed_size_metadata_string = fixed_size_metadata.to_string(); let expected_metadata_string = - "FixedSizeList(Int32;N, field = 'item', size = 4, metadata = {\"key2\": \"value2\"})"; + "FixedSizeList(4 x nullable Int32, metadata = {\"key2\": \"value2\"})"; assert_eq!(fixed_size_metadata_string, expected_metadata_string); } } From 3f0e66b40bcec46228f9231204e4738db4fd0c7f Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Sun, 7 Sep 2025 18:09:20 +0200 Subject: [PATCH 09/27] Break out formatting to own file --- arrow-schema/src/datatype.rs | 166 +------------------ arrow-schema/src/datatype_format.rs | 241 ++++++++++++++++++++++++++++ arrow-schema/src/lib.rs | 1 + arrow-schema/src/schema.rs | 5 +- 4 files changed, 244 insertions(+), 169 deletions(-) create mode 100644 arrow-schema/src/datatype_format.rs diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index 2c6afbf91a5f..61f1d4c8fb70 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -15,7 +15,6 @@ // specific language governing permissions and limitations // under the License. -use std::fmt; use std::str::FromStr; use std::sync::Arc; @@ -92,7 +91,7 @@ use crate::{ArrowError, Field, FieldRef, Fields, UnionFields}; /// /// [`Schema.fbs`]: https://github.com/apache/arrow/blob/main/format/Schema.fbs /// [the physical memory layout of Apache Arrow]: https://arrow.apache.org/docs/format/Columnar.html#physical-memory-layout -#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[derive(Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub enum DataType { /// Null type @@ -484,76 +483,6 @@ pub enum UnionMode { Dense, } -impl fmt::Display for DataType { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - match &self { - DataType::List(field) | DataType::LargeList(field) => { - let type_name = if matches!(self, DataType::List(_)) { - "List" - } else { - "LargeList" - }; - - let maybe_nullable = if field.is_nullable() { "nullable " } else { "" }; - let metadata_str = if field.metadata().is_empty() { - String::new() - } else { - format!(", metadata = {:?}", field.metadata()) - }; - - let name = field.name(); - let data_type = field.data_type(); - - let field_name_str = if name == "item" { - String::default() - } else { - format!(", field = '{name}'") - }; - - // e.g. `LargeList(nullable Uint32) - write!( - f, - "{type_name}({maybe_nullable}{data_type}{field_name_str}{metadata_str})" - ) - } - DataType::FixedSizeList(field, size) => { - let name = field.name(); - let data_type = field.data_type(); - let maybe_nullable = if field.is_nullable() { "nullable " } else { "" }; - let field_name_str = if name == "item" { - String::default() - } else { - format!(", field = '{name}'") - }; - let metadata_str = if field.metadata().is_empty() { - String::new() - } else { - format!(", metadata = {:?}", field.metadata()) - }; - - write!( - f, - "FixedSizeList({size} x {maybe_nullable}{data_type}{field_name_str}{metadata_str})", - ) - } - DataType::Struct(fields) => { - write!(f, "Struct(")?; - if !fields.is_empty() { - let fields_str = fields - .iter() - .map(|f| format!("{} {}", f.name(), f.data_type())) - .collect::>() - .join(", "); - write!(f, "{fields_str}")?; - } - write!(f, ")")?; - Ok(()) - } - _ => write!(f, "{self:?}"), - } - } -} - /// Parses `str` into a `DataType`. /// /// This is the reverse of [`DataType`]'s `Display` @@ -934,8 +863,6 @@ pub const DECIMAL_DEFAULT_SCALE: i8 = 10; #[cfg(test)] mod tests { - use std::collections::HashMap; - use super::*; #[test] @@ -1248,95 +1175,4 @@ mod tests { let data_type: DataType = "UInt64".parse().unwrap(); assert_eq!(data_type, DataType::UInt64); } - - #[test] - fn test_display_list() { - let list_data_type = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))); - let list_data_type_string = list_data_type.to_string(); - let expected_string = "List(nullable Int32)"; - assert_eq!(list_data_type_string, expected_string); - } - - #[test] - fn test_display_list_with_named_field() { - let list_data_type = DataType::List(Arc::new(Field::new("foo", DataType::UInt64, false))); - let list_data_type_string = list_data_type.to_string(); - let expected_string = "List(UInt64, field = 'foo')"; - assert_eq!(list_data_type_string, expected_string); - } - - #[test] - fn test_display_nested_list() { - let nested_data_type = DataType::List(Arc::new(Field::new_list_field( - DataType::List(Arc::new(Field::new_list_field(DataType::UInt64, false))), - false, - ))); - let nested_data_type_string = nested_data_type.to_string(); - let nested_expected_string = "List(List(UInt64))"; - assert_eq!(nested_data_type_string, nested_expected_string); - } - - #[test] - fn test_display_list_with_metadata() { - let mut field = Field::new_list_field(DataType::Int32, true); - let metadata = HashMap::from([("foo1".to_string(), "value1".to_string())]); - field.set_metadata(metadata); - let list_data_type = DataType::List(Arc::new(field)); - let list_data_type_string = list_data_type.to_string(); - let expected_string = "List(nullable Int32, metadata = {\"foo1\": \"value1\"})"; - - assert_eq!(list_data_type_string, expected_string); - } - - #[test] - fn test_display_large_list() { - let large_list_data_type = - DataType::LargeList(Arc::new(Field::new_list_field(DataType::Int32, true))); - let large_list_data_type_string = large_list_data_type.to_string(); - let expected_string = "LargeList(nullable Int32)"; - assert_eq!(large_list_data_type_string, expected_string); - - // Test with named field - let large_list_named = - DataType::LargeList(Arc::new(Field::new("bar", DataType::UInt64, false))); - let large_list_named_string = large_list_named.to_string(); - let expected_named_string = "LargeList(UInt64, field = 'bar')"; - assert_eq!(large_list_named_string, expected_named_string); - - // Test with metadata - let mut field = Field::new_list_field(DataType::Int32, true); - let metadata = HashMap::from([("key1".to_string(), "value1".to_string())]); - field.set_metadata(metadata); - let large_list_metadata = DataType::LargeList(Arc::new(field)); - let large_list_metadata_string = large_list_metadata.to_string(); - let expected_metadata_string = - "LargeList(nullable Int32, metadata = {\"key1\": \"value1\"})"; - assert_eq!(large_list_metadata_string, expected_metadata_string); - } - - #[test] - fn test_display_fixed_size_list() { - let fixed_size_list = - DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int32, true)), 5); - let fixed_size_list_string = fixed_size_list.to_string(); - let expected_string = "FixedSizeList(5 x nullable Int32)"; - assert_eq!(fixed_size_list_string, expected_string); - - // Test with named field - let fixed_size_named = - DataType::FixedSizeList(Arc::new(Field::new("baz", DataType::UInt64, false)), 3); - let fixed_size_named_string = fixed_size_named.to_string(); - let expected_named_string = "FixedSizeList(3 x UInt64, field = 'baz')"; - assert_eq!(fixed_size_named_string, expected_named_string); - - // Test with metadata - let mut field = Field::new_list_field(DataType::Int32, true); - let metadata = HashMap::from([("key2".to_string(), "value2".to_string())]); - field.set_metadata(metadata); - let fixed_size_metadata = DataType::FixedSizeList(Arc::new(field), 4); - let fixed_size_metadata_string = fixed_size_metadata.to_string(); - let expected_metadata_string = - "FixedSizeList(4 x nullable Int32, metadata = {\"key2\": \"value2\"})"; - assert_eq!(fixed_size_metadata_string, expected_metadata_string); - } } diff --git a/arrow-schema/src/datatype_format.rs b/arrow-schema/src/datatype_format.rs new file mode 100644 index 000000000000..8b60da51a142 --- /dev/null +++ b/arrow-schema/src/datatype_format.rs @@ -0,0 +1,241 @@ +use std::{collections::HashMap, fmt}; + +use crate::DataType; + +impl fmt::Debug for DataType { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + fn format_metadata(metadata: &HashMap) -> String { + if metadata.is_empty() { + String::new() + } else { + format!(", metadata = {metadata:?}") + } + } + + match &self { + Self::Null => write!(f, "Null"), + Self::Boolean => write!(f, "Boolean"), + Self::Int8 => write!(f, "Int8"), + Self::Int16 => write!(f, "Int16"), + Self::Int32 => write!(f, "Int32"), + Self::Int64 => write!(f, "Int64"), + Self::UInt8 => write!(f, "UInt8"), + Self::UInt16 => write!(f, "UInt16"), + Self::UInt32 => write!(f, "UInt32"), + Self::UInt64 => write!(f, "UInt64"), + Self::Float16 => write!(f, "Float16"), + Self::Float32 => write!(f, "Float32"), + Self::Float64 => write!(f, "Float64"), + Self::Timestamp(time_unit, timezone) => { + write!(f, "Timestamp({time_unit:?}, {timezone:?})") + } + Self::Date32 => write!(f, "Date32"), + Self::Date64 => write!(f, "Date64"), + Self::Time32(time_unit) => write!(f, "Time32({time_unit:?})"), + Self::Time64(time_unit) => write!(f, "Time64({time_unit:?})"), + Self::Duration(time_unit) => write!(f, "Duration({time_unit:?})"), + Self::Interval(interval_unit) => write!(f, "Interval({interval_unit:?})"), + Self::Binary => write!(f, "Binary"), + Self::FixedSizeBinary(bytes_per_value) => { + write!(f, "FixedSizeBinary({bytes_per_value:?})") + } + Self::LargeBinary => write!(f, "LargeBinary"), + Self::BinaryView => write!(f, "BinaryView"), + Self::Utf8 => write!(f, "Utf8"), + Self::LargeUtf8 => write!(f, "LargeUtf8"), + Self::Utf8View => write!(f, "Utf8View"), + Self::ListView(field) => write!(f, "ListView({field:?})"), // TODO: make more readable + Self::LargeListView(field) => write!(f, "LargeListView({field:?})"), // TODO: make more readable + Self::List(field) | Self::LargeList(field) => { + let type_name = if matches!(self, Self::List(_)) { + "List" + } else { + "LargeList" + }; + + let name = field.name(); + let maybe_nullable = if field.is_nullable() { "nullable " } else { "" }; + let data_type = field.data_type(); + let field_name_str = if name == "item" { + String::default() + } else { + format!(", field = '{name}'") + }; + let metadata_str = format_metadata(field.metadata()); + + // e.g. `LargeList(nullable Uint32) + write!( + f, + "{type_name}({maybe_nullable}{data_type}{field_name_str}{metadata_str})" + ) + } + Self::FixedSizeList(field, size) => { + let name = field.name(); + let maybe_nullable = if field.is_nullable() { "nullable " } else { "" }; + let data_type = field.data_type(); + let field_name_str = if name == "item" { + String::default() + } else { + format!(", field = '{name}'") + }; + let metadata_str = format_metadata(field.metadata()); + + write!( + f, + "FixedSizeList({size} x {maybe_nullable}{data_type}{field_name_str}{metadata_str})", + ) + } + Self::Struct(fields) => { + write!(f, "Struct(")?; + if !fields.is_empty() { + let fields_str = fields + .iter() + .map(|field| { + let name = field.name(); + // let maybe_nullable = if field.is_nullable() { "nullable " } else { "" }; + let maybe_nullable = ""; // TODO: add support for this in `fn parse_struct` + let data_type = field.data_type(); + let metadata_str = format_metadata(field.metadata()); + format!("{name} {maybe_nullable}{data_type}{metadata_str}") + }) + .collect::>() + .join(", "); + write!(f, "{fields_str}")?; + } + write!(f, ")")?; + Ok(()) + } + Self::Union(union_fields, union_mode) => { + write!(f, "Union({union_fields:?}, {union_mode:?})") + } + Self::Dictionary(data_type, data_type1) => { + write!(f, "Dictionary({data_type:?}, {data_type1:?})") + } + Self::Decimal32(precision, scale) => write!(f, "Decimal32({precision:?}, {scale:?})"), + Self::Decimal64(precision, scale) => write!(f, "Decimal64({precision:?}, {scale:?})"), + Self::Decimal128(precision, scale) => write!(f, "Decimal128({precision:?}, {scale:?})"), + Self::Decimal256(precision, scale) => write!(f, "Decimal256({precision:?}, {scale:?})"), + Self::Map(field, keys_are_sorted) => write!(f, "Map({field:?}, {keys_are_sorted:?})"), + Self::RunEndEncoded(run_ends_field, values_field) => { + write!(f, "RunEndEncoded({run_ends_field:?}, {values_field:?})") + } + } + } +} + +impl fmt::Display for DataType { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + // NOTE: `Display` and `Debug` formatting are ALWAYS the same, + // because we want BOTH to be easy to read AND reversible! + write!(f, "{self:?}") + } +} + +#[cfg(test)] +mod tests { + + use std::sync::Arc; + + use crate::Field; + + use super::*; + + #[test] + fn test_display_list() { + let list_data_type = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))); + let list_data_type_string = list_data_type.to_string(); + let expected_string = "List(nullable Int32)"; + assert_eq!(list_data_type_string, expected_string); + } + + #[test] + fn test_display_list_with_named_field() { + let list_data_type = DataType::List(Arc::new(Field::new("foo", DataType::UInt64, false))); + let list_data_type_string = list_data_type.to_string(); + let expected_string = "List(UInt64, field = 'foo')"; + assert_eq!(list_data_type_string, expected_string); + } + + #[test] + fn test_display_nested_list() { + let nested_data_type = DataType::List(Arc::new(Field::new_list_field( + DataType::List(Arc::new(Field::new_list_field(DataType::UInt64, false))), + false, + ))); + let nested_data_type_string = nested_data_type.to_string(); + let nested_expected_string = "List(List(UInt64))"; + assert_eq!(nested_data_type_string, nested_expected_string); + } + + #[test] + fn test_display_list_with_metadata() { + let mut field = Field::new_list_field(DataType::Int32, true); + let metadata = HashMap::from([("foo1".to_string(), "value1".to_string())]); + field.set_metadata(metadata); + let list_data_type = DataType::List(Arc::new(field)); + let list_data_type_string = list_data_type.to_string(); + let expected_string = "List(nullable Int32, metadata = {\"foo1\": \"value1\"})"; + + assert_eq!(list_data_type_string, expected_string); + } + + #[test] + fn test_display_large_list() { + let large_list_data_type = + DataType::LargeList(Arc::new(Field::new_list_field(DataType::Int32, true))); + let large_list_data_type_string = large_list_data_type.to_string(); + let expected_string = "LargeList(nullable Int32)"; + assert_eq!(large_list_data_type_string, expected_string); + + // Test with named field + let large_list_named = + DataType::LargeList(Arc::new(Field::new("bar", DataType::UInt64, false))); + let large_list_named_string = large_list_named.to_string(); + let expected_named_string = "LargeList(UInt64, field = 'bar')"; + assert_eq!(large_list_named_string, expected_named_string); + + // Test with metadata + let mut field = Field::new_list_field(DataType::Int32, true); + let metadata = HashMap::from([("key1".to_string(), "value1".to_string())]); + field.set_metadata(metadata); + let large_list_metadata = DataType::LargeList(Arc::new(field)); + let large_list_metadata_string = large_list_metadata.to_string(); + let expected_metadata_string = + "LargeList(nullable Int32, metadata = {\"key1\": \"value1\"})"; + assert_eq!(large_list_metadata_string, expected_metadata_string); + } + + #[test] + fn test_display_fixed_size_list() { + let fixed_size_list = + DataType::FixedSizeList(Arc::new(Field::new_list_field(DataType::Int32, true)), 5); + let fixed_size_list_string = fixed_size_list.to_string(); + let expected_string = "FixedSizeList(5 x nullable Int32)"; + assert_eq!(fixed_size_list_string, expected_string); + + // Test with named field + let fixed_size_named = + DataType::FixedSizeList(Arc::new(Field::new("baz", DataType::UInt64, false)), 3); + let fixed_size_named_string = fixed_size_named.to_string(); + let expected_named_string = "FixedSizeList(3 x UInt64, field = 'baz')"; + assert_eq!(fixed_size_named_string, expected_named_string); + + // Test with metadata + let mut field = Field::new_list_field(DataType::Int32, true); + let metadata = HashMap::from([("key2".to_string(), "value2".to_string())]); + field.set_metadata(metadata); + let fixed_size_metadata = DataType::FixedSizeList(Arc::new(field), 4); + let fixed_size_metadata_string = fixed_size_metadata.to_string(); + let expected_metadata_string = + "FixedSizeList(4 x nullable Int32, metadata = {\"key2\": \"value2\"})"; + assert_eq!(fixed_size_metadata_string, expected_metadata_string); + } + + #[test] + fn test_debug_fmt() { + let list_data_type = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))); + let list_data_type_string = format!("{list_data_type:?}"); + let expected_string = "List(nullable Int32)"; + assert_eq!(list_data_type_string, expected_string); + } +} diff --git a/arrow-schema/src/lib.rs b/arrow-schema/src/lib.rs index d1befbd04ff8..6571209cf70a 100644 --- a/arrow-schema/src/lib.rs +++ b/arrow-schema/src/lib.rs @@ -28,6 +28,7 @@ mod datatype; pub use datatype::*; use std::fmt::Display; +mod datatype_format; mod datatype_parse; mod error; pub use error::*; diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs index 04c01f18e1d8..94aa1afa7026 100644 --- a/arrow-schema/src/schema.rs +++ b/arrow-schema/src/schema.rs @@ -700,10 +700,7 @@ mod tests { assert_eq!(schema.to_string(), "Field { name: \"first_name\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {\"k\": \"v\"} }, \ Field { name: \"last_name\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, \ - Field { name: \"address\", data_type: Struct([\ - Field { name: \"street\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, \ - Field { name: \"zip\", data_type: UInt16, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }\ - ]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, \ + Field { name: \"address\", data_type: Struct(street Utf8, zip UInt16), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, \ Field { name: \"interests\", data_type: Dictionary(Int32, Utf8), nullable: true, dict_id: 123, dict_is_ordered: true, metadata: {} }") } From 0661f1fefca4e3d73c7014e35dccd096c0753ced Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Sun, 7 Sep 2025 18:26:14 +0200 Subject: [PATCH 10/27] Add a comment with design goals --- arrow-schema/src/datatype_format.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arrow-schema/src/datatype_format.rs b/arrow-schema/src/datatype_format.rs index 8b60da51a142..85c7d1844991 100644 --- a/arrow-schema/src/datatype_format.rs +++ b/arrow-schema/src/datatype_format.rs @@ -12,6 +12,12 @@ impl fmt::Debug for DataType { } } + // A lot of these can still be improved a lot. + // _Some_ of these can be parsed with `FromStr`, but not all (YET!). + // The goal is that the formatting should always be + // * Terse and teadable + // * Reversible (contain all necessary information to reverse it perfectly) + match &self { Self::Null => write!(f, "Null"), Self::Boolean => write!(f, "Boolean"), From b4e9be554957e7c490a04b03294855b34c015c4d Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Sun, 7 Sep 2025 18:27:27 +0200 Subject: [PATCH 11/27] Use `: ` syntax instead of ` = ` --- arrow-schema/src/datatype_format.rs | 34 ++++++++++++++--------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/arrow-schema/src/datatype_format.rs b/arrow-schema/src/datatype_format.rs index 85c7d1844991..4a17fdc1d0b8 100644 --- a/arrow-schema/src/datatype_format.rs +++ b/arrow-schema/src/datatype_format.rs @@ -2,13 +2,21 @@ use std::{collections::HashMap, fmt}; use crate::DataType; +impl fmt::Display for DataType { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + // NOTE: `Display` and `Debug` formatting are ALWAYS the same, + // because we want BOTH to be easy to read AND reversible! + write!(f, "{self:?}") + } +} + impl fmt::Debug for DataType { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fn format_metadata(metadata: &HashMap) -> String { if metadata.is_empty() { String::new() } else { - format!(", metadata = {metadata:?}") + format!(", metadata: {metadata:?}") } } @@ -65,7 +73,7 @@ impl fmt::Debug for DataType { let field_name_str = if name == "item" { String::default() } else { - format!(", field = '{name}'") + format!(", field: '{name}'") }; let metadata_str = format_metadata(field.metadata()); @@ -82,7 +90,7 @@ impl fmt::Debug for DataType { let field_name_str = if name == "item" { String::default() } else { - format!(", field = '{name}'") + format!(", field: '{name}'") }; let metadata_str = format_metadata(field.metadata()); @@ -129,14 +137,6 @@ impl fmt::Debug for DataType { } } -impl fmt::Display for DataType { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - // NOTE: `Display` and `Debug` formatting are ALWAYS the same, - // because we want BOTH to be easy to read AND reversible! - write!(f, "{self:?}") - } -} - #[cfg(test)] mod tests { @@ -158,7 +158,7 @@ mod tests { fn test_display_list_with_named_field() { let list_data_type = DataType::List(Arc::new(Field::new("foo", DataType::UInt64, false))); let list_data_type_string = list_data_type.to_string(); - let expected_string = "List(UInt64, field = 'foo')"; + let expected_string = "List(UInt64, field: 'foo')"; assert_eq!(list_data_type_string, expected_string); } @@ -180,7 +180,7 @@ mod tests { field.set_metadata(metadata); let list_data_type = DataType::List(Arc::new(field)); let list_data_type_string = list_data_type.to_string(); - let expected_string = "List(nullable Int32, metadata = {\"foo1\": \"value1\"})"; + let expected_string = "List(nullable Int32, metadata: {\"foo1\": \"value1\"})"; assert_eq!(list_data_type_string, expected_string); } @@ -197,7 +197,7 @@ mod tests { let large_list_named = DataType::LargeList(Arc::new(Field::new("bar", DataType::UInt64, false))); let large_list_named_string = large_list_named.to_string(); - let expected_named_string = "LargeList(UInt64, field = 'bar')"; + let expected_named_string = "LargeList(UInt64, field: 'bar')"; assert_eq!(large_list_named_string, expected_named_string); // Test with metadata @@ -207,7 +207,7 @@ mod tests { let large_list_metadata = DataType::LargeList(Arc::new(field)); let large_list_metadata_string = large_list_metadata.to_string(); let expected_metadata_string = - "LargeList(nullable Int32, metadata = {\"key1\": \"value1\"})"; + "LargeList(nullable Int32, metadata: {\"key1\": \"value1\"})"; assert_eq!(large_list_metadata_string, expected_metadata_string); } @@ -223,7 +223,7 @@ mod tests { let fixed_size_named = DataType::FixedSizeList(Arc::new(Field::new("baz", DataType::UInt64, false)), 3); let fixed_size_named_string = fixed_size_named.to_string(); - let expected_named_string = "FixedSizeList(3 x UInt64, field = 'baz')"; + let expected_named_string = "FixedSizeList(3 x UInt64, field: 'baz')"; assert_eq!(fixed_size_named_string, expected_named_string); // Test with metadata @@ -233,7 +233,7 @@ mod tests { let fixed_size_metadata = DataType::FixedSizeList(Arc::new(field), 4); let fixed_size_metadata_string = fixed_size_metadata.to_string(); let expected_metadata_string = - "FixedSizeList(4 x nullable Int32, metadata = {\"key2\": \"value2\"})"; + "FixedSizeList(4 x nullable Int32, metadata: {\"key2\": \"value2\"})"; assert_eq!(fixed_size_metadata_string, expected_metadata_string); } From 0179d581827272417fee12ae423254177da7a1fe Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Sun, 7 Sep 2025 18:29:09 +0200 Subject: [PATCH 12/27] Improve `Debug` format of `Field` --- arrow-schema/src/field.rs | 39 ++++++++++++++++++++++++++++++++++++-- arrow-schema/src/schema.rs | 12 +++++++----- 2 files changed, 44 insertions(+), 7 deletions(-) diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index 3beae35795e4..856cad2429a3 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -44,7 +44,7 @@ pub type FieldRef = Arc; /// /// Arrow Extension types, are encoded in `Field`s metadata. See /// [`Self::try_extension_type`] to retrieve the [`ExtensionType`], if any. -#[derive(Debug, Clone)] +#[derive(Clone)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct Field { name: String, @@ -860,13 +860,48 @@ impl Field { } } -// TODO: improve display with crate https://crates.io/crates/derive_more ? impl std::fmt::Display for Field { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + // NOTE: `Display` and `Debug` formatting are ALWAYS the same, + // because we want BOTH to be easy to read AND reversible! write!(f, "{self:?}") } } +impl std::fmt::Debug for Field { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + #![expect(deprecated)] // Must still print dict_id, if set + let Self { + name, + data_type, + nullable, + dict_id, + dict_is_ordered, + metadata, + } = self; + let maybe_nullable = if *nullable { "nullable " } else { "" }; + let metadata_str = if metadata.is_empty() { + String::new() + } else { + format!(", metadata: {metadata:?}") + }; + let dict_id_str = if dict_id == &0 { + String::new() + } else { + format!(", dict_id: {dict_id}") + }; + let dict_is_ordered_str = if *dict_is_ordered { + ", dict_is_ordered" + } else { + "" + }; + write!( + f, + "Field {{ {name:?}: {maybe_nullable}{data_type}{dict_id_str}{dict_is_ordered_str}{metadata_str} }}" + ) + } +} + #[cfg(test)] mod test { use super::*; diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs index 94aa1afa7026..0acec4662415 100644 --- a/arrow-schema/src/schema.rs +++ b/arrow-schema/src/schema.rs @@ -697,11 +697,13 @@ mod tests { #[test] fn create_schema_string() { let schema = person_schema(); - assert_eq!(schema.to_string(), - "Field { name: \"first_name\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {\"k\": \"v\"} }, \ - Field { name: \"last_name\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, \ - Field { name: \"address\", data_type: Struct(street Utf8, zip UInt16), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, \ - Field { name: \"interests\", data_type: Dictionary(Int32, Utf8), nullable: true, dict_id: 123, dict_is_ordered: true, metadata: {} }") + assert_eq!( + schema.to_string(), + "Field { \"first_name\": Utf8, metadata: {\"k\": \"v\"} }, \ + Field { \"last_name\": Utf8 }, \ + Field { \"address\": Struct(street Utf8, zip UInt16) }, \ + Field { \"interests\": nullable Dictionary(Int32, Utf8), dict_id: 123, dict_is_ordered }" + ) } #[test] From 2e9d645022bc29d2aa469e55fec902419e7de757 Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Sun, 7 Sep 2025 18:30:34 +0200 Subject: [PATCH 13/27] tidy-up --- arrow-schema/src/schema.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arrow-schema/src/schema.rs b/arrow-schema/src/schema.rs index 0acec4662415..e1f449e096a8 100644 --- a/arrow-schema/src/schema.rs +++ b/arrow-schema/src/schema.rs @@ -700,9 +700,9 @@ mod tests { assert_eq!( schema.to_string(), "Field { \"first_name\": Utf8, metadata: {\"k\": \"v\"} }, \ - Field { \"last_name\": Utf8 }, \ - Field { \"address\": Struct(street Utf8, zip UInt16) }, \ - Field { \"interests\": nullable Dictionary(Int32, Utf8), dict_id: 123, dict_is_ordered }" + Field { \"last_name\": Utf8 }, \ + Field { \"address\": Struct(street Utf8, zip UInt16) }, \ + Field { \"interests\": nullable Dictionary(Int32, Utf8), dict_id: 123, dict_is_ordered }" ) } From be144f4a9e1f4868391e3b6d36f5c0436ec86913 Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Sun, 7 Sep 2025 18:38:04 +0200 Subject: [PATCH 14/27] Make sure structs write the nullability of its fields --- arrow-schema/src/datatype_format.rs | 3 +-- arrow-schema/src/datatype_parse.rs | 15 ++++++++++++--- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/arrow-schema/src/datatype_format.rs b/arrow-schema/src/datatype_format.rs index 4a17fdc1d0b8..f93f3fa49f2a 100644 --- a/arrow-schema/src/datatype_format.rs +++ b/arrow-schema/src/datatype_format.rs @@ -106,8 +106,7 @@ impl fmt::Debug for DataType { .iter() .map(|field| { let name = field.name(); - // let maybe_nullable = if field.is_nullable() { "nullable " } else { "" }; - let maybe_nullable = ""; // TODO: add support for this in `fn parse_struct` + let maybe_nullable = if field.is_nullable() { "nullable " } else { "" }; let data_type = field.data_type(); let metadata_str = format_metadata(field.metadata()); format!("{name} {maybe_nullable}{data_type}{metadata_str}") diff --git a/arrow-schema/src/datatype_parse.rs b/arrow-schema/src/datatype_parse.rs index 7e71d53ccbdb..bba5e914aa3e 100644 --- a/arrow-schema/src/datatype_parse.rs +++ b/arrow-schema/src/datatype_parse.rs @@ -38,14 +38,14 @@ fn make_error_expected(val: &str, expected: &Token, actual: &Token) -> ArrowErro /// Implementation of `parse_data_type`, modeled after struct Parser<'a> { val: &'a str, - tokenizer: Tokenizer<'a>, + tokenizer: Peekable>, } impl<'a> Parser<'a> { fn new(val: &'a str) -> Self { Self { val, - tokenizer: Tokenizer::new(val), + tokenizer: Tokenizer::new(val).peekable(), } } @@ -345,8 +345,12 @@ impl<'a> Parser<'a> { )) } }; + let nullable = self + .tokenizer + .next_if(|next| matches!(next, Ok(Token::Nullable))) + .is_some(); let field_type = self.parse_next_type()?; - fields.push(Arc::new(Field::new(field_name, field_type, true))); + fields.push(Arc::new(Field::new(field_name, field_type, nullable))); match self.next_token()? { Token::Comma => continue, Token::RParen => break, @@ -551,7 +555,10 @@ impl<'a> Tokenizer<'a> { "Some" => Token::Some, "None" => Token::None, + "nullable" => Token::Nullable, + "Struct" => Token::Struct, + // If we don't recognize the word, treat it as a field name word => Token::FieldName(word.to_string()), }; @@ -618,6 +625,7 @@ enum Token { LargeList, FixedSizeList, Struct, + Nullable, FieldName(String), } @@ -649,6 +657,7 @@ impl Display for Token { Token::Integer(v) => write!(f, "Integer({v})"), Token::DoubleQuotedString(s) => write!(f, "DoubleQuotedString({s})"), Token::Struct => write!(f, "Struct"), + Token::Nullable => write!(f, "nullable"), Token::FieldName(s) => write!(f, "FieldName({s})"), } } From f27e349e0aa1f5fac6f3a5602d1bf82447ae1b5d Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Sun, 7 Sep 2025 18:42:45 +0200 Subject: [PATCH 15/27] Update test --- arrow-array/src/builder/struct_builder.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arrow-array/src/builder/struct_builder.rs b/arrow-array/src/builder/struct_builder.rs index 3afee5863f52..88f89f124f5c 100644 --- a/arrow-array/src/builder/struct_builder.rs +++ b/arrow-array/src/builder/struct_builder.rs @@ -62,7 +62,7 @@ use std::sync::Arc; /// /// // We can't obtain the ListBuilder with the expected generic types, because under the hood /// // the StructBuilder was returned as a Box and passed as such to the ListBuilder constructor -/// +/// /// // This panics in runtime, even though we know that the builder is a ListBuilder. /// // let sb = col_struct_builder /// // .field_builder::>(0) @@ -648,7 +648,7 @@ mod tests { #[test] #[should_panic( - expected = "StructBuilder (Schema { fields: [Field { name: \"f1\", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: \"f2\", data_type: Boolean, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }], metadata: {} }) and field_builder with index 1 (Boolean) are of unequal lengths: (2 != 1)." + expected = "StructBuilder (Schema { fields: [Field { \"f1\": Int32 }, Field { \"f2\": Boolean }], metadata: {} }) and field_builder with index 1 (Boolean) are of unequal lengths: (2 != 1)." )] fn test_struct_array_builder_unequal_field_builders_lengths() { let mut int_builder = Int32Builder::with_capacity(10); From 71d9cb637c29cee1a670b115964707ec110de51c Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Sun, 7 Sep 2025 19:15:00 +0200 Subject: [PATCH 16/27] Update some error reporting --- arrow-cast/src/cast/mod.rs | 20 ++++++++++++++------ arrow-json/src/lib.rs | 2 +- parquet/src/arrow/arrow_reader/mod.rs | 4 ++-- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index e2bb3db85984..e20e8ceada05 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -8600,8 +8600,12 @@ mod tests { let new_array_result = cast(&array, &new_type.clone()); assert!(!can_cast_types(array.data_type(), &new_type)); - assert!( - matches!(new_array_result, Err(ArrowError::CastError(t)) if t == r#"Casting from Map(Field { name: "entries", data_type: Struct([Field { name: "key", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", data_type: Utf8, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, false) to Map(Field { name: "entries", data_type: Struct([Field { name: "key", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, true) not supported"#) + let Err(ArrowError::CastError(t)) = new_array_result else { + panic!(); + }; + assert_eq!( + t, + r#"Casting from Map(Field { "entries": Struct(key Utf8, value nullable Utf8) }, false) to Map(Field { "entries": Struct(key Utf8, value Utf8) }, true) not supported"# ); } @@ -8647,8 +8651,12 @@ mod tests { let new_array_result = cast(&array, &new_type.clone()); assert!(!can_cast_types(array.data_type(), &new_type)); - assert!( - matches!(new_array_result, Err(ArrowError::CastError(t)) if t == r#"Casting from Map(Field { name: "entries", data_type: Struct([Field { name: "key", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", data_type: Interval(DayTime), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, false) to Map(Field { name: "entries", data_type: Struct([Field { name: "key", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: "value", data_type: Duration(Second), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]), nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, true) not supported"#) + let Err(ArrowError::CastError(t)) = new_array_result else { + panic!(); + }; + assert_eq!( + t, + r#"Casting from Map(Field { "entries": Struct(key Utf8, value nullable Interval(DayTime)) }, false) to Map(Field { "entries": Struct(key Utf8, value Duration(Second)) }, true) not supported"# ); } @@ -10740,7 +10748,7 @@ mod tests { let to_type = DataType::Utf8; let result = cast(&struct_array, &to_type); assert_eq!( - r#"Cast error: Casting from Struct([Field { name: "a", data_type: Boolean, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]) to Utf8 not supported"#, + r#"Cast error: Casting from Struct(a Boolean) to Utf8 not supported"#, result.unwrap_err().to_string() ); } @@ -10751,7 +10759,7 @@ mod tests { let to_type = DataType::Struct(vec![Field::new("a", DataType::Boolean, false)].into()); let result = cast(&array, &to_type); assert_eq!( - r#"Cast error: Casting from Utf8 to Struct([Field { name: "a", data_type: Boolean, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]) not supported"#, + r#"Cast error: Casting from Utf8 to Struct(a Boolean) not supported"#, result.unwrap_err().to_string() ); } diff --git a/arrow-json/src/lib.rs b/arrow-json/src/lib.rs index 6d7ab4400b6e..5a5430fef973 100644 --- a/arrow-json/src/lib.rs +++ b/arrow-json/src/lib.rs @@ -87,7 +87,7 @@ use serde_json::{Number, Value}; /// /// This enum controls which form(s) the Reader will accept and which form the /// Writer will produce. For example, if the RecordBatch Schema is -/// `[("a", Int32), ("r", Struct([("b", Boolean), ("c", Utf8)]))]` +/// `[("a", Int32), ("r", Struct(b Boolean, c Utf8))]` /// then a Reader with [`StructMode::ObjectOnly`] would read rows of the form /// `{"a": 1, "r": {"b": true, "c": "cat"}}` while with ['StructMode::ListOnly'] /// would read rows of the form `[1, [true, "cat"]]`. A Writer would produce diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index 81765a800edd..d49e56715b1b 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -3677,8 +3677,8 @@ mod tests { ), ])), "Arrow: Incompatible supplied Arrow schema: data type mismatch for field nested: \ - requested Struct([Field { name: \"nested1_valid\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: \"nested1_invalid\", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]) \ - but found Struct([Field { name: \"nested1_valid\", data_type: Utf8, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, Field { name: \"nested1_invalid\", data_type: Int64, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }])", + requested Struct(nested1_valid Utf8, nested1_invalid Int32) \ + but found Struct(nested1_valid Utf8, nested1_invalid Int64)", ); } From 931c5f9e343999aa02a0e58bef0bb63591ee7934 Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Sun, 7 Sep 2025 19:56:55 +0200 Subject: [PATCH 17/27] Add license header --- arrow-schema/src/datatype_format.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/arrow-schema/src/datatype_format.rs b/arrow-schema/src/datatype_format.rs index f93f3fa49f2a..67fcaff8280d 100644 --- a/arrow-schema/src/datatype_format.rs +++ b/arrow-schema/src/datatype_format.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + use std::{collections::HashMap, fmt}; use crate::DataType; From 34d65c8d36079327e3fe5ac69b82af46a04e1733 Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Thu, 11 Sep 2025 10:23:58 +0200 Subject: [PATCH 18/27] Rename datatype_format.rs to datatype_display.rs --- arrow-schema/src/{datatype_format.rs => datatype_display.rs} | 0 arrow-schema/src/lib.rs | 2 +- 2 files changed, 1 insertion(+), 1 deletion(-) rename arrow-schema/src/{datatype_format.rs => datatype_display.rs} (100%) diff --git a/arrow-schema/src/datatype_format.rs b/arrow-schema/src/datatype_display.rs similarity index 100% rename from arrow-schema/src/datatype_format.rs rename to arrow-schema/src/datatype_display.rs diff --git a/arrow-schema/src/lib.rs b/arrow-schema/src/lib.rs index 6571209cf70a..785f2f5516a7 100644 --- a/arrow-schema/src/lib.rs +++ b/arrow-schema/src/lib.rs @@ -28,7 +28,7 @@ mod datatype; pub use datatype::*; use std::fmt::Display; -mod datatype_format; +mod datatype_display; mod datatype_parse; mod error; pub use error::*; From 68b4eb499c9591446ce3b661ab8e4589935150c9 Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Thu, 11 Sep 2025 10:29:38 +0200 Subject: [PATCH 19/27] Add comment motivating why we do what we do --- arrow-schema/src/datatype_display.rs | 16 ++++++++++++++++ arrow-schema/src/field.rs | 16 ++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/arrow-schema/src/datatype_display.rs b/arrow-schema/src/datatype_display.rs index 67fcaff8280d..8a0be147f36a 100644 --- a/arrow-schema/src/datatype_display.rs +++ b/arrow-schema/src/datatype_display.rs @@ -23,6 +23,22 @@ impl fmt::Display for DataType { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { // NOTE: `Display` and `Debug` formatting are ALWAYS the same, // because we want BOTH to be easy to read AND reversible! + // + // Many error messages in `datafusion`, `arrow`, and 3rd party crates mistakingly use the + // `Debug` formatting of `DataType` instead of `Display`, + // which means we end up with huge difficult-to-read error messages. + // + // There are three solutions to this, afaict: + // + // A) Use `Display`=`Debug`, like we do here + // + // B) Replace all uses of `{:?}` with `{}` when printing datatypes in datafusion, arrow, and other third party crates. + // This is VERY hard to do, as I know of no automated tool to find all these places. + // + // C) Improve `Debug` formatting by omitting empty/default fields. + // This will _help_, but the Debug format for `DataType::List` will still be very ugly, since it wraps a `Field`. + // If you want to go down that route, please make sure that the `Debug` formating of a List is still nice and readable 🙏 + write!(f, "{self:?}") } } diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index 856cad2429a3..38d68fd9fec8 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -864,6 +864,22 @@ impl std::fmt::Display for Field { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { // NOTE: `Display` and `Debug` formatting are ALWAYS the same, // because we want BOTH to be easy to read AND reversible! + // + // Many error messages in `datafusion`, `arrow`, and 3rd party crates mistakingly use the + // `Debug` formatting of `DataType` and/or `Field` instead of `Display`, + // which means we end up with huge difficult-to-read error messages. + // + // There are three solutions to this, afaict: + // + // A) Use `Display`=`Debug`, like we do here + // + // B) Replace all uses of `{:?}` with `{}` when printing datatypes and fields in datafusion, arrow, and other third party crates. + // This is VERY hard to do, as I know of no automated tool to find all these places. + // + // C) Improve `Debug` formatting by omitting empty/default fields. + // This will _help_ a lot. + // If you want to go down that route, consider omitting the field name if it is `item`, + // or special-case `Debug` formatting of `Datatype::List` some other way 🙏 write!(f, "{self:?}") } } From 079e7bcb1c8b8ad037e7d319bc9a42f47e26b876 Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Mon, 15 Sep 2025 10:58:01 +0200 Subject: [PATCH 20/27] Revert to auto-derived Debug impl --- arrow-schema/src/datatype.rs | 2 +- arrow-schema/src/datatype_display.rs | 32 ---------------------------- arrow-schema/src/field.rs | 26 +--------------------- 3 files changed, 2 insertions(+), 58 deletions(-) diff --git a/arrow-schema/src/datatype.rs b/arrow-schema/src/datatype.rs index 61f1d4c8fb70..32bce3347404 100644 --- a/arrow-schema/src/datatype.rs +++ b/arrow-schema/src/datatype.rs @@ -91,7 +91,7 @@ use crate::{ArrowError, Field, FieldRef, Fields, UnionFields}; /// /// [`Schema.fbs`]: https://github.com/apache/arrow/blob/main/format/Schema.fbs /// [the physical memory layout of Apache Arrow]: https://arrow.apache.org/docs/format/Columnar.html#physical-memory-layout -#[derive(Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[derive(Clone, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub enum DataType { /// Null type diff --git a/arrow-schema/src/datatype_display.rs b/arrow-schema/src/datatype_display.rs index 8a0be147f36a..5294721688af 100644 --- a/arrow-schema/src/datatype_display.rs +++ b/arrow-schema/src/datatype_display.rs @@ -20,30 +20,6 @@ use std::{collections::HashMap, fmt}; use crate::DataType; impl fmt::Display for DataType { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - // NOTE: `Display` and `Debug` formatting are ALWAYS the same, - // because we want BOTH to be easy to read AND reversible! - // - // Many error messages in `datafusion`, `arrow`, and 3rd party crates mistakingly use the - // `Debug` formatting of `DataType` instead of `Display`, - // which means we end up with huge difficult-to-read error messages. - // - // There are three solutions to this, afaict: - // - // A) Use `Display`=`Debug`, like we do here - // - // B) Replace all uses of `{:?}` with `{}` when printing datatypes in datafusion, arrow, and other third party crates. - // This is VERY hard to do, as I know of no automated tool to find all these places. - // - // C) Improve `Debug` formatting by omitting empty/default fields. - // This will _help_, but the Debug format for `DataType::List` will still be very ugly, since it wraps a `Field`. - // If you want to go down that route, please make sure that the `Debug` formating of a List is still nice and readable 🙏 - - write!(f, "{self:?}") - } -} - -impl fmt::Debug for DataType { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fn format_metadata(metadata: &HashMap) -> String { if metadata.is_empty() { @@ -268,12 +244,4 @@ mod tests { "FixedSizeList(4 x nullable Int32, metadata: {\"key2\": \"value2\"})"; assert_eq!(fixed_size_metadata_string, expected_metadata_string); } - - #[test] - fn test_debug_fmt() { - let list_data_type = DataType::List(Arc::new(Field::new_list_field(DataType::Int32, true))); - let list_data_type_string = format!("{list_data_type:?}"); - let expected_string = "List(nullable Int32)"; - assert_eq!(list_data_type_string, expected_string); - } } diff --git a/arrow-schema/src/field.rs b/arrow-schema/src/field.rs index 38d68fd9fec8..8017fa81b5ea 100644 --- a/arrow-schema/src/field.rs +++ b/arrow-schema/src/field.rs @@ -44,7 +44,7 @@ pub type FieldRef = Arc; /// /// Arrow Extension types, are encoded in `Field`s metadata. See /// [`Self::try_extension_type`] to retrieve the [`ExtensionType`], if any. -#[derive(Clone)] +#[derive(Clone, Debug)] #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] pub struct Field { name: String, @@ -861,30 +861,6 @@ impl Field { } impl std::fmt::Display for Field { - fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - // NOTE: `Display` and `Debug` formatting are ALWAYS the same, - // because we want BOTH to be easy to read AND reversible! - // - // Many error messages in `datafusion`, `arrow`, and 3rd party crates mistakingly use the - // `Debug` formatting of `DataType` and/or `Field` instead of `Display`, - // which means we end up with huge difficult-to-read error messages. - // - // There are three solutions to this, afaict: - // - // A) Use `Display`=`Debug`, like we do here - // - // B) Replace all uses of `{:?}` with `{}` when printing datatypes and fields in datafusion, arrow, and other third party crates. - // This is VERY hard to do, as I know of no automated tool to find all these places. - // - // C) Improve `Debug` formatting by omitting empty/default fields. - // This will _help_ a lot. - // If you want to go down that route, consider omitting the field name if it is `item`, - // or special-case `Debug` formatting of `Datatype::List` some other way 🙏 - write!(f, "{self:?}") - } -} - -impl std::fmt::Debug for Field { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { #![expect(deprecated)] // Must still print dict_id, if set let Self { From 2992b56f2b6d21abe9a315f8e35878b438e04732 Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Mon, 15 Sep 2025 10:59:42 +0200 Subject: [PATCH 21/27] Use Display formatting of Field --- arrow-schema/src/datatype_display.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arrow-schema/src/datatype_display.rs b/arrow-schema/src/datatype_display.rs index 5294721688af..e1bd86cba08e 100644 --- a/arrow-schema/src/datatype_display.rs +++ b/arrow-schema/src/datatype_display.rs @@ -67,8 +67,8 @@ impl fmt::Display for DataType { Self::Utf8 => write!(f, "Utf8"), Self::LargeUtf8 => write!(f, "LargeUtf8"), Self::Utf8View => write!(f, "Utf8View"), - Self::ListView(field) => write!(f, "ListView({field:?})"), // TODO: make more readable - Self::LargeListView(field) => write!(f, "LargeListView({field:?})"), // TODO: make more readable + Self::ListView(field) => write!(f, "ListView({field})"), // TODO: make more readable + Self::LargeListView(field) => write!(f, "LargeListView({field})"), // TODO: make more readable Self::List(field) | Self::LargeList(field) => { let type_name = if matches!(self, Self::List(_)) { "List" @@ -131,15 +131,15 @@ impl fmt::Display for DataType { write!(f, "Union({union_fields:?}, {union_mode:?})") } Self::Dictionary(data_type, data_type1) => { - write!(f, "Dictionary({data_type:?}, {data_type1:?})") + write!(f, "Dictionary({data_type}, {data_type1:?})") } Self::Decimal32(precision, scale) => write!(f, "Decimal32({precision:?}, {scale:?})"), Self::Decimal64(precision, scale) => write!(f, "Decimal64({precision:?}, {scale:?})"), Self::Decimal128(precision, scale) => write!(f, "Decimal128({precision:?}, {scale:?})"), Self::Decimal256(precision, scale) => write!(f, "Decimal256({precision:?}, {scale:?})"), - Self::Map(field, keys_are_sorted) => write!(f, "Map({field:?}, {keys_are_sorted:?})"), + Self::Map(field, keys_are_sorted) => write!(f, "Map({field}, {keys_are_sorted:?})"), Self::RunEndEncoded(run_ends_field, values_field) => { - write!(f, "RunEndEncoded({run_ends_field:?}, {values_field:?})") + write!(f, "RunEndEncoded({run_ends_field}, {values_field})") } } } From 44b5e0297321d17edb08dc765fdd560a48d3d3dd Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Mon, 15 Sep 2025 11:01:01 +0200 Subject: [PATCH 22/27] Less Debug-formatting of DataType in error messages --- .../src/array/fixed_size_list_array.rs | 2 +- arrow-array/src/array/primitive_array.rs | 6 ++-- arrow-array/src/builder/mod.rs | 2 +- arrow-array/src/ffi.rs | 22 ++++++------- arrow-array/src/record_batch.rs | 4 +-- arrow-cast/src/cast/dictionary.rs | 4 +-- arrow-cast/src/cast/mod.rs | 32 +++++++++---------- arrow-csv/src/reader/mod.rs | 2 +- arrow-data/src/transform/run.rs | 4 +-- arrow-row/src/list.rs | 2 +- arrow-schema/src/datatype_parse.rs | 4 +-- parquet/benches/arrow_reader_row_filter.rs | 2 +- parquet/src/arrow/arrow_reader/mod.rs | 4 +-- parquet/src/arrow/arrow_writer/mod.rs | 2 +- parquet/src/arrow/buffer/view_buffer.rs | 2 +- 15 files changed, 47 insertions(+), 47 deletions(-) diff --git a/arrow-array/src/array/fixed_size_list_array.rs b/arrow-array/src/array/fixed_size_list_array.rs index 4a338591e5aa..12147087107c 100644 --- a/arrow-array/src/array/fixed_size_list_array.rs +++ b/arrow-array/src/array/fixed_size_list_array.rs @@ -350,7 +350,7 @@ impl From for FixedSizeListArray { let value_length = match data.data_type() { DataType::FixedSizeList(_, len) => *len, data_type => { - panic!("FixedSizeListArray data should contain a FixedSizeList data type, got {data_type:?}") + panic!("FixedSizeListArray data should contain a FixedSizeList data type, got {data_type}") } }; diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 42594e7a129d..d23f4131521a 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -1290,7 +1290,7 @@ impl std::fmt::Debug for PrimitiveArray { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { let data_type = self.data_type(); - write!(f, "PrimitiveArray<{data_type:?}>\n[\n")?; + write!(f, "PrimitiveArray<{data_type}>\n[\n")?; print_long_array(self, f, |array, index, f| match data_type { DataType::Date32 | DataType::Date64 => { let v = self.value(index).to_i64().unwrap(); @@ -1299,7 +1299,7 @@ impl std::fmt::Debug for PrimitiveArray { None => { write!( f, - "Cast error: Failed to convert {v} to temporal for {data_type:?}" + "Cast error: Failed to convert {v} to temporal for {data_type}" ) } } @@ -1311,7 +1311,7 @@ impl std::fmt::Debug for PrimitiveArray { None => { write!( f, - "Cast error: Failed to convert {v} to temporal for {data_type:?}" + "Cast error: Failed to convert {v} to temporal for {data_type}" ) } } diff --git a/arrow-array/src/builder/mod.rs b/arrow-array/src/builder/mod.rs index ea9c98f9b60e..75170cea172e 100644 --- a/arrow-array/src/builder/mod.rs +++ b/arrow-array/src/builder/mod.rs @@ -604,7 +604,7 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box dict_builder!(Int32Type), DataType::Int64 => dict_builder!(Int64Type), _ => { - panic!("Data type {t:?} with key type {key_type:?} is not currently supported") + panic!("Data type {t:?} with key type {key_type} is not currently supported") } } } diff --git a/arrow-array/src/ffi.rs b/arrow-array/src/ffi.rs index 83eaa3d6544a..218f729434dd 100644 --- a/arrow-array/src/ffi.rs +++ b/arrow-array/src/ffi.rs @@ -146,11 +146,11 @@ fn bit_width(data_type: &DataType, i: usize) -> Result { if let Some(primitive) = data_type.primitive_width() { return match i { 0 => Err(ArrowError::CDataInterface(format!( - "The datatype \"{data_type:?}\" doesn't expect buffer at index 0. Please verify that the C data interface is correctly implemented." + "The datatype \"{data_type}\" doesn't expect buffer at index 0. Please verify that the C data interface is correctly implemented." ))), 1 => Ok(primitive * 8), i => Err(ArrowError::CDataInterface(format!( - "The datatype \"{data_type:?}\" expects 2 buffers, but requested {i}. Please verify that the C data interface is correctly implemented." + "The datatype \"{data_type}\" expects 2 buffers, but requested {i}. Please verify that the C data interface is correctly implemented." ))), }; } @@ -159,7 +159,7 @@ fn bit_width(data_type: &DataType, i: usize) -> Result { (DataType::Boolean, 1) => 1, (DataType::Boolean, _) => { return Err(ArrowError::CDataInterface(format!( - "The datatype \"{data_type:?}\" expects 2 buffers, but requested {i}. Please verify that the C data interface is correctly implemented." + "The datatype \"{data_type}\" expects 2 buffers, but requested {i}. Please verify that the C data interface is correctly implemented." ))) } (DataType::FixedSizeBinary(num_bytes), 1) => *num_bytes as usize * u8::BITS as usize, @@ -169,7 +169,7 @@ fn bit_width(data_type: &DataType, i: usize) -> Result { }, (DataType::FixedSizeBinary(_), _) | (DataType::FixedSizeList(_, _), _) => { return Err(ArrowError::CDataInterface(format!( - "The datatype \"{data_type:?}\" expects 2 buffers, but requested {i}. Please verify that the C data interface is correctly implemented." + "The datatype \"{data_type}\" expects 2 buffers, but requested {i}. Please verify that the C data interface is correctly implemented." ))) }, // Variable-size list and map have one i32 buffer. @@ -179,12 +179,12 @@ fn bit_width(data_type: &DataType, i: usize) -> Result { (DataType::Utf8, 2) | (DataType::Binary, 2) => u8::BITS as _, (DataType::List(_), _) | (DataType::Map(_, _), _) => { return Err(ArrowError::CDataInterface(format!( - "The datatype \"{data_type:?}\" expects 2 buffers, but requested {i}. Please verify that the C data interface is correctly implemented." + "The datatype \"{data_type}\" expects 2 buffers, but requested {i}. Please verify that the C data interface is correctly implemented." ))) } (DataType::Utf8, _) | (DataType::Binary, _) => { return Err(ArrowError::CDataInterface(format!( - "The datatype \"{data_type:?}\" expects 3 buffers, but requested {i}. Please verify that the C data interface is correctly implemented." + "The datatype \"{data_type}\" expects 3 buffers, but requested {i}. Please verify that the C data interface is correctly implemented." ))) } // Variable-sized binaries: have two buffers. @@ -193,7 +193,7 @@ fn bit_width(data_type: &DataType, i: usize) -> Result { (DataType::LargeUtf8, 2) | (DataType::LargeBinary, 2) | (DataType::LargeList(_), 2)=> u8::BITS as _, (DataType::LargeUtf8, _) | (DataType::LargeBinary, _) | (DataType::LargeList(_), _)=> { return Err(ArrowError::CDataInterface(format!( - "The datatype \"{data_type:?}\" expects 3 buffers, but requested {i}. Please verify that the C data interface is correctly implemented." + "The datatype \"{data_type}\" expects 3 buffers, but requested {i}. Please verify that the C data interface is correctly implemented." ))) } // Variable-sized views: have 3 or more buffers. @@ -209,24 +209,24 @@ fn bit_width(data_type: &DataType, i: usize) -> Result { (DataType::Union(_, UnionMode::Dense), 1) => i32::BITS as _, (DataType::Union(_, UnionMode::Sparse), _) => { return Err(ArrowError::CDataInterface(format!( - "The datatype \"{data_type:?}\" expects 1 buffer, but requested {i}. Please verify that the C data interface is correctly implemented." + "The datatype \"{data_type}\" expects 1 buffer, but requested {i}. Please verify that the C data interface is correctly implemented." ))) } (DataType::Union(_, UnionMode::Dense), _) => { return Err(ArrowError::CDataInterface(format!( - "The datatype \"{data_type:?}\" expects 2 buffer, but requested {i}. Please verify that the C data interface is correctly implemented." + "The datatype \"{data_type}\" expects 2 buffer, but requested {i}. Please verify that the C data interface is correctly implemented." ))) } (_, 0) => { // We don't call this `bit_width` to compute buffer length for null buffer. If any types that don't have null buffer like // UnionArray, they should be handled above. return Err(ArrowError::CDataInterface(format!( - "The datatype \"{data_type:?}\" doesn't expect buffer at index 0. Please verify that the C data interface is correctly implemented." + "The datatype \"{data_type}\" doesn't expect buffer at index 0. Please verify that the C data interface is correctly implemented." ))) } _ => { return Err(ArrowError::CDataInterface(format!( - "The datatype \"{data_type:?}\" is still not supported in Rust implementation" + "The datatype \"{data_type}\" is still not supported in Rust implementation" ))) } }) diff --git a/arrow-array/src/record_batch.rs b/arrow-array/src/record_batch.rs index c1023b739081..aeeafe5dd9fb 100644 --- a/arrow-array/src/record_batch.rs +++ b/arrow-array/src/record_batch.rs @@ -360,7 +360,7 @@ impl RecordBatch { if let Some((i, (col_type, field_type))) = not_match { return Err(ArrowError::InvalidArgumentError(format!( - "column types must match schema types, expected {field_type:?} but found {col_type:?} at column index {i}"))); + "column types must match schema types, expected {field_type} but found {col_type} at column index {i}"))); } Ok(RecordBatch { @@ -422,7 +422,7 @@ impl RecordBatch { /// // Insert a key-value pair into the metadata /// batch.schema_metadata_mut().insert("key".into(), "value".into()); /// assert_eq!(batch.schema().metadata().get("key"), Some(&String::from("value"))); - /// ``` + /// ``` pub fn schema_metadata_mut(&mut self) -> &mut std::collections::HashMap { let schema = Arc::make_mut(&mut self.schema); &mut schema.metadata diff --git a/arrow-cast/src/cast/dictionary.rs b/arrow-cast/src/cast/dictionary.rs index 43a67a7d9a2d..c213ac266228 100644 --- a/arrow-cast/src/cast/dictionary.rs +++ b/arrow-cast/src/cast/dictionary.rs @@ -78,7 +78,7 @@ pub(crate) fn dictionary_cast( UInt64 => Arc::new(DictionaryArray::::from(data)), _ => { return Err(ArrowError::CastError(format!( - "Unsupported type {to_index_type:?} for dictionary index" + "Unsupported type {to_index_type} for dictionary index" ))); } }; @@ -313,7 +313,7 @@ pub(crate) fn cast_to_dictionary( pack_byte_to_fixed_size_dictionary::(array, cast_options, byte_size) } _ => Err(ArrowError::CastError(format!( - "Unsupported output type for dictionary packing: {dict_value_type:?}" + "Unsupported output type for dictionary packing: {dict_value_type}" ))), } } diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index 36e0ee8fb678..89a9f0057f8c 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -798,7 +798,7 @@ pub fn cast_with_options( UInt32 => dictionary_cast::(array, to_type, cast_options), UInt64 => dictionary_cast::(array, to_type, cast_options), _ => Err(ArrowError::CastError(format!( - "Casting from dictionary type {from_type:?} to {to_type:?} not supported", + "Casting from dictionary type {from_type} to {to_type} not supported", ))), }, (_, Dictionary(index_type, value_type)) => match **index_type { @@ -811,7 +811,7 @@ pub fn cast_with_options( UInt32 => cast_to_dictionary::(array, value_type, cast_options), UInt64 => cast_to_dictionary::(array, value_type, cast_options), _ => Err(ArrowError::CastError(format!( - "Casting from type {from_type:?} to dictionary type {to_type:?} not supported", + "Casting from type {from_type} to dictionary type {to_type} not supported", ))), }, (List(_), List(to)) => cast_list_values::(array, to, cast_options), @@ -1143,10 +1143,10 @@ pub fn cast_with_options( Ok(Arc::new(array) as ArrayRef) } (Struct(_), _) => Err(ArrowError::CastError(format!( - "Casting from {from_type:?} to {to_type:?} not supported" + "Casting from {from_type} to {to_type} not supported" ))), (_, Struct(_)) => Err(ArrowError::CastError(format!( - "Casting from {from_type:?} to {to_type:?} not supported" + "Casting from {from_type} to {to_type} not supported" ))), (_, Boolean) => match from_type { UInt8 => cast_numeric_to_bool::(array), @@ -1164,7 +1164,7 @@ pub fn cast_with_options( Utf8 => cast_utf8_to_boolean::(array, cast_options), LargeUtf8 => cast_utf8_to_boolean::(array, cast_options), _ => Err(ArrowError::CastError(format!( - "Casting from {from_type:?} to {to_type:?} not supported", + "Casting from {from_type} to {to_type} not supported", ))), }, (Boolean, _) => match to_type { @@ -1183,7 +1183,7 @@ pub fn cast_with_options( Utf8 => value_to_string::(array, cast_options), LargeUtf8 => value_to_string::(array, cast_options), _ => Err(ArrowError::CastError(format!( - "Casting from {from_type:?} to {to_type:?} not supported", + "Casting from {from_type} to {to_type} not supported", ))), }, (Utf8, _) => match to_type { @@ -1245,7 +1245,7 @@ pub fn cast_with_options( cast_string_to_month_day_nano_interval::(array, cast_options) } _ => Err(ArrowError::CastError(format!( - "Casting from {from_type:?} to {to_type:?} not supported", + "Casting from {from_type} to {to_type} not supported", ))), }, (Utf8View, _) => match to_type { @@ -1296,7 +1296,7 @@ pub fn cast_with_options( cast_view_to_month_day_nano_interval(array, cast_options) } _ => Err(ArrowError::CastError(format!( - "Casting from {from_type:?} to {to_type:?} not supported", + "Casting from {from_type} to {to_type} not supported", ))), }, (LargeUtf8, _) => match to_type { @@ -1362,7 +1362,7 @@ pub fn cast_with_options( cast_string_to_month_day_nano_interval::(array, cast_options) } _ => Err(ArrowError::CastError(format!( - "Casting from {from_type:?} to {to_type:?} not supported", + "Casting from {from_type} to {to_type} not supported", ))), }, (Binary, _) => match to_type { @@ -1380,7 +1380,7 @@ pub fn cast_with_options( cast_binary_to_string::(array, cast_options)?.as_string::(), ))), _ => Err(ArrowError::CastError(format!( - "Casting from {from_type:?} to {to_type:?} not supported", + "Casting from {from_type} to {to_type} not supported", ))), }, (LargeBinary, _) => match to_type { @@ -1399,7 +1399,7 @@ pub fn cast_with_options( Ok(Arc::new(StringViewArray::from(array.as_string::()))) } _ => Err(ArrowError::CastError(format!( - "Casting from {from_type:?} to {to_type:?} not supported", + "Casting from {from_type} to {to_type} not supported", ))), }, (FixedSizeBinary(size), _) => match to_type { @@ -1407,7 +1407,7 @@ pub fn cast_with_options( LargeBinary => cast_fixed_size_binary_to_binary::(array, *size), BinaryView => cast_fixed_size_binary_to_binary_view(array, *size), _ => Err(ArrowError::CastError(format!( - "Casting from {from_type:?} to {to_type:?} not supported", + "Casting from {from_type} to {to_type} not supported", ))), }, (BinaryView, Binary) => cast_view_to_byte::>(array), @@ -1426,7 +1426,7 @@ pub fn cast_with_options( Ok(Arc::new(array.as_binary_view().clone().to_string_view()?) as ArrayRef) } (BinaryView, _) => Err(ArrowError::CastError(format!( - "Casting from {from_type:?} to {to_type:?} not supported", + "Casting from {from_type} to {to_type} not supported", ))), (from_type, Utf8View) if from_type.is_primitive() => { value_to_string_view(array, cast_options) @@ -2160,7 +2160,7 @@ pub fn cast_with_options( cast_reinterpret_arrays::(array) } (_, _) => Err(ArrowError::CastError(format!( - "Casting from {from_type:?} to {to_type:?} not supported", + "Casting from {from_type} to {to_type} not supported", ))), } } @@ -2201,7 +2201,7 @@ where LargeUtf8 => value_to_string::(array, cast_options), Null => Ok(new_null_array(to_type, array.len())), _ => Err(ArrowError::CastError(format!( - "Casting from {from_type:?} to {to_type:?} not supported" + "Casting from {from_type} to {to_type} not supported" ))), } } @@ -2304,7 +2304,7 @@ where LargeUtf8 => cast_string_to_decimal::(array, *precision, *scale, cast_options), Null => Ok(new_null_array(to_type, array.len())), _ => Err(ArrowError::CastError(format!( - "Casting from {from_type:?} to {to_type:?} not supported" + "Casting from {from_type} to {to_type} not supported" ))), } } diff --git a/arrow-csv/src/reader/mod.rs b/arrow-csv/src/reader/mod.rs index 7b69df51b541..d1fc4eb350fd 100644 --- a/arrow-csv/src/reader/mod.rs +++ b/arrow-csv/src/reader/mod.rs @@ -860,7 +860,7 @@ fn parse( .collect::>(), ) as ArrayRef), _ => Err(ArrowError::ParseError(format!( - "Unsupported dictionary key type {key_type:?}" + "Unsupported dictionary key type {key_type}" ))), } } diff --git a/arrow-data/src/transform/run.rs b/arrow-data/src/transform/run.rs index f962a5009845..af0b9e640c22 100644 --- a/arrow-data/src/transform/run.rs +++ b/arrow-data/src/transform/run.rs @@ -75,7 +75,7 @@ pub fn extend_nulls(mutable: &mut _MutableArrayData, len: usize) { DataType::Int16 => extend_nulls_impl!(i16), DataType::Int32 => extend_nulls_impl!(i32), DataType::Int64 => extend_nulls_impl!(i64), - _ => panic!("Invalid run end type for RunEndEncoded array: {run_end_type:?}"), + _ => panic!("Invalid run end type for RunEndEncoded array: {run_end_type}"), }; mutable.child_data[0].data.len += 1; @@ -225,7 +225,7 @@ pub fn build_extend(array: &ArrayData) -> Extend<'_> { DataType::Int16 => build_and_process_impl!(i16), DataType::Int32 => build_and_process_impl!(i32), DataType::Int64 => build_and_process_impl!(i64), - _ => panic!("Invalid run end type for RunEndEncoded array: {dest_run_end_type:?}",), + _ => panic!("Invalid run end type for RunEndEncoded array: {dest_run_end_type}",), } }, ) diff --git a/arrow-row/src/list.rs b/arrow-row/src/list.rs index 72d93d2f4bbe..43b4e3b4f266 100644 --- a/arrow-row/src/list.rs +++ b/arrow-row/src/list.rs @@ -278,7 +278,7 @@ pub unsafe fn decode_fixed_size_list( DataType::FixedSizeList(element_field, _) => element_field.data_type(), _ => { return Err(ArrowError::InvalidArgumentError(format!( - "Expected FixedSizeListArray, found: {list_type:?}", + "Expected FixedSizeListArray, found: {list_type}", ))) } }; diff --git a/arrow-schema/src/datatype_parse.rs b/arrow-schema/src/datatype_parse.rs index bba5e914aa3e..8b48ecd17f63 100644 --- a/arrow-schema/src/datatype_parse.rs +++ b/arrow-schema/src/datatype_parse.rs @@ -679,7 +679,7 @@ mod test { /// verifying it is the same fn round_trip(data_type: DataType) { let data_type_string = data_type.to_string(); - println!("Input '{data_type_string}' ({data_type:?})"); + println!("Input '{data_type_string}' ({data_type})"); let parsed_type = parse_data_type(&data_type_string).unwrap(); assert_eq!( data_type, parsed_type, @@ -826,7 +826,7 @@ mod test { ]; for (data_type_string, expected_data_type) in cases { - println!("Parsing '{data_type_string}', expecting '{expected_data_type:?}'"); + println!("Parsing '{data_type_string}', expecting '{expected_data_type}'"); let parsed_data_type = parse_data_type(data_type_string).unwrap(); assert_eq!(parsed_data_type, expected_data_type); } diff --git a/parquet/benches/arrow_reader_row_filter.rs b/parquet/benches/arrow_reader_row_filter.rs index 0ef40ac8237c..ec403f8fd39c 100644 --- a/parquet/benches/arrow_reader_row_filter.rs +++ b/parquet/benches/arrow_reader_row_filter.rs @@ -461,7 +461,7 @@ fn benchmark_filters_and_projections(c: &mut Criterion) { let projection_mask = ProjectionMask::roots(schema_descr, output_projection.clone()); let pred_mask = ProjectionMask::roots(schema_descr, filter_col.clone()); - let benchmark_name = format!("{filter_type:?}/{proj_case}",); + let benchmark_name = format!("{filter_type}/{proj_case}",); // run the benchmark for the async reader let bench_id = BenchmarkId::new(benchmark_name.clone(), "async"); diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index d49e56715b1b..abdc32b10906 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -3185,7 +3185,7 @@ mod tests { "Parquet argument error: Parquet error: encountered non UTF-8 data"; assert!( err.to_string().contains(expected_err), - "data type: {data_type:?}, expected: {expected_err}, got: {err}" + "data type: {data_type}, expected: {expected_err}, got: {err}" ); } } @@ -3224,7 +3224,7 @@ mod tests { "Parquet argument error: Parquet error: encountered non UTF-8 data"; assert!( err.to_string().contains(expected_err), - "data type: {data_type:?}, expected: {expected_err}, got: {err}" + "data type: {data_type}, expected: {expected_err}, got: {err}" ); } } diff --git a/parquet/src/arrow/arrow_writer/mod.rs b/parquet/src/arrow/arrow_writer/mod.rs index 864c1bf2da45..53600235d33b 100644 --- a/parquet/src/arrow/arrow_writer/mod.rs +++ b/parquet/src/arrow/arrow_writer/mod.rs @@ -1080,7 +1080,7 @@ impl ArrowColumnWriterFactory { } _ => return Err(ParquetError::NYI( format!( - "Attempting to write an Arrow type {data_type:?} to parquet that is not yet implemented" + "Attempting to write an Arrow type {data_type} to parquet that is not yet implemented" ) )) } diff --git a/parquet/src/arrow/buffer/view_buffer.rs b/parquet/src/arrow/buffer/view_buffer.rs index 97db778e47aa..9e9b8616c3eb 100644 --- a/parquet/src/arrow/buffer/view_buffer.rs +++ b/parquet/src/arrow/buffer/view_buffer.rs @@ -91,7 +91,7 @@ impl ViewBuffer { let array = unsafe { builder.build_unchecked() }; make_array(array) } - _ => panic!("Unsupported data type: {data_type:?}"), + _ => panic!("Unsupported data type: {data_type}"), } } } From 58c9c2c2f85a9c0a9008974054cfc1ff38420a5b Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Mon, 15 Sep 2025 11:02:15 +0200 Subject: [PATCH 23/27] Use Display formatting of Field in most error messages --- arrow-integration-test/src/lib.rs | 6 +++--- arrow/src/util/data_gen.rs | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arrow-integration-test/src/lib.rs b/arrow-integration-test/src/lib.rs index 177a1c47f31f..ceed51afc848 100644 --- a/arrow-integration-test/src/lib.rs +++ b/arrow-integration-test/src/lib.rs @@ -794,13 +794,13 @@ pub fn array_from_json( DataType::Dictionary(key_type, value_type) => { #[allow(deprecated)] let dict_id = field.dict_id().ok_or_else(|| { - ArrowError::JsonError(format!("Unable to find dict_id for field {field:?}")) + ArrowError::JsonError(format!("Unable to find dict_id for field {field}")) })?; // find dictionary let dictionary = dictionaries .ok_or_else(|| { ArrowError::JsonError(format!( - "Unable to find any dictionaries for field {field:?}" + "Unable to find any dictionaries for field {field}" )) })? .get(&dict_id); @@ -814,7 +814,7 @@ pub fn array_from_json( dictionaries, ), None => Err(ArrowError::JsonError(format!( - "Unable to find dictionary for field {field:?}" + "Unable to find dictionary for field {field}" ))), } } diff --git a/arrow/src/util/data_gen.rs b/arrow/src/util/data_gen.rs index 7ea05811d55b..70af62e6b40d 100644 --- a/arrow/src/util/data_gen.rs +++ b/arrow/src/util/data_gen.rs @@ -267,7 +267,7 @@ fn create_random_decimal_array(field: &Field, size: usize, null_density: f32) -> )) } _ => Err(ArrowError::InvalidArgumentError(format!( - "Cannot create decimal array for field {field:?}" + "Cannot create decimal array for field {field}" ))), } } @@ -298,7 +298,7 @@ fn create_random_list_array( } _ => { return Err(ArrowError::InvalidArgumentError(format!( - "Cannot create list array for field {field:?}" + "Cannot create list array for field {field}" ))) } }; @@ -336,7 +336,7 @@ fn create_random_struct_array( DataType::Struct(fields) => fields, _ => { return Err(ArrowError::InvalidArgumentError(format!( - "Cannot create struct array for field {field:?}" + "Cannot create struct array for field {field}" ))) } }; From 5cb977ad879c6b2a9d264827dcbcf6abcb938f76 Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Mon, 15 Sep 2025 11:12:13 +0200 Subject: [PATCH 24/27] Less Debug --- arrow-array/src/array/mod.rs | 6 +++--- parquet-variant-compute/src/arrow_to_variant.rs | 4 ++-- parquet-variant-compute/src/variant_array.rs | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arrow-array/src/array/mod.rs b/arrow-array/src/array/mod.rs index 5fdfb9fb2244..b5ba32745a71 100644 --- a/arrow-array/src/array/mod.rs +++ b/arrow-array/src/array/mod.rs @@ -824,20 +824,20 @@ pub fn make_array(data: ArrayData) -> ArrayRef { DataType::UInt16 => Arc::new(DictionaryArray::::from(data)) as ArrayRef, DataType::UInt32 => Arc::new(DictionaryArray::::from(data)) as ArrayRef, DataType::UInt64 => Arc::new(DictionaryArray::::from(data)) as ArrayRef, - dt => panic!("Unexpected dictionary key type {dt:?}"), + dt => panic!("Unexpected dictionary key type {dt}"), }, DataType::RunEndEncoded(ref run_ends_type, _) => match run_ends_type.data_type() { DataType::Int16 => Arc::new(RunArray::::from(data)) as ArrayRef, DataType::Int32 => Arc::new(RunArray::::from(data)) as ArrayRef, DataType::Int64 => Arc::new(RunArray::::from(data)) as ArrayRef, - dt => panic!("Unexpected data type for run_ends array {dt:?}"), + dt => panic!("Unexpected data type for run_ends array {dt}"), }, DataType::Null => Arc::new(NullArray::from(data)) as ArrayRef, DataType::Decimal32(_, _) => Arc::new(Decimal32Array::from(data)) as ArrayRef, DataType::Decimal64(_, _) => Arc::new(Decimal64Array::from(data)) as ArrayRef, DataType::Decimal128(_, _) => Arc::new(Decimal128Array::from(data)) as ArrayRef, DataType::Decimal256(_, _) => Arc::new(Decimal256Array::from(data)) as ArrayRef, - dt => panic!("Unexpected data type {dt:?}"), + dt => panic!("Unexpected data type {dt}"), } } diff --git a/parquet-variant-compute/src/arrow_to_variant.rs b/parquet-variant-compute/src/arrow_to_variant.rs index 26713ce8ee19..ad8958b7db70 100644 --- a/parquet-variant-compute/src/arrow_to_variant.rs +++ b/parquet-variant-compute/src/arrow_to_variant.rs @@ -261,14 +261,14 @@ pub(crate) fn make_arrow_to_variant_row_builder<'a>( } _ => { return Err(ArrowError::CastError(format!( - "Unsupported run ends type: {:?}", + "Unsupported run ends type: {}", run_ends.data_type() ))); } }, dt => { return Err(ArrowError::CastError(format!( - "Unsupported data type for casting to Variant: {dt:?}", + "Unsupported data type for casting to Variant: {dt}", ))); } }; diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index 17b0adbdd086..b6bd022c9d5d 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -631,7 +631,7 @@ fn typed_value_to_variant(typed_value: &ArrayRef, index: usize) -> Variant<'_, ' // https://github.com/apache/arrow-rs/issues/8091 debug_assert!( false, - "Unsupported typed_value type: {:?}", + "Unsupported typed_value type: {}", typed_value.data_type() ); Variant::Null From 941286fa16f8ffc3c81dc6c3cc36b1351f9e98b6 Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Mon, 15 Sep 2025 11:19:17 +0200 Subject: [PATCH 25/27] Update test/panic output --- arrow-array/src/builder/struct_builder.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arrow-array/src/builder/struct_builder.rs b/arrow-array/src/builder/struct_builder.rs index 88f89f124f5c..d5109ec192a2 100644 --- a/arrow-array/src/builder/struct_builder.rs +++ b/arrow-array/src/builder/struct_builder.rs @@ -267,7 +267,7 @@ impl StructBuilder { let schema = builder.finish(); panic!("{}", format!( - "StructBuilder ({:?}) and field_builder with index {} ({:?}) are of unequal lengths: ({} != {}).", + "StructBuilder ({}) and field_builder with index {} ({}) are of unequal lengths: ({} != {}).", schema, idx, self.fields[idx].data_type(), @@ -648,7 +648,7 @@ mod tests { #[test] #[should_panic( - expected = "StructBuilder (Schema { fields: [Field { \"f1\": Int32 }, Field { \"f2\": Boolean }], metadata: {} }) and field_builder with index 1 (Boolean) are of unequal lengths: (2 != 1)." + expected = "StructBuilder (Field { \"f1\": Int32 }, Field { \"f2\": Boolean }) and field_builder with index 1 (Boolean) are of unequal lengths: (2 != 1)." )] fn test_struct_array_builder_unequal_field_builders_lengths() { let mut int_builder = Int32Builder::with_capacity(10); From 2cbd723eeb9d6bfa180242ff6d80f59a4d9e799f Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Mon, 15 Sep 2025 11:35:34 +0200 Subject: [PATCH 26/27] A few more places --- arrow-array/src/builder/mod.rs | 8 ++++---- arrow-integration-test/src/lib.rs | 2 +- arrow-ord/src/sort.rs | 2 +- parquet/src/basic.rs | 4 +++- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/arrow-array/src/builder/mod.rs b/arrow-array/src/builder/mod.rs index 75170cea172e..91e29957fc67 100644 --- a/arrow-array/src/builder/mod.rs +++ b/arrow-array/src/builder/mod.rs @@ -567,7 +567,7 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box panic!("The field of Map data type {t:?} should have a child Struct field"), + t => panic!("The field of Map data type {t} should have a child Struct field"), }, DataType::Struct(fields) => Box::new(StructBuilder::from_fields(fields.clone(), capacity)), t @ DataType::Dictionary(key_type, value_type) => { @@ -594,7 +594,7 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box panic!("Dictionary value type {t:?} is not currently supported"), + t => panic!("Dictionary value type {t} is not currently supported"), } }; } @@ -604,10 +604,10 @@ pub fn make_builder(datatype: &DataType, capacity: usize) -> Box dict_builder!(Int32Type), DataType::Int64 => dict_builder!(Int64Type), _ => { - panic!("Data type {t:?} with key type {key_type} is not currently supported") + panic!("Data type {t} with key type {key_type} is not currently supported") } } } - t => panic!("Data type {t:?} is not currently supported"), + t => panic!("Data type {t} is not currently supported"), } } diff --git a/arrow-integration-test/src/lib.rs b/arrow-integration-test/src/lib.rs index ceed51afc848..1f4c4bd4bdda 100644 --- a/arrow-integration-test/src/lib.rs +++ b/arrow-integration-test/src/lib.rs @@ -946,7 +946,7 @@ pub fn array_from_json( Ok(Arc::new(array)) } t => Err(ArrowError::JsonError(format!( - "data type {t:?} not supported" + "data type {t} not supported" ))), } } diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs index 797c2246738c..ad2dada0616f 100644 --- a/arrow-ord/src/sort.rs +++ b/arrow-ord/src/sort.rs @@ -304,7 +304,7 @@ pub fn sort_to_indices( }, t => { return Err(ArrowError::ComputeError(format!( - "Sort not supported for data type {t:?}" + "Sort not supported for data type {t}" ))); } }) diff --git a/parquet/src/basic.rs b/parquet/src/basic.rs index c1e301136d0e..2cf5e46fea5a 100644 --- a/parquet/src/basic.rs +++ b/parquet/src/basic.rs @@ -941,7 +941,9 @@ impl From> for ConvertedType { (16, false) => ConvertedType::UINT_16, (32, false) => ConvertedType::UINT_32, (64, false) => ConvertedType::UINT_64, - t => panic!("Integer type {t:?} is not supported"), + (bit_width, is_signed) => panic!( + "Integer type bit_width={bit_width}, signed={is_signed} is not supported" + ), }, LogicalType::Json => ConvertedType::JSON, LogicalType::Bson => ConvertedType::BSON, From 513676a55d8793c00bfa42e23f9ac60b9debea6c Mon Sep 17 00:00:00 2001 From: Emil Ernerfeldt Date: Mon, 15 Sep 2025 13:04:01 +0200 Subject: [PATCH 27/27] Another place --- parquet/src/arrow/arrow_reader/mod.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parquet/src/arrow/arrow_reader/mod.rs b/parquet/src/arrow/arrow_reader/mod.rs index abdc32b10906..e6195032b64d 100644 --- a/parquet/src/arrow/arrow_reader/mod.rs +++ b/parquet/src/arrow/arrow_reader/mod.rs @@ -638,7 +638,7 @@ impl ArrowReaderMetadata { for (field1, field2) in field_iter { if field1.data_type() != field2.data_type() { errors.push(format!( - "data type mismatch for field {}: requested {:?} but found {:?}", + "data type mismatch for field {}: requested {} but found {}", field1.name(), field1.data_type(), field2.data_type()