diff --git a/parquet-variant-compute/benches/variant_kernels.rs b/parquet-variant-compute/benches/variant_kernels.rs index 5e97f948b231..3cdb28229b8a 100644 --- a/parquet-variant-compute/benches/variant_kernels.rs +++ b/parquet-variant-compute/benches/variant_kernels.rs @@ -84,7 +84,7 @@ fn benchmark_batch_json_string_to_variant(c: &mut Criterion) { pub fn variant_get_bench(c: &mut Criterion) { let variant_array = create_primitive_variant_array(8192); - let input: ArrayRef = Arc::new(variant_array); + let input = ArrayRef::from(variant_array); let options = GetOptions { path: vec![].into(), diff --git a/parquet-variant-compute/src/lib.rs b/parquet-variant-compute/src/lib.rs index b0d4c5ac3d3f..496d550d95b1 100644 --- a/parquet-variant-compute/src/lib.rs +++ b/parquet-variant-compute/src/lib.rs @@ -46,7 +46,7 @@ mod variant_array_builder; pub mod variant_get; mod variant_to_arrow; -pub use variant_array::{ShreddingState, VariantArray}; +pub use variant_array::{ShreddingState, VariantArray, VariantType}; pub use variant_array_builder::{VariantArrayBuilder, VariantValueArrayBuilder}; pub use cast_to_variant::{cast_to_variant, cast_to_variant_with_options}; diff --git a/parquet-variant-compute/src/shred_variant.rs b/parquet-variant-compute/src/shred_variant.rs index aea36266e8c0..138209802ab4 100644 --- a/parquet-variant-compute/src/shred_variant.rs +++ b/parquet-variant-compute/src/shred_variant.rs @@ -22,7 +22,7 @@ use crate::variant_to_arrow::{ make_primitive_variant_to_arrow_row_builder, PrimitiveVariantToArrowRowBuilder, }; use crate::{VariantArray, VariantValueArrayBuilder}; -use arrow::array::{Array as _, ArrayRef, BinaryViewArray, NullBufferBuilder}; +use arrow::array::{ArrayRef, BinaryViewArray, NullBufferBuilder}; use arrow::buffer::NullBuffer; use arrow::compute::CastOptions; use arrow::datatypes::{DataType, Fields}; @@ -310,7 +310,7 @@ impl<'a> VariantToShreddedObjectVariantRowBuilder<'a> { let (value, typed_value, nulls) = typed_value_builder.finish()?; let array = ShreddedVariantFieldArray::from_parts(Some(value), Some(typed_value), nulls); - builder = builder.with_field(field_name, Arc::new(array), false); + builder = builder.with_field(field_name, ArrayRef::from(array), false); } if let Some(nulls) = self.typed_value_nulls.finish() { builder = builder.with_nulls(nulls); @@ -327,7 +327,7 @@ impl<'a> VariantToShreddedObjectVariantRowBuilder<'a> { mod tests { use super::*; use crate::VariantArrayBuilder; - use arrow::array::{Float64Array, Int64Array}; + use arrow::array::{Array, Float64Array, Int64Array}; use arrow::datatypes::{DataType, Field, Fields}; use parquet_variant::{Variant, VariantBuilder, VariantBuilderExt as _}; use std::sync::Arc; @@ -556,18 +556,11 @@ mod tests { .unwrap(); // Extract score and age fields from typed_value struct - let score_field = typed_value - .column_by_name("score") - .unwrap() - .as_any() - .downcast_ref::() - .unwrap(); - let age_field = typed_value - .column_by_name("age") - .unwrap() - .as_any() - .downcast_ref::() - .unwrap(); + let score_field = + ShreddedVariantFieldArray::try_new(typed_value.column_by_name("score").unwrap()) + .unwrap(); + let age_field = + ShreddedVariantFieldArray::try_new(typed_value.column_by_name("age").unwrap()).unwrap(); let score_value = score_field .value_field() diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index a0983063cf0c..ed4b6fe37e47 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -18,36 +18,191 @@ //! [`VariantArray`] implementation use crate::type_conversion::primitive_conversion_single_value; -use arrow::array::{Array, ArrayData, ArrayRef, AsArray, BinaryViewArray, StructArray}; +use arrow::array::{Array, ArrayRef, AsArray, BinaryViewArray, StructArray}; use arrow::buffer::NullBuffer; +use arrow::compute::cast; use arrow::datatypes::{ Date32Type, Float16Type, Float32Type, Float64Type, Int16Type, Int32Type, Int64Type, Int8Type, UInt16Type, UInt32Type, UInt64Type, UInt8Type, }; +use arrow_schema::extension::ExtensionType; use arrow_schema::{ArrowError, DataType, Field, FieldRef, Fields}; use parquet_variant::Uuid; use parquet_variant::Variant; -use std::any::Any; use std::sync::Arc; +/// Arrow Variant [`ExtensionType`]. +/// +/// Represents the canonical Arrow Extension Type for storing variants. +/// See [`VariantArray`] for more examples of using this extension type. +pub struct VariantType; + +impl ExtensionType for VariantType { + const NAME: &'static str = "arrow.parquet.variant"; + + // Variants extension metadata is an empty string + // + type Metadata = &'static str; + + fn metadata(&self) -> &Self::Metadata { + &"" + } + + fn serialize_metadata(&self) -> Option { + Some(String::new()) + } + + fn deserialize_metadata(_metadata: Option<&str>) -> Result { + Ok("") + } + + fn supports_data_type(&self, data_type: &DataType) -> Result<(), ArrowError> { + if matches!(data_type, DataType::Struct(_)) { + Ok(()) + } else { + Err(ArrowError::InvalidArgumentError(format!( + "VariantType only supports StructArray, got {data_type}" + ))) + } + } + + fn try_new(data_type: &DataType, _metadata: Self::Metadata) -> Result { + Self.supports_data_type(data_type)?; + Ok(Self) + } +} + /// An array of Parquet [`Variant`] values /// /// A [`VariantArray`] wraps an Arrow [`StructArray`] that stores the underlying /// `metadata` and `value` fields, and adds convenience methods to access -/// the `Variant`s +/// the [`Variant`]s. /// -/// See [`VariantArrayBuilder`] for constructing a `VariantArray`. +/// See [`VariantArrayBuilder`] for constructing `VariantArray` row by row. +/// +/// See the examples below from converting between `VariantArray` and +/// `StructArray`. /// /// [`VariantArrayBuilder`]: crate::VariantArrayBuilder /// -/// # Specification +/// # Documentation /// -/// 1. This code follows the conventions for storing variants in Arrow `StructArray` -/// defined by [Extension Type for Parquet Variant arrow] and this [document]. -/// At the time of this writing, this is not yet a standardized Arrow extension type. +/// At the time of this writing, Variant has been accepted as an official +/// extension type but not been published to the [official list of extension +/// types] on the Apache Arrow website. See the [Extension Type for Parquet +/// Variant arrow] ticket for more details. /// /// [Extension Type for Parquet Variant arrow]: https://github.com/apache/arrow/issues/46908 -/// [document]: https://docs.google.com/document/d/1pw0AWoMQY3SjD7R4LgbPvMjG_xSCtXp3rZHkVp9jpZ4/edit?usp=sharing +/// [official list of extension types]: https://arrow.apache.org/docs/format/CanonicalExtensions.html +/// +/// # Example: Check if a [`StructArray`] has the [`VariantType`] extension +/// +/// Arrow Arrays only provide [`DataType`], but the extension type information +/// is stored on a [`Field`]. Thus, you must have access to the [`Schema`] or +/// [`Field`] to check for the extension type. +/// +/// [`Schema`]: arrow_schema::Schema +/// ``` +/// # use arrow::array::StructArray; +/// # use arrow_schema::{Schema, Field, DataType}; +/// # use parquet_variant::Variant; +/// # use parquet_variant_compute::{VariantArrayBuilder, VariantArray, VariantType}; +/// # fn get_variant_array() -> VariantArray { +/// # let mut builder = VariantArrayBuilder::new(10); +/// # builder.append_variant(Variant::from("such wow")); +/// # builder.build() +/// # } +/// # fn get_schema() -> Schema { +/// # Schema::new(vec![ +/// # Field::new("id", DataType::Int32, false), +/// # get_variant_array().field("var"), +/// # ]) +/// # } +/// let schema = get_schema(); +/// assert_eq!(schema.fields().len(), 2); +/// // first field is not a Variant +/// assert!(schema.field(0).try_extension_type::().is_err()); +/// // second field is a Variant +/// assert!(schema.field(1).try_extension_type::().is_ok()); +/// ``` +/// +/// # Example: Constructing the correct [`Field`] for a [`VariantArray`] +/// +/// You can construct the correct [`Field`] for a [`VariantArray`] using the +/// [`VariantArray::field`] method. +/// +/// ``` +/// # use arrow_schema::{Schema, Field, DataType}; +/// # use parquet_variant::Variant; +/// # use parquet_variant_compute::{VariantArrayBuilder, VariantArray, VariantType}; +/// # fn get_variant_array() -> VariantArray { +/// # let mut builder = VariantArrayBuilder::new(10); +/// # builder.append_variant(Variant::from("such wow")); +/// # builder.build() +/// # } +/// let variant_array = get_variant_array(); +/// // First field is an integer id, second field is a variant +/// let schema = Schema::new(vec![ +/// Field::new("id", DataType::Int32, false), +/// // call VariantArray::field to get the correct Field +/// variant_array.field("var"), +/// ]); +/// ``` +/// +/// You can also construct the [`Field`] using [`VariantType`] directly +/// +/// ``` +/// # use arrow_schema::{Schema, Field, DataType}; +/// # use parquet_variant::Variant; +/// # use parquet_variant_compute::{VariantArrayBuilder, VariantArray, VariantType}; +/// # fn get_variant_array() -> VariantArray { +/// # let mut builder = VariantArrayBuilder::new(10); +/// # builder.append_variant(Variant::from("such wow")); +/// # builder.build() +/// # } +/// # let variant_array = get_variant_array(); +/// // The DataType of a VariantArray varies depending on how it is shredded +/// let data_type = variant_array.data_type().clone(); +/// // First field is an integer id, second field is a variant +/// let schema = Schema::new(vec![ +/// Field::new("id", DataType::Int32, false), +/// Field::new("var", data_type, false) +/// // Add extension metadata to the field using `VariantType` +/// .with_extension_type(VariantType), +/// ]); +/// ``` +/// +/// # Example: Converting a [`VariantArray`] to a [`StructArray`] +/// +/// ``` +/// # use arrow::array::StructArray; +/// # use parquet_variant::Variant; +/// # use parquet_variant_compute::VariantArrayBuilder; +/// // Create Variant Array +/// let mut builder = VariantArrayBuilder::new(10); +/// builder.append_variant(Variant::from("such wow")); +/// let variant_array = builder.build(); +/// // convert to StructArray +/// let struct_array: StructArray = variant_array.into(); +/// ``` +/// +/// # Example: Converting a [`StructArray`] to a [`VariantArray`] +/// +/// ``` +/// # use arrow::array::StructArray; +/// # use parquet_variant::Variant; +/// # use parquet_variant_compute::{VariantArrayBuilder, VariantArray}; +/// # fn get_struct_array() -> StructArray { +/// # let mut builder = VariantArrayBuilder::new(10); +/// # builder.append_variant(Variant::from("such wow")); +/// # builder.build().into() +/// # } +/// let struct_array: StructArray = get_struct_array(); +/// // try and create a VariantArray from it +/// let variant_array = VariantArray::try_new(&struct_array).unwrap(); +/// assert_eq!(variant_array.value(0), Variant::from("such wow")); +/// ``` +/// #[derive(Clone, Debug)] pub struct VariantArray { /// Reference to the underlying StructArray @@ -88,7 +243,11 @@ impl VariantArray { /// int8. /// /// Currently, only [`BinaryViewArray`] are supported. - pub fn try_new(inner: ArrayRef) -> Result { + pub fn try_new(inner: &dyn Array) -> Result { + // Workaround lack of support for Binary + // https://github.com/apache/arrow-rs/issues/8387 + let inner = cast_to_binary_view_arrays(inner)?; + let Some(inner) = inner.as_struct_opt() else { return Err(ArrowError::InvalidArgumentError( "Invalid VariantArray: requires StructArray as input".to_string(), @@ -242,6 +401,67 @@ impl VariantArray { pub fn typed_value_field(&self) -> Option<&ArrayRef> { self.shredding_state.typed_value_field() } + + /// Return a field to represent this VariantArray in a `Schema` with + /// a particular name + pub fn field(&self, name: impl Into) -> Field { + Field::new( + name.into(), + self.data_type().clone(), + self.inner.is_nullable(), + ) + .with_extension_type(VariantType) + } + + /// Returns a new DataType representing this VariantArray's inner type + pub fn data_type(&self) -> &DataType { + self.inner.data_type() + } + + pub fn slice(&self, offset: usize, length: usize) -> Self { + let inner = self.inner.slice(offset, length); + let metadata = self.metadata.slice(offset, length); + let shredding_state = self.shredding_state.slice(offset, length); + Self { + inner, + metadata, + shredding_state, + } + } + + pub fn len(&self) -> usize { + self.inner.len() + } + + pub fn is_empty(&self) -> bool { + self.inner.is_empty() + } + + pub fn nulls(&self) -> Option<&NullBuffer> { + self.inner.nulls() + } + + /// Is the element at index null? + pub fn is_null(&self, index: usize) -> bool { + self.nulls().is_some_and(|n| n.is_null(index)) + } + + /// Is the element at index valid (not null)? + pub fn is_valid(&self, index: usize) -> bool { + !self.is_null(index) + } +} + +impl From for StructArray { + fn from(variant_array: VariantArray) -> Self { + variant_array.into_inner() + } +} + +impl From for ArrayRef { + fn from(variant_array: VariantArray) -> Self { + Arc::new(variant_array.into_inner()) + } } /// One shredded field of a partially or prefectly shredded variant. For example, suppose the @@ -307,23 +527,17 @@ impl ShreddedVariantFieldArray { /// or be a list, large_list, list_view or struct /// /// Currently, only `value` columns of type [`BinaryViewArray`] are supported. - pub fn try_new(inner: ArrayRef) -> Result { + pub fn try_new(inner: &dyn Array) -> Result { let Some(inner_struct) = inner.as_struct_opt() else { return Err(ArrowError::InvalidArgumentError( "Invalid ShreddedVariantFieldArray: requires StructArray as input".to_string(), )); }; - // Extract value and typed_value fields (metadata is not expected in ShreddedVariantFieldArray) - let value = inner_struct - .column_by_name("value") - .and_then(|col| col.as_binary_view_opt().cloned()); - let typed_value = inner_struct.column_by_name("typed_value").cloned(); - // Note this clone is cheap, it just bumps the ref count Ok(Self { inner: inner_struct.clone(), - shredding_state: ShreddingState::new(value, typed_value), + shredding_state: ShreddingState::from(inner_struct), }) } @@ -368,59 +582,54 @@ impl ShreddedVariantFieldArray { shredding_state: ShreddingState::new(value, typed_value), } } -} -impl Array for ShreddedVariantFieldArray { - fn as_any(&self) -> &dyn Any { - self - } - - fn to_data(&self) -> ArrayData { - self.inner.to_data() - } - - fn into_data(self) -> ArrayData { - self.inner.into_data() + /// Returns the inner [`StructArray`], consuming self + pub fn into_inner(self) -> StructArray { + self.inner } - fn data_type(&self) -> &DataType { + pub fn data_type(&self) -> &DataType { self.inner.data_type() } - fn slice(&self, offset: usize, length: usize) -> ArrayRef { - let inner = self.inner.slice(offset, length); - let shredding_state = self.shredding_state.slice(offset, length); - Arc::new(Self { - inner, - shredding_state, - }) - } - - fn len(&self) -> usize { + pub fn len(&self) -> usize { self.inner.len() } - fn is_empty(&self) -> bool { + pub fn is_empty(&self) -> bool { self.inner.is_empty() } - fn offset(&self) -> usize { + pub fn offset(&self) -> usize { self.inner.offset() } - fn nulls(&self) -> Option<&NullBuffer> { + pub fn nulls(&self) -> Option<&NullBuffer> { // According to the shredding spec, ShreddedVariantFieldArray should be // physically non-nullable - SQL NULL is inferred by both value and // typed_value being physically NULL None } + /// Is the element at index null? + pub fn is_null(&self, index: usize) -> bool { + self.nulls().is_some_and(|n| n.is_null(index)) + } - fn get_buffer_memory_size(&self) -> usize { - self.inner.get_buffer_memory_size() + /// Is the element at index valid (not null)? + pub fn is_valid(&self, index: usize) -> bool { + !self.is_null(index) } +} - fn get_array_memory_size(&self) -> usize { - self.inner.get_array_memory_size() +impl From for ArrayRef { + fn from(array: ShreddedVariantFieldArray) -> Self { + Arc::new(array.into_inner()) + } +} + +impl From for StructArray { + fn from(array: ShreddedVariantFieldArray) -> Self { + array.into_inner() } } @@ -434,7 +643,7 @@ impl Array for ShreddedVariantFieldArray { /// single value. Values in the two fields must be interpreted according to the /// following table (see [Parquet Variant Shredding Spec] for more details): /// -/// | value | typed_value | Meaning | +/// | value | typed_value | Meaning | /// |----------|--------------|---------| /// | null | null | The value is missing; only valid for shredded object fields | /// | non-null | null | The value is present and may be any type, including `null` | @@ -473,7 +682,20 @@ pub enum ShreddingState { } impl ShreddingState { - /// Create a new `ShreddingState` from the given fields + /// try to create a new `ShreddingState` from the given `value` and `typed_value` fields + /// + /// Note you can create a `ShreddingState` from a &[`StructArray`] using + /// `ShreddingState::try_from(&struct_array)`, for example: + /// + /// ```no_run + /// # use arrow::array::StructArray; + /// # use parquet_variant_compute::ShreddingState; + /// # fn get_struct_array() -> StructArray { + /// # unimplemented!() + /// # } + /// let struct_array: StructArray = get_struct_array(); + /// let shredding_state = ShreddingState::try_from(&struct_array).unwrap(); + /// ``` pub fn new(value: Option, typed_value: Option) -> Self { match (value, typed_value) { (Some(value), Some(typed_value)) => Self::PartiallyShredded { value, typed_value }, @@ -523,6 +745,17 @@ impl ShreddingState { } } +impl From<&StructArray> for ShreddingState { + fn from(inner_struct: &StructArray) -> Self { + let value = inner_struct + .column_by_name("value") + .and_then(|col| col.as_binary_view_opt().cloned()); + let typed_value = inner_struct.column_by_name("typed_value").cloned(); + + ShreddingState::new(value, typed_value) + } +} + /// Builds struct arrays from component fields /// /// TODO: move to arrow crate @@ -647,70 +880,52 @@ fn typed_value_to_variant(typed_value: &ArrayRef, index: usize) -> Variant<'_, ' } } -impl Array for VariantArray { - fn as_any(&self) -> &dyn Any { - self - } - - fn to_data(&self) -> ArrayData { - self.inner.to_data() - } - - fn into_data(self) -> ArrayData { - self.inner.into_data() - } - - fn data_type(&self) -> &DataType { - self.inner.data_type() - } - - fn slice(&self, offset: usize, length: usize) -> ArrayRef { - let inner = self.inner.slice(offset, length); - let metadata = self.metadata.slice(offset, length); - let shredding_state = self.shredding_state.slice(offset, length); - Arc::new(Self { - inner, - metadata, - shredding_state, - }) - } - - fn len(&self) -> usize { - self.inner.len() - } - - fn is_empty(&self) -> bool { - self.inner.is_empty() - } - - fn offset(&self) -> usize { - self.inner.offset() - } - - fn nulls(&self) -> Option<&NullBuffer> { - self.inner.nulls() - } +/// Workaround for lack of direct support for BinaryArray +/// +/// +/// The values are read as +/// * `StructArray` +/// +/// but VariantArray needs them as +/// * `StructArray` +/// +/// So cast them to get the right type. +fn cast_to_binary_view_arrays(array: &dyn Array) -> Result { + let new_type = rewrite_to_view_types(array.data_type()); + cast(array, &new_type) +} - fn get_buffer_memory_size(&self) -> usize { - self.inner.get_buffer_memory_size() +/// replaces all instances of Binary with BinaryView in a DataType +fn rewrite_to_view_types(data_type: &DataType) -> DataType { + match data_type { + DataType::Binary => DataType::BinaryView, + DataType::List(field) => DataType::List(rewrite_field_type(field)), + DataType::Struct(fields) => { + DataType::Struct(fields.iter().map(rewrite_field_type).collect()) + } + _ => data_type.clone(), } +} - fn get_array_memory_size(&self) -> usize { - self.inner.get_array_memory_size() - } +fn rewrite_field_type(field: impl AsRef) -> Arc { + let field = field.as_ref(); + let new_field = field + .clone() + .with_data_type(rewrite_to_view_types(field.data_type())); + Arc::new(new_field) } #[cfg(test)] mod test { use super::*; - use arrow::array::{BinaryArray, BinaryViewArray}; + use arrow::array::{BinaryViewArray, Int32Array}; use arrow_schema::{Field, Fields}; #[test] fn invalid_not_a_struct_array() { let array = make_binary_view_array(); // Should fail because the input is not a StructArray - let err = VariantArray::try_new(array); + let err = VariantArray::try_new(&array); assert_eq!( err.unwrap_err().to_string(), "Invalid argument error: Invalid VariantArray: requires StructArray as input" @@ -722,7 +937,7 @@ mod test { let fields = Fields::from(vec![Field::new("value", DataType::BinaryView, true)]); let array = StructArray::new(fields, vec![make_binary_view_array()], None); // Should fail because the StructArray does not contain a 'metadata' field - let err = VariantArray::try_new(Arc::new(array)); + let err = VariantArray::try_new(&array); assert_eq!( err.unwrap_err().to_string(), "Invalid argument error: Invalid VariantArray: StructArray must contain a 'metadata' field" @@ -737,7 +952,7 @@ mod test { // NOTE: By strict spec interpretation, this case (top-level variant with null/null) // should be invalid, but we currently allow it and treat it as Variant::Null. // This is a pragmatic decision to handle missing data gracefully. - let variant_array = VariantArray::try_new(Arc::new(array)).unwrap(); + let variant_array = VariantArray::try_new(&array).unwrap(); // Verify the shredding state is AllNull assert!(matches!( @@ -756,18 +971,18 @@ mod test { #[test] fn invalid_metadata_field_type() { let fields = Fields::from(vec![ - Field::new("metadata", DataType::Binary, true), // Not yet supported + Field::new("metadata", DataType::Int32, true), // not supported Field::new("value", DataType::BinaryView, true), ]); let array = StructArray::new( fields, - vec![make_binary_array(), make_binary_view_array()], + vec![make_int32_array(), make_binary_view_array()], None, ); - let err = VariantArray::try_new(Arc::new(array)); + let err = VariantArray::try_new(&array); assert_eq!( err.unwrap_err().to_string(), - "Not yet implemented: VariantArray 'metadata' field must be BinaryView, got Binary" + "Not yet implemented: VariantArray 'metadata' field must be BinaryView, got Int32" ); } @@ -775,17 +990,17 @@ mod test { fn invalid_value_field_type() { let fields = Fields::from(vec![ Field::new("metadata", DataType::BinaryView, true), - Field::new("value", DataType::Binary, true), // Not yet supported + Field::new("value", DataType::Int32, true), // Not yet supported ]); let array = StructArray::new( fields, - vec![make_binary_view_array(), make_binary_array()], + vec![make_binary_view_array(), make_int32_array()], None, ); - let err = VariantArray::try_new(Arc::new(array)); + let err = VariantArray::try_new(&array); assert_eq!( err.unwrap_err().to_string(), - "Not yet implemented: VariantArray 'value' field must be BinaryView, got Binary" + "Not yet implemented: VariantArray 'value' field must be BinaryView, got Int32" ); } @@ -793,8 +1008,8 @@ mod test { Arc::new(BinaryViewArray::from(vec![b"test" as &[u8]])) } - fn make_binary_array() -> ArrayRef { - Arc::new(BinaryArray::from(vec![b"test" as &[u8]])) + fn make_int32_array() -> ArrayRef { + Arc::new(Int32Array::from(vec![1])) } #[test] @@ -814,7 +1029,7 @@ mod test { let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]); let struct_array = StructArray::new(fields, vec![Arc::new(metadata)], Some(nulls)); - let variant_array = VariantArray::try_new(Arc::new(struct_array)).unwrap(); + let variant_array = VariantArray::try_new(&struct_array).unwrap(); // Verify the shredding state is AllNull assert!(matches!( @@ -864,7 +1079,7 @@ mod test { None, // struct itself is not null, just the value field is all null ); - let variant_array = VariantArray::try_new(Arc::new(struct_array)).unwrap(); + let variant_array = VariantArray::try_new(&struct_array).unwrap(); // This should be Unshredded, not AllNull, because value field exists in schema assert!(matches!( diff --git a/parquet-variant-compute/src/variant_array_builder.rs b/parquet-variant-compute/src/variant_array_builder.rs index 6451e3565802..68c1fd6b5492 100644 --- a/parquet-variant-compute/src/variant_array_builder.rs +++ b/parquet-variant-compute/src/variant_array_builder.rs @@ -133,7 +133,7 @@ impl VariantArrayBuilder { ); // TODO add arrow extension type metadata - VariantArray::try_new(Arc::new(inner)).expect("valid VariantArray by construction") + VariantArray::try_new(&inner).expect("valid VariantArray by construction") } /// Appends a null row to the builder. diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index ffcd968bc661..ef602e84f1bf 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -23,15 +23,16 @@ use arrow::{ use arrow_schema::{ArrowError, DataType, FieldRef}; use parquet_variant::{VariantPath, VariantPathElement}; -use crate::variant_array::{ShreddedVariantFieldArray, ShreddingState}; +use crate::variant_array::ShreddingState; use crate::variant_to_arrow::make_variant_to_arrow_row_builder; use crate::VariantArray; +use arrow::array::AsArray; use std::sync::Arc; -pub(crate) enum ShreddedPathStep<'a> { +pub(crate) enum ShreddedPathStep { /// Path step succeeded, return the new shredding state - Success(&'a ShreddingState), + Success(ShreddingState), /// The path element is not present in the `typed_value` column and there is no `value` column, /// so we we know it does not exist. It, and all paths under it, are all-NULL. Missing, @@ -46,11 +47,11 @@ pub(crate) enum ShreddedPathStep<'a> { /// level, or if `typed_value` is not a struct, or if the requested field name does not exist. /// /// TODO: Support `VariantPathElement::Index`? It wouldn't be easy, and maybe not even possible. -pub(crate) fn follow_shredded_path_element<'a>( - shredding_state: &'a ShreddingState, +pub(crate) fn follow_shredded_path_element( + shredding_state: &ShreddingState, path_element: &VariantPathElement<'_>, cast_options: &CastOptions, -) -> Result> { +) -> Result { // If the requested path element is not present in `typed_value`, and `value` is missing, then // we know it does not exist; it, and all paths under it, are all-NULL. let missing_path_step = || { @@ -87,20 +88,17 @@ pub(crate) fn follow_shredded_path_element<'a>( return Ok(missing_path_step()); }; - let field = field - .as_any() - .downcast_ref::() - .ok_or_else(|| { - // TODO: Should we blow up? Or just end the traversal and let the normal - // variant pathing code sort out the mess that it must anyway be - // prepared to handle? - ArrowError::InvalidArgumentError(format!( - "Expected a ShreddedVariantFieldArray, got {:?} instead", - field.data_type(), - )) - })?; - - Ok(ShreddedPathStep::Success(field.shredding_state())) + let struct_array = field.as_struct_opt().ok_or_else(|| { + // TODO: Should we blow up? Or just end the traversal and let the normal + // variant pathing code sort out the mess that it must anyway be + // prepared to handle? + ArrowError::InvalidArgumentError(format!( + "Expected Struct array while following path, got {}", + field.data_type(), + )) + })?; + + Ok(ShreddedPathStep::Success(struct_array.into())) } VariantPathElement::Index { .. } => { // TODO: Support array indexing. Among other things, it will require slicing not @@ -154,11 +152,11 @@ fn shredded_get_path( // Peel away the prefix of path elements that traverses the shredded parts of this variant // column. Shredding will traverse the rest of the path on a per-row basis. - let mut shredding_state = input.shredding_state(); + let mut shredding_state = input.shredding_state().clone(); let mut accumulated_nulls = input.inner().nulls().cloned(); let mut path_index = 0; for path_element in path { - match follow_shredded_path_element(shredding_state, path_element, cast_options)? { + match follow_shredded_path_element(&shredding_state, path_element, cast_options)? { ShreddedPathStep::Success(state) => { // Union nulls from the typed_value we just accessed if let Some(typed_value) = shredding_state.typed_value_field() { @@ -199,7 +197,7 @@ fn shredded_get_path( // If our caller did not request any specific type, we can just return whatever we landed on. let Some(as_field) = as_field else { - return Ok(Arc::new(target)); + return Ok(ArrayRef::from(target)); }; // Structs are special. Recurse into each field separately, hoping to follow the shredding even @@ -242,11 +240,7 @@ fn shredded_get_path( /// quickly become annoying (and inefficient) to call `variant_get` for each leaf value in a struct or /// list and then try to assemble the results. pub fn variant_get(input: &ArrayRef, options: GetOptions) -> Result { - let variant_array: &VariantArray = input.as_any().downcast_ref().ok_or_else(|| { - ArrowError::InvalidArgumentError( - "expected a VariantArray as the input for variant_get".to_owned(), - ) - })?; + let variant_array = VariantArray::try_new(input)?; let GetOptions { as_type, @@ -254,7 +248,7 @@ pub fn variant_get(input: &ArrayRef, options: GetOptions) -> Result { cast_options, } = options; - shredded_get_path(variant_array, &path, as_type.as_deref(), &cast_options) + shredded_get_path(&variant_array, &path, as_type.as_deref(), &cast_options) } /// Controls the action of the variant_get kernel. @@ -303,9 +297,9 @@ mod test { use std::sync::Arc; use arrow::array::{ - Array, ArrayRef, BinaryViewArray, Date32Array, Float16Array, Float32Array, Float64Array, - Int16Array, Int32Array, Int64Array, Int8Array, StringArray, StructArray, UInt16Array, - UInt32Array, UInt64Array, UInt8Array, + Array, ArrayRef, AsArray, BinaryViewArray, BooleanArray, Date32Array, FixedSizeBinaryArray, + Float16Array, Float32Array, Float64Array, Int16Array, Int32Array, Int64Array, Int8Array, + StringArray, StructArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, }; use arrow::buffer::NullBuffer; use arrow::compute::CastOptions; @@ -322,8 +316,7 @@ mod test { fn single_variant_get_test(input_json: &str, path: VariantPath, expected_json: &str) { // Create input array from JSON string let input_array_ref: ArrayRef = Arc::new(StringArray::from(vec![Some(input_json)])); - let input_variant_array_ref: ArrayRef = - Arc::new(json_to_variant(&input_array_ref).unwrap()); + let input_variant_array_ref = ArrayRef::from(json_to_variant(&input_array_ref).unwrap()); let result = variant_get(&input_variant_array_ref, GetOptions::new_with_path(path)).unwrap(); @@ -332,7 +325,7 @@ mod test { let expected_array_ref: ArrayRef = Arc::new(StringArray::from(vec![Some(expected_json)])); let expected_variant_array = json_to_variant(&expected_array_ref).unwrap(); - let result_array: &VariantArray = result.as_any().downcast_ref().unwrap(); + let result_array = VariantArray::try_new(&result).unwrap(); assert_eq!( result_array.len(), 1, @@ -408,7 +401,7 @@ mod test { let result = variant_get(&array, options).unwrap(); // expect the result is a VariantArray - let result: &VariantArray = result.as_any().downcast_ref().unwrap(); + let result = VariantArray::try_new(&result).unwrap(); assert_eq!(result.len(), 4); // Expect the values are the same as the original values @@ -487,7 +480,7 @@ mod test { let result = variant_get(&array, options).unwrap(); // expect the result is a VariantArray - let result: &VariantArray = result.as_any().downcast_ref().unwrap(); + let result = VariantArray::try_new(&result).unwrap(); assert_eq!(result.len(), 4); // Expect the values are the same as the original values @@ -504,7 +497,7 @@ mod test { let result = variant_get(&array, options).unwrap(); // expect the result is a VariantArray - let result: &VariantArray = result.as_any().downcast_ref().unwrap(); + let result = VariantArray::try_new(&result).unwrap(); assert_eq!(result.len(), 4); // Expect the values are the same as the original values @@ -521,7 +514,7 @@ mod test { let result = variant_get(&array, options).unwrap(); // expect the result is a VariantArray - let result: &VariantArray = result.as_any().downcast_ref().unwrap(); + let result = VariantArray::try_new(&result).unwrap(); assert_eq!(result.len(), 4); // Expect the values are the same as the original values @@ -538,7 +531,7 @@ mod test { let result = variant_get(&array, options).unwrap(); // expect the result is a VariantArray - let result: &VariantArray = result.as_any().downcast_ref().unwrap(); + let result = VariantArray::try_new(&result).unwrap(); assert_eq!(result.len(), 4); // Expect the values are the same as the original values @@ -558,7 +551,7 @@ mod test { let result = variant_get(&array, options).unwrap(); // expect the result is a VariantArray - let result: &VariantArray = result.as_any().downcast_ref().unwrap(); + let result = VariantArray::try_new(&result).unwrap(); assert_eq!(result.len(), 4); // Expect the values are the same as the original values @@ -613,7 +606,7 @@ mod test { let result = variant_get(&array, options).unwrap(); // expect the result is a VariantArray - let result: &VariantArray = result.as_any().downcast_ref().unwrap(); + let result = VariantArray::try_new(&result).unwrap(); assert_eq!(result.len(), 3); // Expect the values are the same as the original values @@ -695,7 +688,7 @@ mod test { let result = variant_get(&array, options).unwrap(); // expect the result is a VariantArray - let result: &VariantArray = result.as_any().downcast_ref().unwrap(); + let result = VariantArray::try_new(&result).unwrap(); assert_eq!(result.len(), 3); // All values should be null @@ -815,10 +808,9 @@ mod test { .with_field("typed_value", Arc::new(typed_value), true) .build(); - Arc::new( - VariantArray::try_new(Arc::new(struct_array)) - .expect("should create variant array"), - ) + VariantArray::try_new(&struct_array) + .expect("should create variant array") + .into() } }; } @@ -946,10 +938,7 @@ mod test { .with_nulls(nulls) .build(); - Arc::new( - VariantArray::try_new(Arc::new(struct_array)) - .expect("should create variant array"), - ) + Arc::new(struct_array) } }; } @@ -1037,7 +1026,7 @@ mod test { None, // row 3 is shredded, so no value ]); - let typed_value = arrow::array::BooleanArray::from(vec![ + let typed_value = BooleanArray::from(vec![ Some(true), // row 0 is shredded, so it has a value None, // row 1 is null, so no value None, // row 2 is a string, so no typed value @@ -1051,9 +1040,7 @@ mod test { .with_nulls(nulls) .build(); - Arc::new( - VariantArray::try_new(Arc::new(struct_array)).expect("should create variant array"), - ) + Arc::new(struct_array) } /// Return a VariantArray that represents a partially "shredded" variant for fixed size binary @@ -1097,7 +1084,7 @@ mod test { false, // row 2 is string true, // row 3 has value ]); - let typed_value = arrow::array::FixedSizeBinaryArray::try_new( + let typed_value = FixedSizeBinaryArray::try_new( 3, // byte width arrow::buffer::Buffer::from(data), Some(typed_value_nulls), @@ -1111,9 +1098,7 @@ mod test { .with_nulls(nulls) .build(); - Arc::new( - VariantArray::try_new(Arc::new(struct_array)).expect("should create variant array"), - ) + Arc::new(struct_array) } /// Return a VariantArray that represents a partially "shredded" variant for UTF8 @@ -1158,9 +1143,7 @@ mod test { .with_nulls(nulls) .build(); - Arc::new( - VariantArray::try_new(Arc::new(struct_array)).expect("should create variant array"), - ) + Arc::new(struct_array) } /// Return a VariantArray that represents a partially "shredded" variant for Date32 @@ -1205,9 +1188,7 @@ mod test { .with_nulls(nulls) .build(); - Arc::new( - VariantArray::try_new(Arc::new(struct_array)).expect("should create variant array"), - ) + Arc::new(struct_array) } /// Return a VariantArray that represents a partially "shredded" variant for BinaryView @@ -1252,9 +1233,7 @@ mod test { .with_nulls(nulls) .build(); - Arc::new( - VariantArray::try_new(Arc::new(struct_array)).expect("should create variant array"), - ) + Arc::new(struct_array) } /// Return a VariantArray that represents an "all null" variant @@ -1289,9 +1268,7 @@ mod test { .with_nulls(nulls) .build(); - Arc::new( - VariantArray::try_new(Arc::new(struct_array)).expect("should create variant array"), - ) + Arc::new(struct_array) } /// This test manually constructs a shredded variant array representing objects /// like {"x": 1, "y": "foo"} and {"x": 42} and tests extracting the "x" field @@ -1304,7 +1281,7 @@ mod test { let options = GetOptions::new_with_path(VariantPath::from("x")); let result = variant_get(&array, options).unwrap(); - let result_variant: &VariantArray = result.as_any().downcast_ref().unwrap(); + let result_variant = VariantArray::try_new(&result).unwrap(); assert_eq!(result_variant.len(), 2); // Row 0: expect x=1 @@ -1381,7 +1358,7 @@ mod test { .build(); // Wrap the x field struct in a ShreddedVariantFieldArray - let x_field_shredded = ShreddedVariantFieldArray::try_new(Arc::new(x_field_struct)) + let x_field_shredded = ShreddedVariantFieldArray::try_new(&x_field_struct) .expect("should create ShreddedVariantFieldArray"); // Create the main typed_value as a struct containing the "x" field @@ -1392,7 +1369,7 @@ mod test { )]); let typed_value_struct = StructArray::try_new( typed_value_fields, - vec![Arc::new(x_field_shredded)], + vec![ArrayRef::from(x_field_shredded)], None, // No nulls - both rows have the object structure ) .unwrap(); @@ -1404,7 +1381,7 @@ mod test { .with_field("typed_value", Arc::new(typed_value_struct), true) .build(); - Arc::new(VariantArray::try_new(Arc::new(main_struct)).expect("should create variant array")) + Arc::new(main_struct) } /// Simple test to check if nested paths are supported by current implementation @@ -1647,7 +1624,7 @@ mod test { } } - Arc::new(builder.build()) + ArrayRef::from(builder.build()) } /// Create test data for depth 1 (single nested field) @@ -1677,7 +1654,7 @@ mod test { } } - Arc::new(builder.build()) + ArrayRef::from(builder.build()) } /// Create test data for depth 2 (double nested field) @@ -1718,7 +1695,7 @@ mod test { } } - Arc::new(builder.build()) + ArrayRef::from(builder.build()) } /// Create simple shredded test data for depth 0 using a simplified working pattern @@ -1760,7 +1737,7 @@ mod test { .with_field("typed_value", Arc::new(x_field_typed_value), true) .build(); - let x_field_shredded = ShreddedVariantFieldArray::try_new(Arc::new(x_field_struct)) + let x_field_shredded = ShreddedVariantFieldArray::try_new(&x_field_struct) .expect("should create ShreddedVariantFieldArray"); // Create the main typed_value as a struct containing the "x" field @@ -1769,9 +1746,12 @@ mod test { x_field_shredded.data_type().clone(), true, )]); - let typed_value_struct = - StructArray::try_new(typed_value_fields, vec![Arc::new(x_field_shredded)], None) - .unwrap(); + let typed_value_struct = StructArray::try_new( + typed_value_fields, + vec![ArrayRef::from(x_field_shredded)], + None, + ) + .unwrap(); // Build final VariantArray let struct_array = StructArrayBuilder::new() @@ -1780,7 +1760,7 @@ mod test { .with_field("typed_value", Arc::new(typed_value_struct), true) .build(); - Arc::new(VariantArray::try_new(Arc::new(struct_array)).expect("should create VariantArray")) + Arc::new(struct_array) } /// Create working depth 1 shredded test data based on the existing working pattern @@ -1838,7 +1818,7 @@ mod test { let x_field_struct = StructArrayBuilder::new() .with_field("typed_value", Arc::new(x_typed_value), true) .build(); - let x_field_shredded = ShreddedVariantFieldArray::try_new(Arc::new(x_field_struct)) + let x_field_shredded = ShreddedVariantFieldArray::try_new(&x_field_struct) .expect("should create ShreddedVariantFieldArray for x"); // Level 1: a field containing x field + value field for fallbacks @@ -1866,14 +1846,18 @@ mod test { .with_field( "typed_value", Arc::new( - StructArray::try_new(a_inner_fields, vec![Arc::new(x_field_shredded)], None) - .unwrap(), + StructArray::try_new( + a_inner_fields, + vec![ArrayRef::from(x_field_shredded)], + None, + ) + .unwrap(), ), true, ) .with_field("value", Arc::new(a_value_array), true) .build(); - let a_field_shredded = ShreddedVariantFieldArray::try_new(Arc::new(a_inner_struct)) + let a_field_shredded = ShreddedVariantFieldArray::try_new(&a_inner_struct) .expect("should create ShreddedVariantFieldArray for a"); // Level 0: main typed_value struct containing a field @@ -1882,9 +1866,12 @@ mod test { a_field_shredded.data_type().clone(), true, )]); - let typed_value_struct = - StructArray::try_new(typed_value_fields, vec![Arc::new(a_field_shredded)], None) - .unwrap(); + let typed_value_struct = StructArray::try_new( + typed_value_fields, + vec![ArrayRef::from(a_field_shredded)], + None, + ) + .unwrap(); // Build final VariantArray let struct_array = StructArrayBuilder::new() @@ -1893,7 +1880,7 @@ mod test { .with_field("typed_value", Arc::new(typed_value_struct), true) .build(); - Arc::new(VariantArray::try_new(Arc::new(struct_array)).expect("should create VariantArray")) + Arc::new(struct_array) } /// Create working depth 2 shredded test data for "a.b.x" paths @@ -1944,7 +1931,7 @@ mod test { let x_field_struct = StructArrayBuilder::new() .with_field("typed_value", Arc::new(x_typed_value), true) .build(); - let x_field_shredded = ShreddedVariantFieldArray::try_new(Arc::new(x_field_struct)) + let x_field_shredded = ShreddedVariantFieldArray::try_new(&x_field_struct) .expect("should create ShreddedVariantFieldArray for x"); // Level 2: b field containing x field + value field @@ -1970,14 +1957,18 @@ mod test { .with_field( "typed_value", Arc::new( - StructArray::try_new(b_inner_fields, vec![Arc::new(x_field_shredded)], None) - .unwrap(), + StructArray::try_new( + b_inner_fields, + vec![ArrayRef::from(x_field_shredded)], + None, + ) + .unwrap(), ), true, ) .with_field("value", Arc::new(b_value_array), true) .build(); - let b_field_shredded = ShreddedVariantFieldArray::try_new(Arc::new(b_inner_struct)) + let b_field_shredded = ShreddedVariantFieldArray::try_new(&b_inner_struct) .expect("should create ShreddedVariantFieldArray for b"); // Level 1: a field containing b field + value field @@ -2003,14 +1994,18 @@ mod test { .with_field( "typed_value", Arc::new( - StructArray::try_new(a_inner_fields, vec![Arc::new(b_field_shredded)], None) - .unwrap(), + StructArray::try_new( + a_inner_fields, + vec![ArrayRef::from(b_field_shredded)], + None, + ) + .unwrap(), ), true, ) .with_field("value", Arc::new(a_value_array), true) .build(); - let a_field_shredded = ShreddedVariantFieldArray::try_new(Arc::new(a_inner_struct)) + let a_field_shredded = ShreddedVariantFieldArray::try_new(&a_inner_struct) .expect("should create ShreddedVariantFieldArray for a"); // Level 0: main typed_value struct containing a field @@ -2019,9 +2014,12 @@ mod test { a_field_shredded.data_type().clone(), true, )]); - let typed_value_struct = - StructArray::try_new(typed_value_fields, vec![Arc::new(a_field_shredded)], None) - .unwrap(); + let typed_value_struct = StructArray::try_new( + typed_value_fields, + vec![ArrayRef::from(a_field_shredded)], + None, + ) + .unwrap(); // Build final VariantArray let struct_array = StructArrayBuilder::new() @@ -2030,7 +2028,7 @@ mod test { .with_field("typed_value", Arc::new(typed_value_struct), true) .build(); - Arc::new(VariantArray::try_new(Arc::new(struct_array)).expect("should create VariantArray")) + Arc::new(struct_array) } #[test] @@ -2051,7 +2049,7 @@ mod test { cast_options: CastOptions::default(), // safe = true }; - let variant_array_ref: Arc = variant_array.clone(); + let variant_array_ref: Arc = variant_array.clone(); let result = variant_get(&variant_array_ref, safe_options); // Should succeed and return NULLs (safe behavior) assert!(result.is_ok()); @@ -2108,7 +2106,7 @@ mod test { cast_options: CastOptions::default(), }; - let variant_array_ref: Arc = variant_array.clone(); + let variant_array_ref: Arc = variant_array.clone(); let result = variant_get(&variant_array_ref, options).unwrap(); // Verify the result length matches input @@ -2124,10 +2122,7 @@ mod test { ); // Verify the actual values - let int32_result = result - .as_any() - .downcast_ref::() - .unwrap(); + let int32_result = result.as_any().downcast_ref::().unwrap(); assert_eq!(int32_result.value(0), 55); // The valid Int32 value } @@ -2167,26 +2162,23 @@ mod test { cast_options: CastOptions::default(), }; - let variant_array_ref: Arc = Arc::new(variant_array); + let variant_array_ref = ArrayRef::from(variant_array); let result = variant_get(&variant_array_ref, options).unwrap(); // Verify the result is a StructArray - let struct_result = result - .as_any() - .downcast_ref::() - .unwrap(); + let struct_result = result.as_struct(); assert_eq!(struct_result.len(), 3); // Get the individual field arrays let field_a = struct_result .column(0) .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap(); let field_b = struct_result .column(1) .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap(); // Verify field values and nulls @@ -2248,13 +2240,13 @@ mod test { cast_options: CastOptions::default(), }; - let variant_array_ref: Arc = Arc::new(variant_array); + let variant_array_ref = ArrayRef::from(variant_array); let result_nullable = variant_get(&variant_array_ref, options_nullable).unwrap(); // Verify we get an Int32Array with nulls for cast failures let int32_result = result_nullable .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap(); assert_eq!(int32_result.len(), 9); @@ -2303,11 +2295,11 @@ mod test { // Create variant array again since we moved it let variant_array_2 = json_to_variant(&string_array).unwrap(); - let variant_array_ref_2: Arc = Arc::new(variant_array_2); + let variant_array_ref_2 = ArrayRef::from(variant_array_2); let result_non_nullable = variant_get(&variant_array_ref_2, options_non_nullable).unwrap(); let int32_result_2 = result_non_nullable .as_any() - .downcast_ref::() + .downcast_ref::() .unwrap(); // Even with a non-nullable field, safe casting should still produce nulls for failures @@ -2620,7 +2612,7 @@ mod test { cast_options: CastOptions::default(), }; - let variant_array_ref: Arc = Arc::new(variant_array); + let variant_array_ref = ArrayRef::from(variant_array); let result = variant_get(&variant_array_ref, options); // Should fail with NotYetImplemented when the row builder tries to handle struct type @@ -2656,7 +2648,7 @@ mod test { let a_field_struct = StructArrayBuilder::new() .with_field("typed_value", Arc::new(a_field_typed_value), true) .build(); - let a_field_shredded = ShreddedVariantFieldArray::try_new(Arc::new(a_field_struct)) + let a_field_shredded = ShreddedVariantFieldArray::try_new(&a_field_struct) .expect("should create ShreddedVariantFieldArray for a"); // Field "b": present in rows 0,2 (missing in rows 1,3,4) @@ -2664,7 +2656,7 @@ mod test { let b_field_struct = StructArrayBuilder::new() .with_field("typed_value", Arc::new(b_field_typed_value), true) .build(); - let b_field_shredded = ShreddedVariantFieldArray::try_new(Arc::new(b_field_struct)) + let b_field_shredded = ShreddedVariantFieldArray::try_new(&b_field_struct) .expect("should create ShreddedVariantFieldArray for b"); // Field "c": present in row 0 only (missing in all other rows) @@ -2672,7 +2664,7 @@ mod test { let c_field_struct = StructArrayBuilder::new() .with_field("typed_value", Arc::new(c_field_typed_value), true) .build(); - let c_field_shredded = ShreddedVariantFieldArray::try_new(Arc::new(c_field_struct)) + let c_field_shredded = ShreddedVariantFieldArray::try_new(&c_field_struct) .expect("should create ShreddedVariantFieldArray for c"); // Create main typed_value struct @@ -2684,9 +2676,9 @@ mod test { let typed_value_struct = StructArray::try_new( typed_value_fields, vec![ - Arc::new(a_field_shredded), - Arc::new(b_field_shredded), - Arc::new(c_field_shredded), + ArrayRef::from(a_field_shredded), + ArrayRef::from(b_field_shredded), + ArrayRef::from(c_field_shredded), ], None, ) @@ -2699,7 +2691,7 @@ mod test { .with_nulls(nulls) .build(); - Arc::new(VariantArray::try_new(Arc::new(struct_array)).expect("should create VariantArray")) + Arc::new(struct_array) } /// Create comprehensive nested shredded variant with diverse null patterns @@ -2713,7 +2705,7 @@ mod test { let inner = StructArrayBuilder::new() .with_field("typed_value", Arc::new(inner_typed_value), true) .build(); - let inner = ShreddedVariantFieldArray::try_new(Arc::new(inner)).unwrap(); + let inner = ShreddedVariantFieldArray::try_new(&inner).unwrap(); let outer_typed_value_nulls = NullBuffer::from(vec![ true, // row 0: inner struct exists with typed_value=42 @@ -2722,14 +2714,14 @@ mod test { false, // row 3: top-level NULL ]); let outer_typed_value = StructArrayBuilder::new() - .with_field("inner", Arc::new(inner), false) + .with_field("inner", ArrayRef::from(inner), false) .with_nulls(outer_typed_value_nulls) .build(); let outer = StructArrayBuilder::new() .with_field("typed_value", Arc::new(outer_typed_value), true) .build(); - let outer = ShreddedVariantFieldArray::try_new(Arc::new(outer)).unwrap(); + let outer = ShreddedVariantFieldArray::try_new(&outer).unwrap(); let typed_value_nulls = NullBuffer::from(vec![ true, // row 0: inner struct exists with typed_value=42 @@ -2738,7 +2730,7 @@ mod test { false, // row 3: top-level NULL ]); let typed_value = StructArrayBuilder::new() - .with_field("outer", Arc::new(outer), false) + .with_field("outer", ArrayRef::from(outer), false) .with_nulls(typed_value_nulls) .build(); @@ -2757,7 +2749,7 @@ mod test { .with_nulls(nulls) .build(); - Arc::new(VariantArray::try_new(Arc::new(struct_array)).expect("should create VariantArray")) + Arc::new(struct_array) } /// Create variant with mixed shredding (spec-compliant) including null scenarios @@ -2810,12 +2802,12 @@ mod test { let x_field_struct = StructArrayBuilder::new() .with_field("typed_value", Arc::new(x_field_typed_value), true) .build(); - let x_field_shredded = ShreddedVariantFieldArray::try_new(Arc::new(x_field_struct)) + let x_field_shredded = ShreddedVariantFieldArray::try_new(&x_field_struct) .expect("should create ShreddedVariantFieldArray for x"); // Create main typed_value struct (only contains shredded fields) let typed_value_struct = StructArrayBuilder::new() - .with_field("x", Arc::new(x_field_shredded), false) + .with_field("x", ArrayRef::from(x_field_shredded), false) .build(); // Build VariantArray with both value and typed_value (PartiallyShredded) @@ -2828,6 +2820,6 @@ mod test { .with_nulls(variant_nulls) .build(); - Arc::new(VariantArray::try_new(Arc::new(struct_array)).expect("should create VariantArray")) + Arc::new(struct_array) } } diff --git a/parquet-variant-compute/src/variant_to_arrow.rs b/parquet-variant-compute/src/variant_to_arrow.rs index 12be4f0748e3..c1483b74bc5b 100644 --- a/parquet-variant-compute/src/variant_to_arrow.rs +++ b/parquet-variant-compute/src/variant_to_arrow.rs @@ -377,11 +377,13 @@ impl VariantToBinaryVariantArrowRowBuilder { } fn finish(mut self) -> Result { - Ok(Arc::new(VariantArray::from_parts( + let variant_array = VariantArray::from_parts( self.metadata, Some(self.builder.build()?), None, // no typed_value column self.nulls.finish(), - ))) + ); + + Ok(ArrayRef::from(variant_array)) } } diff --git a/parquet/tests/variant_integration.rs b/parquet/tests/variant_integration.rs index ebce056cc4ad..5e5c3d944c34 100644 --- a/parquet/tests/variant_integration.rs +++ b/parquet/tests/variant_integration.rs @@ -24,15 +24,12 @@ //! Inspired by the arrow-go implementation: use arrow::util::test_util::parquet_test_data; -use arrow_array::{Array, ArrayRef}; -use arrow_cast::cast; -use arrow_schema::{DataType, Fields}; use parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; use parquet_variant::{Variant, VariantMetadata}; use parquet_variant_compute::VariantArray; use serde::Deserialize; use std::path::Path; -use std::sync::{Arc, LazyLock}; +use std::sync::LazyLock; use std::{fs, path::PathBuf}; type Result = std::result::Result; @@ -398,57 +395,12 @@ impl VariantTestCase { .column_by_name("var") .unwrap_or_else(|| panic!("No 'var' column found in parquet file {path:?}")); - // the values are read as - // * StructArray - // but VariantArray needs them as - // * StructArray - // - // So cast them to get the right type. Hack Alert: the parquet reader - // should read them directly as BinaryView - let var = cast_to_binary_view_arrays(var); - VariantArray::try_new(var).unwrap_or_else(|e| { panic!("Error converting StructArray to VariantArray for {path:?}: {e}") }) } } -fn cast_to_binary_view_arrays(array: &ArrayRef) -> ArrayRef { - let new_type = map_type(array.data_type()); - cast(array, &new_type).unwrap_or_else(|e| { - panic!( - "Error casting array from {:?} to {:?}: {e}", - array.data_type(), - new_type - ) - }) -} - -/// replaces all instances of Binary with BinaryView in a DataType -fn map_type(data_type: &DataType) -> DataType { - match data_type { - DataType::Binary => DataType::BinaryView, - DataType::List(field) => { - let new_field = field - .as_ref() - .clone() - .with_data_type(map_type(field.data_type())); - DataType::List(Arc::new(new_field)) - } - DataType::Struct(fields) => { - let new_fields: Fields = fields - .iter() - .map(|f| { - let new_field = f.as_ref().clone().with_data_type(map_type(f.data_type())); - Arc::new(new_field) - }) - .collect(); - DataType::Struct(new_fields) - } - _ => data_type.clone(), - } -} - /// Variant value loaded from .variant.bin file #[derive(Debug, Clone)] struct ExpectedVariant {