diff --git a/parquet-variant-compute/src/arrow_to_variant.rs b/parquet-variant-compute/src/arrow_to_variant.rs index 26713ce8ee19..6f9a54bcb202 100644 --- a/parquet-variant-compute/src/arrow_to_variant.rs +++ b/parquet-variant-compute/src/arrow_to_variant.rs @@ -15,11 +15,10 @@ // specific language governing permissions and limitations // under the License. -use std::collections::HashMap; - use crate::type_conversion::{decimal_to_variant_decimal, CastOptions}; use arrow::array::{ - Array, AsArray, GenericBinaryArray, GenericStringArray, OffsetSizeTrait, PrimitiveArray, + Array, AsArray, FixedSizeListArray, GenericBinaryArray, GenericListArray, GenericListViewArray, + GenericStringArray, OffsetSizeTrait, PrimitiveArray, }; use arrow::compute::kernels::cast; use arrow::datatypes::{ @@ -36,6 +35,8 @@ use parquet_variant::{ ObjectFieldBuilder, Variant, VariantBuilderExt, VariantDecimal16, VariantDecimal4, VariantDecimal8, }; +use std::collections::HashMap; +use std::ops::Range; // ============================================================================ // Row-oriented builders for efficient Arrow-to-Variant conversion @@ -77,8 +78,11 @@ pub(crate) enum ArrowToVariantRowBuilder<'a> { Utf8(StringArrowToVariantBuilder<'a, i32>), LargeUtf8(StringArrowToVariantBuilder<'a, i64>), Utf8View(StringViewArrowToVariantBuilder<'a>), - List(ListArrowToVariantBuilder<'a, i32>), - LargeList(ListArrowToVariantBuilder<'a, i64>), + List(ListArrowToVariantBuilder<'a, GenericListArray>), + LargeList(ListArrowToVariantBuilder<'a, GenericListArray>), + ListView(ListArrowToVariantBuilder<'a, GenericListViewArray>), + LargeListView(ListArrowToVariantBuilder<'a, GenericListViewArray>), + FixedSizeList(ListArrowToVariantBuilder<'a, FixedSizeListArray>), Struct(StructArrowToVariantBuilder<'a>), Map(MapArrowToVariantBuilder<'a>), Union(UnionArrowToVariantBuilder<'a>), @@ -133,6 +137,9 @@ impl<'a> ArrowToVariantRowBuilder<'a> { Utf8View(b) => b.append_row(builder, index), List(b) => b.append_row(builder, index), LargeList(b) => b.append_row(builder, index), + ListView(b) => b.append_row(builder, index), + LargeListView(b) => b.append_row(builder, index), + FixedSizeList(b) => b.append_row(builder, index), Struct(b) => b.append_row(builder, index), Map(b) => b.append_row(builder, index), Union(b) => b.append_row(builder, index), @@ -238,8 +245,22 @@ pub(crate) fn make_arrow_to_variant_row_builder<'a>( DataType::Utf8 => Utf8(StringArrowToVariantBuilder::new(array)), DataType::LargeUtf8 => LargeUtf8(StringArrowToVariantBuilder::new(array)), DataType::Utf8View => Utf8View(StringViewArrowToVariantBuilder::new(array)), - DataType::List(_) => List(ListArrowToVariantBuilder::new(array, options)?), - DataType::LargeList(_) => LargeList(ListArrowToVariantBuilder::new(array, options)?), + DataType::List(_) => List(ListArrowToVariantBuilder::new(array.as_list(), options)?), + DataType::LargeList(_) => { + LargeList(ListArrowToVariantBuilder::new(array.as_list(), options)?) + } + DataType::ListView(_) => ListView(ListArrowToVariantBuilder::new( + array.as_list_view(), + options, + )?), + DataType::LargeListView(_) => LargeListView(ListArrowToVariantBuilder::new( + array.as_list_view(), + options, + )?), + DataType::FixedSizeList(_, _) => FixedSizeList(ListArrowToVariantBuilder::new( + array.as_fixed_size_list(), + options, + )?), DataType::Struct(_) => Struct(StructArrowToVariantBuilder::new( array.as_struct(), options, @@ -266,11 +287,6 @@ pub(crate) fn make_arrow_to_variant_row_builder<'a>( ))); } }, - dt => { - return Err(ArrowError::CastError(format!( - "Unsupported data type for casting to Variant: {dt:?}", - ))); - } }; Ok(builder) } @@ -425,7 +441,7 @@ define_row_builder!( options: &'a CastOptions, has_time_zone: bool, }, - |array| -> arrow::array::PrimitiveArray { array.as_primitive() }, + |array| -> PrimitiveArray { array.as_primitive() }, |value| -> Option<_> { // Convert using Arrow's temporal conversion functions as_datetime::(value).map(|naive_datetime| { @@ -508,21 +524,21 @@ impl NullArrowToVariantBuilder { } } -/// Generic list builder for List and LargeList types -pub(crate) struct ListArrowToVariantBuilder<'a, O: OffsetSizeTrait> { - list_array: &'a arrow::array::GenericListArray, +/// Generic list builder for ListLikeArray types including List, LargeList, ListView, LargeListView, +/// and FixedSizeList +pub(crate) struct ListArrowToVariantBuilder<'a, L: ListLikeArray> { + list_array: &'a L, values_builder: Box>, } -impl<'a, O: OffsetSizeTrait> ListArrowToVariantBuilder<'a, O> { - pub(crate) fn new(array: &'a dyn Array, options: &'a CastOptions) -> Result { - let list_array = array.as_list(); - let values = list_array.values(); +impl<'a, L: ListLikeArray> ListArrowToVariantBuilder<'a, L> { + pub(crate) fn new(array: &'a L, options: &'a CastOptions) -> Result { + let values = array.values(); let values_builder = - make_arrow_to_variant_row_builder(values.data_type(), values.as_ref(), options)?; + make_arrow_to_variant_row_builder(values.data_type(), values, options)?; Ok(Self { - list_array, + list_array: array, values_builder: Box::new(values_builder), }) } @@ -537,12 +553,10 @@ impl<'a, O: OffsetSizeTrait> ListArrowToVariantBuilder<'a, O> { return Ok(()); } - let offsets = self.list_array.offsets(); - let start = offsets[index].as_usize(); - let end = offsets[index + 1].as_usize(); + let range = self.list_array.element_range(index); let mut list_builder = builder.try_new_list()?; - for value_index in start..end { + for value_index in range { self.values_builder .append_row(&mut list_builder, value_index)?; } @@ -551,6 +565,54 @@ impl<'a, O: OffsetSizeTrait> ListArrowToVariantBuilder<'a, O> { } } +/// Trait for list-like arrays that can provide element ranges +pub(crate) trait ListLikeArray: Array { + /// Get the values array + fn values(&self) -> &dyn Array; + + /// Get the start and end indices for a list element + fn element_range(&self, index: usize) -> Range; +} + +impl ListLikeArray for GenericListArray { + fn values(&self) -> &dyn Array { + self.values() + } + + fn element_range(&self, index: usize) -> Range { + let offsets = self.offsets(); + let start = offsets[index].as_usize(); + let end = offsets[index + 1].as_usize(); + start..end + } +} + +impl ListLikeArray for GenericListViewArray { + fn values(&self) -> &dyn Array { + self.values() + } + + fn element_range(&self, index: usize) -> Range { + let offsets = self.value_offsets(); + let sizes = self.value_sizes(); + let offset = offsets[index].as_usize(); + let size = sizes[index].as_usize(); + offset..(offset + size) + } +} + +impl ListLikeArray for FixedSizeListArray { + fn values(&self) -> &dyn Array { + self.values() + } + + fn element_range(&self, index: usize) -> Range { + let value_length = self.value_length().as_usize(); + let offset = index * value_length; + offset..(offset + value_length) + } +} + /// Struct builder for StructArray pub(crate) struct StructArrowToVariantBuilder<'a> { struct_array: &'a arrow::array::StructArray, diff --git a/parquet-variant-compute/src/cast_to_variant.rs b/parquet-variant-compute/src/cast_to_variant.rs index 295019645f62..467e93014700 100644 --- a/parquet-variant-compute/src/cast_to_variant.rs +++ b/parquet-variant-compute/src/cast_to_variant.rs @@ -91,14 +91,14 @@ mod tests { ArrayRef, BinaryArray, BooleanArray, Date32Array, Date64Array, Decimal128Array, Decimal256Array, Decimal32Array, Decimal64Array, DictionaryArray, DurationMicrosecondArray, DurationMillisecondArray, DurationNanosecondArray, DurationSecondArray, - FixedSizeBinaryBuilder, Float16Array, Float32Array, Float64Array, GenericByteBuilder, - GenericByteViewBuilder, Int16Array, Int32Array, Int64Array, Int8Array, + FixedSizeBinaryBuilder, FixedSizeListBuilder, Float16Array, Float32Array, Float64Array, + GenericByteBuilder, GenericByteViewBuilder, Int16Array, Int32Array, Int64Array, Int8Array, IntervalDayTimeArray, IntervalMonthDayNanoArray, IntervalYearMonthArray, LargeListArray, - LargeStringArray, ListArray, MapArray, NullArray, StringArray, StringRunBuilder, - StringViewArray, StructArray, Time32MillisecondArray, Time32SecondArray, - Time64MicrosecondArray, Time64NanosecondArray, TimestampMicrosecondArray, - TimestampMillisecondArray, TimestampNanosecondArray, TimestampSecondArray, UInt16Array, - UInt32Array, UInt64Array, UInt8Array, UnionArray, + LargeListViewBuilder, LargeStringArray, ListArray, ListViewBuilder, MapArray, NullArray, + StringArray, StringRunBuilder, StringViewArray, StructArray, Time32MillisecondArray, + Time32SecondArray, Time64MicrosecondArray, Time64NanosecondArray, + TimestampMicrosecondArray, TimestampMillisecondArray, TimestampNanosecondArray, + TimestampSecondArray, UInt16Array, UInt32Array, UInt64Array, UInt8Array, UnionArray, }; use arrow::buffer::{NullBuffer, OffsetBuffer, ScalarBuffer}; use arrow::datatypes::{ @@ -1258,6 +1258,209 @@ mod tests { ); } + #[test] + fn test_cast_to_variant_list_view() { + // Create a ListViewArray with some data + let mut builder = ListViewBuilder::new(Int32Array::builder(0)); + builder.append_value(&Int32Array::from(vec![Some(0), Some(1), Some(2)])); + builder.append_value(&Int32Array::from(vec![Some(3), Some(4)])); + builder.append_null(); + let list_view_array = builder.finish(); + + // Expected values + let (metadata1, value1) = { + let mut builder = VariantBuilder::new(); + let mut list = builder.new_list(); + list.append_value(0i32); + list.append_value(1i32); + list.append_value(2i32); + list.finish(); + builder.finish() + }; + let variant1 = Variant::new(&metadata1, &value1); + + let (metadata2, value2) = { + let mut builder = VariantBuilder::new(); + let mut list = builder.new_list(); + list.append_value(3i32); + list.append_value(4i32); + list.finish(); + builder.finish() + }; + let variant2 = Variant::new(&metadata2, &value2); + + run_test( + Arc::new(list_view_array), + vec![Some(variant1), Some(variant2), None], + ); + } + + #[test] + fn test_cast_to_variant_sliced_list_view() { + // Create a ListViewArray with some data + let mut builder = ListViewBuilder::new(Int32Array::builder(0)); + builder.append_value(&Int32Array::from(vec![Some(0), Some(1), Some(2)])); + builder.append_value(&Int32Array::from(vec![Some(3), Some(4)])); + builder.append_null(); + let list_view_array = builder.finish(); + + // Expected value for slice(1, 2) - should get the second and third elements + let (metadata, value) = { + let mut builder = VariantBuilder::new(); + let mut list = builder.new_list(); + list.append_value(3i32); + list.append_value(4i32); + list.finish(); + builder.finish() + }; + let variant = Variant::new(&metadata, &value); + + run_test( + Arc::new(list_view_array.slice(1, 2)), + vec![Some(variant), None], + ); + } + + #[test] + fn test_cast_to_variant_large_list_view() { + // Create a LargeListViewArray with some data + let mut builder = LargeListViewBuilder::new(Int64Array::builder(0)); + builder.append_value(&Int64Array::from(vec![Some(0), Some(1), Some(2)])); + builder.append_value(&Int64Array::from(vec![Some(3), Some(4)])); + builder.append_null(); + let large_list_view_array = builder.finish(); + + // Expected values + let (metadata1, value1) = { + let mut builder = VariantBuilder::new(); + let mut list = builder.new_list(); + list.append_value(0i64); + list.append_value(1i64); + list.append_value(2i64); + list.finish(); + builder.finish() + }; + let variant1 = Variant::new(&metadata1, &value1); + + let (metadata2, value2) = { + let mut builder = VariantBuilder::new(); + let mut list = builder.new_list(); + list.append_value(3i64); + list.append_value(4i64); + list.finish(); + builder.finish() + }; + let variant2 = Variant::new(&metadata2, &value2); + + run_test( + Arc::new(large_list_view_array), + vec![Some(variant1), Some(variant2), None], + ); + } + + #[test] + fn test_cast_to_variant_sliced_large_list_view() { + // Create a LargeListViewArray with some data + let mut builder = LargeListViewBuilder::new(Int64Array::builder(0)); + builder.append_value(&Int64Array::from(vec![Some(0), Some(1), Some(2)])); + builder.append_value(&Int64Array::from(vec![Some(3), Some(4)])); + builder.append_null(); + let large_list_view_array = builder.finish(); + + // Expected value for slice(1, 2) - should get the second and third elements + let (metadata, value) = { + let mut builder = VariantBuilder::new(); + let mut list = builder.new_list(); + list.append_value(3i64); + list.append_value(4i64); + list.finish(); + builder.finish() + }; + let variant = Variant::new(&metadata, &value); + + run_test( + Arc::new(large_list_view_array.slice(1, 2)), + vec![Some(variant), None], + ); + } + + #[test] + fn test_cast_to_variant_fixed_size_list() { + let mut builder = FixedSizeListBuilder::new(Int32Array::builder(0), 2); + builder.values().append_value(0); + builder.values().append_value(1); + builder.append(true); // First list: [0, 1] + + builder.values().append_value(2); + builder.values().append_value(3); + builder.append(true); // Second list: [2, 3] + + builder.values().append_nulls(2); + builder.append(false); // Third list: null + + let fixed_size_list_array = builder.finish(); + + // Expected values + let (metadata1, value1) = { + let mut builder = VariantBuilder::new(); + let mut list = builder.new_list(); + list.append_value(0i32); + list.append_value(1i32); + list.finish(); + builder.finish() + }; + let variant1 = Variant::new(&metadata1, &value1); + + let (metadata2, value2) = { + let mut builder = VariantBuilder::new(); + let mut list = builder.new_list(); + list.append_value(2i32); + list.append_value(3i32); + list.finish(); + builder.finish() + }; + let variant2 = Variant::new(&metadata2, &value2); + + run_test( + Arc::new(fixed_size_list_array), + vec![Some(variant1), Some(variant2), None], + ); + } + + #[test] + fn test_cast_to_variant_sliced_fixed_size_list() { + // Create a FixedSizeListArray with size 2 + let mut builder = FixedSizeListBuilder::new(Int64Array::builder(0), 2); + builder.values().append_value(0); + builder.values().append_value(1); + builder.append(true); // First list: [0, 1] + + builder.values().append_value(2); + builder.values().append_value(3); + builder.append(true); // Second list: [2, 3] + + builder.values().append_nulls(2); + builder.append(false); // Third list: null + + let fixed_size_list_array = builder.finish(); + + // Expected value for slice(1, 2) - should get the second and third elements + let (metadata, value) = { + let mut builder = VariantBuilder::new(); + let mut list = builder.new_list(); + list.append_value(2i64); + list.append_value(3i64); + list.finish(); + builder.finish() + }; + let variant = Variant::new(&metadata, &value); + + run_test( + Arc::new(fixed_size_list_array.slice(1, 2)), + vec![Some(variant), None], + ); + } + #[test] fn test_cast_to_variant_struct() { // Test a simple struct with two fields: id (int64) and age (int32)