From 4c5b64421c0edcfdfbce9641215e30af9b50cbd0 Mon Sep 17 00:00:00 2001 From: Richard Baah Date: Wed, 18 Jun 2025 23:55:03 -0400 Subject: [PATCH 1/4] Implemented casting for RunEnd Encoding --- arrow-cast/src/cast/mod.rs | 275 ++++++++++++++++++++++++ arrow-cast/src/cast/run_array.rs | 357 +++++++++++++++++++++++++++++++ 2 files changed, 632 insertions(+) create mode 100644 arrow-cast/src/cast/run_array.rs diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index b317dabd5dda..5683b8ba4981 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -41,11 +41,13 @@ mod decimal; mod dictionary; mod list; mod map; +mod run_array; mod string; use crate::cast::decimal::*; use crate::cast::dictionary::*; use crate::cast::list::*; use crate::cast::map::*; +use crate::cast::run_array::{cast_to_run_end_encoded, run_end_encoded_cast}; use crate::cast::string::*; use arrow_buffer::IntervalMonthDayNano; @@ -137,6 +139,10 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { can_cast_types(from_value_type, to_value_type) } (Dictionary(_, value_type), _) => can_cast_types(value_type, to_type), + (RunEndEncoded(_, value_type), _) => can_cast_types(value_type.data_type(), to_type), + (_, RunEndEncoded(_, _value_type)) => true, + + (_, Dictionary(_, value_type)) => can_cast_types(from_type, value_type), (List(list_from) | LargeList(list_from), List(list_to) | LargeList(list_to)) => { can_cast_types(list_from.data_type(), list_to.data_type()) @@ -739,6 +745,28 @@ pub fn cast_with_options( | Map(_, _) | Dictionary(_, _), ) => Ok(new_null_array(to_type, array.len())), + (RunEndEncoded(index_type, _), _) => match index_type.data_type() { + Int16 => run_end_encoded_cast::(array, to_type, cast_options), + Int32 => run_end_encoded_cast::(array, to_type, cast_options), + Int64 => run_end_encoded_cast::(array, to_type, cast_options), + _ => Err(ArrowError::CastError(format!( + "Casting from run end encoded type {from_type:?} to {to_type:?} not supported", + ))), + }, + (_, RunEndEncoded(index_type, value_type)) => match index_type.data_type() { + Int16 => { + cast_to_run_end_encoded::(array, value_type.data_type(), cast_options) + } + Int32 => { + cast_to_run_end_encoded::(array, value_type.data_type(), cast_options) + } + Int64 => { + cast_to_run_end_encoded::(array, value_type.data_type(), cast_options) + } + _ => Err(ArrowError::CastError(format!( + "Casting from type {from_type:?} to run end encoded type {to_type:?} not supported", + ))), + }, (Dictionary(index_type, _), _) => match **index_type { Int8 => dictionary_cast::(array, to_type, cast_options), Int16 => dictionary_cast::(array, to_type, cast_options), @@ -10684,4 +10712,251 @@ mod tests { )) as ArrayRef; assert_eq!(*fixed_array, *r); } + #[cfg(test)] + mod run_end_encoded_tests { + use super::*; + use arrow_schema::{DataType, Field}; + use std::sync::Arc; + + /// Test casting FROM RunEndEncoded to primitive types + #[test] + fn test_run_end_encoded_to_primitive() { + // Create a RunEndEncoded array: [1, 1, 2, 2, 2, 3] + let run_ends = Int32Array::from(vec![2, 5, 6]); + let values = Int32Array::from(vec![1, 2, 3]); + let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); + let array_ref = Arc::new(run_array) as ArrayRef; + + // Cast to Int64 + let cast_result = cast(&array_ref, &DataType::Int64).unwrap(); + + // Verify the result is a RunArray with Int64 values + let result_run_array = cast_result + .as_any() + .downcast_ref::>() + .unwrap(); + + // Check that values were cast to Int64 + assert_eq!(result_run_array.values().data_type(), &DataType::Int64); + + // Check that run structure is preserved + assert_eq!(result_run_array.run_ends().values(), &[2, 5, 6]); + + // Check that values are correct + let values_array = result_run_array.values().as_primitive::(); + assert_eq!(values_array.values(), &[1i64, 2i64, 3i64]); + } + + /// Test casting FROM RunEndEncoded to string + #[test] + fn test_run_end_encoded_to_string() { + // Create a RunEndEncoded array with Int32 values: [10, 10, 20, 30, 30] + let run_ends = Int32Array::from(vec![2, 3, 5]); + let values = Int32Array::from(vec![10, 20, 30]); + let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); + let array_ref = Arc::new(run_array) as ArrayRef; + + // Cast to String + let cast_result = cast(&array_ref, &DataType::Utf8).unwrap(); + + // Verify the result is a RunArray with String values + let result_run_array = cast_result + .as_any() + .downcast_ref::>() + .unwrap(); + + // Check that values were cast to String + assert_eq!(result_run_array.values().data_type(), &DataType::Utf8); + + // Check that run structure is preserved + assert_eq!(result_run_array.run_ends().values(), &[2, 3, 5]); + + // Check that values are correct + let values_array = result_run_array.values().as_string::(); + assert_eq!(values_array.value(0), "10"); + assert_eq!(values_array.value(1), "20"); + assert_eq!(values_array.value(2), "30"); + } + + /// Test casting TO RunEndEncoded from primitive types + #[test] + fn test_primitive_to_run_end_encoded() { + // Create an Int32 array with repeated values: [1, 1, 2, 2, 2, 3] + let source_array = Int32Array::from(vec![1, 1, 2, 2, 2, 3]); + let array_ref = Arc::new(source_array) as ArrayRef; + + // Cast to RunEndEncoded + let target_type = DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int32, false)), + Arc::new(Field::new("values", DataType::Int32, true)), + ); + let cast_result = cast(&array_ref, &target_type).unwrap(); + + // Verify the result is a RunArray + let result_run_array = cast_result + .as_any() + .downcast_ref::>() + .unwrap(); + + // Check run structure: runs should end at positions [2, 5, 6] + assert_eq!(result_run_array.run_ends().values(), &[2, 5, 6]); + + // Check values: should be [1, 2, 3] + let values_array = result_run_array.values().as_primitive::(); + assert_eq!(values_array.values(), &[1, 2, 3]); + } + + /// Test casting TO RunEndEncoded from string + #[test] + fn test_string_to_run_end_encoded() { + // Create a String array with repeated values: ["a", "a", "b", "c", "c"] + let source_array = StringArray::from(vec!["a", "a", "b", "c", "c"]); + let array_ref = Arc::new(source_array) as ArrayRef; + + // Cast to RunEndEncoded + let target_type = DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int32, false)), + Arc::new(Field::new("values", DataType::Utf8, true)), + ); + let cast_result = cast(&array_ref, &target_type).unwrap(); + + // Verify the result is a RunArray + let result_run_array = cast_result + .as_any() + .downcast_ref::>() + .unwrap(); + + // Check run structure: runs should end at positions [2, 3, 5] + assert_eq!(result_run_array.run_ends().values(), &[2, 3, 5]); + + // Check values: should be ["a", "b", "c"] + let values_array = result_run_array.values().as_string::(); + assert_eq!(values_array.value(0), "a"); + assert_eq!(values_array.value(1), "b"); + assert_eq!(values_array.value(2), "c"); + } + + /// Test casting with type conversion (Int32 -> RunEndEncoded) + #[test] + fn test_cast_with_type_conversion() { + // Create an Int32 array: [1, 1, 2, 2, 3] + let source_array = Int32Array::from(vec![1, 1, 2, 2, 3]); + let array_ref = Arc::new(source_array) as ArrayRef; + + // Cast to RunEndEncoded (values get converted to strings) + let target_type = DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int32, false)), + Arc::new(Field::new("values", DataType::Utf8, true)), + ); + let cast_result = cast(&array_ref, &target_type).unwrap(); + + // Verify the result is a RunArray with String values + let result_run_array = cast_result + .as_any() + .downcast_ref::>() + .unwrap(); + + // Check that values were converted to strings + assert_eq!(result_run_array.values().data_type(), &DataType::Utf8); + + // Check run structure: runs should end at positions [2, 4, 5] + assert_eq!(result_run_array.run_ends().values(), &[2, 4, 5]); + + // Check values: should be ["1", "2", "3"] + let values_array = result_run_array.values().as_string::(); + assert_eq!(values_array.value(0), "1"); + assert_eq!(values_array.value(1), "2"); + assert_eq!(values_array.value(2), "3"); + } + + /// Test casting empty array to RunEndEncoded + #[test] + fn test_empty_array_to_run_end_encoded() { + // Create an empty Int32 array + let source_array = Int32Array::from(Vec::::new()); + let array_ref = Arc::new(source_array) as ArrayRef; + + // Cast to RunEndEncoded + let target_type = DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int32, false)), + Arc::new(Field::new("values", DataType::Int32, true)), + ); + let cast_result = cast(&array_ref, &target_type).unwrap(); + + // Verify the result is an empty RunArray + let result_run_array = cast_result + .as_any() + .downcast_ref::>() + .unwrap(); + + // Check that both run_ends and values are empty + assert_eq!(result_run_array.run_ends().len(), 0); + assert_eq!(result_run_array.values().len(), 0); + } + + /// Test casting RunEndEncoded with nulls + #[test] + fn test_run_end_encoded_with_nulls() { + // Create a RunEndEncoded array with nulls: [1, 1, null, 2, 2] + let run_ends = Int32Array::from(vec![2, 3, 5]); + let values = Int32Array::from(vec![Some(1), None, Some(2)]); + let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); + let array_ref = Arc::new(run_array) as ArrayRef; + + // Cast to String + let cast_result = cast(&array_ref, &DataType::Utf8).unwrap(); + + // Verify the result preserves nulls + let result_run_array = cast_result + .as_any() + .downcast_ref::>() + .unwrap(); + + let values_array = result_run_array.values().as_string::(); + assert_eq!(values_array.value(0), "1"); + assert!(values_array.is_null(1)); + assert_eq!(values_array.value(2), "2"); + } + + /// Test different index types (Int16, Int64) + #[test] + fn test_different_index_types() { + // Test with Int16 index type + let source_array = Int32Array::from(vec![1, 1, 2, 3, 3]); + let array_ref = Arc::new(source_array) as ArrayRef; + + let target_type = DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int16, false)), + Arc::new(Field::new("values", DataType::Int32, true)), + ); + let cast_result = cast(&array_ref, &target_type).unwrap(); + assert_eq!(cast_result.data_type(), &target_type); + + // Test with Int64 index type + let target_type = DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int64, false)), + Arc::new(Field::new("values", DataType::Int32, true)), + ); + let cast_result = cast(&array_ref, &target_type).unwrap(); + assert_eq!(cast_result.data_type(), &target_type); + } + #[test] + fn test_unsupported_cast_to_run_end_encoded() { + // Create a Struct array - complex nested type that might not be supported + let field = Field::new("item", DataType::Int32, false); + let struct_array = StructArray::from(vec![( + Arc::new(field), + Arc::new(Int32Array::from(vec![1, 2, 3])) as ArrayRef, + )]); + let array_ref = Arc::new(struct_array) as ArrayRef; + + // This should fail because: + // 1. The target type is not RunEndEncoded + // 2. The target type is not supported for casting from StructArray + let cast_result = cast(&array_ref, &DataType::FixedSizeBinary(10)); + + // Expect this to fail + assert!(cast_result.is_err()); + } + } } diff --git a/arrow-cast/src/cast/run_array.rs b/arrow-cast/src/cast/run_array.rs new file mode 100644 index 000000000000..99ec54559fce --- /dev/null +++ b/arrow-cast/src/cast/run_array.rs @@ -0,0 +1,357 @@ +use crate::cast::*; + +pub(crate) fn run_end_encoded_cast( + array: &dyn Array, + to_type: &DataType, + cast_options: &CastOptions, +) -> Result { + match array.data_type() { + DataType::RunEndEncoded(_run_end_field, _values_field) => { + let run_array = array.as_any().downcast_ref::>().unwrap(); + + let values = run_array.values(); + + // Cast the values to the target type + let cast_values = cast_with_options(values, to_type, cast_options)?; + + // Create a PrimitiveArray from the run_ends buffer + let run_ends_buffer = run_array.run_ends(); + let run_ends_array = + PrimitiveArray::::from_iter_values(run_ends_buffer.values().iter().copied()); + + // Create new RunArray with the same run_ends but cast values + let new_run_array = RunArray::::try_new(&run_ends_array, cast_values.as_ref())?; + + Ok(Arc::new(new_run_array)) + } + _ => Err(ArrowError::CastError(format!( + "Cannot cast array of type {:?} to RunEndEncodedArray", + array.data_type() + ))), + } +} + +/// Attempts to cast an array to a RunEndEncoded array with the specified index type K +/// and value type. This function performs run-length encoding on the input array. +/// +/// # Arguments +/// * `array` - The input array to be run-length encoded +/// * `value_type` - The target data type for the values in the RunEndEncoded array +/// * `cast_options` - Options controlling the casting behavior +/// +/// # Returns +/// A `Result` containing the new `RunArray` or an `ArrowError` if casting fails +/// +/// # Process +/// 1. Cast the input array to the target value type if needed +/// 2. Iterate through the array to identify runs of consecutive equal values +/// 3. Build run_ends array indicating where each run terminates +/// 4. Build values array containing the unique values for each run +/// 5. Construct and return the RunArray +pub(crate) fn cast_to_run_end_encoded( + array: &dyn Array, + value_type: &DataType, + cast_options: &CastOptions, +) -> Result { + // Step 1: Cast the input array to the target value type if necessary + let cast_array = if array.data_type() == value_type { + // No casting needed, use the array as-is + make_array(array.to_data()) + } else { + // Cast to the target value type + cast_with_options(array, value_type, cast_options)? + }; + + // Step 2: Run-length encode the cast array + // We'll use a builder to construct the RunArray efficiently + let mut run_ends_builder = PrimitiveBuilder::::new(); + + if cast_array.len() == 0 { + // Handle empty array case + let empty_run_ends = run_ends_builder.finish(); + let empty_values = make_array(ArrayData::new_empty(value_type)); + return Ok(Arc::new(RunArray::::try_new( + &empty_run_ends, + empty_values.as_ref(), + )?)); + } + + // Step 3: Use a simpler approach - use existing Arrow builders for run-length encoding + // This is a more robust implementation that handles all data types correctly + + // For now, we'll use a basic approach that works with the existing builder infrastructure + // In a production implementation, you'd want to use type-specific comparison logic + + // Create a temporary builder to construct the run array + // We'll iterate through and build runs by comparing adjacent elements + let mut run_ends_vec = Vec::new(); + let mut values_indices = Vec::new(); + + let mut current_run_end = 1usize; + + // Add the first element as the start of the first run + values_indices.push(0); + + for i in 1..cast_array.len() { + // For simplicity, we'll use a basic comparison approach + // In practice, you'd want more sophisticated comparison based on data type + let values_equal = match (cast_array.is_null(i), cast_array.is_null(i - 1)) { + (true, true) => true, // Both null + (false, false) => { + // Both non-null - use slice comparison as a basic approach + // This is a simplified implementation + cast_array.slice(i, 1).to_data() == cast_array.slice(i - 1, 1).to_data() + } + _ => false, // One null, one not null + }; + + if !values_equal { + // End current run, start new run + run_ends_vec.push(current_run_end); + values_indices.push(i); + } + + current_run_end += 1; + } + + // Add the final run end + run_ends_vec.push(current_run_end); + + // Step 4: Build the run_ends array + for &run_end in &run_ends_vec { + run_ends_builder.append_value(K::Native::from_usize(run_end).unwrap()); + } + let run_ends_array = run_ends_builder.finish(); + + // Step 5: Build the values array by taking elements at the run start positions + let indices = PrimitiveArray::::from_iter_values( + values_indices.iter().map(|&idx| idx as u32), + ); + let values_array = take(&cast_array, &indices, None)?; + + // Step 7: Create and return the RunArray + let run_array = RunArray::::try_new(&run_ends_array, values_array.as_ref())?; + Ok(Arc::new(run_array)) +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_array::*; + use arrow_schema::DataType; + use std::sync::Arc; + + /// Test casting FROM RunEndEncoded to other types + #[test] + fn test_run_end_encoded_to_primitive() { + // Create a RunEndEncoded array: [1, 1, 2, 2, 2, 3] + let run_ends = Int32Array::from(vec![2, 5, 6]); + let values = Int32Array::from(vec![1, 2, 3]); + let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); + let array_ref = Arc::new(run_array) as ArrayRef; + + // Cast to Int64 + let cast_result = run_end_encoded_cast::( + array_ref.as_ref(), + &DataType::Int64, + &CastOptions::default(), + ) + .unwrap(); + + // Verify the result is a RunArray with Int64 values + let result_run_array = cast_result + .as_any() + .downcast_ref::>() + .unwrap(); + + // Check that values were cast to Int64 + assert_eq!(result_run_array.values().data_type(), &DataType::Int64); + + // Check that run structure is preserved + assert_eq!(result_run_array.run_ends().values(), &[2, 5, 6]); + + // Check that values are correct + let values_array = result_run_array.values().as_primitive::(); + assert_eq!(values_array.values(), &[1i64, 2i64, 3i64]); + } + + #[test] + fn test_run_end_encoded_to_string() { + // Create a RunEndEncoded array with Int32 values: [10, 10, 20, 30, 30] + let run_ends = Int32Array::from(vec![2, 3, 5]); + let values = Int32Array::from(vec![10, 20, 30]); + let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); + let array_ref = Arc::new(run_array) as ArrayRef; + + // Cast to String + let cast_result = run_end_encoded_cast::( + array_ref.as_ref(), + &DataType::Utf8, + &CastOptions::default(), + ) + .unwrap(); + + // Verify the result is a RunArray with String values + let result_run_array = cast_result + .as_any() + .downcast_ref::>() + .unwrap(); + + // Check that values were cast to String + assert_eq!(result_run_array.values().data_type(), &DataType::Utf8); + + // Check that run structure is preserved + assert_eq!(result_run_array.run_ends().values(), &[2, 3, 5]); + + // Check that values are correct + let values_array = result_run_array.values().as_string::(); + assert_eq!(values_array.value(0), "10"); + assert_eq!(values_array.value(1), "20"); + assert_eq!(values_array.value(2), "30"); + } + + /// Test casting TO RunEndEncoded from other types + #[test] + fn test_primitive_to_run_end_encoded() { + // Create an Int32 array with repeated values: [1, 1, 2, 2, 2, 3] + let source_array = Int32Array::from(vec![1, 1, 2, 2, 2, 3]); + let array_ref = Arc::new(source_array) as ArrayRef; + + // Cast to RunEndEncoded + let cast_result = cast_to_run_end_encoded::( + array_ref.as_ref(), + &DataType::Int32, + &CastOptions::default(), + ) + .unwrap(); + + // Verify the result is a RunArray + let result_run_array = cast_result + .as_any() + .downcast_ref::>() + .unwrap(); + + // Check run structure: runs should end at positions [2, 5, 6] + assert_eq!(result_run_array.run_ends().values(), &[2, 5, 6]); + + // Check values: should be [1, 2, 3] + let values_array = result_run_array.values().as_primitive::(); + assert_eq!(values_array.values(), &[1, 2, 3]); + } + + #[test] + fn test_string_to_run_end_encoded() { + // Create a String array with repeated values: ["a", "a", "b", "c", "c"] + let source_array = StringArray::from(vec!["a", "a", "b", "c", "c"]); + let array_ref = Arc::new(source_array) as ArrayRef; + + // Cast to RunEndEncoded + let cast_result = cast_to_run_end_encoded::( + array_ref.as_ref(), + &DataType::Utf8, + &CastOptions::default(), + ) + .unwrap(); + + // Verify the result is a RunArray + let result_run_array = cast_result + .as_any() + .downcast_ref::>() + .unwrap(); + + // Check run structure: runs should end at positions [2, 3, 5] + assert_eq!(result_run_array.run_ends().values(), &[2, 3, 5]); + + // Check values: should be ["a", "b", "c"] + let values_array = result_run_array.values().as_string::(); + assert_eq!(values_array.value(0), "a"); + assert_eq!(values_array.value(1), "b"); + assert_eq!(values_array.value(2), "c"); + } + + #[test] + fn test_cast_with_type_conversion() { + // Create an Int32 array: [1, 1, 2, 2, 3] + let source_array = Int32Array::from(vec![1, 1, 2, 2, 3]); + let array_ref = Arc::new(source_array) as ArrayRef; + + // Cast to RunEndEncoded (values get converted to strings) + let cast_result = cast_to_run_end_encoded::( + array_ref.as_ref(), + &DataType::Utf8, + &CastOptions::default(), + ) + .unwrap(); + + // Verify the result is a RunArray with String values + let result_run_array = cast_result + .as_any() + .downcast_ref::>() + .unwrap(); + + // Check that values were converted to strings + assert_eq!(result_run_array.values().data_type(), &DataType::Utf8); + + // Check run structure: runs should end at positions [2, 4, 5] + assert_eq!(result_run_array.run_ends().values(), &[2, 4, 5]); + + // Check values: should be ["1", "2", "3"] + let values_array = result_run_array.values().as_string::(); + assert_eq!(values_array.value(0), "1"); + assert_eq!(values_array.value(1), "2"); + assert_eq!(values_array.value(2), "3"); + } + + #[test] + fn test_empty_array_to_run_end_encoded() { + // Create an empty Int32 array + let source_array = Int32Array::from(Vec::::new()); + let array_ref = Arc::new(source_array) as ArrayRef; + + // Cast to RunEndEncoded + let cast_result = cast_to_run_end_encoded::( + array_ref.as_ref(), + &DataType::Int32, + &CastOptions::default(), + ) + .unwrap(); + + // Verify the result is an empty RunArray + let result_run_array = cast_result + .as_any() + .downcast_ref::>() + .unwrap(); + + // Check that both run_ends and values are empty + assert_eq!(result_run_array.run_ends().len(), 0); + assert_eq!(result_run_array.values().len(), 0); + } + + #[test] + fn test_run_end_encoded_with_nulls() { + // Create a RunEndEncoded array with nulls: [1, 1, null, 2, 2] + let run_ends = Int32Array::from(vec![2, 3, 5]); + let values = Int32Array::from(vec![Some(1), None, Some(2)]); + let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); + let array_ref = Arc::new(run_array) as ArrayRef; + + // Cast to String + let cast_result = run_end_encoded_cast::( + array_ref.as_ref(), + &DataType::Utf8, + &CastOptions::default(), + ) + .unwrap(); + + // Verify the result preserves nulls + let result_run_array = cast_result + .as_any() + .downcast_ref::>() + .unwrap(); + + let values_array = result_run_array.values().as_string::(); + assert_eq!(values_array.value(0), "1"); + assert!(values_array.is_null(1)); + assert_eq!(values_array.value(2), "2"); + } +} From 5307851706c9965e4bc6cce4c334dd438a7332c0 Mon Sep 17 00:00:00 2001 From: Richard Baah Date: Wed, 18 Jun 2025 23:55:03 -0400 Subject: [PATCH 2/4] Implemented casting for RunEnd Encoding --- arrow-cast/src/cast/mod.rs | 45 ++--- arrow-cast/src/cast/run_array.rs | 277 ++++--------------------------- 2 files changed, 49 insertions(+), 273 deletions(-) diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index 5683b8ba4981..cb50a7384328 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -10726,25 +10726,16 @@ mod tests { let values = Int32Array::from(vec![1, 2, 3]); let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); let array_ref = Arc::new(run_array) as ArrayRef; - + println!("1"); // Cast to Int64 let cast_result = cast(&array_ref, &DataType::Int64).unwrap(); - + println!("2"); // Verify the result is a RunArray with Int64 values let result_run_array = cast_result .as_any() - .downcast_ref::>() + .downcast_ref::() .unwrap(); - - // Check that values were cast to Int64 - assert_eq!(result_run_array.values().data_type(), &DataType::Int64); - - // Check that run structure is preserved - assert_eq!(result_run_array.run_ends().values(), &[2, 5, 6]); - - // Check that values are correct - let values_array = result_run_array.values().as_primitive::(); - assert_eq!(values_array.values(), &[1i64, 2i64, 3i64]); + assert_eq!(result_run_array.values(), &[1i64, 1i64, 2i64, 2i64, 2i64, 3i64]); } /// Test casting FROM RunEndEncoded to string @@ -10760,22 +10751,14 @@ mod tests { let cast_result = cast(&array_ref, &DataType::Utf8).unwrap(); // Verify the result is a RunArray with String values - let result_run_array = cast_result + let result_array = cast_result .as_any() - .downcast_ref::>() + .downcast_ref::() .unwrap(); - - // Check that values were cast to String - assert_eq!(result_run_array.values().data_type(), &DataType::Utf8); - - // Check that run structure is preserved - assert_eq!(result_run_array.run_ends().values(), &[2, 3, 5]); - // Check that values are correct - let values_array = result_run_array.values().as_string::(); - assert_eq!(values_array.value(0), "10"); - assert_eq!(values_array.value(1), "20"); - assert_eq!(values_array.value(2), "30"); + assert_eq!(result_array.value(0), "10"); + assert_eq!(result_array.value(1), "10"); + assert_eq!(result_array.value(2), "20"); } /// Test casting TO RunEndEncoded from primitive types @@ -10909,13 +10892,11 @@ mod tests { // Verify the result preserves nulls let result_run_array = cast_result .as_any() - .downcast_ref::>() + .downcast_ref::() .unwrap(); - - let values_array = result_run_array.values().as_string::(); - assert_eq!(values_array.value(0), "1"); - assert!(values_array.is_null(1)); - assert_eq!(values_array.value(2), "2"); + assert_eq!(result_run_array.value(0), "1"); + assert!(result_run_array.is_null(2)); + assert_eq!(result_run_array.value(4), "2"); } /// Test different index types (Int16, Int64) diff --git a/arrow-cast/src/cast/run_array.rs b/arrow-cast/src/cast/run_array.rs index 99ec54559fce..72b0aff586a0 100644 --- a/arrow-cast/src/cast/run_array.rs +++ b/arrow-cast/src/cast/run_array.rs @@ -7,23 +7,46 @@ pub(crate) fn run_end_encoded_cast( ) -> Result { match array.data_type() { DataType::RunEndEncoded(_run_end_field, _values_field) => { - let run_array = array.as_any().downcast_ref::>().unwrap(); + let run_array = array + .as_any() + .downcast_ref::>() + .ok_or_else(|| ArrowError::CastError("Expected RunArray".to_string()))?; let values = run_array.values(); - // Cast the values to the target type - let cast_values = cast_with_options(values, to_type, cast_options)?; - - // Create a PrimitiveArray from the run_ends buffer - let run_ends_buffer = run_array.run_ends(); - let run_ends_array = - PrimitiveArray::::from_iter_values(run_ends_buffer.values().iter().copied()); - - // Create new RunArray with the same run_ends but cast values - let new_run_array = RunArray::::try_new(&run_ends_array, cast_values.as_ref())?; - - Ok(Arc::new(new_run_array)) + match to_type { + // CASE 1: Stay as RunEndEncoded, cast only the values + DataType::RunEndEncoded(_target_run_end_field, target_value_field) => { + let cast_values = + cast_with_options(values, target_value_field.data_type(), cast_options)?; + + let run_ends_array = PrimitiveArray::::from_iter_values( + run_array.run_ends().values().iter().copied(), + ); + + let new_run_array = + RunArray::::try_new(&run_ends_array, cast_values.as_ref())?; + Ok(Arc::new(new_run_array)) + } + + // CASE 2: Expand to logical form + _ => { + let total_len = run_array.len(); + let indices = Int32Array::from_iter_values( + (0..total_len).map(|i| run_array.get_physical_index(i) as i32), + ); + + let taken = take(values.as_ref(), &indices, None)?; + + if taken.data_type() != to_type { + cast_with_options(taken.as_ref(), to_type, cast_options) + } else { + Ok(taken) + } + } + } } + _ => Err(ArrowError::CastError(format!( "Cannot cast array of type {:?} to RunEndEncodedArray", array.data_type() @@ -76,12 +99,6 @@ pub(crate) fn cast_to_run_end_encoded( )?)); } - // Step 3: Use a simpler approach - use existing Arrow builders for run-length encoding - // This is a more robust implementation that handles all data types correctly - - // For now, we'll use a basic approach that works with the existing builder infrastructure - // In a production implementation, you'd want to use type-specific comparison logic - // Create a temporary builder to construct the run array // We'll iterate through and build runs by comparing adjacent elements let mut run_ends_vec = Vec::new(); @@ -133,225 +150,3 @@ pub(crate) fn cast_to_run_end_encoded( let run_array = RunArray::::try_new(&run_ends_array, values_array.as_ref())?; Ok(Arc::new(run_array)) } - -#[cfg(test)] -mod tests { - use super::*; - use arrow_array::*; - use arrow_schema::DataType; - use std::sync::Arc; - - /// Test casting FROM RunEndEncoded to other types - #[test] - fn test_run_end_encoded_to_primitive() { - // Create a RunEndEncoded array: [1, 1, 2, 2, 2, 3] - let run_ends = Int32Array::from(vec![2, 5, 6]); - let values = Int32Array::from(vec![1, 2, 3]); - let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); - let array_ref = Arc::new(run_array) as ArrayRef; - - // Cast to Int64 - let cast_result = run_end_encoded_cast::( - array_ref.as_ref(), - &DataType::Int64, - &CastOptions::default(), - ) - .unwrap(); - - // Verify the result is a RunArray with Int64 values - let result_run_array = cast_result - .as_any() - .downcast_ref::>() - .unwrap(); - - // Check that values were cast to Int64 - assert_eq!(result_run_array.values().data_type(), &DataType::Int64); - - // Check that run structure is preserved - assert_eq!(result_run_array.run_ends().values(), &[2, 5, 6]); - - // Check that values are correct - let values_array = result_run_array.values().as_primitive::(); - assert_eq!(values_array.values(), &[1i64, 2i64, 3i64]); - } - - #[test] - fn test_run_end_encoded_to_string() { - // Create a RunEndEncoded array with Int32 values: [10, 10, 20, 30, 30] - let run_ends = Int32Array::from(vec![2, 3, 5]); - let values = Int32Array::from(vec![10, 20, 30]); - let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); - let array_ref = Arc::new(run_array) as ArrayRef; - - // Cast to String - let cast_result = run_end_encoded_cast::( - array_ref.as_ref(), - &DataType::Utf8, - &CastOptions::default(), - ) - .unwrap(); - - // Verify the result is a RunArray with String values - let result_run_array = cast_result - .as_any() - .downcast_ref::>() - .unwrap(); - - // Check that values were cast to String - assert_eq!(result_run_array.values().data_type(), &DataType::Utf8); - - // Check that run structure is preserved - assert_eq!(result_run_array.run_ends().values(), &[2, 3, 5]); - - // Check that values are correct - let values_array = result_run_array.values().as_string::(); - assert_eq!(values_array.value(0), "10"); - assert_eq!(values_array.value(1), "20"); - assert_eq!(values_array.value(2), "30"); - } - - /// Test casting TO RunEndEncoded from other types - #[test] - fn test_primitive_to_run_end_encoded() { - // Create an Int32 array with repeated values: [1, 1, 2, 2, 2, 3] - let source_array = Int32Array::from(vec![1, 1, 2, 2, 2, 3]); - let array_ref = Arc::new(source_array) as ArrayRef; - - // Cast to RunEndEncoded - let cast_result = cast_to_run_end_encoded::( - array_ref.as_ref(), - &DataType::Int32, - &CastOptions::default(), - ) - .unwrap(); - - // Verify the result is a RunArray - let result_run_array = cast_result - .as_any() - .downcast_ref::>() - .unwrap(); - - // Check run structure: runs should end at positions [2, 5, 6] - assert_eq!(result_run_array.run_ends().values(), &[2, 5, 6]); - - // Check values: should be [1, 2, 3] - let values_array = result_run_array.values().as_primitive::(); - assert_eq!(values_array.values(), &[1, 2, 3]); - } - - #[test] - fn test_string_to_run_end_encoded() { - // Create a String array with repeated values: ["a", "a", "b", "c", "c"] - let source_array = StringArray::from(vec!["a", "a", "b", "c", "c"]); - let array_ref = Arc::new(source_array) as ArrayRef; - - // Cast to RunEndEncoded - let cast_result = cast_to_run_end_encoded::( - array_ref.as_ref(), - &DataType::Utf8, - &CastOptions::default(), - ) - .unwrap(); - - // Verify the result is a RunArray - let result_run_array = cast_result - .as_any() - .downcast_ref::>() - .unwrap(); - - // Check run structure: runs should end at positions [2, 3, 5] - assert_eq!(result_run_array.run_ends().values(), &[2, 3, 5]); - - // Check values: should be ["a", "b", "c"] - let values_array = result_run_array.values().as_string::(); - assert_eq!(values_array.value(0), "a"); - assert_eq!(values_array.value(1), "b"); - assert_eq!(values_array.value(2), "c"); - } - - #[test] - fn test_cast_with_type_conversion() { - // Create an Int32 array: [1, 1, 2, 2, 3] - let source_array = Int32Array::from(vec![1, 1, 2, 2, 3]); - let array_ref = Arc::new(source_array) as ArrayRef; - - // Cast to RunEndEncoded (values get converted to strings) - let cast_result = cast_to_run_end_encoded::( - array_ref.as_ref(), - &DataType::Utf8, - &CastOptions::default(), - ) - .unwrap(); - - // Verify the result is a RunArray with String values - let result_run_array = cast_result - .as_any() - .downcast_ref::>() - .unwrap(); - - // Check that values were converted to strings - assert_eq!(result_run_array.values().data_type(), &DataType::Utf8); - - // Check run structure: runs should end at positions [2, 4, 5] - assert_eq!(result_run_array.run_ends().values(), &[2, 4, 5]); - - // Check values: should be ["1", "2", "3"] - let values_array = result_run_array.values().as_string::(); - assert_eq!(values_array.value(0), "1"); - assert_eq!(values_array.value(1), "2"); - assert_eq!(values_array.value(2), "3"); - } - - #[test] - fn test_empty_array_to_run_end_encoded() { - // Create an empty Int32 array - let source_array = Int32Array::from(Vec::::new()); - let array_ref = Arc::new(source_array) as ArrayRef; - - // Cast to RunEndEncoded - let cast_result = cast_to_run_end_encoded::( - array_ref.as_ref(), - &DataType::Int32, - &CastOptions::default(), - ) - .unwrap(); - - // Verify the result is an empty RunArray - let result_run_array = cast_result - .as_any() - .downcast_ref::>() - .unwrap(); - - // Check that both run_ends and values are empty - assert_eq!(result_run_array.run_ends().len(), 0); - assert_eq!(result_run_array.values().len(), 0); - } - - #[test] - fn test_run_end_encoded_with_nulls() { - // Create a RunEndEncoded array with nulls: [1, 1, null, 2, 2] - let run_ends = Int32Array::from(vec![2, 3, 5]); - let values = Int32Array::from(vec![Some(1), None, Some(2)]); - let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); - let array_ref = Arc::new(run_array) as ArrayRef; - - // Cast to String - let cast_result = run_end_encoded_cast::( - array_ref.as_ref(), - &DataType::Utf8, - &CastOptions::default(), - ) - .unwrap(); - - // Verify the result preserves nulls - let result_run_array = cast_result - .as_any() - .downcast_ref::>() - .unwrap(); - - let values_array = result_run_array.values().as_string::(); - assert_eq!(values_array.value(0), "1"); - assert!(values_array.is_null(1)); - assert_eq!(values_array.value(2), "2"); - } -} From 0452360ce42123a2c395f820cfd7474e2b2479d1 Mon Sep 17 00:00:00 2001 From: Richard Baah Date: Mon, 23 Jun 2025 14:42:24 -0400 Subject: [PATCH 3/4] feat: Add Run-End Encoded array casting with overflow protection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement casting between REE arrays and other Arrow types. REE-to-REE casting validates run-end upcasts only (Int16→Int32, Int16→Int64, Int32→Int64) to prevent invalid sequences. --- arrow-cast/src/cast/mod.rs | 176 ++++++++++++++++++++++++++----- arrow-cast/src/cast/run_array.rs | 155 +++++++++++++++++++++++---- 2 files changed, 284 insertions(+), 47 deletions(-) diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index cb50a7384328..6b3d6b88f7f2 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -47,7 +47,9 @@ use crate::cast::decimal::*; use crate::cast::dictionary::*; use crate::cast::list::*; use crate::cast::map::*; -use crate::cast::run_array::{cast_to_run_end_encoded, run_end_encoded_cast}; +use crate::cast::run_array::{ + can_cast_run_end_encoded, cast_to_run_end_encoded, run_end_encoded_cast, +}; use crate::cast::string::*; use arrow_buffer::IntervalMonthDayNano; @@ -140,9 +142,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { } (Dictionary(_, value_type), _) => can_cast_types(value_type, to_type), (RunEndEncoded(_, value_type), _) => can_cast_types(value_type.data_type(), to_type), - (_, RunEndEncoded(_, _value_type)) => true, - - + (_, RunEndEncoded(_, _value_type)) => can_cast_run_end_encoded(from_type, to_type), (_, Dictionary(_, value_type)) => can_cast_types(from_type, value_type), (List(list_from) | LargeList(list_from), List(list_to) | LargeList(list_to)) => { can_cast_types(list_from.data_type(), list_to.data_type()) @@ -745,14 +745,18 @@ pub fn cast_with_options( | Map(_, _) | Dictionary(_, _), ) => Ok(new_null_array(to_type, array.len())), - (RunEndEncoded(index_type, _), _) => match index_type.data_type() { - Int16 => run_end_encoded_cast::(array, to_type, cast_options), - Int32 => run_end_encoded_cast::(array, to_type, cast_options), - Int64 => run_end_encoded_cast::(array, to_type, cast_options), - _ => Err(ArrowError::CastError(format!( - "Casting from run end encoded type {from_type:?} to {to_type:?} not supported", - ))), - }, + (RunEndEncoded(index_type, _), _) => { + let mut new_cast_options = cast_options.clone(); + new_cast_options.safe = false; + match index_type.data_type() { + Int16 => run_end_encoded_cast::(array, to_type, &new_cast_options), + Int32 => run_end_encoded_cast::(array, to_type, &new_cast_options), + Int64 => run_end_encoded_cast::(array, to_type, &new_cast_options), + _ => Err(ArrowError::CastError(format!( + "Casting from run end encoded type {from_type:?} to {to_type:?} not supported", + ))), + } + } (_, RunEndEncoded(index_type, value_type)) => match index_type.data_type() { Int16 => { cast_to_run_end_encoded::(array, value_type.data_type(), cast_options) @@ -10726,16 +10730,14 @@ mod tests { let values = Int32Array::from(vec![1, 2, 3]); let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); let array_ref = Arc::new(run_array) as ArrayRef; - println!("1"); // Cast to Int64 let cast_result = cast(&array_ref, &DataType::Int64).unwrap(); - println!("2"); // Verify the result is a RunArray with Int64 values - let result_run_array = cast_result - .as_any() - .downcast_ref::() - .unwrap(); - assert_eq!(result_run_array.values(), &[1i64, 1i64, 2i64, 2i64, 2i64, 3i64]); + let result_run_array = cast_result.as_any().downcast_ref::().unwrap(); + assert_eq!( + result_run_array.values(), + &[1i64, 1i64, 2i64, 2i64, 2i64, 3i64] + ); } /// Test casting FROM RunEndEncoded to string @@ -10751,10 +10753,7 @@ mod tests { let cast_result = cast(&array_ref, &DataType::Utf8).unwrap(); // Verify the result is a RunArray with String values - let result_array = cast_result - .as_any() - .downcast_ref::() - .unwrap(); + let result_array = cast_result.as_any().downcast_ref::().unwrap(); // Check that values are correct assert_eq!(result_array.value(0), "10"); assert_eq!(result_array.value(1), "10"); @@ -10890,10 +10889,7 @@ mod tests { let cast_result = cast(&array_ref, &DataType::Utf8).unwrap(); // Verify the result preserves nulls - let result_run_array = cast_result - .as_any() - .downcast_ref::() - .unwrap(); + let result_run_array = cast_result.as_any().downcast_ref::().unwrap(); assert_eq!(result_run_array.value(0), "1"); assert!(result_run_array.is_null(2)); assert_eq!(result_run_array.value(4), "2"); @@ -10939,5 +10935,131 @@ mod tests { // Expect this to fail assert!(cast_result.is_err()); } + #[test] + fn test_cast_run_end_encoded_int64_to_int16_should_fail() { + use arrow_array::{Int64Array, RunArray, StringArray}; + use arrow_schema::{DataType, Field}; + use std::sync::Arc; + + // Construct a valid REE array with Int64 run-ends + let run_ends = Int64Array::from(vec![100_000, 400_000, 700_000]); // values too large for Int16 + let values = StringArray::from(vec!["a", "b", "c"]); + + let ree_array = RunArray::::try_new(&run_ends, &values).unwrap(); + let array_ref = Arc::new(ree_array) as ArrayRef; + + // Attempt to cast to RunEndEncoded + let target_type = DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int16, false)), + Arc::new(Field::new("values", DataType::Utf8, true)), + ); + let cast_options = CastOptions { + safe: false, // This should make it fail instead of returning nulls + format_options: FormatOptions::default(), + }; + + // This should fail due to run-end overflow + let result: Result, ArrowError> = + cast_with_options(&array_ref, &target_type, &cast_options); + + match result { + Err(e) => { + assert!(e + .to_string() + .contains("Cast error: Can't cast value 100000 to type Int16")); + } + Ok(_array_ref) => { + panic!("This should not happen"); + } + } + } + #[test] + fn test_cast_run_end_encoded_int16_to_int64_should_succeed() { + use arrow_array::{Int16Array, RunArray, StringArray}; + use arrow_schema::{DataType, Field}; + use std::sync::Arc; + + // Construct a valid REE array with Int16 run-ends + let run_ends = Int16Array::from(vec![2, 5, 8]); // values that fit in Int16 + let values = StringArray::from(vec!["a", "b", "c"]); + + let ree_array = RunArray::::try_new(&run_ends, &values).unwrap(); + let array_ref = Arc::new(ree_array) as ArrayRef; + + // Attempt to cast to RunEndEncoded (upcast should succeed) + let target_type = DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int64, false)), + Arc::new(Field::new("values", DataType::Utf8, true)), + ); + let cast_options = CastOptions { + safe: false, + format_options: FormatOptions::default(), + }; + + // This should succeed due to valid upcast + let result: Result, ArrowError> = + cast_with_options(&array_ref, &target_type, &cast_options); + + match result { + Ok(array_ref) => { + // Downcast to RunArray + let run_array = array_ref + .as_any() + .downcast_ref::>() + .unwrap(); + + // Verify the cast worked correctly + // Assert the values were cast correctly + assert_eq!(run_array.run_ends().values(), &[2i64, 5i64, 8i64]); + assert_eq!(run_array.values().as_string::().value(0), "a"); + assert_eq!(run_array.values().as_string::().value(1), "b"); + assert_eq!(run_array.values().as_string::().value(2), "c"); + } + Err(e) => { + panic!("Cast should have succeeded but failed: {}", e); + } + } + } + + #[test] + fn test_cast_run_end_encoded_int32_to_int16_should_fail() { + use arrow_array::{Int32Array, RunArray, StringArray}; + use arrow_schema::{DataType, Field}; + use std::sync::Arc; + + // Construct a valid REE array with Int32 run-ends + let run_ends = Int32Array::from(vec![1000, 50000, 80000]); // values too large for Int16 + let values = StringArray::from(vec!["x", "y", "z"]); + + println!("Original run_ends null count: {}", run_ends.null_count()); + println!("Original run_ends values: {:?}", run_ends.values()); + + let ree_array = RunArray::::try_new(&run_ends, &values).unwrap(); + let array_ref = Arc::new(ree_array) as ArrayRef; + + // Attempt to cast to RunEndEncoded (downcast should fail) + let target_type = DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int16, false)), + Arc::new(Field::new("values", DataType::Utf8, true)), + ); + let cast_options = CastOptions { + safe: false, + format_options: FormatOptions::default(), + }; + + // This should fail due to run-end overflow + let result: Result, ArrowError> = + cast_with_options(&array_ref, &target_type, &cast_options); + + match result { + Ok(_) => { + panic!("Cast should have failed due to overflow but succeeded"); + } + Err(e) => { + // Verify the error is about overflow/out of range + assert!(e.to_string().contains("Can't cast value")); + } + } + } } } diff --git a/arrow-cast/src/cast/run_array.rs b/arrow-cast/src/cast/run_array.rs index 72b0aff586a0..3d0c5ec6a4d0 100644 --- a/arrow-cast/src/cast/run_array.rs +++ b/arrow-cast/src/cast/run_array.rs @@ -1,12 +1,54 @@ use crate::cast::*; - +/// Attempts to cast a Run-End Encoded array to another type, handling both REE-to-REE +/// and REE-to-other type conversions with proper validation and error handling. +/// +/// # Arguments +/// * `array` - The input Run-End Encoded array to be cast +/// * `to_type` - The target data type for the casting operation +/// * `cast_options` - Options controlling the casting behavior (e.g., safe vs unsafe) +/// +/// # Returns +/// A `Result` containing the new `ArrayRef` or an `ArrowError` if casting fails +/// +/// # Behavior +/// This function handles two main casting scenarios: +/// +/// ## Case 1: REE-to-REE Casting +/// When casting to another Run-End Encoded type: +/// - Casts both the `values` and `run_ends` to their target types +/// - Validates that run-end casting only allows upcasts (Int16→Int32, Int16→Int64, Int32→Int64) +/// - Preserves the REE structure while updating both fields +/// - Returns a new `RunArray` with the appropriate run-end type (Int16, Int32, or Int64) +/// +/// ## Case 2: REE-to-Other Casting +/// When casting to a non-REE type: +/// - Expands the REE array to its logical form by unpacking all values +/// - Applies the target type casting to the expanded array +/// - Returns a regular array of the target type (e.g., StringArray, Int64Array) +/// +/// # Error Handling, error occurs if: +/// - the input array is not a Run-End Encoded array +/// - run-end downcasting would cause overflow +/// - the target run-end type is unsupported +/// - Propagates errors from underlying casting operations +/// +/// # Safety Considerations +/// - Run-end casting uses `safe: false` to prevent silent overflow +/// - Only upcasts are allowed for run-ends to maintain valid REE structure +/// - Unpacking preserves null values and array length +/// - Type validation ensures only supported run-end types (Int16, Int32, Int64) +/// +/// # Performance Notes +/// - REE-to-REE casting is efficient as it operates on the compressed structure +/// - REE-to-other casting requires full unpacking, which may be expensive for large arrays +/// - Run-end validation adds minimal overhead for safety pub(crate) fn run_end_encoded_cast( array: &dyn Array, to_type: &DataType, cast_options: &CastOptions, ) -> Result { match array.data_type() { - DataType::RunEndEncoded(_run_end_field, _values_field) => { + DataType::RunEndEncoded(_, _) => { let run_array = array .as_any() .downcast_ref::>() @@ -16,16 +58,37 @@ pub(crate) fn run_end_encoded_cast( match to_type { // CASE 1: Stay as RunEndEncoded, cast only the values - DataType::RunEndEncoded(_target_run_end_field, target_value_field) => { + DataType::RunEndEncoded(target_index_field, target_value_field) => { let cast_values = cast_with_options(values, target_value_field.data_type(), cast_options)?; let run_ends_array = PrimitiveArray::::from_iter_values( run_array.run_ends().values().iter().copied(), ); - - let new_run_array = - RunArray::::try_new(&run_ends_array, cast_values.as_ref())?; + let cast_run_ends = cast_with_options( + &run_ends_array, + target_index_field.data_type(), + cast_options, + )?; + let new_run_array: ArrayRef = match target_index_field.data_type() { + DataType::Int16 => { + let re = cast_run_ends.as_primitive::(); + Arc::new(RunArray::::try_new(re, cast_values.as_ref())?) + } + DataType::Int32 => { + let re = cast_run_ends.as_primitive::(); + Arc::new(RunArray::::try_new(re, cast_values.as_ref())?) + } + DataType::Int64 => { + let re = cast_run_ends.as_primitive::(); + Arc::new(RunArray::::try_new(re, cast_values.as_ref())?) + } + _ => { + return Err(ArrowError::CastError( + "Run-end type must be i16, i32, or i64".to_string(), + )) + } + }; Ok(Arc::new(new_run_array)) } @@ -55,10 +118,10 @@ pub(crate) fn run_end_encoded_cast( } /// Attempts to cast an array to a RunEndEncoded array with the specified index type K -/// and value type. This function performs run-length encoding on the input array. +/// and value type. This function performs run-end encoding on the input array. /// /// # Arguments -/// * `array` - The input array to be run-length encoded +/// * `array` - The input array to be run-end encoded /// * `value_type` - The target data type for the values in the RunEndEncoded array /// * `cast_options` - Options controlling the casting behavior /// @@ -85,7 +148,7 @@ pub(crate) fn cast_to_run_end_encoded( cast_with_options(array, value_type, cast_options)? }; - // Step 2: Run-length encode the cast array + // Step 2: Run-end encode the cast array // We'll use a builder to construct the RunArray efficiently let mut run_ends_builder = PrimitiveBuilder::::new(); @@ -104,14 +167,11 @@ pub(crate) fn cast_to_run_end_encoded( let mut run_ends_vec = Vec::new(); let mut values_indices = Vec::new(); - let mut current_run_end = 1usize; - // Add the first element as the start of the first run values_indices.push(0); - + // Step 3: Identify runs of consecutive equal values for i in 1..cast_array.len() { // For simplicity, we'll use a basic comparison approach - // In practice, you'd want more sophisticated comparison based on data type let values_equal = match (cast_array.is_null(i), cast_array.is_null(i - 1)) { (true, true) => true, // Both null (false, false) => { @@ -124,19 +184,24 @@ pub(crate) fn cast_to_run_end_encoded( if !values_equal { // End current run, start new run - run_ends_vec.push(current_run_end); + run_ends_vec.push(i); values_indices.push(i); } - - current_run_end += 1; } // Add the final run end - run_ends_vec.push(current_run_end); + run_ends_vec.push(cast_array.len() as usize); // Step 4: Build the run_ends array - for &run_end in &run_ends_vec { - run_ends_builder.append_value(K::Native::from_usize(run_end).unwrap()); + for run_end in run_ends_vec { + run_ends_builder.append_value(match K::Native::from_usize(run_end) { + Some(value) => value, + None => { + return Err(ArrowError::CastError( + "Run end index out of range".to_string(), + )) + } + }); } let run_ends_array = run_ends_builder.finish(); @@ -146,7 +211,57 @@ pub(crate) fn cast_to_run_end_encoded( ); let values_array = take(&cast_array, &indices, None)?; - // Step 7: Create and return the RunArray + // Step 6: Create and return the RunArray let run_array = RunArray::::try_new(&run_ends_array, values_array.as_ref())?; Ok(Arc::new(run_array)) } + +// There might be a cleaner way to handle this but for now this works +pub(crate) fn can_cast_run_end_encoded(from_type: &DataType, to_type: &DataType) -> bool { + match to_type { + DataType::RunEndEncoded(_, _) => { + // Check if from_type supports equality (can be REE-encoded) + match from_type { + // Primitive types - support equality + DataType::Boolean + | DataType::Int8 + | DataType::Int16 + | DataType::Int32 + | DataType::Int64 + | DataType::UInt8 + | DataType::UInt16 + | DataType::UInt32 + | DataType::UInt64 + | DataType::Float32 + | DataType::Float64 => true, + + // String types - support equality + DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => true, + + // Binary types - support equality + DataType::Binary + | DataType::LargeBinary + | DataType::BinaryView + | DataType::FixedSizeBinary(_) => true, + + // Temporal types - support equality + DataType::Date32 + | DataType::Date64 + | DataType::Timestamp(_, _) + | DataType::Time32(_) + | DataType::Time64(_) + | DataType::Duration(_) + | DataType::Interval(_) => true, + + // Decimal types - support equality + DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => true, + + // Already REE-encoded - can be re-encoded + DataType::RunEndEncoded(_, _) => true, + + _ => false, + } + } + _ => false, // Not casting to REE type + } +} From 2642ecd5a2730a97acc7fb321320145ab9ddc1f4 Mon Sep 17 00:00:00 2001 From: Richard Baah Date: Mon, 23 Jun 2025 14:42:24 -0400 Subject: [PATCH 4/4] feat: Add Run-End Encoded array casting with overflow protection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement casting between REE arrays and other Arrow types. REE-to-REE casting validates run-end upcasts only (Int16→Int32, Int16→Int64, Int32→Int64) to prevent invalid sequences. rebased changes --- arrow-cast/src/cast/mod.rs | 111 +++++++++++++++++++++++++++---- arrow-cast/src/cast/run_array.rs | 26 ++++---- 2 files changed, 111 insertions(+), 26 deletions(-) diff --git a/arrow-cast/src/cast/mod.rs b/arrow-cast/src/cast/mod.rs index 6b3d6b88f7f2..a6c4ead9bbf2 100644 --- a/arrow-cast/src/cast/mod.rs +++ b/arrow-cast/src/cast/mod.rs @@ -48,7 +48,7 @@ use crate::cast::dictionary::*; use crate::cast::list::*; use crate::cast::map::*; use crate::cast::run_array::{ - can_cast_run_end_encoded, cast_to_run_end_encoded, run_end_encoded_cast, + can_cast_to_run_end_encoded, cast_to_run_end_encoded, run_end_encoded_cast, }; use crate::cast::string::*; @@ -142,7 +142,7 @@ pub fn can_cast_types(from_type: &DataType, to_type: &DataType) -> bool { } (Dictionary(_, value_type), _) => can_cast_types(value_type, to_type), (RunEndEncoded(_, value_type), _) => can_cast_types(value_type.data_type(), to_type), - (_, RunEndEncoded(_, _value_type)) => can_cast_run_end_encoded(from_type, to_type), + (_, RunEndEncoded(_, _value_type)) => can_cast_to_run_end_encoded(from_type, to_type), (_, Dictionary(_, value_type)) => can_cast_types(from_type, value_type), (List(list_from) | LargeList(list_from), List(list_to) | LargeList(list_to)) => { can_cast_types(list_from.data_type(), list_to.data_type()) @@ -10716,13 +10716,13 @@ mod tests { )) as ArrayRef; assert_eq!(*fixed_array, *r); } + #[cfg(test)] mod run_end_encoded_tests { use super::*; use arrow_schema::{DataType, Field}; use std::sync::Arc; - /// Test casting FROM RunEndEncoded to primitive types #[test] fn test_run_end_encoded_to_primitive() { // Create a RunEndEncoded array: [1, 1, 2, 2, 2, 3] @@ -10740,10 +10740,8 @@ mod tests { ); } - /// Test casting FROM RunEndEncoded to string #[test] fn test_run_end_encoded_to_string() { - // Create a RunEndEncoded array with Int32 values: [10, 10, 20, 30, 30] let run_ends = Int32Array::from(vec![2, 3, 5]); let values = Int32Array::from(vec![10, 20, 30]); let run_array = RunArray::::try_new(&run_ends, &values).unwrap(); @@ -10760,7 +10758,6 @@ mod tests { assert_eq!(result_array.value(2), "20"); } - /// Test casting TO RunEndEncoded from primitive types #[test] fn test_primitive_to_run_end_encoded() { // Create an Int32 array with repeated values: [1, 1, 2, 2, 2, 3] @@ -10788,7 +10785,94 @@ mod tests { assert_eq!(values_array.values(), &[1, 2, 3]); } - /// Test casting TO RunEndEncoded from string + #[test] + fn test_primitive_to_run_end_encoded_with_nulls() { + let source_array = Int32Array::from(vec![ + Some(1), + Some(1), + None, + None, + Some(2), + Some(2), + Some(3), + Some(3), + None, + None, + Some(4), + Some(4), + Some(5), + Some(5), + None, + None, + ]); + let array_ref = Arc::new(source_array) as ArrayRef; + let target_type = DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int32, false)), + Arc::new(Field::new("values", DataType::Int32, true)), + ); + let cast_result = cast(&array_ref, &target_type).unwrap(); + let result_run_array = cast_result + .as_any() + .downcast_ref::>() + .unwrap(); + assert_eq!( + result_run_array.run_ends().values(), + &[2, 4, 6, 8, 10, 12, 14, 16] + ); + assert_eq!( + result_run_array + .values() + .as_primitive::() + .values(), + &[1, 0, 2, 3, 0, 4, 5, 0] + ); + assert_eq!(result_run_array.values().null_count(), 3); + } + + #[test] + fn test_primitive_to_run_end_encoded_with_nulls_consecutive() { + let source_array = Int64Array::from(vec![ + Some(1), + Some(1), + None, + None, + None, + None, + None, + None, + None, + None, + Some(4), + Some(20), + Some(500), + Some(500), + None, + None, + ]); + let array_ref = Arc::new(source_array) as ArrayRef; + let target_type = DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int16, false)), + Arc::new(Field::new("values", DataType::Int64, true)), + ); + let cast_result = cast(&array_ref, &target_type).unwrap(); + let result_run_array = cast_result + .as_any() + .downcast_ref::>() + .unwrap(); + assert_eq!( + result_run_array.run_ends().values(), + &[2, 10, 11, 12, 14, 16] + ); + assert_eq!( + result_run_array + .values() + .as_primitive::() + .values(), + &[1, 0, 4, 20, 500, 0] + ); + assert_eq!(result_run_array.values().null_count(), 2); + } + #[test] fn test_string_to_run_end_encoded() { // Create a String array with repeated values: ["a", "a", "b", "c", "c"] @@ -10818,7 +10902,6 @@ mod tests { assert_eq!(values_array.value(2), "c"); } - /// Test casting with type conversion (Int32 -> RunEndEncoded) #[test] fn test_cast_with_type_conversion() { // Create an Int32 array: [1, 1, 2, 2, 3] @@ -10851,7 +10934,6 @@ mod tests { assert_eq!(values_array.value(2), "3"); } - /// Test casting empty array to RunEndEncoded #[test] fn test_empty_array_to_run_end_encoded() { // Create an empty Int32 array @@ -10876,7 +10958,6 @@ mod tests { assert_eq!(result_run_array.values().len(), 0); } - /// Test casting RunEndEncoded with nulls #[test] fn test_run_end_encoded_with_nulls() { // Create a RunEndEncoded array with nulls: [1, 1, null, 2, 2] @@ -10895,7 +10976,6 @@ mod tests { assert_eq!(result_run_array.value(4), "2"); } - /// Test different index types (Int16, Int64) #[test] fn test_different_index_types() { // Test with Int16 index type @@ -10917,6 +10997,7 @@ mod tests { let cast_result = cast(&array_ref, &target_type).unwrap(); assert_eq!(cast_result.data_type(), &target_type); } + #[test] fn test_unsupported_cast_to_run_end_encoded() { // Create a Struct array - complex nested type that might not be supported @@ -10935,8 +11016,10 @@ mod tests { // Expect this to fail assert!(cast_result.is_err()); } + #[test] fn test_cast_run_end_encoded_int64_to_int16_should_fail() { + /// Test casting RunEndEncoded to RunEndEncoded should fail use arrow_array::{Int64Array, RunArray, StringArray}; use arrow_schema::{DataType, Field}; use std::sync::Arc; @@ -10973,8 +11056,10 @@ mod tests { } } } + #[test] fn test_cast_run_end_encoded_int16_to_int64_should_succeed() { + /// Test casting RunEndEncoded to RunEndEncoded should succeed use arrow_array::{Int16Array, RunArray, StringArray}; use arrow_schema::{DataType, Field}; use std::sync::Arc; @@ -11023,6 +11108,7 @@ mod tests { #[test] fn test_cast_run_end_encoded_int32_to_int16_should_fail() { + /// Test casting RunEndEncoded to RunEndEncoded should fail use arrow_array::{Int32Array, RunArray, StringArray}; use arrow_schema::{DataType, Field}; use std::sync::Arc; @@ -11031,9 +11117,6 @@ mod tests { let run_ends = Int32Array::from(vec![1000, 50000, 80000]); // values too large for Int16 let values = StringArray::from(vec!["x", "y", "z"]); - println!("Original run_ends null count: {}", run_ends.null_count()); - println!("Original run_ends values: {:?}", run_ends.values()); - let ree_array = RunArray::::try_new(&run_ends, &values).unwrap(); let array_ref = Arc::new(ree_array) as ArrayRef; diff --git a/arrow-cast/src/cast/run_array.rs b/arrow-cast/src/cast/run_array.rs index 3d0c5ec6a4d0..3b82ffd4f5bd 100644 --- a/arrow-cast/src/cast/run_array.rs +++ b/arrow-cast/src/cast/run_array.rs @@ -1,4 +1,5 @@ use crate::cast::*; + /// Attempts to cast a Run-End Encoded array to another type, handling both REE-to-REE /// and REE-to-other type conversions with proper validation and error handling. /// @@ -171,7 +172,7 @@ pub(crate) fn cast_to_run_end_encoded( values_indices.push(0); // Step 3: Identify runs of consecutive equal values for i in 1..cast_array.len() { - // For simplicity, we'll use a basic comparison approach + // We can afford to perform the simple comparison here as we already validated the type in [can_cast_run_end_encoded] let values_equal = match (cast_array.is_null(i), cast_array.is_null(i - 1)) { (true, true) => true, // Both null (false, false) => { @@ -190,18 +191,14 @@ pub(crate) fn cast_to_run_end_encoded( } // Add the final run end - run_ends_vec.push(cast_array.len() as usize); + run_ends_vec.push(cast_array.len()); // Step 4: Build the run_ends array for run_end in run_ends_vec { - run_ends_builder.append_value(match K::Native::from_usize(run_end) { - Some(value) => value, - None => { - return Err(ArrowError::CastError( - "Run end index out of range".to_string(), - )) - } - }); + run_ends_builder.append_value( + K::Native::from_usize(run_end) + .ok_or_else(|| ArrowError::CastError("Run end index out of range".to_string()))?, + ); } let run_ends_array = run_ends_builder.finish(); @@ -216,8 +213,13 @@ pub(crate) fn cast_to_run_end_encoded( Ok(Arc::new(run_array)) } -// There might be a cleaner way to handle this but for now this works -pub(crate) fn can_cast_run_end_encoded(from_type: &DataType, to_type: &DataType) -> bool { +/// Checks if a given data type can be cast to a RunEndEncoded array. +/// +/// # Arguments +/// * `from_type` - The source data type to be checked +/// * `to_type` - The target data type to be checked +/// +pub(crate) fn can_cast_to_run_end_encoded(from_type: &DataType, to_type: &DataType) -> bool { match to_type { DataType::RunEndEncoded(_, _) => { // Check if from_type supports equality (can be REE-encoded)