From 48a22b5ee949e97a29fcecdbe0d6ee98e6ca90f2 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Tue, 9 Dec 2025 12:50:31 -0600 Subject: [PATCH 1/7] add specialized InList implementations for common scalar types --- .../physical-expr/src/expressions/in_list.rs | 1175 +++++++++++++++-- 1 file changed, 1087 insertions(+), 88 deletions(-) diff --git a/datafusion/physical-expr/src/expressions/in_list.rs b/datafusion/physical-expr/src/expressions/in_list.rs index fba8f4fbe4d9..22f0d18deb1d 100644 --- a/datafusion/physical-expr/src/expressions/in_list.rs +++ b/datafusion/physical-expr/src/expressions/in_list.rs @@ -26,7 +26,7 @@ use crate::PhysicalExpr; use crate::physical_expr::physical_exprs_bag_equal; use arrow::array::*; -use arrow::buffer::BooleanBuffer; +use arrow::buffer::{BooleanBuffer, NullBuffer}; use arrow::compute::kernels::boolean::{not, or_kleene}; use arrow::compute::{SortOptions, take}; use arrow::datatypes::*; @@ -91,7 +91,12 @@ impl StaticFilter for ArrayStaticFilter { if v.data_type() == &DataType::Null || self.in_array.data_type() == &DataType::Null { - return Ok(BooleanArray::from(vec![None; v.len()])); + // return Ok(BooleanArray::new(vec![None; v.len()])); + let nulls = NullBuffer::new_null(v.len()); + return Ok(BooleanArray::new( + BooleanBuffer::new_unset(v.len()), + Some(nulls), + )); } downcast_dictionary_array! { @@ -138,9 +143,17 @@ fn instantiate_static_filter( in_array: ArrayRef, ) -> Result> { match in_array.data_type() { + // Integer primitive types + DataType::Int8 => Ok(Arc::new(Int8StaticFilter::try_new(&in_array)?)), + DataType::Int16 => Ok(Arc::new(Int16StaticFilter::try_new(&in_array)?)), DataType::Int32 => Ok(Arc::new(Int32StaticFilter::try_new(&in_array)?)), + DataType::Int64 => Ok(Arc::new(Int64StaticFilter::try_new(&in_array)?)), + DataType::UInt8 => Ok(Arc::new(UInt8StaticFilter::try_new(&in_array)?)), + DataType::UInt16 => Ok(Arc::new(UInt16StaticFilter::try_new(&in_array)?)), + DataType::UInt32 => Ok(Arc::new(UInt32StaticFilter::try_new(&in_array)?)), + DataType::UInt64 => Ok(Arc::new(UInt64StaticFilter::try_new(&in_array)?)), _ => { - /* fall through to generic implementation */ + /* fall through to generic implementation for unsupported types (Float32/Float64, Struct, etc.) */ Ok(Arc::new(ArrayStaticFilter::try_new(in_array)?)) } } @@ -198,99 +211,127 @@ impl ArrayStaticFilter { } } -struct Int32StaticFilter { - null_count: usize, - values: HashSet, -} - -impl Int32StaticFilter { - fn try_new(in_array: &ArrayRef) -> Result { - let in_array = in_array - .as_primitive_opt::() - .ok_or_else(|| exec_datafusion_err!("Failed to downcast array"))?; - - let mut values = HashSet::with_capacity(in_array.len()); - let null_count = in_array.null_count(); - - for v in in_array.iter().flatten() { - values.insert(v); +// Macro to generate specialized StaticFilter implementations for primitive types +macro_rules! primitive_static_filter { + ($Name:ident, $ArrowType:ty) => { + struct $Name { + null_count: usize, + values: HashSet<<$ArrowType as ArrowPrimitiveType>::Native>, } - Ok(Self { null_count, values }) - } -} + impl $Name { + fn try_new(in_array: &ArrayRef) -> Result { + let in_array = in_array + .as_primitive_opt::<$ArrowType>() + .ok_or_else(|| exec_datafusion_err!("Failed to downcast an array to a '{}' array", stringify!($ArrowType)))?; -impl StaticFilter for Int32StaticFilter { - fn null_count(&self) -> usize { - self.null_count - } + let mut values = HashSet::with_capacity(in_array.len()); + let null_count = in_array.null_count(); - fn contains(&self, v: &dyn Array, negated: bool) -> Result { - // Handle dictionary arrays by recursing on the values - downcast_dictionary_array! { - v => { - let values_contains = self.contains(v.values().as_ref(), negated)?; - let result = take(&values_contains, v.keys(), None)?; - return Ok(downcast_array(result.as_ref())) + for v in in_array.iter().flatten() { + values.insert(v); + } + + Ok(Self { null_count, values }) } - _ => {} } - let v = v - .as_primitive_opt::() - .ok_or_else(|| exec_datafusion_err!("Failed to downcast array"))?; - - let haystack_has_nulls = self.null_count > 0; - let has_nulls = v.null_count() > 0 || haystack_has_nulls; - - let result = match (has_nulls, negated) { - (true, false) => { - // needle has nulls, not negated - BooleanArray::from_iter(v.iter().map(|value| match value { - None => None, - Some(v) => { - if self.values.contains(&v) { - Some(true) - } else if haystack_has_nulls { - None - } else { - Some(false) + impl StaticFilter for $Name { + fn null_count(&self) -> usize { + self.null_count + } + + fn contains(&self, v: &dyn Array, negated: bool) -> Result { + // Handle dictionary arrays by recursing on the values + downcast_dictionary_array! { + v => { + let values_contains = self.contains(v.values().as_ref(), negated)?; + let result = take(&values_contains, v.keys(), None)?; + return Ok(downcast_array(result.as_ref())) + } + _ => {} + } + + let v = v + .as_primitive_opt::<$ArrowType>() + .ok_or_else(|| exec_datafusion_err!("Failed to downcast an array to a '{}' array", stringify!($ArrowType)))?; + + let haystack_has_nulls = self.null_count > 0; + + let result = match (v.null_count() > 0, haystack_has_nulls, negated) { + (true, _, false) | (false, true, false) => { + // Either needle or haystack has nulls, not negated + BooleanArray::from_iter(v.iter().map(|value| { + match value { + // SQL three-valued logic: null IN (...) is always null + None => None, + Some(v) => { + if self.values.contains(&v) { + Some(true) + } else if haystack_has_nulls { + // value not in set, but set has nulls -> null + None + } else { + Some(false) + } + } + } + })) + } + (true, _, true) | (false, true, true) => { + // Either needle or haystack has nulls, negated + BooleanArray::from_iter(v.iter().map(|value| { + match value { + // SQL three-valued logic: null NOT IN (...) is always null + None => None, + Some(v) => { + if self.values.contains(&v) { + Some(false) + } else if haystack_has_nulls { + // value not in set, but set has nulls -> null + None + } else { + Some(true) + } + } + } + })) + } + (false, false, false) => { + // no nulls anywhere, not negated + let values = v.values(); + let mut builder = BooleanBufferBuilder::new(values.len()); + for value in values.iter() { + builder.append(self.values.contains(value)); } + BooleanArray::new(builder.finish(), None) } - })) - } - (true, true) => { - // needle has nulls, negated - BooleanArray::from_iter(v.iter().map(|value| match value { - None => None, - Some(v) => { - if self.values.contains(&v) { - Some(false) - } else if haystack_has_nulls { - None - } else { - Some(true) + (false, false, true) => { + let values = v.values(); + let mut builder = BooleanBufferBuilder::new(values.len()); + for value in values.iter() { + builder.append(!self.values.contains(value)); } + BooleanArray::new(builder.finish(), None) } - })) - } - (false, false) => { - // No nulls anywhere, not negated - BooleanArray::from_iter( - v.values().iter().map(|value| self.values.contains(value)), - ) + }; + Ok(result) } - (false, true) => { - // No nulls anywhere, negated - BooleanArray::from_iter( - v.values().iter().map(|value| !self.values.contains(value)), - ) - } - }; - Ok(result) - } + } + }; } +// Generate specialized filters for all integer primitive types +// Note: Float32 and Float64 are excluded because they don't implement Hash/Eq due to NaN +primitive_static_filter!(Int8StaticFilter, Int8Type); +primitive_static_filter!(Int16StaticFilter, Int16Type); +primitive_static_filter!(Int32StaticFilter, Int32Type); +primitive_static_filter!(Int64StaticFilter, Int64Type); +primitive_static_filter!(UInt8StaticFilter, UInt8Type); +primitive_static_filter!(UInt16StaticFilter, UInt16Type); +primitive_static_filter!(UInt32StaticFilter, UInt32Type); +primitive_static_filter!(UInt64StaticFilter, UInt64Type); + /// Evaluates the list of expressions into an array, flattening any dictionaries fn evaluate_list( list: &[Arc], @@ -500,8 +541,12 @@ impl PhysicalExpr for InListExpr { if scalar.is_null() { // SQL three-valued logic: null IN (...) is always null // The code below would handle this correctly but this is a faster path + let nulls = NullBuffer::new_null(num_rows); return Ok(ColumnarValue::Array(Arc::new( - BooleanArray::from(vec![None; num_rows]), + BooleanArray::new( + BooleanBuffer::new_unset(num_rows), + Some(nulls), + ), ))); } // Use a 1 row array to avoid code duplication/branching @@ -512,12 +557,15 @@ impl PhysicalExpr for InListExpr { // Broadcast the single result to all rows // Must check is_null() to preserve NULL values (SQL three-valued logic) if result_array.is_null(0) { - BooleanArray::from(vec![None; num_rows]) + let nulls = NullBuffer::new_null(num_rows); + BooleanArray::new( + BooleanBuffer::new_unset(num_rows), + Some(nulls), + ) + } else if result_array.value(0) { + BooleanArray::new(BooleanBuffer::new_set(num_rows), None) } else { - BooleanArray::from_iter(std::iter::repeat_n( - result_array.value(0), - num_rows, - )) + BooleanArray::new(BooleanBuffer::new_unset(num_rows), None) } } } @@ -1257,6 +1305,957 @@ mod tests { Ok(()) } + #[test] + fn in_list_int8() -> Result<()> { + let schema = Schema::new(vec![Field::new("a", DataType::Int8, true)]); + let a = Int8Array::from(vec![Some(0), Some(2), None]); + let col_a = col("a", &schema)?; + let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; + + // expression: "a in (0, 1)" + let list = vec![lit(0i8), lit(1i8)]; + in_list!( + batch, + list, + &false, + vec![Some(true), Some(false), None], + Arc::clone(&col_a), + &schema + ); + + // expression: "a not in (0, 1)" + let list = vec![lit(0i8), lit(1i8)]; + in_list!( + batch, + list, + &true, + vec![Some(false), Some(true), None], + Arc::clone(&col_a), + &schema + ); + + // expression: "a in (0, 1, NULL)" + let list = vec![lit(0i8), lit(1i8), lit(ScalarValue::Null)]; + in_list!( + batch, + list, + &false, + vec![Some(true), None, None], + Arc::clone(&col_a), + &schema + ); + + // expression: "a not in (0, 1, NULL)" + let list = vec![lit(0i8), lit(1i8), lit(ScalarValue::Null)]; + in_list!( + batch, + list, + &true, + vec![Some(false), None, None], + Arc::clone(&col_a), + &schema + ); + + Ok(()) + } + + #[test] + fn in_list_int16() -> Result<()> { + let schema = Schema::new(vec![Field::new("a", DataType::Int16, true)]); + let a = Int16Array::from(vec![Some(0), Some(2), None]); + let col_a = col("a", &schema)?; + let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; + + // expression: "a in (0, 1)" + let list = vec![lit(0i16), lit(1i16)]; + in_list!( + batch, + list, + &false, + vec![Some(true), Some(false), None], + Arc::clone(&col_a), + &schema + ); + + // expression: "a not in (0, 1)" + let list = vec![lit(0i16), lit(1i16)]; + in_list!( + batch, + list, + &true, + vec![Some(false), Some(true), None], + Arc::clone(&col_a), + &schema + ); + + // expression: "a in (0, 1, NULL)" + let list = vec![lit(0i16), lit(1i16), lit(ScalarValue::Null)]; + in_list!( + batch, + list, + &false, + vec![Some(true), None, None], + Arc::clone(&col_a), + &schema + ); + + // expression: "a not in (0, 1, NULL)" + let list = vec![lit(0i16), lit(1i16), lit(ScalarValue::Null)]; + in_list!( + batch, + list, + &true, + vec![Some(false), None, None], + Arc::clone(&col_a), + &schema + ); + + Ok(()) + } + + #[test] + fn in_list_int32() -> Result<()> { + let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]); + let a = Int32Array::from(vec![Some(0), Some(2), None]); + let col_a = col("a", &schema)?; + let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; + + // expression: "a in (0, 1)" + let list = vec![lit(0i32), lit(1i32)]; + in_list!( + batch, + list, + &false, + vec![Some(true), Some(false), None], + Arc::clone(&col_a), + &schema + ); + + // expression: "a not in (0, 1)" + let list = vec![lit(0i32), lit(1i32)]; + in_list!( + batch, + list, + &true, + vec![Some(false), Some(true), None], + Arc::clone(&col_a), + &schema + ); + + // expression: "a in (0, 1, NULL)" + let list = vec![lit(0i32), lit(1i32), lit(ScalarValue::Null)]; + in_list!( + batch, + list, + &false, + vec![Some(true), None, None], + Arc::clone(&col_a), + &schema + ); + + // expression: "a not in (0, 1, NULL)" + let list = vec![lit(0i32), lit(1i32), lit(ScalarValue::Null)]; + in_list!( + batch, + list, + &true, + vec![Some(false), None, None], + Arc::clone(&col_a), + &schema + ); + + Ok(()) + } + + #[test] + fn in_list_uint8() -> Result<()> { + let schema = Schema::new(vec![Field::new("a", DataType::UInt8, true)]); + let a = UInt8Array::from(vec![Some(0), Some(2), None]); + let col_a = col("a", &schema)?; + let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; + + // expression: "a in (0, 1)" + let list = vec![lit(0u8), lit(1u8)]; + in_list!( + batch, + list, + &false, + vec![Some(true), Some(false), None], + Arc::clone(&col_a), + &schema + ); + + // expression: "a not in (0, 1)" + let list = vec![lit(0u8), lit(1u8)]; + in_list!( + batch, + list, + &true, + vec![Some(false), Some(true), None], + Arc::clone(&col_a), + &schema + ); + + // expression: "a in (0, 1, NULL)" + let list = vec![lit(0u8), lit(1u8), lit(ScalarValue::Null)]; + in_list!( + batch, + list, + &false, + vec![Some(true), None, None], + Arc::clone(&col_a), + &schema + ); + + // expression: "a not in (0, 1, NULL)" + let list = vec![lit(0u8), lit(1u8), lit(ScalarValue::Null)]; + in_list!( + batch, + list, + &true, + vec![Some(false), None, None], + Arc::clone(&col_a), + &schema + ); + + Ok(()) + } + + #[test] + fn in_list_uint16() -> Result<()> { + let schema = Schema::new(vec![Field::new("a", DataType::UInt16, true)]); + let a = UInt16Array::from(vec![Some(0), Some(2), None]); + let col_a = col("a", &schema)?; + let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; + + // expression: "a in (0, 1)" + let list = vec![lit(0u16), lit(1u16)]; + in_list!( + batch, + list, + &false, + vec![Some(true), Some(false), None], + Arc::clone(&col_a), + &schema + ); + + // expression: "a not in (0, 1)" + let list = vec![lit(0u16), lit(1u16)]; + in_list!( + batch, + list, + &true, + vec![Some(false), Some(true), None], + Arc::clone(&col_a), + &schema + ); + + // expression: "a in (0, 1, NULL)" + let list = vec![lit(0u16), lit(1u16), lit(ScalarValue::Null)]; + in_list!( + batch, + list, + &false, + vec![Some(true), None, None], + Arc::clone(&col_a), + &schema + ); + + // expression: "a not in (0, 1, NULL)" + let list = vec![lit(0u16), lit(1u16), lit(ScalarValue::Null)]; + in_list!( + batch, + list, + &true, + vec![Some(false), None, None], + Arc::clone(&col_a), + &schema + ); + + Ok(()) + } + + #[test] + fn in_list_uint32() -> Result<()> { + let schema = Schema::new(vec![Field::new("a", DataType::UInt32, true)]); + let a = UInt32Array::from(vec![Some(0), Some(2), None]); + let col_a = col("a", &schema)?; + let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; + + // expression: "a in (0, 1)" + let list = vec![lit(0u32), lit(1u32)]; + in_list!( + batch, + list, + &false, + vec![Some(true), Some(false), None], + Arc::clone(&col_a), + &schema + ); + + // expression: "a not in (0, 1)" + let list = vec![lit(0u32), lit(1u32)]; + in_list!( + batch, + list, + &true, + vec![Some(false), Some(true), None], + Arc::clone(&col_a), + &schema + ); + + // expression: "a in (0, 1, NULL)" + let list = vec![lit(0u32), lit(1u32), lit(ScalarValue::Null)]; + in_list!( + batch, + list, + &false, + vec![Some(true), None, None], + Arc::clone(&col_a), + &schema + ); + + // expression: "a not in (0, 1, NULL)" + let list = vec![lit(0u32), lit(1u32), lit(ScalarValue::Null)]; + in_list!( + batch, + list, + &true, + vec![Some(false), None, None], + Arc::clone(&col_a), + &schema + ); + + Ok(()) + } + + #[test] + fn in_list_uint64() -> Result<()> { + let schema = Schema::new(vec![Field::new("a", DataType::UInt64, true)]); + let a = UInt64Array::from(vec![Some(0), Some(2), None]); + let col_a = col("a", &schema)?; + let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; + + // expression: "a in (0, 1)" + let list = vec![lit(0u64), lit(1u64)]; + in_list!( + batch, + list, + &false, + vec![Some(true), Some(false), None], + Arc::clone(&col_a), + &schema + ); + + // expression: "a not in (0, 1)" + let list = vec![lit(0u64), lit(1u64)]; + in_list!( + batch, + list, + &true, + vec![Some(false), Some(true), None], + Arc::clone(&col_a), + &schema + ); + + // expression: "a in (0, 1, NULL)" + let list = vec![lit(0u64), lit(1u64), lit(ScalarValue::Null)]; + in_list!( + batch, + list, + &false, + vec![Some(true), None, None], + Arc::clone(&col_a), + &schema + ); + + // expression: "a not in (0, 1, NULL)" + let list = vec![lit(0u64), lit(1u64), lit(ScalarValue::Null)]; + in_list!( + batch, + list, + &true, + vec![Some(false), None, None], + Arc::clone(&col_a), + &schema + ); + + Ok(()) + } + + #[test] + fn in_list_large_utf8() -> Result<()> { + let schema = Schema::new(vec![Field::new("a", DataType::LargeUtf8, true)]); + let a = LargeStringArray::from(vec![Some("a"), Some("d"), None]); + let col_a = col("a", &schema)?; + let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; + + // expression: "a in ("a", "b")" + let list = vec![lit("a"), lit("b")]; + in_list!( + batch, + list, + &false, + vec![Some(true), Some(false), None], + Arc::clone(&col_a), + &schema + ); + + // expression: "a not in ("a", "b")" + let list = vec![lit("a"), lit("b")]; + in_list!( + batch, + list, + &true, + vec![Some(false), Some(true), None], + Arc::clone(&col_a), + &schema + ); + + // expression: "a in ("a", "b", null)" + let list = vec![lit("a"), lit("b"), lit(ScalarValue::LargeUtf8(None))]; + in_list!( + batch, + list, + &false, + vec![Some(true), None, None], + Arc::clone(&col_a), + &schema + ); + + // expression: "a not in ("a", "b", null)" + let list = vec![lit("a"), lit("b"), lit(ScalarValue::LargeUtf8(None))]; + in_list!( + batch, + list, + &true, + vec![Some(false), None, None], + Arc::clone(&col_a), + &schema + ); + + Ok(()) + } + + #[test] + fn in_list_utf8_view() -> Result<()> { + let schema = Schema::new(vec![Field::new("a", DataType::Utf8View, true)]); + let a = StringViewArray::from(vec![Some("a"), Some("d"), None]); + let col_a = col("a", &schema)?; + let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; + + // expression: "a in ("a", "b")" + let list = vec![lit("a"), lit("b")]; + in_list!( + batch, + list, + &false, + vec![Some(true), Some(false), None], + Arc::clone(&col_a), + &schema + ); + + // expression: "a not in ("a", "b")" + let list = vec![lit("a"), lit("b")]; + in_list!( + batch, + list, + &true, + vec![Some(false), Some(true), None], + Arc::clone(&col_a), + &schema + ); + + // expression: "a in ("a", "b", null)" + let list = vec![lit("a"), lit("b"), lit(ScalarValue::Utf8View(None))]; + in_list!( + batch, + list, + &false, + vec![Some(true), None, None], + Arc::clone(&col_a), + &schema + ); + + // expression: "a not in ("a", "b", null)" + let list = vec![lit("a"), lit("b"), lit(ScalarValue::Utf8View(None))]; + in_list!( + batch, + list, + &true, + vec![Some(false), None, None], + Arc::clone(&col_a), + &schema + ); + + Ok(()) + } + + #[test] + fn in_list_large_binary() -> Result<()> { + let schema = Schema::new(vec![Field::new("a", DataType::LargeBinary, true)]); + let a = LargeBinaryArray::from(vec![ + Some([1, 2, 3].as_slice()), + Some([1, 2, 2].as_slice()), + None, + ]); + let col_a = col("a", &schema)?; + let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; + + // expression: "a in ([1, 2, 3], [4, 5, 6])" + let list = vec![lit([1, 2, 3].as_slice()), lit([4, 5, 6].as_slice())]; + in_list!( + batch, + list.clone(), + &false, + vec![Some(true), Some(false), None], + Arc::clone(&col_a), + &schema + ); + + // expression: "a not in ([1, 2, 3], [4, 5, 6])" + in_list!( + batch, + list, + &true, + vec![Some(false), Some(true), None], + Arc::clone(&col_a), + &schema + ); + + // expression: "a in ([1, 2, 3], [4, 5, 6], null)" + let list = vec![ + lit([1, 2, 3].as_slice()), + lit([4, 5, 6].as_slice()), + lit(ScalarValue::LargeBinary(None)), + ]; + in_list!( + batch, + list.clone(), + &false, + vec![Some(true), None, None], + Arc::clone(&col_a), + &schema + ); + + // expression: "a not in ([1, 2, 3], [4, 5, 6], null)" + in_list!( + batch, + list, + &true, + vec![Some(false), None, None], + Arc::clone(&col_a), + &schema + ); + + Ok(()) + } + + #[test] + fn in_list_binary_view() -> Result<()> { + let schema = Schema::new(vec![Field::new("a", DataType::BinaryView, true)]); + let a = BinaryViewArray::from(vec![ + Some([1, 2, 3].as_slice()), + Some([1, 2, 2].as_slice()), + None, + ]); + let col_a = col("a", &schema)?; + let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; + + // expression: "a in ([1, 2, 3], [4, 5, 6])" + let list = vec![lit([1, 2, 3].as_slice()), lit([4, 5, 6].as_slice())]; + in_list!( + batch, + list.clone(), + &false, + vec![Some(true), Some(false), None], + Arc::clone(&col_a), + &schema + ); + + // expression: "a not in ([1, 2, 3], [4, 5, 6])" + in_list!( + batch, + list, + &true, + vec![Some(false), Some(true), None], + Arc::clone(&col_a), + &schema + ); + + // expression: "a in ([1, 2, 3], [4, 5, 6], null)" + let list = vec![ + lit([1, 2, 3].as_slice()), + lit([4, 5, 6].as_slice()), + lit(ScalarValue::BinaryView(None)), + ]; + in_list!( + batch, + list.clone(), + &false, + vec![Some(true), None, None], + Arc::clone(&col_a), + &schema + ); + + // expression: "a not in ([1, 2, 3], [4, 5, 6], null)" + in_list!( + batch, + list, + &true, + vec![Some(false), None, None], + Arc::clone(&col_a), + &schema + ); + + Ok(()) + } + + #[test] + fn in_list_date64() -> Result<()> { + let schema = Schema::new(vec![Field::new("a", DataType::Date64, true)]); + let a = Date64Array::from(vec![Some(0), Some(2), None]); + let col_a = col("a", &schema)?; + let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; + + // expression: "a in (0, 1)" + let list = vec![ + lit(ScalarValue::Date64(Some(0))), + lit(ScalarValue::Date64(Some(1))), + ]; + in_list!( + batch, + list, + &false, + vec![Some(true), Some(false), None], + Arc::clone(&col_a), + &schema + ); + + // expression: "a not in (0, 1)" + let list = vec![ + lit(ScalarValue::Date64(Some(0))), + lit(ScalarValue::Date64(Some(1))), + ]; + in_list!( + batch, + list, + &true, + vec![Some(false), Some(true), None], + Arc::clone(&col_a), + &schema + ); + + // expression: "a in (0, 1, NULL)" + let list = vec![ + lit(ScalarValue::Date64(Some(0))), + lit(ScalarValue::Date64(Some(1))), + lit(ScalarValue::Null), + ]; + in_list!( + batch, + list, + &false, + vec![Some(true), None, None], + Arc::clone(&col_a), + &schema + ); + + // expression: "a not in (0, 1, NULL)" + let list = vec![ + lit(ScalarValue::Date64(Some(0))), + lit(ScalarValue::Date64(Some(1))), + lit(ScalarValue::Null), + ]; + in_list!( + batch, + list, + &true, + vec![Some(false), None, None], + Arc::clone(&col_a), + &schema + ); + + Ok(()) + } + + #[test] + fn in_list_date32() -> Result<()> { + let schema = Schema::new(vec![Field::new("a", DataType::Date32, true)]); + let a = Date32Array::from(vec![Some(0), Some(2), None]); + let col_a = col("a", &schema)?; + let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; + + // expression: "a in (0, 1)" + let list = vec![ + lit(ScalarValue::Date32(Some(0))), + lit(ScalarValue::Date32(Some(1))), + ]; + in_list!( + batch, + list, + &false, + vec![Some(true), Some(false), None], + Arc::clone(&col_a), + &schema + ); + + // expression: "a not in (0, 1)" + let list = vec![ + lit(ScalarValue::Date32(Some(0))), + lit(ScalarValue::Date32(Some(1))), + ]; + in_list!( + batch, + list, + &true, + vec![Some(false), Some(true), None], + Arc::clone(&col_a), + &schema + ); + + // expression: "a in (0, 1, NULL)" + let list = vec![ + lit(ScalarValue::Date32(Some(0))), + lit(ScalarValue::Date32(Some(1))), + lit(ScalarValue::Null), + ]; + in_list!( + batch, + list, + &false, + vec![Some(true), None, None], + Arc::clone(&col_a), + &schema + ); + + // expression: "a not in (0, 1, NULL)" + let list = vec![ + lit(ScalarValue::Date32(Some(0))), + lit(ScalarValue::Date32(Some(1))), + lit(ScalarValue::Null), + ]; + in_list!( + batch, + list, + &true, + vec![Some(false), None, None], + Arc::clone(&col_a), + &schema + ); + + Ok(()) + } + + #[test] + fn in_list_decimal_type_coercion() -> Result<()> { + // Now, we can check the NULL type + let schema = + Schema::new(vec![Field::new("a", DataType::Decimal128(13, 4), true)]); + let array = vec![Some(100_0000_i128), None, Some(200_5000_i128)] + .into_iter() + .collect::(); + let array = array.with_precision_and_scale(13, 4).unwrap(); + let col_a = col("a", &schema)?; + let batch = + RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(array)])?; + + // expression: "a in (100,200), the data type of list is INT32 + let list = vec![lit(100i32), lit(200i32)]; + in_list!( + batch, + list, + &false, + vec![Some(true), None, Some(false)], + Arc::clone(&col_a), + &schema + ); + // expression: "a not in (100,200) + let list = vec![lit(100i32), lit(200i32)]; + in_list!( + batch, + list, + &true, + vec![Some(false), None, Some(true)], + Arc::clone(&col_a), + &schema + ); + + // expression: "a in (200,NULL), the data type of list is INT32 AND NULL + let list = vec![lit(ScalarValue::Int32(Some(100))), lit(ScalarValue::Null)]; + in_list!( + batch, + list.clone(), + &false, + vec![Some(true), None, None], + Arc::clone(&col_a), + &schema + ); + // expression: "a not in (200,NULL), the data type of list is INT32 AND NULL + in_list!( + batch, + list, + &true, + vec![Some(false), None, None], + Arc::clone(&col_a), + &schema + ); + + // expression: "a in (200.5, 100), the data type of list is FLOAT32 and INT32 + let list = vec![lit(200.50f32), lit(100i32)]; + in_list!( + batch, + list, + &false, + vec![Some(true), None, Some(true)], + Arc::clone(&col_a), + &schema + ); + + // expression: "a not in (200.5, 100), the data type of list is FLOAT32 and INT32 + let list = vec![lit(200.50f32), lit(101i32)]; + in_list!( + batch, + list, + &true, + vec![Some(true), None, Some(false)], + Arc::clone(&col_a), + &schema + ); + + // test the optimization: set + // expression: "a in (99..300), the data type of list is INT32 + let list = (99i32..300).map(lit).collect::>(); + + in_list!( + batch, + list.clone(), + &false, + vec![Some(true), None, Some(false)], + Arc::clone(&col_a), + &schema + ); + + in_list!( + batch, + list, + &true, + vec![Some(false), None, Some(true)], + Arc::clone(&col_a), + &schema + ); + + Ok(()) + } + + #[test] + fn in_list_timestamp() -> Result<()> { + let schema = Schema::new(vec![Field::new( + "a", + DataType::Timestamp(TimeUnit::Microsecond, None), + true, + )]); + let a = TimestampMicrosecondArray::from(vec![ + Some(1388588401000000000), + Some(1288588501000000000), + None, + ]); + let col_a = col("a", &schema)?; + let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; + + let list = vec![ + lit(ScalarValue::TimestampMicrosecond( + Some(1388588401000000000), + None, + )), + lit(ScalarValue::TimestampMicrosecond( + Some(1388588401000000001), + None, + )), + lit(ScalarValue::TimestampMicrosecond( + Some(1388588401000000002), + None, + )), + ]; + + in_list!( + batch, + list.clone(), + &false, + vec![Some(true), Some(false), None], + Arc::clone(&col_a), + &schema + ); + + in_list!( + batch, + list.clone(), + &true, + vec![Some(false), Some(true), None], + Arc::clone(&col_a), + &schema + ); + Ok(()) + } + + #[test] + fn in_expr_with_multiple_element_in_list() -> Result<()> { + let schema = Schema::new(vec![ + Field::new("a", DataType::Float64, true), + Field::new("b", DataType::Float64, true), + Field::new("c", DataType::Float64, true), + ]); + let a = Float64Array::from(vec![ + Some(0.0), + Some(1.0), + Some(2.0), + Some(f64::NAN), + Some(-f64::NAN), + ]); + let b = Float64Array::from(vec![ + Some(8.0), + Some(1.0), + Some(5.0), + Some(f64::NAN), + Some(3.0), + ]); + let c = Float64Array::from(vec![ + Some(6.0), + Some(7.0), + None, + Some(5.0), + Some(-f64::NAN), + ]); + let col_a = col("a", &schema)?; + let col_b = col("b", &schema)?; + let col_c = col("c", &schema)?; + let batch = RecordBatch::try_new( + Arc::new(schema.clone()), + vec![Arc::new(a), Arc::new(b), Arc::new(c)], + )?; + + let list = vec![Arc::clone(&col_b), Arc::clone(&col_c)]; + in_list!( + batch, + list.clone(), + &false, + vec![Some(false), Some(true), None, Some(true), Some(true)], + Arc::clone(&col_a), + &schema + ); + + in_list!( + batch, + list, + &true, + vec![Some(true), Some(false), None, Some(false), Some(false)], + Arc::clone(&col_a), + &schema + ); + + Ok(()) + } + macro_rules! test_nullable { ($COL:expr, $LIST:expr, $SCHEMA:expr, $EXPECTED:expr) => {{ let (cast_expr, cast_list_exprs) = in_list_cast($COL, $LIST, $SCHEMA)?; From d11d7ae6d79acc1cc4e815fd01e37bce6afae0ad Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Tue, 9 Dec 2025 15:55:57 -0600 Subject: [PATCH 2/7] remove test diff --- .../physical-expr/src/expressions/in_list.rs | 951 ------------------ 1 file changed, 951 deletions(-) diff --git a/datafusion/physical-expr/src/expressions/in_list.rs b/datafusion/physical-expr/src/expressions/in_list.rs index 22f0d18deb1d..fa9f1d6de031 100644 --- a/datafusion/physical-expr/src/expressions/in_list.rs +++ b/datafusion/physical-expr/src/expressions/in_list.rs @@ -1305,957 +1305,6 @@ mod tests { Ok(()) } - #[test] - fn in_list_int8() -> Result<()> { - let schema = Schema::new(vec![Field::new("a", DataType::Int8, true)]); - let a = Int8Array::from(vec![Some(0), Some(2), None]); - let col_a = col("a", &schema)?; - let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; - - // expression: "a in (0, 1)" - let list = vec![lit(0i8), lit(1i8)]; - in_list!( - batch, - list, - &false, - vec![Some(true), Some(false), None], - Arc::clone(&col_a), - &schema - ); - - // expression: "a not in (0, 1)" - let list = vec![lit(0i8), lit(1i8)]; - in_list!( - batch, - list, - &true, - vec![Some(false), Some(true), None], - Arc::clone(&col_a), - &schema - ); - - // expression: "a in (0, 1, NULL)" - let list = vec![lit(0i8), lit(1i8), lit(ScalarValue::Null)]; - in_list!( - batch, - list, - &false, - vec![Some(true), None, None], - Arc::clone(&col_a), - &schema - ); - - // expression: "a not in (0, 1, NULL)" - let list = vec![lit(0i8), lit(1i8), lit(ScalarValue::Null)]; - in_list!( - batch, - list, - &true, - vec![Some(false), None, None], - Arc::clone(&col_a), - &schema - ); - - Ok(()) - } - - #[test] - fn in_list_int16() -> Result<()> { - let schema = Schema::new(vec![Field::new("a", DataType::Int16, true)]); - let a = Int16Array::from(vec![Some(0), Some(2), None]); - let col_a = col("a", &schema)?; - let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; - - // expression: "a in (0, 1)" - let list = vec![lit(0i16), lit(1i16)]; - in_list!( - batch, - list, - &false, - vec![Some(true), Some(false), None], - Arc::clone(&col_a), - &schema - ); - - // expression: "a not in (0, 1)" - let list = vec![lit(0i16), lit(1i16)]; - in_list!( - batch, - list, - &true, - vec![Some(false), Some(true), None], - Arc::clone(&col_a), - &schema - ); - - // expression: "a in (0, 1, NULL)" - let list = vec![lit(0i16), lit(1i16), lit(ScalarValue::Null)]; - in_list!( - batch, - list, - &false, - vec![Some(true), None, None], - Arc::clone(&col_a), - &schema - ); - - // expression: "a not in (0, 1, NULL)" - let list = vec![lit(0i16), lit(1i16), lit(ScalarValue::Null)]; - in_list!( - batch, - list, - &true, - vec![Some(false), None, None], - Arc::clone(&col_a), - &schema - ); - - Ok(()) - } - - #[test] - fn in_list_int32() -> Result<()> { - let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]); - let a = Int32Array::from(vec![Some(0), Some(2), None]); - let col_a = col("a", &schema)?; - let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; - - // expression: "a in (0, 1)" - let list = vec![lit(0i32), lit(1i32)]; - in_list!( - batch, - list, - &false, - vec![Some(true), Some(false), None], - Arc::clone(&col_a), - &schema - ); - - // expression: "a not in (0, 1)" - let list = vec![lit(0i32), lit(1i32)]; - in_list!( - batch, - list, - &true, - vec![Some(false), Some(true), None], - Arc::clone(&col_a), - &schema - ); - - // expression: "a in (0, 1, NULL)" - let list = vec![lit(0i32), lit(1i32), lit(ScalarValue::Null)]; - in_list!( - batch, - list, - &false, - vec![Some(true), None, None], - Arc::clone(&col_a), - &schema - ); - - // expression: "a not in (0, 1, NULL)" - let list = vec![lit(0i32), lit(1i32), lit(ScalarValue::Null)]; - in_list!( - batch, - list, - &true, - vec![Some(false), None, None], - Arc::clone(&col_a), - &schema - ); - - Ok(()) - } - - #[test] - fn in_list_uint8() -> Result<()> { - let schema = Schema::new(vec![Field::new("a", DataType::UInt8, true)]); - let a = UInt8Array::from(vec![Some(0), Some(2), None]); - let col_a = col("a", &schema)?; - let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; - - // expression: "a in (0, 1)" - let list = vec![lit(0u8), lit(1u8)]; - in_list!( - batch, - list, - &false, - vec![Some(true), Some(false), None], - Arc::clone(&col_a), - &schema - ); - - // expression: "a not in (0, 1)" - let list = vec![lit(0u8), lit(1u8)]; - in_list!( - batch, - list, - &true, - vec![Some(false), Some(true), None], - Arc::clone(&col_a), - &schema - ); - - // expression: "a in (0, 1, NULL)" - let list = vec![lit(0u8), lit(1u8), lit(ScalarValue::Null)]; - in_list!( - batch, - list, - &false, - vec![Some(true), None, None], - Arc::clone(&col_a), - &schema - ); - - // expression: "a not in (0, 1, NULL)" - let list = vec![lit(0u8), lit(1u8), lit(ScalarValue::Null)]; - in_list!( - batch, - list, - &true, - vec![Some(false), None, None], - Arc::clone(&col_a), - &schema - ); - - Ok(()) - } - - #[test] - fn in_list_uint16() -> Result<()> { - let schema = Schema::new(vec![Field::new("a", DataType::UInt16, true)]); - let a = UInt16Array::from(vec![Some(0), Some(2), None]); - let col_a = col("a", &schema)?; - let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; - - // expression: "a in (0, 1)" - let list = vec![lit(0u16), lit(1u16)]; - in_list!( - batch, - list, - &false, - vec![Some(true), Some(false), None], - Arc::clone(&col_a), - &schema - ); - - // expression: "a not in (0, 1)" - let list = vec![lit(0u16), lit(1u16)]; - in_list!( - batch, - list, - &true, - vec![Some(false), Some(true), None], - Arc::clone(&col_a), - &schema - ); - - // expression: "a in (0, 1, NULL)" - let list = vec![lit(0u16), lit(1u16), lit(ScalarValue::Null)]; - in_list!( - batch, - list, - &false, - vec![Some(true), None, None], - Arc::clone(&col_a), - &schema - ); - - // expression: "a not in (0, 1, NULL)" - let list = vec![lit(0u16), lit(1u16), lit(ScalarValue::Null)]; - in_list!( - batch, - list, - &true, - vec![Some(false), None, None], - Arc::clone(&col_a), - &schema - ); - - Ok(()) - } - - #[test] - fn in_list_uint32() -> Result<()> { - let schema = Schema::new(vec![Field::new("a", DataType::UInt32, true)]); - let a = UInt32Array::from(vec![Some(0), Some(2), None]); - let col_a = col("a", &schema)?; - let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; - - // expression: "a in (0, 1)" - let list = vec![lit(0u32), lit(1u32)]; - in_list!( - batch, - list, - &false, - vec![Some(true), Some(false), None], - Arc::clone(&col_a), - &schema - ); - - // expression: "a not in (0, 1)" - let list = vec![lit(0u32), lit(1u32)]; - in_list!( - batch, - list, - &true, - vec![Some(false), Some(true), None], - Arc::clone(&col_a), - &schema - ); - - // expression: "a in (0, 1, NULL)" - let list = vec![lit(0u32), lit(1u32), lit(ScalarValue::Null)]; - in_list!( - batch, - list, - &false, - vec![Some(true), None, None], - Arc::clone(&col_a), - &schema - ); - - // expression: "a not in (0, 1, NULL)" - let list = vec![lit(0u32), lit(1u32), lit(ScalarValue::Null)]; - in_list!( - batch, - list, - &true, - vec![Some(false), None, None], - Arc::clone(&col_a), - &schema - ); - - Ok(()) - } - - #[test] - fn in_list_uint64() -> Result<()> { - let schema = Schema::new(vec![Field::new("a", DataType::UInt64, true)]); - let a = UInt64Array::from(vec![Some(0), Some(2), None]); - let col_a = col("a", &schema)?; - let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; - - // expression: "a in (0, 1)" - let list = vec![lit(0u64), lit(1u64)]; - in_list!( - batch, - list, - &false, - vec![Some(true), Some(false), None], - Arc::clone(&col_a), - &schema - ); - - // expression: "a not in (0, 1)" - let list = vec![lit(0u64), lit(1u64)]; - in_list!( - batch, - list, - &true, - vec![Some(false), Some(true), None], - Arc::clone(&col_a), - &schema - ); - - // expression: "a in (0, 1, NULL)" - let list = vec![lit(0u64), lit(1u64), lit(ScalarValue::Null)]; - in_list!( - batch, - list, - &false, - vec![Some(true), None, None], - Arc::clone(&col_a), - &schema - ); - - // expression: "a not in (0, 1, NULL)" - let list = vec![lit(0u64), lit(1u64), lit(ScalarValue::Null)]; - in_list!( - batch, - list, - &true, - vec![Some(false), None, None], - Arc::clone(&col_a), - &schema - ); - - Ok(()) - } - - #[test] - fn in_list_large_utf8() -> Result<()> { - let schema = Schema::new(vec![Field::new("a", DataType::LargeUtf8, true)]); - let a = LargeStringArray::from(vec![Some("a"), Some("d"), None]); - let col_a = col("a", &schema)?; - let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; - - // expression: "a in ("a", "b")" - let list = vec![lit("a"), lit("b")]; - in_list!( - batch, - list, - &false, - vec![Some(true), Some(false), None], - Arc::clone(&col_a), - &schema - ); - - // expression: "a not in ("a", "b")" - let list = vec![lit("a"), lit("b")]; - in_list!( - batch, - list, - &true, - vec![Some(false), Some(true), None], - Arc::clone(&col_a), - &schema - ); - - // expression: "a in ("a", "b", null)" - let list = vec![lit("a"), lit("b"), lit(ScalarValue::LargeUtf8(None))]; - in_list!( - batch, - list, - &false, - vec![Some(true), None, None], - Arc::clone(&col_a), - &schema - ); - - // expression: "a not in ("a", "b", null)" - let list = vec![lit("a"), lit("b"), lit(ScalarValue::LargeUtf8(None))]; - in_list!( - batch, - list, - &true, - vec![Some(false), None, None], - Arc::clone(&col_a), - &schema - ); - - Ok(()) - } - - #[test] - fn in_list_utf8_view() -> Result<()> { - let schema = Schema::new(vec![Field::new("a", DataType::Utf8View, true)]); - let a = StringViewArray::from(vec![Some("a"), Some("d"), None]); - let col_a = col("a", &schema)?; - let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; - - // expression: "a in ("a", "b")" - let list = vec![lit("a"), lit("b")]; - in_list!( - batch, - list, - &false, - vec![Some(true), Some(false), None], - Arc::clone(&col_a), - &schema - ); - - // expression: "a not in ("a", "b")" - let list = vec![lit("a"), lit("b")]; - in_list!( - batch, - list, - &true, - vec![Some(false), Some(true), None], - Arc::clone(&col_a), - &schema - ); - - // expression: "a in ("a", "b", null)" - let list = vec![lit("a"), lit("b"), lit(ScalarValue::Utf8View(None))]; - in_list!( - batch, - list, - &false, - vec![Some(true), None, None], - Arc::clone(&col_a), - &schema - ); - - // expression: "a not in ("a", "b", null)" - let list = vec![lit("a"), lit("b"), lit(ScalarValue::Utf8View(None))]; - in_list!( - batch, - list, - &true, - vec![Some(false), None, None], - Arc::clone(&col_a), - &schema - ); - - Ok(()) - } - - #[test] - fn in_list_large_binary() -> Result<()> { - let schema = Schema::new(vec![Field::new("a", DataType::LargeBinary, true)]); - let a = LargeBinaryArray::from(vec![ - Some([1, 2, 3].as_slice()), - Some([1, 2, 2].as_slice()), - None, - ]); - let col_a = col("a", &schema)?; - let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; - - // expression: "a in ([1, 2, 3], [4, 5, 6])" - let list = vec![lit([1, 2, 3].as_slice()), lit([4, 5, 6].as_slice())]; - in_list!( - batch, - list.clone(), - &false, - vec![Some(true), Some(false), None], - Arc::clone(&col_a), - &schema - ); - - // expression: "a not in ([1, 2, 3], [4, 5, 6])" - in_list!( - batch, - list, - &true, - vec![Some(false), Some(true), None], - Arc::clone(&col_a), - &schema - ); - - // expression: "a in ([1, 2, 3], [4, 5, 6], null)" - let list = vec![ - lit([1, 2, 3].as_slice()), - lit([4, 5, 6].as_slice()), - lit(ScalarValue::LargeBinary(None)), - ]; - in_list!( - batch, - list.clone(), - &false, - vec![Some(true), None, None], - Arc::clone(&col_a), - &schema - ); - - // expression: "a not in ([1, 2, 3], [4, 5, 6], null)" - in_list!( - batch, - list, - &true, - vec![Some(false), None, None], - Arc::clone(&col_a), - &schema - ); - - Ok(()) - } - - #[test] - fn in_list_binary_view() -> Result<()> { - let schema = Schema::new(vec![Field::new("a", DataType::BinaryView, true)]); - let a = BinaryViewArray::from(vec![ - Some([1, 2, 3].as_slice()), - Some([1, 2, 2].as_slice()), - None, - ]); - let col_a = col("a", &schema)?; - let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; - - // expression: "a in ([1, 2, 3], [4, 5, 6])" - let list = vec![lit([1, 2, 3].as_slice()), lit([4, 5, 6].as_slice())]; - in_list!( - batch, - list.clone(), - &false, - vec![Some(true), Some(false), None], - Arc::clone(&col_a), - &schema - ); - - // expression: "a not in ([1, 2, 3], [4, 5, 6])" - in_list!( - batch, - list, - &true, - vec![Some(false), Some(true), None], - Arc::clone(&col_a), - &schema - ); - - // expression: "a in ([1, 2, 3], [4, 5, 6], null)" - let list = vec![ - lit([1, 2, 3].as_slice()), - lit([4, 5, 6].as_slice()), - lit(ScalarValue::BinaryView(None)), - ]; - in_list!( - batch, - list.clone(), - &false, - vec![Some(true), None, None], - Arc::clone(&col_a), - &schema - ); - - // expression: "a not in ([1, 2, 3], [4, 5, 6], null)" - in_list!( - batch, - list, - &true, - vec![Some(false), None, None], - Arc::clone(&col_a), - &schema - ); - - Ok(()) - } - - #[test] - fn in_list_date64() -> Result<()> { - let schema = Schema::new(vec![Field::new("a", DataType::Date64, true)]); - let a = Date64Array::from(vec![Some(0), Some(2), None]); - let col_a = col("a", &schema)?; - let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; - - // expression: "a in (0, 1)" - let list = vec![ - lit(ScalarValue::Date64(Some(0))), - lit(ScalarValue::Date64(Some(1))), - ]; - in_list!( - batch, - list, - &false, - vec![Some(true), Some(false), None], - Arc::clone(&col_a), - &schema - ); - - // expression: "a not in (0, 1)" - let list = vec![ - lit(ScalarValue::Date64(Some(0))), - lit(ScalarValue::Date64(Some(1))), - ]; - in_list!( - batch, - list, - &true, - vec![Some(false), Some(true), None], - Arc::clone(&col_a), - &schema - ); - - // expression: "a in (0, 1, NULL)" - let list = vec![ - lit(ScalarValue::Date64(Some(0))), - lit(ScalarValue::Date64(Some(1))), - lit(ScalarValue::Null), - ]; - in_list!( - batch, - list, - &false, - vec![Some(true), None, None], - Arc::clone(&col_a), - &schema - ); - - // expression: "a not in (0, 1, NULL)" - let list = vec![ - lit(ScalarValue::Date64(Some(0))), - lit(ScalarValue::Date64(Some(1))), - lit(ScalarValue::Null), - ]; - in_list!( - batch, - list, - &true, - vec![Some(false), None, None], - Arc::clone(&col_a), - &schema - ); - - Ok(()) - } - - #[test] - fn in_list_date32() -> Result<()> { - let schema = Schema::new(vec![Field::new("a", DataType::Date32, true)]); - let a = Date32Array::from(vec![Some(0), Some(2), None]); - let col_a = col("a", &schema)?; - let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; - - // expression: "a in (0, 1)" - let list = vec![ - lit(ScalarValue::Date32(Some(0))), - lit(ScalarValue::Date32(Some(1))), - ]; - in_list!( - batch, - list, - &false, - vec![Some(true), Some(false), None], - Arc::clone(&col_a), - &schema - ); - - // expression: "a not in (0, 1)" - let list = vec![ - lit(ScalarValue::Date32(Some(0))), - lit(ScalarValue::Date32(Some(1))), - ]; - in_list!( - batch, - list, - &true, - vec![Some(false), Some(true), None], - Arc::clone(&col_a), - &schema - ); - - // expression: "a in (0, 1, NULL)" - let list = vec![ - lit(ScalarValue::Date32(Some(0))), - lit(ScalarValue::Date32(Some(1))), - lit(ScalarValue::Null), - ]; - in_list!( - batch, - list, - &false, - vec![Some(true), None, None], - Arc::clone(&col_a), - &schema - ); - - // expression: "a not in (0, 1, NULL)" - let list = vec![ - lit(ScalarValue::Date32(Some(0))), - lit(ScalarValue::Date32(Some(1))), - lit(ScalarValue::Null), - ]; - in_list!( - batch, - list, - &true, - vec![Some(false), None, None], - Arc::clone(&col_a), - &schema - ); - - Ok(()) - } - - #[test] - fn in_list_decimal_type_coercion() -> Result<()> { - // Now, we can check the NULL type - let schema = - Schema::new(vec![Field::new("a", DataType::Decimal128(13, 4), true)]); - let array = vec![Some(100_0000_i128), None, Some(200_5000_i128)] - .into_iter() - .collect::(); - let array = array.with_precision_and_scale(13, 4).unwrap(); - let col_a = col("a", &schema)?; - let batch = - RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(array)])?; - - // expression: "a in (100,200), the data type of list is INT32 - let list = vec![lit(100i32), lit(200i32)]; - in_list!( - batch, - list, - &false, - vec![Some(true), None, Some(false)], - Arc::clone(&col_a), - &schema - ); - // expression: "a not in (100,200) - let list = vec![lit(100i32), lit(200i32)]; - in_list!( - batch, - list, - &true, - vec![Some(false), None, Some(true)], - Arc::clone(&col_a), - &schema - ); - - // expression: "a in (200,NULL), the data type of list is INT32 AND NULL - let list = vec![lit(ScalarValue::Int32(Some(100))), lit(ScalarValue::Null)]; - in_list!( - batch, - list.clone(), - &false, - vec![Some(true), None, None], - Arc::clone(&col_a), - &schema - ); - // expression: "a not in (200,NULL), the data type of list is INT32 AND NULL - in_list!( - batch, - list, - &true, - vec![Some(false), None, None], - Arc::clone(&col_a), - &schema - ); - - // expression: "a in (200.5, 100), the data type of list is FLOAT32 and INT32 - let list = vec![lit(200.50f32), lit(100i32)]; - in_list!( - batch, - list, - &false, - vec![Some(true), None, Some(true)], - Arc::clone(&col_a), - &schema - ); - - // expression: "a not in (200.5, 100), the data type of list is FLOAT32 and INT32 - let list = vec![lit(200.50f32), lit(101i32)]; - in_list!( - batch, - list, - &true, - vec![Some(true), None, Some(false)], - Arc::clone(&col_a), - &schema - ); - - // test the optimization: set - // expression: "a in (99..300), the data type of list is INT32 - let list = (99i32..300).map(lit).collect::>(); - - in_list!( - batch, - list.clone(), - &false, - vec![Some(true), None, Some(false)], - Arc::clone(&col_a), - &schema - ); - - in_list!( - batch, - list, - &true, - vec![Some(false), None, Some(true)], - Arc::clone(&col_a), - &schema - ); - - Ok(()) - } - - #[test] - fn in_list_timestamp() -> Result<()> { - let schema = Schema::new(vec![Field::new( - "a", - DataType::Timestamp(TimeUnit::Microsecond, None), - true, - )]); - let a = TimestampMicrosecondArray::from(vec![ - Some(1388588401000000000), - Some(1288588501000000000), - None, - ]); - let col_a = col("a", &schema)?; - let batch = RecordBatch::try_new(Arc::new(schema.clone()), vec![Arc::new(a)])?; - - let list = vec![ - lit(ScalarValue::TimestampMicrosecond( - Some(1388588401000000000), - None, - )), - lit(ScalarValue::TimestampMicrosecond( - Some(1388588401000000001), - None, - )), - lit(ScalarValue::TimestampMicrosecond( - Some(1388588401000000002), - None, - )), - ]; - - in_list!( - batch, - list.clone(), - &false, - vec![Some(true), Some(false), None], - Arc::clone(&col_a), - &schema - ); - - in_list!( - batch, - list.clone(), - &true, - vec![Some(false), Some(true), None], - Arc::clone(&col_a), - &schema - ); - Ok(()) - } - - #[test] - fn in_expr_with_multiple_element_in_list() -> Result<()> { - let schema = Schema::new(vec![ - Field::new("a", DataType::Float64, true), - Field::new("b", DataType::Float64, true), - Field::new("c", DataType::Float64, true), - ]); - let a = Float64Array::from(vec![ - Some(0.0), - Some(1.0), - Some(2.0), - Some(f64::NAN), - Some(-f64::NAN), - ]); - let b = Float64Array::from(vec![ - Some(8.0), - Some(1.0), - Some(5.0), - Some(f64::NAN), - Some(3.0), - ]); - let c = Float64Array::from(vec![ - Some(6.0), - Some(7.0), - None, - Some(5.0), - Some(-f64::NAN), - ]); - let col_a = col("a", &schema)?; - let col_b = col("b", &schema)?; - let col_c = col("c", &schema)?; - let batch = RecordBatch::try_new( - Arc::new(schema.clone()), - vec![Arc::new(a), Arc::new(b), Arc::new(c)], - )?; - - let list = vec![Arc::clone(&col_b), Arc::clone(&col_c)]; - in_list!( - batch, - list.clone(), - &false, - vec![Some(false), Some(true), None, Some(true), Some(true)], - Arc::clone(&col_a), - &schema - ); - - in_list!( - batch, - list, - &true, - vec![Some(true), Some(false), None, Some(false), Some(false)], - Arc::clone(&col_a), - &schema - ); - - Ok(()) - } - macro_rules! test_nullable { ($COL:expr, $LIST:expr, $SCHEMA:expr, $EXPECTED:expr) => {{ let (cast_expr, cast_list_exprs) = in_list_cast($COL, $LIST, $SCHEMA)?; From c1a29a251c7c7710cb45650e409160437ecc3e7f Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Wed, 10 Dec 2025 08:46:47 -0600 Subject: [PATCH 3/7] try to optimize --- .../physical-expr/src/expressions/in_list.rs | 119 ++++++++++-------- 1 file changed, 66 insertions(+), 53 deletions(-) diff --git a/datafusion/physical-expr/src/expressions/in_list.rs b/datafusion/physical-expr/src/expressions/in_list.rs index fa9f1d6de031..362ee59dd09f 100644 --- a/datafusion/physical-expr/src/expressions/in_list.rs +++ b/datafusion/physical-expr/src/expressions/in_list.rs @@ -258,64 +258,77 @@ macro_rules! primitive_static_filter { let haystack_has_nulls = self.null_count > 0; - let result = match (v.null_count() > 0, haystack_has_nulls, negated) { - (true, _, false) | (false, true, false) => { - // Either needle or haystack has nulls, not negated - BooleanArray::from_iter(v.iter().map(|value| { - match value { - // SQL three-valued logic: null IN (...) is always null - None => None, - Some(v) => { - if self.values.contains(&v) { - Some(true) - } else if haystack_has_nulls { - // value not in set, but set has nulls -> null - None - } else { - Some(false) - } - } - } - })) + let needle_values = v.values(); + let needle_nulls = v.nulls(); + let needle_has_nulls = v.null_count() > 0; + + // Compute the "contains" result using collect_bool (fast batched approach) + // This ignores nulls - we handle them separately + let contains_buffer = if negated { + BooleanBuffer::collect_bool(needle_values.len(), |i| { + !self.values.contains(&needle_values[i]) + }) + } else { + BooleanBuffer::collect_bool(needle_values.len(), |i| { + self.values.contains(&needle_values[i]) + }) + }; + + // Compute the null mask + // Output is null when: + // 1. needle value is null, OR + // 2. needle value is not in set AND haystack has nulls + let result_nulls = match (needle_has_nulls, haystack_has_nulls) { + (false, false) => { + // No nulls anywhere + None } - (true, _, true) | (false, true, true) => { - // Either needle or haystack has nulls, negated - BooleanArray::from_iter(v.iter().map(|value| { - match value { - // SQL three-valued logic: null NOT IN (...) is always null - None => None, - Some(v) => { - if self.values.contains(&v) { - Some(false) - } else if haystack_has_nulls { - // value not in set, but set has nulls -> null - None - } else { - Some(true) - } - } - } - })) + (true, false) => { + // Only needle has nulls - just use needle's null mask + needle_nulls.cloned() } - (false, false, false) => { - // no nulls anywhere, not negated - let values = v.values(); - let mut builder = BooleanBufferBuilder::new(values.len()); - for value in values.iter() { - builder.append(self.values.contains(value)); - } - BooleanArray::new(builder.finish(), None) + (false, true) => { + // Only haystack has nulls - null where not-in-set + // For IN: null where contains is false + // For NOT IN: null where contains is true (before negation, i.e., where original contains was false) + // Since we already negated contains_buffer for NOT IN, we need to handle this: + // - IN (negated=false): null where !contains_buffer + // - NOT IN (negated=true): null where contains_buffer (which is !original_contains) + // Actually both cases: null where the "not found" condition is true + // For IN: not found = !contains_buffer + // For NOT IN: not found = contains_buffer (since contains_buffer = !original_contains) + // So the validity mask (valid = not null) is: + // - IN: contains_buffer (found = valid) + // - NOT IN: !contains_buffer (found in original = valid, but contains_buffer is negated) + let validity = if negated { + // For NOT IN: we want valid where original contains was true + // contains_buffer = !original_contains, so validity = !contains_buffer + !&contains_buffer + } else { + // For IN: valid where contains is true + contains_buffer.clone() + }; + Some(NullBuffer::new(validity)) } - (false, false, true) => { - let values = v.values(); - let mut builder = BooleanBufferBuilder::new(values.len()); - for value in values.iter() { - builder.append(!self.values.contains(value)); - } - BooleanArray::new(builder.finish(), None) + (true, true) => { + // Both have nulls - combine needle nulls with haystack-induced nulls + let needle_validity = needle_nulls.map(|n| n.inner().clone()) + .unwrap_or_else(|| BooleanBuffer::new_set(needle_values.len())); + + // Haystack-induced validity (same logic as above) + let haystack_validity = if negated { + !&contains_buffer + } else { + contains_buffer.clone() + }; + + // Combined validity: valid only where both are valid + let combined_validity = &needle_validity & &haystack_validity; + Some(NullBuffer::new(combined_validity)) } }; - Ok(result) + + Ok(BooleanArray::new(contains_buffer, result_nulls)) } } }; From a430ef82c3fd5ca97dab917659f43d7e663b6fa6 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Wed, 10 Dec 2025 11:56:04 -0600 Subject: [PATCH 4/7] remove comment --- datafusion/physical-expr/src/expressions/in_list.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/datafusion/physical-expr/src/expressions/in_list.rs b/datafusion/physical-expr/src/expressions/in_list.rs index 362ee59dd09f..2b9207c8693d 100644 --- a/datafusion/physical-expr/src/expressions/in_list.rs +++ b/datafusion/physical-expr/src/expressions/in_list.rs @@ -91,7 +91,6 @@ impl StaticFilter for ArrayStaticFilter { if v.data_type() == &DataType::Null || self.in_array.data_type() == &DataType::Null { - // return Ok(BooleanArray::new(vec![None; v.len()])); let nulls = NullBuffer::new_null(v.len()); return Ok(BooleanArray::new( BooleanBuffer::new_unset(v.len()), From 2f2c60da1b59f314ebb18690f0003220c0ec10b9 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Wed, 10 Dec 2025 12:27:08 -0600 Subject: [PATCH 5/7] add float implementations --- .../physical-expr/src/expressions/in_list.rs | 169 +++++++++++++++++- 1 file changed, 167 insertions(+), 2 deletions(-) diff --git a/datafusion/physical-expr/src/expressions/in_list.rs b/datafusion/physical-expr/src/expressions/in_list.rs index 2b9207c8693d..d3c62718d700 100644 --- a/datafusion/physical-expr/src/expressions/in_list.rs +++ b/datafusion/physical-expr/src/expressions/in_list.rs @@ -151,8 +151,11 @@ fn instantiate_static_filter( DataType::UInt16 => Ok(Arc::new(UInt16StaticFilter::try_new(&in_array)?)), DataType::UInt32 => Ok(Arc::new(UInt32StaticFilter::try_new(&in_array)?)), DataType::UInt64 => Ok(Arc::new(UInt64StaticFilter::try_new(&in_array)?)), + // Float primitive types (use ordered wrappers for Hash/Eq) + DataType::Float32 => Ok(Arc::new(Float32StaticFilter::try_new(&in_array)?)), + DataType::Float64 => Ok(Arc::new(Float64StaticFilter::try_new(&in_array)?)), _ => { - /* fall through to generic implementation for unsupported types (Float32/Float64, Struct, etc.) */ + /* fall through to generic implementation for unsupported types (Struct, etc.) */ Ok(Arc::new(ArrayStaticFilter::try_new(in_array)?)) } } @@ -210,6 +213,56 @@ impl ArrayStaticFilter { } } +/// Wrapper for f32 that implements Hash and Eq using IEEE 754 total ordering. +/// This treats NaN values as equal to each other (using total_cmp). +#[derive(Clone, Copy)] +struct OrderedFloat32(f32); + +impl Hash for OrderedFloat32 { + fn hash(&self, state: &mut H) { + self.0.to_ne_bytes().hash(state); + } +} + +impl PartialEq for OrderedFloat32 { + fn eq(&self, other: &Self) -> bool { + self.0.total_cmp(&other.0).is_eq() + } +} + +impl Eq for OrderedFloat32 {} + +impl From for OrderedFloat32 { + fn from(v: f32) -> Self { + Self(v) + } +} + +/// Wrapper for f64 that implements Hash and Eq using IEEE 754 total ordering. +/// This treats NaN values as equal to each other (using total_cmp). +#[derive(Clone, Copy)] +struct OrderedFloat64(f64); + +impl Hash for OrderedFloat64 { + fn hash(&self, state: &mut H) { + self.0.to_ne_bytes().hash(state); + } +} + +impl PartialEq for OrderedFloat64 { + fn eq(&self, other: &Self) -> bool { + self.0.total_cmp(&other.0).is_eq() + } +} + +impl Eq for OrderedFloat64 {} + +impl From for OrderedFloat64 { + fn from(v: f64) -> Self { + Self(v) + } +} + // Macro to generate specialized StaticFilter implementations for primitive types macro_rules! primitive_static_filter { ($Name:ident, $ArrowType:ty) => { @@ -334,7 +387,6 @@ macro_rules! primitive_static_filter { } // Generate specialized filters for all integer primitive types -// Note: Float32 and Float64 are excluded because they don't implement Hash/Eq due to NaN primitive_static_filter!(Int8StaticFilter, Int8Type); primitive_static_filter!(Int16StaticFilter, Int16Type); primitive_static_filter!(Int32StaticFilter, Int32Type); @@ -344,6 +396,119 @@ primitive_static_filter!(UInt16StaticFilter, UInt16Type); primitive_static_filter!(UInt32StaticFilter, UInt32Type); primitive_static_filter!(UInt64StaticFilter, UInt64Type); +// Macro to generate specialized StaticFilter implementations for float types +// Floats require a wrapper type (OrderedFloat*) to implement Hash/Eq due to NaN semantics +macro_rules! float_static_filter { + ($Name:ident, $ArrowType:ty, $OrderedType:ty) => { + struct $Name { + null_count: usize, + values: HashSet<$OrderedType>, + } + + impl $Name { + fn try_new(in_array: &ArrayRef) -> Result { + let in_array = in_array + .as_primitive_opt::<$ArrowType>() + .ok_or_else(|| exec_datafusion_err!("Failed to downcast an array to a '{}' array", stringify!($ArrowType)))?; + + let mut values = HashSet::with_capacity(in_array.len()); + let null_count = in_array.null_count(); + + for v in in_array.iter().flatten() { + values.insert(<$OrderedType>::from(v)); + } + + Ok(Self { null_count, values }) + } + } + + impl StaticFilter for $Name { + fn null_count(&self) -> usize { + self.null_count + } + + fn contains(&self, v: &dyn Array, negated: bool) -> Result { + // Handle dictionary arrays by recursing on the values + downcast_dictionary_array! { + v => { + let values_contains = self.contains(v.values().as_ref(), negated)?; + let result = take(&values_contains, v.keys(), None)?; + return Ok(downcast_array(result.as_ref())) + } + _ => {} + } + + let v = v + .as_primitive_opt::<$ArrowType>() + .ok_or_else(|| exec_datafusion_err!("Failed to downcast an array to a '{}' array", stringify!($ArrowType)))?; + + let haystack_has_nulls = self.null_count > 0; + + let needle_values = v.values(); + let needle_nulls = v.nulls(); + let needle_has_nulls = v.null_count() > 0; + + // Compute the "contains" result using collect_bool (fast batched approach) + // This ignores nulls - we handle them separately + let contains_buffer = if negated { + BooleanBuffer::collect_bool(needle_values.len(), |i| { + !self.values.contains(&<$OrderedType>::from(needle_values[i])) + }) + } else { + BooleanBuffer::collect_bool(needle_values.len(), |i| { + self.values.contains(&<$OrderedType>::from(needle_values[i])) + }) + }; + + // Compute the null mask + // Output is null when: + // 1. needle value is null, OR + // 2. needle value is not in set AND haystack has nulls + let result_nulls = match (needle_has_nulls, haystack_has_nulls) { + (false, false) => { + // No nulls anywhere + None + } + (true, false) => { + // Only needle has nulls - just use needle's null mask + needle_nulls.cloned() + } + (false, true) => { + // Only haystack has nulls - null where not-in-set + let validity = if negated { + !&contains_buffer + } else { + contains_buffer.clone() + }; + Some(NullBuffer::new(validity)) + } + (true, true) => { + // Both have nulls - combine needle nulls with haystack-induced nulls + let needle_validity = needle_nulls.map(|n| n.inner().clone()) + .unwrap_or_else(|| BooleanBuffer::new_set(needle_values.len())); + + let haystack_validity = if negated { + !&contains_buffer + } else { + contains_buffer.clone() + }; + + // Combined validity: valid only where both are valid + let combined_validity = &needle_validity & &haystack_validity; + Some(NullBuffer::new(combined_validity)) + } + }; + + Ok(BooleanArray::new(contains_buffer, result_nulls)) + } + } + }; +} + +// Generate specialized filters for float types using ordered wrappers +float_static_filter!(Float32StaticFilter, Float32Type, OrderedFloat32); +float_static_filter!(Float64StaticFilter, Float64Type, OrderedFloat64); + /// Evaluates the list of expressions into an array, flattening any dictionaries fn evaluate_list( list: &[Arc], From 0b5a9c9bdcc8a49bbf21719fa56016021c4a2421 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Thu, 11 Dec 2025 13:07:43 -0600 Subject: [PATCH 6/7] Add truth table --- .../physical-expr/src/expressions/in_list.rs | 57 +++++++++++++------ 1 file changed, 40 insertions(+), 17 deletions(-) diff --git a/datafusion/physical-expr/src/expressions/in_list.rs b/datafusion/physical-expr/src/expressions/in_list.rs index d3c62718d700..b539704518f5 100644 --- a/datafusion/physical-expr/src/expressions/in_list.rs +++ b/datafusion/physical-expr/src/expressions/in_list.rs @@ -314,6 +314,22 @@ macro_rules! primitive_static_filter { let needle_nulls = v.nulls(); let needle_has_nulls = v.null_count() > 0; + // Truth table for `value [NOT] IN (set)` with SQL three-valued logic: + // ("-" means the value doesn't affect the result) + // + // | needle_null | haystack_null | negated | in set? | result | + // |-------------|---------------|---------|---------|--------| + // | true | - | false | - | null | + // | true | - | true | - | null | + // | false | true | false | yes | true | + // | false | true | false | no | null | + // | false | true | true | yes | false | + // | false | true | true | no | null | + // | false | false | false | yes | true | + // | false | false | false | no | false | + // | false | false | true | yes | false | + // | false | false | true | no | true | + // Compute the "contains" result using collect_bool (fast batched approach) // This ignores nulls - we handle them separately let contains_buffer = if negated { @@ -340,24 +356,12 @@ macro_rules! primitive_static_filter { needle_nulls.cloned() } (false, true) => { - // Only haystack has nulls - null where not-in-set - // For IN: null where contains is false - // For NOT IN: null where contains is true (before negation, i.e., where original contains was false) - // Since we already negated contains_buffer for NOT IN, we need to handle this: - // - IN (negated=false): null where !contains_buffer - // - NOT IN (negated=true): null where contains_buffer (which is !original_contains) - // Actually both cases: null where the "not found" condition is true - // For IN: not found = !contains_buffer - // For NOT IN: not found = contains_buffer (since contains_buffer = !original_contains) - // So the validity mask (valid = not null) is: - // - IN: contains_buffer (found = valid) - // - NOT IN: !contains_buffer (found in original = valid, but contains_buffer is negated) + // Only haystack has nulls - result is null when value not in set + // Valid (not null) when original "in set" is true + // For NOT IN: contains_buffer = !original, so validity = !contains_buffer let validity = if negated { - // For NOT IN: we want valid where original contains was true - // contains_buffer = !original_contains, so validity = !contains_buffer !&contains_buffer } else { - // For IN: valid where contains is true contains_buffer.clone() }; Some(NullBuffer::new(validity)) @@ -367,7 +371,7 @@ macro_rules! primitive_static_filter { let needle_validity = needle_nulls.map(|n| n.inner().clone()) .unwrap_or_else(|| BooleanBuffer::new_set(needle_values.len())); - // Haystack-induced validity (same logic as above) + // Valid when original "in set" is true (see above) let haystack_validity = if negated { !&contains_buffer } else { @@ -448,6 +452,22 @@ macro_rules! float_static_filter { let needle_nulls = v.nulls(); let needle_has_nulls = v.null_count() > 0; + // Truth table for `value [NOT] IN (set)` with SQL three-valued logic: + // ("-" means the value doesn't affect the result) + // + // | needle_null | haystack_null | negated | in set? | result | + // |-------------|---------------|---------|---------|--------| + // | true | - | false | - | null | + // | true | - | true | - | null | + // | false | true | false | yes | true | + // | false | true | false | no | null | + // | false | true | true | yes | false | + // | false | true | true | no | null | + // | false | false | false | yes | true | + // | false | false | false | no | false | + // | false | false | true | yes | false | + // | false | false | true | no | true | + // Compute the "contains" result using collect_bool (fast batched approach) // This ignores nulls - we handle them separately let contains_buffer = if negated { @@ -474,7 +494,9 @@ macro_rules! float_static_filter { needle_nulls.cloned() } (false, true) => { - // Only haystack has nulls - null where not-in-set + // Only haystack has nulls - result is null when value not in set + // Valid (not null) when original "in set" is true + // For NOT IN: contains_buffer = !original, so validity = !contains_buffer let validity = if negated { !&contains_buffer } else { @@ -487,6 +509,7 @@ macro_rules! float_static_filter { let needle_validity = needle_nulls.map(|n| n.inner().clone()) .unwrap_or_else(|| BooleanBuffer::new_set(needle_values.len())); + // Valid when original "in set" is true (see above) let haystack_validity = if negated { !&contains_buffer } else { From b6404e5bcd119782d08853095c1bd7f9943d5893 Mon Sep 17 00:00:00 2001 From: Adrian Garcia Badaracco <1755071+adriangb@users.noreply.github.com> Date: Thu, 11 Dec 2025 13:10:31 -0600 Subject: [PATCH 7/7] use to_bits() --- datafusion/physical-expr/src/expressions/in_list.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/datafusion/physical-expr/src/expressions/in_list.rs b/datafusion/physical-expr/src/expressions/in_list.rs index b539704518f5..b6b67c85c488 100644 --- a/datafusion/physical-expr/src/expressions/in_list.rs +++ b/datafusion/physical-expr/src/expressions/in_list.rs @@ -213,8 +213,8 @@ impl ArrayStaticFilter { } } -/// Wrapper for f32 that implements Hash and Eq using IEEE 754 total ordering. -/// This treats NaN values as equal to each other (using total_cmp). +/// Wrapper for f32 that implements Hash and Eq using bit comparison. +/// This treats NaN values as equal to each other when they have the same bit pattern. #[derive(Clone, Copy)] struct OrderedFloat32(f32); @@ -226,7 +226,7 @@ impl Hash for OrderedFloat32 { impl PartialEq for OrderedFloat32 { fn eq(&self, other: &Self) -> bool { - self.0.total_cmp(&other.0).is_eq() + self.0.to_bits() == other.0.to_bits() } } @@ -238,8 +238,8 @@ impl From for OrderedFloat32 { } } -/// Wrapper for f64 that implements Hash and Eq using IEEE 754 total ordering. -/// This treats NaN values as equal to each other (using total_cmp). +/// Wrapper for f64 that implements Hash and Eq using bit comparison. +/// This treats NaN values as equal to each other when they have the same bit pattern. #[derive(Clone, Copy)] struct OrderedFloat64(f64); @@ -251,7 +251,7 @@ impl Hash for OrderedFloat64 { impl PartialEq for OrderedFloat64 { fn eq(&self, other: &Self) -> bool { - self.0.total_cmp(&other.0).is_eq() + self.0.to_bits() == other.0.to_bits() } }