@@ -419,34 +419,54 @@ impl StaticFilter for BooleanStaticFilter {
419419 }
420420}
421421
422- // Macro to generate static filter implementations for string and binary types
423- // This eliminates ~550 lines of duplicated code across 6 implementations
424- macro_rules! define_static_filter {
425- (
426- $name: ident,
427- $value_type: ty,
428- |$arr_param: ident| $downcast: expr,
429- $convert: ident
430- ) => {
422+ // Macro to generate hash-based static filter implementations for string and binary types
423+ // This avoids copying string/binary data by storing only the original array and hash indices
424+ macro_rules! define_hash_based_static_filter {
425+ ( $name: ident, |$arr_param: ident| $downcast: expr) => {
431426 struct $name {
427+ in_array: ArrayRef ,
428+ state: RandomState ,
429+ map: HashMap <usize , ( ) , ( ) >,
432430 null_count: usize ,
433- values: HashSet <$value_type>,
434431 }
435432
436433 impl $name {
437434 fn try_new( in_array: & ArrayRef ) -> Result <Self > {
438- let $arr_param = in_array;
439- let in_array = $downcast
440- . ok_or_else( || exec_datafusion_err!( "Failed to downcast array" ) ) ?;
441-
442- let mut values = HashSet :: with_capacity( in_array. len( ) ) ;
443435 let null_count = in_array. null_count( ) ;
436+ let in_array_clone = Arc :: clone( in_array) ;
437+ let state = RandomState :: new( ) ;
438+ let mut map: HashMap <usize , ( ) , ( ) > = HashMap :: with_hasher( ( ) ) ;
439+
440+ with_hashes( [ in_array. as_ref( ) ] , & state, |hashes| -> Result <( ) > {
441+ let cmp = make_comparator( in_array, in_array, SortOptions :: default ( ) ) ?;
442+
443+ let insert_value = |idx| {
444+ let hash = hashes[ idx] ;
445+ if let RawEntryMut :: Vacant ( v) = map
446+ . raw_entry_mut( )
447+ . from_hash( hash, |x| cmp( * x, idx) . is_eq( ) )
448+ {
449+ v. insert_with_hasher( hash, idx, ( ) , |x| hashes[ * x] ) ;
450+ }
451+ } ;
444452
445- for v in in_array. iter( ) . flatten( ) {
446- values. insert( v. $convert( ) ) ;
447- }
453+ match in_array. nulls( ) {
454+ Some ( nulls) => {
455+ BitIndexIterator :: new( nulls. validity( ) , nulls. offset( ) , nulls. len( ) )
456+ . for_each( insert_value)
457+ }
458+ None => ( 0 ..in_array. len( ) ) . for_each( insert_value) ,
459+ }
448460
449- Ok ( Self { null_count, values } )
461+ Ok ( ( ) )
462+ } ) ?;
463+
464+ Ok ( Self {
465+ in_array: in_array_clone,
466+ state,
467+ map,
468+ null_count,
469+ } )
450470 }
451471 }
452472
@@ -466,105 +486,45 @@ macro_rules! define_static_filter {
466486 _ => { }
467487 }
468488
469- let $arr_param = v;
470- let v = $downcast
471- . ok_or_else( || exec_datafusion_err!( "Failed to downcast array" ) ) ?;
472-
473489 let haystack_has_nulls = self . null_count > 0 ;
474490
475- let result = match ( v. null_count( ) > 0 , haystack_has_nulls, negated) {
476- ( true , _, false ) | ( false , true , false ) => {
477- BooleanArray :: from_iter( v. iter( ) . map( |value| {
478- match value {
479- None => None ,
480- Some ( v) => {
481- if self . values. contains( v) {
482- Some ( true )
483- } else if haystack_has_nulls {
484- None
485- } else {
486- Some ( false )
487- }
488- }
489- }
490- } ) )
491- }
492- ( true , _, true ) | ( false , true , true ) => {
493- BooleanArray :: from_iter( v. iter( ) . map( |value| {
494- match value {
495- None => None ,
496- Some ( v) => {
497- if self . values. contains( v) {
498- Some ( false )
499- } else if haystack_has_nulls {
500- None
501- } else {
502- Some ( true )
503- }
504- }
505- }
506- } ) )
507- }
508- ( false , false , false ) => {
509- BooleanArray :: from_iter(
510- v. iter( ) . map( |value| self . values. contains( value. expect( "null_count is 0" ) ) ) ,
511- )
512- }
513- ( false , false , true ) => {
514- BooleanArray :: from_iter(
515- v. iter( ) . map( |value| !self . values. contains( value. expect( "null_count is 0" ) ) ) ,
516- )
517- }
518- } ;
519- Ok ( result)
491+ // Use hash-based lookup with verification
492+ with_hashes( [ v] , & self . state, |hashes| {
493+ let cmp = make_comparator( v, & self . in_array, SortOptions :: default ( ) ) ?;
494+
495+ Ok ( BooleanArray :: from_iter( ( 0 ..v. len( ) ) . map( |i| {
496+ if v. is_null( i) {
497+ return None ;
498+ }
499+
500+ let hash = hashes[ i] ;
501+ let contains = self
502+ . map
503+ . raw_entry( )
504+ . from_hash( hash, |idx| cmp( i, * idx) . is_eq( ) )
505+ . is_some( ) ;
506+
507+ match contains {
508+ true => Some ( !negated) ,
509+ false if haystack_has_nulls => None ,
510+ false => Some ( negated) ,
511+ }
512+ } ) ) )
513+ } )
520514 }
521515 }
522516 } ;
523517}
524518
525519// String static filters
526- define_static_filter ! (
527- Utf8StaticFilter ,
528- String ,
529- |arr| arr. as_string_opt:: <i32 >( ) ,
530- to_string
531- ) ;
532-
533- define_static_filter ! (
534- LargeUtf8StaticFilter ,
535- String ,
536- |arr| arr. as_string_opt:: <i64 >( ) ,
537- to_string
538- ) ;
539-
540- define_static_filter ! (
541- Utf8ViewStaticFilter ,
542- String ,
543- |arr| arr. as_string_view_opt( ) ,
544- to_string
545- ) ;
520+ define_hash_based_static_filter ! ( Utf8StaticFilter , |arr| arr. as_string_opt:: <i32 >( ) ) ;
521+ define_hash_based_static_filter ! ( LargeUtf8StaticFilter , |arr| arr. as_string_opt:: <i64 >( ) ) ;
522+ define_hash_based_static_filter ! ( Utf8ViewStaticFilter , |arr| arr. as_string_view_opt( ) ) ;
546523
547524// Binary static filters
548- define_static_filter ! (
549- BinaryStaticFilter ,
550- Vec <u8 >,
551- |arr| arr. as_binary_opt:: <i32 >( ) ,
552- to_vec
553- ) ;
554-
555- define_static_filter ! (
556- LargeBinaryStaticFilter ,
557- Vec <u8 >,
558- |arr| arr. as_binary_opt:: <i64 >( ) ,
559- to_vec
560- ) ;
561-
562- define_static_filter ! (
563- BinaryViewStaticFilter ,
564- Vec <u8 >,
565- |arr| arr. as_binary_view_opt( ) ,
566- to_vec
567- ) ;
525+ define_hash_based_static_filter ! ( BinaryStaticFilter , |arr| arr. as_binary_opt:: <i32 >( ) ) ;
526+ define_hash_based_static_filter ! ( LargeBinaryStaticFilter , |arr| arr. as_binary_opt:: <i64 >( ) ) ;
527+ define_hash_based_static_filter ! ( BinaryViewStaticFilter , |arr| arr. as_binary_view_opt( ) ) ;
568528
569529/// Evaluates the list of expressions into an array, flattening any dictionaries
570530fn evaluate_list (
0 commit comments