Skip to content

Commit 4e33dfa

Browse files
committed
store hashes
1 parent 2feab11 commit 4e33dfa

File tree

1 file changed

+68
-108
lines changed
  • datafusion/physical-expr/src/expressions

1 file changed

+68
-108
lines changed

datafusion/physical-expr/src/expressions/in_list.rs

Lines changed: 68 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -419,34 +419,54 @@ impl StaticFilter for BooleanStaticFilter {
419419
}
420420
}
421421

422-
// Macro to generate static filter implementations for string and binary types
423-
// This eliminates ~550 lines of duplicated code across 6 implementations
424-
macro_rules! define_static_filter {
425-
(
426-
$name:ident,
427-
$value_type:ty,
428-
|$arr_param:ident| $downcast:expr,
429-
$convert:ident
430-
) => {
422+
// Macro to generate hash-based static filter implementations for string and binary types
423+
// This avoids copying string/binary data by storing only the original array and hash indices
424+
macro_rules! define_hash_based_static_filter {
425+
($name:ident, |$arr_param:ident| $downcast:expr) => {
431426
struct $name {
427+
in_array: ArrayRef,
428+
state: RandomState,
429+
map: HashMap<usize, (), ()>,
432430
null_count: usize,
433-
values: HashSet<$value_type>,
434431
}
435432

436433
impl $name {
437434
fn try_new(in_array: &ArrayRef) -> Result<Self> {
438-
let $arr_param = in_array;
439-
let in_array = $downcast
440-
.ok_or_else(|| exec_datafusion_err!("Failed to downcast array"))?;
441-
442-
let mut values = HashSet::with_capacity(in_array.len());
443435
let null_count = in_array.null_count();
436+
let in_array_clone = Arc::clone(in_array);
437+
let state = RandomState::new();
438+
let mut map: HashMap<usize, (), ()> = HashMap::with_hasher(());
439+
440+
with_hashes([in_array.as_ref()], &state, |hashes| -> Result<()> {
441+
let cmp = make_comparator(in_array, in_array, SortOptions::default())?;
442+
443+
let insert_value = |idx| {
444+
let hash = hashes[idx];
445+
if let RawEntryMut::Vacant(v) = map
446+
.raw_entry_mut()
447+
.from_hash(hash, |x| cmp(*x, idx).is_eq())
448+
{
449+
v.insert_with_hasher(hash, idx, (), |x| hashes[*x]);
450+
}
451+
};
444452

445-
for v in in_array.iter().flatten() {
446-
values.insert(v.$convert());
447-
}
453+
match in_array.nulls() {
454+
Some(nulls) => {
455+
BitIndexIterator::new(nulls.validity(), nulls.offset(), nulls.len())
456+
.for_each(insert_value)
457+
}
458+
None => (0..in_array.len()).for_each(insert_value),
459+
}
448460

449-
Ok(Self { null_count, values })
461+
Ok(())
462+
})?;
463+
464+
Ok(Self {
465+
in_array: in_array_clone,
466+
state,
467+
map,
468+
null_count,
469+
})
450470
}
451471
}
452472

@@ -466,105 +486,45 @@ macro_rules! define_static_filter {
466486
_ => {}
467487
}
468488

469-
let $arr_param = v;
470-
let v = $downcast
471-
.ok_or_else(|| exec_datafusion_err!("Failed to downcast array"))?;
472-
473489
let haystack_has_nulls = self.null_count > 0;
474490

475-
let result = match (v.null_count() > 0, haystack_has_nulls, negated) {
476-
(true, _, false) | (false, true, false) => {
477-
BooleanArray::from_iter(v.iter().map(|value| {
478-
match value {
479-
None => None,
480-
Some(v) => {
481-
if self.values.contains(v) {
482-
Some(true)
483-
} else if haystack_has_nulls {
484-
None
485-
} else {
486-
Some(false)
487-
}
488-
}
489-
}
490-
}))
491-
}
492-
(true, _, true) | (false, true, true) => {
493-
BooleanArray::from_iter(v.iter().map(|value| {
494-
match value {
495-
None => None,
496-
Some(v) => {
497-
if self.values.contains(v) {
498-
Some(false)
499-
} else if haystack_has_nulls {
500-
None
501-
} else {
502-
Some(true)
503-
}
504-
}
505-
}
506-
}))
507-
}
508-
(false, false, false) => {
509-
BooleanArray::from_iter(
510-
v.iter().map(|value| self.values.contains(value.expect("null_count is 0"))),
511-
)
512-
}
513-
(false, false, true) => {
514-
BooleanArray::from_iter(
515-
v.iter().map(|value| !self.values.contains(value.expect("null_count is 0"))),
516-
)
517-
}
518-
};
519-
Ok(result)
491+
// Use hash-based lookup with verification
492+
with_hashes([v], &self.state, |hashes| {
493+
let cmp = make_comparator(v, &self.in_array, SortOptions::default())?;
494+
495+
Ok(BooleanArray::from_iter((0..v.len()).map(|i| {
496+
if v.is_null(i) {
497+
return None;
498+
}
499+
500+
let hash = hashes[i];
501+
let contains = self
502+
.map
503+
.raw_entry()
504+
.from_hash(hash, |idx| cmp(i, *idx).is_eq())
505+
.is_some();
506+
507+
match contains {
508+
true => Some(!negated),
509+
false if haystack_has_nulls => None,
510+
false => Some(negated),
511+
}
512+
})))
513+
})
520514
}
521515
}
522516
};
523517
}
524518

525519
// String static filters
526-
define_static_filter!(
527-
Utf8StaticFilter,
528-
String,
529-
|arr| arr.as_string_opt::<i32>(),
530-
to_string
531-
);
532-
533-
define_static_filter!(
534-
LargeUtf8StaticFilter,
535-
String,
536-
|arr| arr.as_string_opt::<i64>(),
537-
to_string
538-
);
539-
540-
define_static_filter!(
541-
Utf8ViewStaticFilter,
542-
String,
543-
|arr| arr.as_string_view_opt(),
544-
to_string
545-
);
520+
define_hash_based_static_filter!(Utf8StaticFilter, |arr| arr.as_string_opt::<i32>());
521+
define_hash_based_static_filter!(LargeUtf8StaticFilter, |arr| arr.as_string_opt::<i64>());
522+
define_hash_based_static_filter!(Utf8ViewStaticFilter, |arr| arr.as_string_view_opt());
546523

547524
// Binary static filters
548-
define_static_filter!(
549-
BinaryStaticFilter,
550-
Vec<u8>,
551-
|arr| arr.as_binary_opt::<i32>(),
552-
to_vec
553-
);
554-
555-
define_static_filter!(
556-
LargeBinaryStaticFilter,
557-
Vec<u8>,
558-
|arr| arr.as_binary_opt::<i64>(),
559-
to_vec
560-
);
561-
562-
define_static_filter!(
563-
BinaryViewStaticFilter,
564-
Vec<u8>,
565-
|arr| arr.as_binary_view_opt(),
566-
to_vec
567-
);
525+
define_hash_based_static_filter!(BinaryStaticFilter, |arr| arr.as_binary_opt::<i32>());
526+
define_hash_based_static_filter!(LargeBinaryStaticFilter, |arr| arr.as_binary_opt::<i64>());
527+
define_hash_based_static_filter!(BinaryViewStaticFilter, |arr| arr.as_binary_view_opt());
568528

569529
/// Evaluates the list of expressions into an array, flattening any dictionaries
570530
fn evaluate_list(

0 commit comments

Comments
 (0)