Skip to content

Commit d299a91

Browse files
Short InList Optimization (#46)
1 parent 1e4782f commit d299a91

File tree

2 files changed

+547
-165
lines changed

2 files changed

+547
-165
lines changed

datafusion/physical-expr/benches/in_list.rs

Lines changed: 112 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -15,17 +15,21 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
use arrow::array::{Array, ArrayRef, Float32Array, Int32Array, StringArray};
18+
use arrow::array::{
19+
Array, ArrayRef, Float32Array, Int32Array, StringArray, StringViewArray,
20+
};
1921
use arrow::datatypes::{Field, Schema};
2022
use arrow::record_batch::RecordBatch;
2123
use criterion::{criterion_group, criterion_main, Criterion};
2224
use datafusion_common::ScalarValue;
2325
use datafusion_physical_expr::expressions::{col, in_list, lit};
2426
use rand::distr::Alphanumeric;
2527
use rand::prelude::*;
28+
use std::any::TypeId;
2629
use std::hint::black_box;
2730
use std::sync::Arc;
2831

32+
/// Measures how long `in_list(col("a"), exprs)` takes to evaluate against a single RecordBatch.
2933
fn do_bench(c: &mut Criterion, name: &str, values: ArrayRef, exprs: &[ScalarValue]) {
3034
let schema = Schema::new(vec![Field::new("a", values.data_type().clone(), true)]);
3135
let exprs = exprs.iter().map(|s| lit(s.clone())).collect();
@@ -37,78 +41,129 @@ fn do_bench(c: &mut Criterion, name: &str, values: ArrayRef, exprs: &[ScalarValu
3741
});
3842
}
3943

44+
/// Generates a random alphanumeric string of the specified length.
4045
fn random_string(rng: &mut StdRng, len: usize) -> String {
4146
let value = rng.sample_iter(&Alphanumeric).take(len).collect();
4247
String::from_utf8(value).unwrap()
4348
}
4449

45-
fn do_benches(
46-
c: &mut Criterion,
47-
array_length: usize,
48-
in_list_length: usize,
49-
null_percent: f64,
50-
) {
51-
let mut rng = StdRng::seed_from_u64(120320);
52-
for string_length in [5, 10, 20] {
53-
let values: StringArray = (0..array_length)
54-
.map(|_| {
55-
rng.random_bool(null_percent)
56-
.then(|| random_string(&mut rng, string_length))
57-
})
58-
.collect();
59-
60-
let in_list: Vec<_> = (0..in_list_length)
61-
.map(|_| ScalarValue::from(random_string(&mut rng, string_length)))
62-
.collect();
63-
64-
do_bench(
65-
c,
66-
&format!(
67-
"in_list_utf8({string_length}) ({array_length}, {null_percent}) IN ({in_list_length}, 0)"
68-
),
69-
Arc::new(values),
70-
&in_list,
71-
)
50+
const IN_LIST_LENGTHS: [usize; 3] = [3, 8, 100];
51+
const NULL_PERCENTS: [f64; 2] = [0., 0.2];
52+
const STRING_LENGTHS: [usize; 3] = [3, 12, 100];
53+
const ARRAY_LENGTH: usize = 1024;
54+
55+
/// Returns a friendly type name for the array type.
56+
fn array_type_name<A: 'static>() -> &'static str {
57+
let id = TypeId::of::<A>();
58+
if id == TypeId::of::<StringArray>() {
59+
"Utf8"
60+
} else if id == TypeId::of::<StringViewArray>() {
61+
"Utf8View"
62+
} else if id == TypeId::of::<Float32Array>() {
63+
"Float32"
64+
} else if id == TypeId::of::<Int32Array>() {
65+
"Int32"
66+
} else {
67+
"Unknown"
7268
}
69+
}
7370

74-
let values: Float32Array = (0..array_length)
75-
.map(|_| rng.random_bool(null_percent).then(|| rng.random()))
76-
.collect();
71+
/// Builds a benchmark name from array type, list size, and null percentage.
72+
fn bench_name<A: 'static>(in_list_length: usize, null_percent: f64) -> String {
73+
format!(
74+
"in_list/{}/list={in_list_length}/nulls={}%",
75+
array_type_name::<A>(),
76+
(null_percent * 100.0) as u32
77+
)
78+
}
7779

78-
let in_list: Vec<_> = (0..in_list_length)
79-
.map(|_| ScalarValue::Float32(Some(rng.random())))
80-
.collect();
80+
/// Runs in_list benchmarks for a string array type across all list-size × null-ratio × string-length combinations.
81+
fn bench_string_type<A>(
82+
c: &mut Criterion,
83+
rng: &mut StdRng,
84+
make_scalar: fn(String) -> ScalarValue,
85+
) where
86+
A: Array + FromIterator<Option<String>> + 'static,
87+
{
88+
for in_list_length in IN_LIST_LENGTHS {
89+
for null_percent in NULL_PERCENTS {
90+
for string_length in STRING_LENGTHS {
91+
let values: A = (0..ARRAY_LENGTH)
92+
.map(|_| {
93+
rng.random_bool(1.0 - null_percent)
94+
.then(|| random_string(rng, string_length))
95+
})
96+
.collect();
8197

82-
do_bench(
83-
c,
84-
&format!("in_list_f32 ({array_length}, {null_percent}) IN ({in_list_length}, 0)"),
85-
Arc::new(values),
86-
&in_list,
87-
);
98+
let in_list: Vec<_> = (0..in_list_length)
99+
.map(|_| make_scalar(random_string(rng, string_length)))
100+
.collect();
88101

89-
let values: Int32Array = (0..array_length)
90-
.map(|_| rng.random_bool(null_percent).then(|| rng.random()))
91-
.collect();
102+
do_bench(
103+
c,
104+
&format!(
105+
"{}/str={string_length}",
106+
bench_name::<A>(in_list_length, null_percent)
107+
),
108+
Arc::new(values),
109+
&in_list,
110+
)
111+
}
112+
}
113+
}
114+
}
92115

93-
let in_list: Vec<_> = (0..in_list_length)
94-
.map(|_| ScalarValue::Int32(Some(rng.random())))
95-
.collect();
116+
/// Runs in_list benchmarks for a numeric array type across all list-size × null-ratio combinations.
117+
fn bench_numeric_type<T, A>(
118+
c: &mut Criterion,
119+
rng: &mut StdRng,
120+
mut gen_value: impl FnMut(&mut StdRng) -> T,
121+
make_scalar: fn(T) -> ScalarValue,
122+
) where
123+
A: Array + FromIterator<Option<T>> + 'static,
124+
{
125+
for in_list_length in IN_LIST_LENGTHS {
126+
for null_percent in NULL_PERCENTS {
127+
let values: A = (0..ARRAY_LENGTH)
128+
.map(|_| rng.random_bool(1.0 - null_percent).then(|| gen_value(rng)))
129+
.collect();
96130

97-
do_bench(
98-
c,
99-
&format!("in_list_i32 ({array_length}, {null_percent}) IN ({in_list_length}, 0)"),
100-
Arc::new(values),
101-
&in_list,
102-
)
103-
}
131+
let in_list: Vec<_> = (0..in_list_length)
132+
.map(|_| make_scalar(gen_value(rng)))
133+
.collect();
104134

105-
fn criterion_benchmark(c: &mut Criterion) {
106-
for in_list_length in [1, 3, 10, 100] {
107-
for null_percent in [0., 0.2] {
108-
do_benches(c, 1024, in_list_length, null_percent)
135+
do_bench(
136+
c,
137+
&bench_name::<A>(in_list_length, null_percent),
138+
Arc::new(values),
139+
&in_list,
140+
);
109141
}
110142
}
111143
}
112144

145+
/// Entry point: registers in_list benchmarks for Utf8, Utf8View, Float32, and Int32 arrays.
146+
fn criterion_benchmark(c: &mut Criterion) {
147+
let mut rng = StdRng::seed_from_u64(120320);
148+
149+
// Benchmarks for string array types (Utf8, Utf8View)
150+
bench_string_type::<StringArray>(c, &mut rng, |s| ScalarValue::Utf8(Some(s)));
151+
bench_string_type::<StringViewArray>(c, &mut rng, |s| ScalarValue::Utf8View(Some(s)));
152+
153+
// Benchmarks for numeric types
154+
bench_numeric_type::<f32, Float32Array>(
155+
c,
156+
&mut rng,
157+
|rng| rng.random(),
158+
|v| ScalarValue::Float32(Some(v)),
159+
);
160+
bench_numeric_type::<i32, Int32Array>(
161+
c,
162+
&mut rng,
163+
|rng| rng.random(),
164+
|v| ScalarValue::Int32(Some(v)),
165+
);
166+
}
167+
113168
criterion_group!(benches, criterion_benchmark);
114169
criterion_main!(benches);

0 commit comments

Comments
 (0)