Skip to content

Commit d8f6d45

Browse files
committed
Revert "Short InList Optimization (#46)"
This reverts commit d299a91.
1 parent d299a91 commit d8f6d45

File tree

2 files changed

+165
-547
lines changed

2 files changed

+165
-547
lines changed

datafusion/physical-expr/benches/in_list.rs

Lines changed: 57 additions & 112 deletions
Original file line numberDiff line numberDiff line change
@@ -15,21 +15,17 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
use arrow::array::{
19-
Array, ArrayRef, Float32Array, Int32Array, StringArray, StringViewArray,
20-
};
18+
use arrow::array::{Array, ArrayRef, Float32Array, Int32Array, StringArray};
2119
use arrow::datatypes::{Field, Schema};
2220
use arrow::record_batch::RecordBatch;
2321
use criterion::{criterion_group, criterion_main, Criterion};
2422
use datafusion_common::ScalarValue;
2523
use datafusion_physical_expr::expressions::{col, in_list, lit};
2624
use rand::distr::Alphanumeric;
2725
use rand::prelude::*;
28-
use std::any::TypeId;
2926
use std::hint::black_box;
3027
use std::sync::Arc;
3128

32-
/// Measures how long `in_list(col("a"), exprs)` takes to evaluate against a single RecordBatch.
3329
fn do_bench(c: &mut Criterion, name: &str, values: ArrayRef, exprs: &[ScalarValue]) {
3430
let schema = Schema::new(vec![Field::new("a", values.data_type().clone(), true)]);
3531
let exprs = exprs.iter().map(|s| lit(s.clone())).collect();
@@ -41,128 +37,77 @@ fn do_bench(c: &mut Criterion, name: &str, values: ArrayRef, exprs: &[ScalarValu
4137
});
4238
}
4339

44-
/// Generates a random alphanumeric string of the specified length.
4540
fn random_string(rng: &mut StdRng, len: usize) -> String {
4641
let value = rng.sample_iter(&Alphanumeric).take(len).collect();
4742
String::from_utf8(value).unwrap()
4843
}
4944

50-
const IN_LIST_LENGTHS: [usize; 3] = [3, 8, 100];
51-
const NULL_PERCENTS: [f64; 2] = [0., 0.2];
52-
const STRING_LENGTHS: [usize; 3] = [3, 12, 100];
53-
const ARRAY_LENGTH: usize = 1024;
54-
55-
/// Returns a friendly type name for the array type.
56-
fn array_type_name<A: 'static>() -> &'static str {
57-
let id = TypeId::of::<A>();
58-
if id == TypeId::of::<StringArray>() {
59-
"Utf8"
60-
} else if id == TypeId::of::<StringViewArray>() {
61-
"Utf8View"
62-
} else if id == TypeId::of::<Float32Array>() {
63-
"Float32"
64-
} else if id == TypeId::of::<Int32Array>() {
65-
"Int32"
66-
} else {
67-
"Unknown"
45+
fn do_benches(
46+
c: &mut Criterion,
47+
array_length: usize,
48+
in_list_length: usize,
49+
null_percent: f64,
50+
) {
51+
let mut rng = StdRng::seed_from_u64(120320);
52+
for string_length in [5, 10, 20] {
53+
let values: StringArray = (0..array_length)
54+
.map(|_| {
55+
rng.random_bool(null_percent)
56+
.then(|| random_string(&mut rng, string_length))
57+
})
58+
.collect();
59+
60+
let in_list: Vec<_> = (0..in_list_length)
61+
.map(|_| ScalarValue::from(random_string(&mut rng, string_length)))
62+
.collect();
63+
64+
do_bench(
65+
c,
66+
&format!(
67+
"in_list_utf8({string_length}) ({array_length}, {null_percent}) IN ({in_list_length}, 0)"
68+
),
69+
Arc::new(values),
70+
&in_list,
71+
)
6872
}
69-
}
70-
71-
/// Builds a benchmark name from array type, list size, and null percentage.
72-
fn bench_name<A: 'static>(in_list_length: usize, null_percent: f64) -> String {
73-
format!(
74-
"in_list/{}/list={in_list_length}/nulls={}%",
75-
array_type_name::<A>(),
76-
(null_percent * 100.0) as u32
77-
)
78-
}
7973

80-
/// Runs in_list benchmarks for a string array type across all list-size × null-ratio × string-length combinations.
81-
fn bench_string_type<A>(
82-
c: &mut Criterion,
83-
rng: &mut StdRng,
84-
make_scalar: fn(String) -> ScalarValue,
85-
) where
86-
A: Array + FromIterator<Option<String>> + 'static,
87-
{
88-
for in_list_length in IN_LIST_LENGTHS {
89-
for null_percent in NULL_PERCENTS {
90-
for string_length in STRING_LENGTHS {
91-
let values: A = (0..ARRAY_LENGTH)
92-
.map(|_| {
93-
rng.random_bool(1.0 - null_percent)
94-
.then(|| random_string(rng, string_length))
95-
})
96-
.collect();
74+
let values: Float32Array = (0..array_length)
75+
.map(|_| rng.random_bool(null_percent).then(|| rng.random()))
76+
.collect();
9777

98-
let in_list: Vec<_> = (0..in_list_length)
99-
.map(|_| make_scalar(random_string(rng, string_length)))
100-
.collect();
78+
let in_list: Vec<_> = (0..in_list_length)
79+
.map(|_| ScalarValue::Float32(Some(rng.random())))
80+
.collect();
10181

102-
do_bench(
103-
c,
104-
&format!(
105-
"{}/str={string_length}",
106-
bench_name::<A>(in_list_length, null_percent)
107-
),
108-
Arc::new(values),
109-
&in_list,
110-
)
111-
}
112-
}
113-
}
114-
}
82+
do_bench(
83+
c,
84+
&format!("in_list_f32 ({array_length}, {null_percent}) IN ({in_list_length}, 0)"),
85+
Arc::new(values),
86+
&in_list,
87+
);
11588

116-
/// Runs in_list benchmarks for a numeric array type across all list-size × null-ratio combinations.
117-
fn bench_numeric_type<T, A>(
118-
c: &mut Criterion,
119-
rng: &mut StdRng,
120-
mut gen_value: impl FnMut(&mut StdRng) -> T,
121-
make_scalar: fn(T) -> ScalarValue,
122-
) where
123-
A: Array + FromIterator<Option<T>> + 'static,
124-
{
125-
for in_list_length in IN_LIST_LENGTHS {
126-
for null_percent in NULL_PERCENTS {
127-
let values: A = (0..ARRAY_LENGTH)
128-
.map(|_| rng.random_bool(1.0 - null_percent).then(|| gen_value(rng)))
129-
.collect();
89+
let values: Int32Array = (0..array_length)
90+
.map(|_| rng.random_bool(null_percent).then(|| rng.random()))
91+
.collect();
13092

131-
let in_list: Vec<_> = (0..in_list_length)
132-
.map(|_| make_scalar(gen_value(rng)))
133-
.collect();
93+
let in_list: Vec<_> = (0..in_list_length)
94+
.map(|_| ScalarValue::Int32(Some(rng.random())))
95+
.collect();
13496

135-
do_bench(
136-
c,
137-
&bench_name::<A>(in_list_length, null_percent),
138-
Arc::new(values),
139-
&in_list,
140-
);
141-
}
142-
}
97+
do_bench(
98+
c,
99+
&format!("in_list_i32 ({array_length}, {null_percent}) IN ({in_list_length}, 0)"),
100+
Arc::new(values),
101+
&in_list,
102+
)
143103
}
144104

145-
/// Entry point: registers in_list benchmarks for Utf8, Utf8View, Float32, and Int32 arrays.
146105
fn criterion_benchmark(c: &mut Criterion) {
147-
let mut rng = StdRng::seed_from_u64(120320);
148-
149-
// Benchmarks for string array types (Utf8, Utf8View)
150-
bench_string_type::<StringArray>(c, &mut rng, |s| ScalarValue::Utf8(Some(s)));
151-
bench_string_type::<StringViewArray>(c, &mut rng, |s| ScalarValue::Utf8View(Some(s)));
152-
153-
// Benchmarks for numeric types
154-
bench_numeric_type::<f32, Float32Array>(
155-
c,
156-
&mut rng,
157-
|rng| rng.random(),
158-
|v| ScalarValue::Float32(Some(v)),
159-
);
160-
bench_numeric_type::<i32, Int32Array>(
161-
c,
162-
&mut rng,
163-
|rng| rng.random(),
164-
|v| ScalarValue::Int32(Some(v)),
165-
);
106+
for in_list_length in [1, 3, 10, 100] {
107+
for null_percent in [0., 0.2] {
108+
do_benches(c, 1024, in_list_length, null_percent)
109+
}
110+
}
166111
}
167112

168113
criterion_group!(benches, criterion_benchmark);

0 commit comments

Comments
 (0)