-
Notifications
You must be signed in to change notification settings - Fork 2k
bench: Add IN list benchmarks for non-constant list expressions #20444
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,6 +23,7 @@ use arrow::datatypes::{Field, Schema}; | |
| use arrow::record_batch::RecordBatch; | ||
| use criterion::{Criterion, criterion_group, criterion_main}; | ||
| use datafusion_common::ScalarValue; | ||
| use datafusion_physical_expr::PhysicalExpr; | ||
| use datafusion_physical_expr::expressions::{col, in_list, lit}; | ||
| use rand::distr::Alphanumeric; | ||
| use rand::prelude::*; | ||
|
|
@@ -50,7 +51,9 @@ fn random_string(rng: &mut StdRng, len: usize) -> String { | |
| } | ||
|
|
||
| const IN_LIST_LENGTHS: [usize; 4] = [3, 8, 28, 100]; | ||
| const DYNAMIC_LIST_LENGTHS: [usize; 3] = [3, 8, 28]; | ||
| const NULL_PERCENTS: [f64; 2] = [0., 0.2]; | ||
| const MATCH_PERCENTS: [f64; 3] = [0.0, 0.5, 1.0]; | ||
| const STRING_LENGTHS: [usize; 3] = [3, 12, 100]; | ||
| const ARRAY_LENGTH: usize = 8192; | ||
|
|
||
|
|
@@ -219,6 +222,144 @@ fn bench_realistic_mixed_strings<A>( | |
| } | ||
| } | ||
|
|
||
| /// Benchmarks the dynamic evaluation path (no static filter) by including | ||
| /// a column reference in the IN list, which prevents static filter creation. | ||
|
||
| fn do_bench_dynamic( | ||
| c: &mut Criterion, | ||
| name: &str, | ||
| values: ArrayRef, | ||
| list_cols: &[ArrayRef], | ||
| ) { | ||
| let mut fields = vec![Field::new("a", values.data_type().clone(), true)]; | ||
| let mut columns: Vec<ArrayRef> = vec![values]; | ||
|
|
||
| // Build list expressions: mix of column refs (forces dynamic path) | ||
| let schema_fields: Vec<Field> = list_cols | ||
| .iter() | ||
| .enumerate() | ||
| .map(|(i, col_arr)| { | ||
| let name = format!("b{i}"); | ||
| fields.push(Field::new(&name, col_arr.data_type().clone(), true)); | ||
| columns.push(Arc::clone(col_arr)); | ||
| Field::new(&name, col_arr.data_type().clone(), true) | ||
| }) | ||
| .collect(); | ||
|
|
||
| let schema = Schema::new(fields); | ||
| let list_exprs: Vec<Arc<dyn PhysicalExpr>> = schema_fields | ||
| .iter() | ||
| .map(|f| col(f.name(), &schema).unwrap()) | ||
| .collect(); | ||
|
|
||
| let expr = in_list(col("a", &schema).unwrap(), list_exprs, &false, &schema).unwrap(); | ||
| let batch = RecordBatch::try_new(Arc::new(schema), columns).unwrap(); | ||
|
|
||
| c.bench_function(name, |b| { | ||
| b.iter(|| black_box(expr.evaluate(black_box(&batch)).unwrap())) | ||
| }); | ||
| } | ||
|
|
||
| /// Benchmarks the dynamic IN list path for Int32 arrays with column references. | ||
|
||
| fn bench_dynamic_int32(c: &mut Criterion) { | ||
| let mut rng = StdRng::seed_from_u64(42); | ||
|
|
||
| for list_size in DYNAMIC_LIST_LENGTHS { | ||
| for match_percent in MATCH_PERCENTS { | ||
| for null_percent in NULL_PERCENTS { | ||
| // Generate the "needle" column | ||
| let values: Int32Array = (0..ARRAY_LENGTH) | ||
| .map(|_| { | ||
| rng.random_bool(1.0 - null_percent) | ||
| .then(|| rng.random_range(0..1000)) | ||
| }) | ||
| .collect(); | ||
|
|
||
| // Generate list columns with controlled match rate | ||
| let list_cols: Vec<ArrayRef> = (0..list_size) | ||
| .map(|_| { | ||
| let col: Int32Array = (0..ARRAY_LENGTH) | ||
| .map(|row| { | ||
| if rng.random_bool(1.0 - null_percent) { | ||
| if rng.random_bool(match_percent) { | ||
| // Copy from values to create a match | ||
| if values.is_null(row) { | ||
| Some(rng.random_range(0..1000)) | ||
| } else { | ||
| Some(values.value(row)) | ||
| } | ||
| } else { | ||
| // Random value (unlikely to match) | ||
| Some(rng.random_range(1000..2000)) | ||
| } | ||
| } else { | ||
| None | ||
| } | ||
| }) | ||
| .collect(); | ||
| Arc::new(col) as ArrayRef | ||
| }) | ||
| .collect(); | ||
|
|
||
| do_bench_dynamic( | ||
| c, | ||
| &format!( | ||
| "in_list_dynamic/Int32/list={}/match={}%/nulls={}%", | ||
| list_size, | ||
| (match_percent * 100.0) as u32, | ||
| (null_percent * 100.0) as u32 | ||
| ), | ||
| Arc::new(values), | ||
| &list_cols, | ||
| ); | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /// Benchmarks the dynamic IN list path for Utf8 arrays with column references. | ||
| fn bench_dynamic_utf8(c: &mut Criterion) { | ||
| let mut rng = StdRng::seed_from_u64(99); | ||
|
|
||
| for list_size in DYNAMIC_LIST_LENGTHS { | ||
| for match_percent in MATCH_PERCENTS { | ||
| // Generate the "needle" column | ||
| let value_strings: Vec<Option<String>> = (0..ARRAY_LENGTH) | ||
| .map(|_| rng.random_bool(0.8).then(|| random_string(&mut rng, 12))) | ||
| .collect(); | ||
| let values: StringArray = | ||
| value_strings.iter().map(|s| s.as_deref()).collect(); | ||
|
|
||
| // Generate list columns with controlled match rate | ||
| let list_cols: Vec<ArrayRef> = (0..list_size) | ||
| .map(|_| { | ||
| let col: StringArray = (0..ARRAY_LENGTH) | ||
| .map(|row| { | ||
| if rng.random_bool(match_percent) { | ||
| // Copy from values to create a match | ||
| value_strings[row].as_deref() | ||
| } else { | ||
| Some("no_match_value_xyz") | ||
| } | ||
| }) | ||
| .collect(); | ||
| Arc::new(col) as ArrayRef | ||
| }) | ||
| .collect(); | ||
|
|
||
| do_bench_dynamic( | ||
| c, | ||
| &format!( | ||
| "in_list_dynamic/Utf8/list={}/match={}%", | ||
| list_size, | ||
| (match_percent * 100.0) as u32, | ||
| ), | ||
| Arc::new(values), | ||
| &list_cols, | ||
| ); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| /// Entry point: registers in_list benchmarks for string and numeric array types. | ||
| fn criterion_benchmark(c: &mut Criterion) { | ||
| let mut rng = StdRng::seed_from_u64(120320); | ||
|
|
@@ -266,6 +407,10 @@ fn criterion_benchmark(c: &mut Criterion) { | |
| |rng| rng.random(), | ||
| |v| ScalarValue::TimestampNanosecond(Some(v), None), | ||
| ); | ||
|
|
||
| // Dynamic path benchmarks (non-constant list expressions) | ||
| bench_dynamic_int32(c); | ||
| bench_dynamic_utf8(c); | ||
| } | ||
|
|
||
| criterion_group! { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does
('a', 1, 123.24)also force this "dynamic" path? If so would use the term "heterogeneous" for that. If not and it's only columns that trigger this code path I would use the term "LIST_WITH_COLUMNS_LENGTHS.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
No, only column references trigger this code path. Heterogeneous literals like 1 IN ('a', 1, 123.24) are type-coerced and still go through the static (HashSet) path. Renamed to LIST_WITH_COLUMNS_LENGTHS, and also renamed all related functions/benchmark names to remove the "dynamic" terminology.