Skip to content
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
145 changes: 145 additions & 0 deletions datafusion/physical-expr/benches/in_list.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ use arrow::datatypes::{Field, Schema};
use arrow::record_batch::RecordBatch;
use criterion::{Criterion, criterion_group, criterion_main};
use datafusion_common::ScalarValue;
use datafusion_physical_expr::PhysicalExpr;
use datafusion_physical_expr::expressions::{col, in_list, lit};
use rand::distr::Alphanumeric;
use rand::prelude::*;
Expand Down Expand Up @@ -50,7 +51,9 @@ fn random_string(rng: &mut StdRng, len: usize) -> String {
}

const IN_LIST_LENGTHS: [usize; 4] = [3, 8, 28, 100];
const DYNAMIC_LIST_LENGTHS: [usize; 3] = [3, 8, 28];
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does ('a', 1, 123.24) also force this "dynamic" path? If so would use the term "heterogeneous" for that. If not and it's only columns that trigger this code path I would use the term "LIST_WITH_COLUMNS_LENGTHS.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, only column references trigger this code path. Heterogeneous literals like 1 IN ('a', 1, 123.24) are type-coerced and still go through the static (HashSet) path. Renamed to LIST_WITH_COLUMNS_LENGTHS, and also renamed all related functions/benchmark names to remove the "dynamic" terminology.

const NULL_PERCENTS: [f64; 2] = [0., 0.2];
const MATCH_PERCENTS: [f64; 3] = [0.0, 0.5, 1.0];
const STRING_LENGTHS: [usize; 3] = [3, 12, 100];
const ARRAY_LENGTH: usize = 8192;

Expand Down Expand Up @@ -219,6 +222,144 @@ fn bench_realistic_mixed_strings<A>(
}
}

/// Benchmarks the dynamic evaluation path (no static filter) by including
/// a column reference in the IN list, which prevents static filter creation.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be nice to show an example of how the arguments to this function map to the equivalent SQL being benchmarked.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for your advice, I have added equivalent SQL examples in docstring.

fn do_bench_dynamic(
c: &mut Criterion,
name: &str,
values: ArrayRef,
list_cols: &[ArrayRef],
) {
let mut fields = vec![Field::new("a", values.data_type().clone(), true)];
let mut columns: Vec<ArrayRef> = vec![values];

// Build list expressions: mix of column refs (forces dynamic path)
let schema_fields: Vec<Field> = list_cols
.iter()
.enumerate()
.map(|(i, col_arr)| {
let name = format!("b{i}");
fields.push(Field::new(&name, col_arr.data_type().clone(), true));
columns.push(Arc::clone(col_arr));
Field::new(&name, col_arr.data_type().clone(), true)
})
.collect();

let schema = Schema::new(fields);
let list_exprs: Vec<Arc<dyn PhysicalExpr>> = schema_fields
.iter()
.map(|f| col(f.name(), &schema).unwrap())
.collect();

let expr = in_list(col("a", &schema).unwrap(), list_exprs, &false, &schema).unwrap();
let batch = RecordBatch::try_new(Arc::new(schema), columns).unwrap();

c.bench_function(name, |b| {
b.iter(|| black_box(expr.evaluate(black_box(&batch)).unwrap()))
});
}

/// Benchmarks the dynamic IN list path for Int32 arrays with column references.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be nice to see examples in this docstring of what the SQL being benchmarked is, e.g.:

// select 1 in x from t;
// where t:
// create table t ...

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Added equivalent SQL examples to both bench_with_columns_int32 and bench_with_columns_utf8:

  /// Equivalent SQL:
  /// ```sql
  /// CREATE TABLE t (a INT, b0 INT, b1 INT, ...);
  /// SELECT * FROM t WHERE a IN (b0, b1, ...);
  /// ```

fn bench_dynamic_int32(c: &mut Criterion) {
let mut rng = StdRng::seed_from_u64(42);

for list_size in DYNAMIC_LIST_LENGTHS {
for match_percent in MATCH_PERCENTS {
for null_percent in NULL_PERCENTS {
// Generate the "needle" column
let values: Int32Array = (0..ARRAY_LENGTH)
.map(|_| {
rng.random_bool(1.0 - null_percent)
.then(|| rng.random_range(0..1000))
})
.collect();

// Generate list columns with controlled match rate
let list_cols: Vec<ArrayRef> = (0..list_size)
.map(|_| {
let col: Int32Array = (0..ARRAY_LENGTH)
.map(|row| {
if rng.random_bool(1.0 - null_percent) {
if rng.random_bool(match_percent) {
// Copy from values to create a match
if values.is_null(row) {
Some(rng.random_range(0..1000))
} else {
Some(values.value(row))
}
} else {
// Random value (unlikely to match)
Some(rng.random_range(1000..2000))
}
} else {
None
}
})
.collect();
Arc::new(col) as ArrayRef
})
.collect();

do_bench_dynamic(
c,
&format!(
"in_list_dynamic/Int32/list={}/match={}%/nulls={}%",
list_size,
(match_percent * 100.0) as u32,
(null_percent * 100.0) as u32
),
Arc::new(values),
&list_cols,
);
}
}
}
}

/// Benchmarks the dynamic IN list path for Utf8 arrays with column references.
fn bench_dynamic_utf8(c: &mut Criterion) {
let mut rng = StdRng::seed_from_u64(99);

for list_size in DYNAMIC_LIST_LENGTHS {
for match_percent in MATCH_PERCENTS {
// Generate the "needle" column
let value_strings: Vec<Option<String>> = (0..ARRAY_LENGTH)
.map(|_| rng.random_bool(0.8).then(|| random_string(&mut rng, 12)))
.collect();
let values: StringArray =
value_strings.iter().map(|s| s.as_deref()).collect();

// Generate list columns with controlled match rate
let list_cols: Vec<ArrayRef> = (0..list_size)
.map(|_| {
let col: StringArray = (0..ARRAY_LENGTH)
.map(|row| {
if rng.random_bool(match_percent) {
// Copy from values to create a match
value_strings[row].as_deref()
} else {
Some("no_match_value_xyz")
}
})
.collect();
Arc::new(col) as ArrayRef
})
.collect();

do_bench_dynamic(
c,
&format!(
"in_list_dynamic/Utf8/list={}/match={}%",
list_size,
(match_percent * 100.0) as u32,
),
Arc::new(values),
&list_cols,
);
}
}
}

/// Entry point: registers in_list benchmarks for string and numeric array types.
fn criterion_benchmark(c: &mut Criterion) {
let mut rng = StdRng::seed_from_u64(120320);
Expand Down Expand Up @@ -266,6 +407,10 @@ fn criterion_benchmark(c: &mut Criterion) {
|rng| rng.random(),
|v| ScalarValue::TimestampNanosecond(Some(v), None),
);

// Dynamic path benchmarks (non-constant list expressions)
bench_dynamic_int32(c);
bench_dynamic_utf8(c);
}

criterion_group! {
Expand Down