use bloom filters to push down hash table lookups in HashJoinExec

adriangb · adriangb · commit 67efa2b44b2b · 2025-10-27T10:47:47.000-05:00
diff --git a/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs b/datafusion/core/tests/physical_optimizer/filter_pushdown/mod.rs
@@ -278,7 +278,7 @@ async fn test_dynamic_filter_pushdown_through_hash_join_with_topk() {
     - SortExec: TopK(fetch=2), expr=[e@4 ASC], preserve_partitioning=[false], filter=[e@4 IS NULL OR e@4 < bb]
     -   HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, d@0)]
     -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-    -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, e, f], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ d@0 >= aa AND d@0 <= ab ] AND DynamicFilter [ e@1 IS NULL OR e@1 < bb ]
+    -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, e, f], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ d@0 >= aa AND d@0 <= ab AND d@0 IN BLOOM_FILTER ] AND DynamicFilter [ e@1 IS NULL OR e@1 < bb ]
     "
     );
 }
@@ -1078,7 +1078,7 @@ async fn test_hashjoin_dynamic_filter_pushdown() {
         @r"
     - HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)]
     -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
-    -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb ]
+    -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND a@0 IN BLOOM_FILTER AND b@1 >= ba AND b@1 <= bb AND b@1 IN BLOOM_FILTER ]
     "
     );
 }
@@ -1309,7 +1309,7 @@ async fn test_hashjoin_dynamic_filter_pushdown_partitioned() {
     -             DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
     -         CoalesceBatchesExec: target_batch_size=8192
     -           RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1
-    -             DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 >= ab AND a@0 <= ab AND b@1 >= bb AND b@1 <= bb OR a@0 >= aa AND a@0 <= aa AND b@1 >= ba AND b@1 <= ba ]
+    -             DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 >= ab AND a@0 <= ab AND a@0 IN BLOOM_FILTER AND b@1 >= bb AND b@1 <= bb AND b@1 IN BLOOM_FILTER OR a@0 >= aa AND a@0 <= aa AND a@0 IN BLOOM_FILTER AND b@1 >= ba AND b@1 <= ba AND b@1 IN BLOOM_FILTER ]
     "
     );
 
@@ -1503,7 +1503,7 @@ async fn test_hashjoin_dynamic_filter_pushdown_collect_left() {
     -         DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true
     -         CoalesceBatchesExec: target_batch_size=8192
     -           RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1
-    -             DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb ]
+    -             DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND a@0 IN BLOOM_FILTER AND b@1 >= ba AND b@1 <= bb AND b@1 IN BLOOM_FILTER ]
     "
     );
 
@@ -1671,8 +1671,8 @@ async fn test_nested_hashjoin_dynamic_filter_pushdown() {
     - HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, b@0)]
     -   DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, x], file_type=test, pushdown_supported=true
     -   HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c@1, d@0)]
-    -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[b, c, y], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ b@0 >= aa AND b@0 <= ab ]
-    -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, z], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ d@0 >= ca AND d@0 <= cb ]
+    -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[b, c, y], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ b@0 >= aa AND b@0 <= ab AND b@0 IN BLOOM_FILTER ]
+    -     DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, z], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ d@0 >= ca AND d@0 <= cb AND d@0 IN BLOOM_FILTER ]
     "
     );
 }
diff --git a/datafusion/physical-expr/src/expressions/bloom_filter_expr.rs b/datafusion/physical-expr/src/expressions/bloom_filter_expr.rs
@@ -257,6 +257,11 @@ impl BloomFilterExpr {
         }
     }
 
+    /// Get a reference to the underlying bloom filter
+    pub fn bloom_filter(&self) -> &Sbbf {
+        &self.bloom_filter
+    }
+
     /// Check a scalar value against the bloom filter
     fn check_scalar(&self, value: &ScalarValue) -> bool {
         if value.is_null() {
diff --git a/datafusion/physical-plan/src/joins/hash_join/exec.rs b/datafusion/physical-plan/src/joins/hash_join/exec.rs
@@ -53,6 +53,7 @@ use crate::{
     DisplayAs, DisplayFormatType, Distribution, ExecutionPlan, Partitioning,
     PlanProperties, SendableRecordBatchStream, Statistics,
 };
+use datafusion_physical_expr::bloom_filter::Sbbf;
 
 use arrow::array::{ArrayRef, BooleanBufferBuilder};
 use arrow::compute::concat_batches;
@@ -72,7 +73,9 @@ use datafusion_functions_aggregate_common::min_max::{MaxAccumulator, MinAccumula
 use datafusion_physical_expr::equivalence::{
     join_equivalence_properties, ProjectionMapping,
 };
-use datafusion_physical_expr::expressions::{lit, DynamicFilterPhysicalExpr};
+use datafusion_physical_expr::expressions::{
+    lit, BloomFilterBuilder, DynamicFilterPhysicalExpr,
+};
 use datafusion_physical_expr::{PhysicalExpr, PhysicalExprRef};
 
 use ahash::RandomState;
@@ -104,10 +107,13 @@ pub(super) struct JoinLeftData {
     _reservation: MemoryReservation,
     /// Bounds computed from the build side for dynamic filter pushdown
     pub(super) bounds: Option<Vec<ColumnBounds>>,
+    /// Bloom filters computed from the build side for dynamic filter pushdown
+    pub(super) bloom_filters: Option<Vec<Sbbf>>,
 }
 
 impl JoinLeftData {
     /// Create a new `JoinLeftData` from its parts
+    #[allow(clippy::too_many_arguments)]
     pub(super) fn new(
         hash_map: Box<dyn JoinHashMapType>,
         batch: RecordBatch,
@@ -116,6 +122,7 @@ impl JoinLeftData {
         probe_threads_counter: AtomicUsize,
         reservation: MemoryReservation,
         bounds: Option<Vec<ColumnBounds>>,
+        bloom_filters: Option<Vec<Sbbf>>,
     ) -> Self {
         Self {
             hash_map,
@@ -125,6 +132,7 @@ impl JoinLeftData {
             probe_threads_counter,
             _reservation: reservation,
             bounds,
+            bloom_filters,
         }
     }
 
@@ -1207,14 +1215,14 @@ impl ExecutionPlan for HashJoinExec {
     }
 }
 
-/// Accumulator for collecting min/max bounds from build-side data during hash join.
+/// Accumulator for collecting min/max bounds and bloom filters from build-side data during hash join.
 ///
 /// This struct encapsulates the logic for progressively computing column bounds
-/// (minimum and maximum values) for a specific join key expression as batches
+/// (minimum and maximum values) and bloom filters for a specific join key expression as batches
 /// are processed during the build phase of a hash join.
 ///
-/// The bounds are used for dynamic filter pushdown optimization, where filters
-/// based on the actual data ranges can be pushed down to the probe side to
+/// The bounds and bloom filters are used for dynamic filter pushdown optimization, where filters
+/// based on the actual data ranges and membership can be pushed down to the probe side to
 /// eliminate unnecessary data early.
 struct CollectLeftAccumulator {
     /// The physical expression to evaluate for each batch
@@ -1223,6 +1231,8 @@ struct CollectLeftAccumulator {
     min: MinAccumulator,
     /// Accumulator for tracking the maximum value across all batches
     max: MaxAccumulator,
+    /// Bloom filter builder for membership testing
+    bloom_filter: BloomFilterBuilder,
 }
 
 impl CollectLeftAccumulator {
@@ -1249,17 +1259,23 @@ impl CollectLeftAccumulator {
             .data_type(schema)
             // Min/Max can operate on dictionary data but expect to be initialized with the underlying value type
             .map(|dt| dictionary_value_type(&dt))?;
+
+        // Create bloom filter with default parameters
+        // NDV (number of distinct values) = 10000, FPP (false positive probability) = 0.01 (1%)
+        let bloom_filter = BloomFilterBuilder::new(10000, 0.01)?;
+
         Ok(Self {
             expr,
             min: MinAccumulator::try_new(&data_type)?,
             max: MaxAccumulator::try_new(&data_type)?,
+            bloom_filter,
         })
     }
 
     /// Updates the accumulators with values from a new batch.
     ///
-    /// Evaluates the expression on the batch and updates both min and max
-    /// accumulators with the resulting values.
+    /// Evaluates the expression on the batch and updates min, max, and bloom filter
+    /// with the resulting values.
     ///
     /// # Arguments
     /// * `batch` - The record batch to process
@@ -1270,20 +1286,24 @@ impl CollectLeftAccumulator {
         let array = self.expr.evaluate(batch)?.into_array(batch.num_rows())?;
         self.min.update_batch(std::slice::from_ref(&array))?;
         self.max.update_batch(std::slice::from_ref(&array))?;
+        // Insert values into bloom filter
+        self.bloom_filter.insert_array(&array)?;
         Ok(())
     }
 
-    /// Finalizes the accumulation and returns the computed bounds.
+    /// Finalizes the accumulation and returns the computed bounds and bloom filter.
     ///
-    /// Consumes self to extract the final min and max values from the accumulators.
+    /// Consumes self to extract the final min and max values from the accumulators
+    /// and the built bloom filter.
     ///
     /// # Returns
-    /// The `ColumnBounds` containing the minimum and maximum values observed
-    fn evaluate(mut self) -> Result<ColumnBounds> {
-        Ok(ColumnBounds::new(
-            self.min.evaluate()?,
-            self.max.evaluate()?,
-        ))
+    /// A tuple of (`ColumnBounds`, `Sbbf`) containing the minimum/maximum values and bloom filter
+    fn evaluate(mut self) -> Result<(ColumnBounds, Sbbf)> {
+        let bounds = ColumnBounds::new(self.min.evaluate()?, self.max.evaluate()?);
+        let bloom_filter_expr = self.bloom_filter.finish(Arc::clone(&self.expr));
+        // Extract the Sbbf from the BloomFilterExpr
+        let bloom_filter = bloom_filter_expr.bloom_filter().clone();
+        Ok((bounds, bloom_filter))
     }
 }
 
@@ -1475,16 +1495,18 @@ async fn collect_left_input(
         })
         .collect::<Result<Vec<_>>>()?;
 
-    // Compute bounds for dynamic filter if enabled
-    let bounds = match bounds_accumulators {
+    // Compute bounds and bloom filters for dynamic filter if enabled
+    let (bounds, bloom_filters) = match bounds_accumulators {
         Some(accumulators) if num_rows > 0 => {
-            let bounds = accumulators
+            let results: Vec<_> = accumulators
                 .into_iter()
                 .map(CollectLeftAccumulator::evaluate)
                 .collect::<Result<Vec<_>>>()?;
-            Some(bounds)
+            // Separate bounds and bloom filters
+            let (bounds, bloom_filters): (Vec<_>, Vec<_>) = results.into_iter().unzip();
+            (Some(bounds), Some(bloom_filters))
         }
-        _ => None,
+        _ => (None, None),
     };
 
     let data = JoinLeftData::new(
@@ -1495,6 +1517,7 @@ async fn collect_left_input(
         AtomicUsize::new(probe_threads_count),
         reservation,
         bounds,
+        bloom_filters,
     );
 
     Ok(data)
diff --git a/datafusion/physical-plan/src/joins/hash_join/shared_bounds.rs b/datafusion/physical-plan/src/joins/hash_join/shared_bounds.rs
@@ -27,7 +27,10 @@ use crate::ExecutionPlanProperties;
 
 use datafusion_common::{Result, ScalarValue};
 use datafusion_expr::Operator;
-use datafusion_physical_expr::expressions::{lit, BinaryExpr, DynamicFilterPhysicalExpr};
+use datafusion_physical_expr::bloom_filter::Sbbf;
+use datafusion_physical_expr::expressions::{
+    lit, BinaryExpr, BloomFilterExpr, DynamicFilterPhysicalExpr,
+};
 use datafusion_physical_expr::{PhysicalExpr, PhysicalExprRef};
 
 use itertools::Itertools;
@@ -51,21 +54,29 @@ impl ColumnBounds {
 }
 
 /// Represents the bounds for all join key columns from a single partition.
-/// This contains the min/max values computed from one partition's build-side data.
+/// This contains the min/max values and bloom filters computed from one partition's build-side data.
 #[derive(Debug, Clone)]
 pub(crate) struct PartitionBounds {
     /// Partition identifier for debugging and determinism (not strictly necessary)
     partition: usize,
     /// Min/max bounds for each join key column in this partition.
     /// Index corresponds to the join key expression index.
     column_bounds: Vec<ColumnBounds>,
+    /// Bloom filters for each join key column in this partition.
+    /// Index corresponds to the join key expression index.
+    bloom_filters: Vec<Sbbf>,
 }
 
 impl PartitionBounds {
-    pub(crate) fn new(partition: usize, column_bounds: Vec<ColumnBounds>) -> Self {
+    pub(crate) fn new(
+        partition: usize,
+        column_bounds: Vec<ColumnBounds>,
+        bloom_filters: Vec<Sbbf>,
+    ) -> Self {
         Self {
             partition,
             column_bounds,
+            bloom_filters,
         }
     }
 
@@ -76,6 +87,10 @@ impl PartitionBounds {
     pub(crate) fn get_column_bounds(&self, index: usize) -> Option<&ColumnBounds> {
         self.column_bounds.get(index)
     }
+
+    pub(crate) fn get_bloom_filter(&self, index: usize) -> Option<&Sbbf> {
+        self.bloom_filters.get(index)
+    }
 }
 
 /// Coordinates dynamic filter bounds collection across multiple partitions
@@ -175,15 +190,15 @@ impl SharedBoundsAccumulator {
         }
     }
 
-    /// Create a filter expression from individual partition bounds using OR logic.
+    /// Create a filter expression from individual partition bounds and bloom filters using OR logic.
     ///
-    /// This creates a filter where each partition's bounds form a conjunction (AND)
-    /// of column range predicates, and all partitions are combined with OR.
+    /// This creates a filter where each partition's bounds and bloom filters form a conjunction (AND)
+    /// of column range predicates and bloom filter checks, and all partitions are combined with OR.
     ///
     /// For example, with 2 partitions and 2 columns:
-    /// ((col0 >= p0_min0 AND col0 <= p0_max0 AND col1 >= p0_min1 AND col1 <= p0_max1)
+    /// ((col0 >= p0_min0 AND col0 <= p0_max0 AND col0 IN BLOOM_FILTER_0 AND col1 >= p0_min1 AND col1 <= p0_max1 AND col1 IN BLOOM_FILTER_1)
     ///  OR
-    ///  (col0 >= p1_min0 AND col0 <= p1_max0 AND col1 >= p1_min1 AND col1 <= p1_max1))
+    ///  (col0 >= p1_min0 AND col0 <= p1_max0 AND col0 IN BLOOM_FILTER_0 AND col1 >= p1_min1 AND col1 <= p1_max1 AND col1 IN BLOOM_FILTER_1))
     pub(crate) fn create_filter_from_partition_bounds(
         &self,
         bounds: &[PartitionBounds],
@@ -196,7 +211,7 @@ impl SharedBoundsAccumulator {
         let mut partition_predicates = Vec::with_capacity(bounds.len());
 
         for partition_bounds in bounds.iter().sorted_by_key(|b| b.partition) {
-            // Create range predicates for each join key in this partition
+            // Create range predicates and bloom filter checks for each join key in this partition
             let mut column_predicates = Vec::with_capacity(partition_bounds.len());
 
             for (col_idx, right_expr) in self.on_right.iter().enumerate() {
@@ -215,7 +230,28 @@ impl SharedBoundsAccumulator {
                     let range_expr =
                         Arc::new(BinaryExpr::new(min_expr, Operator::And, max_expr))
                             as Arc<dyn PhysicalExpr>;
-                    column_predicates.push(range_expr);
+
+                    // Create bloom filter check: col IN BLOOM_FILTER
+                    if let Some(bloom_filter) = partition_bounds.get_bloom_filter(col_idx)
+                    {
+                        let bloom_expr = Arc::new(BloomFilterExpr::new(
+                            Arc::clone(right_expr),
+                            bloom_filter.clone(),
+                        ))
+                            as Arc<dyn PhysicalExpr>;
+
+                        // Combine range and bloom filter: (range_expr AND bloom_expr)
+                        let combined_expr = Arc::new(BinaryExpr::new(
+                            range_expr,
+                            Operator::And,
+                            bloom_expr,
+                        ))
+                            as Arc<dyn PhysicalExpr>;
+                        column_predicates.push(combined_expr);
+                    } else {
+                        // If no bloom filter, just use range expression
+                        column_predicates.push(range_expr);
+                    }
                 }
             }
 
@@ -247,8 +283,8 @@ impl SharedBoundsAccumulator {
     /// Report bounds from a completed partition and update dynamic filter if all partitions are done
     ///
     /// This method coordinates the dynamic filter updates across all partitions. It stores the
-    /// bounds from the current partition, increments the completion counter, and when all
-    /// partitions have reported, creates an OR'd filter from individual partition bounds.
+    /// bounds and bloom filters from the current partition, increments the completion counter, and when all
+    /// partitions have reported, creates an OR'd filter from individual partition bounds and bloom filters.
     ///
     /// This method is async and uses a [`tokio::sync::Barrier`] to wait for all partitions
     /// to report their bounds. Once that occurs, the method will resolve for all callers and the
@@ -264,16 +300,18 @@ impl SharedBoundsAccumulator {
     /// # Arguments
     /// * `left_side_partition_id` - The identifier for the **left-side** partition reporting its bounds
     /// * `partition_bounds` - The bounds computed by this partition (if any)
+    /// * `bloom_filters` - The bloom filters computed by this partition (if any)
     ///
     /// # Returns
     /// * `Result<()>` - Ok if successful, Err if filter update failed
     pub(crate) async fn report_partition_bounds(
         &self,
         left_side_partition_id: usize,
         partition_bounds: Option<Vec<ColumnBounds>>,
+        bloom_filters: Option<Vec<Sbbf>>,
     ) -> Result<()> {
-        // Store bounds in the accumulator - this runs once per partition
-        if let Some(bounds) = partition_bounds {
+        // Store bounds and bloom filters in the accumulator - this runs once per partition
+        if let (Some(bounds), Some(filters)) = (partition_bounds, bloom_filters) {
             let mut guard = self.inner.lock();
 
             let should_push = if let Some(last_bound) = guard.bounds.last() {
@@ -286,9 +324,11 @@ impl SharedBoundsAccumulator {
             };
 
             if should_push {
-                guard
-                    .bounds
-                    .push(PartitionBounds::new(left_side_partition_id, bounds));
+                guard.bounds.push(PartitionBounds::new(
+                    left_side_partition_id,
+                    bounds,
+                    filters,
+                ));
             }
         }
 
diff --git a/datafusion/physical-plan/src/joins/hash_join/stream.rs b/datafusion/physical-plan/src/joins/hash_join/stream.rs

Original file line number	Diff line number	Diff line change
`@@ -278,7 +278,7 @@ async fn test_dynamic_filter_pushdown_through_hash_join_with_topk() {`
`278`	`278`	`- SortExec: TopK(fetch=2), expr=[e@4 ASC], preserve_partitioning=[false], filter=[e@4 IS NULL OR e@4 < bb]`
`279`	`279`	`- HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, d@0)]`
`280`	`280`	`- DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true`
`281`		`- - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, e, f], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ d@0 >= aa AND d@0 <= ab ] AND DynamicFilter [ e@1 IS NULL OR e@1 < bb ]`
	`281`	`+ - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, e, f], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ d@0 >= aa AND d@0 <= ab AND d@0 IN BLOOM_FILTER ] AND DynamicFilter [ e@1 IS NULL OR e@1 < bb ]`
`282`	`282`	`"`
`283`	`283`	`);`
`284`	`284`	`}`
`@@ -1078,7 +1078,7 @@ async fn test_hashjoin_dynamic_filter_pushdown() {`
`1078`	`1078`	`@r"`
`1079`	`1079`	`- HashJoinExec: mode=CollectLeft, join_type=Inner, on=[(a@0, a@0), (b@1, b@1)]`
`1080`	`1080`	`- DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true`
`1081`		`- - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb ]`
	`1081`	`+ - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND a@0 IN BLOOM_FILTER AND b@1 >= ba AND b@1 <= bb AND b@1 IN BLOOM_FILTER ]`
`1082`	`1082`	`"`
`1083`	`1083`	`);`
`1084`	`1084`	`}`
`@@ -1309,7 +1309,7 @@ async fn test_hashjoin_dynamic_filter_pushdown_partitioned() {`
`1309`	`1309`	`- DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true`
`1310`	`1310`	`- CoalesceBatchesExec: target_batch_size=8192`
`1311`	`1311`	`- RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1`
`1312`		`- - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 >= ab AND a@0 <= ab AND b@1 >= bb AND b@1 <= bb OR a@0 >= aa AND a@0 <= aa AND b@1 >= ba AND b@1 <= ba ]`
	`1312`	`+ - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 >= ab AND a@0 <= ab AND a@0 IN BLOOM_FILTER AND b@1 >= bb AND b@1 <= bb AND b@1 IN BLOOM_FILTER OR a@0 >= aa AND a@0 <= aa AND a@0 IN BLOOM_FILTER AND b@1 >= ba AND b@1 <= ba AND b@1 IN BLOOM_FILTER ]`
`1313`	`1313`	`"`
`1314`	`1314`	`);`
`1315`	`1315`
`@@ -1503,7 +1503,7 @@ async fn test_hashjoin_dynamic_filter_pushdown_collect_left() {`
`1503`	`1503`	`- DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, c], file_type=test, pushdown_supported=true`
`1504`	`1504`	`- CoalesceBatchesExec: target_batch_size=8192`
`1505`	`1505`	`- RepartitionExec: partitioning=Hash([a@0, b@1], 12), input_partitions=1`
`1506`		`- - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND b@1 >= ba AND b@1 <= bb ]`
	`1506`	`+ - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, b, e], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ a@0 >= aa AND a@0 <= ab AND a@0 IN BLOOM_FILTER AND b@1 >= ba AND b@1 <= bb AND b@1 IN BLOOM_FILTER ]`
`1507`	`1507`	`"`
`1508`	`1508`	`);`
`1509`	`1509`
`@@ -1671,8 +1671,8 @@ async fn test_nested_hashjoin_dynamic_filter_pushdown() {`
`1671`	`1671`	`- HashJoinExec: mode=Partitioned, join_type=Inner, on=[(a@0, b@0)]`
`1672`	`1672`	`- DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[a, x], file_type=test, pushdown_supported=true`
`1673`	`1673`	`- HashJoinExec: mode=Partitioned, join_type=Inner, on=[(c@1, d@0)]`
`1674`		`- - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[b, c, y], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ b@0 >= aa AND b@0 <= ab ]`
`1675`		`- - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, z], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ d@0 >= ca AND d@0 <= cb ]`
	`1674`	`+ - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[b, c, y], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ b@0 >= aa AND b@0 <= ab AND b@0 IN BLOOM_FILTER ]`
	`1675`	`+ - DataSourceExec: file_groups={1 group: [[test.parquet]]}, projection=[d, z], file_type=test, pushdown_supported=true, predicate=DynamicFilter [ d@0 >= ca AND d@0 <= cb AND d@0 IN BLOOM_FILTER ]`
`1676`	`1676`	`"`
`1677`	`1677`	`);`
`1678`	`1678`	`}`
Original file line number	Diff line number	Diff line change
`@@ -257,6 +257,11 @@ impl BloomFilterExpr {`
`257`	`257`	`}`
`258`	`258`	`}`
`259`	`259`
	`260`	`+ /// Get a reference to the underlying bloom filter`
	`261`	`+ pub fn bloom_filter(&self) -> &Sbbf {`
	`262`	`+ &self.bloom_filter`
	`263`	`+ }`
	`264`	`+`
`260`	`265`	`/// Check a scalar value against the bloom filter`
`261`	`266`	`fn check_scalar(&self, value: &ScalarValue) -> bool {`
`262`	`267`	`if value.is_null() {`