Skip to content

Commit af2c460

Browse files
pepijnvealamb
andauthored
Add FilterBuilder::is_optimize_beneficial (#8782)
# Which issue does this PR close? - Closes #8781. # Rationale for this change See problem statement in #8781 # What changes are included in this PR? - The internal function `multiple_arrays` has been moved to `FilterBuilder` and made public - The function has been renamed to `is_optimize_beneficial` to make it intention revealing rather than a description of the implementation # Are these changes tested? No functional changes # Are there any user-facing changes? No breaking changes --------- Co-authored-by: Andrew Lamb <[email protected]>
1 parent 4b33252 commit af2c460

File tree

1 file changed

+30
-15
lines changed

1 file changed

+30
-15
lines changed

arrow-select/src/filter.rs

Lines changed: 30 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@ pub fn prep_null_mask_filter(filter: &BooleanArray) -> BooleanArray {
131131
/// If multiple arrays (or record batches) need to be filtered using the same predicate array,
132132
/// consider using [FilterBuilder] to create a single [FilterPredicate] and then
133133
/// calling [FilterPredicate::filter_record_batch].
134+
///
134135
/// In contrast to this function, it is then the responsibility of the caller
135136
/// to use [FilterBuilder::optimize] if appropriate.
136137
///
@@ -155,7 +156,7 @@ pub fn prep_null_mask_filter(filter: &BooleanArray) -> BooleanArray {
155156
pub fn filter(values: &dyn Array, predicate: &BooleanArray) -> Result<ArrayRef, ArrowError> {
156157
let mut filter_builder = FilterBuilder::new(predicate);
157158

158-
if multiple_arrays(values.data_type()) {
159+
if FilterBuilder::is_optimize_beneficial(values.data_type()) {
159160
// Only optimize if filtering more than one array
160161
// Otherwise, the overhead of optimization can be more than the benefit
161162
filter_builder = filter_builder.optimize();
@@ -166,16 +167,6 @@ pub fn filter(values: &dyn Array, predicate: &BooleanArray) -> Result<ArrayRef,
166167
filter_array(values, &predicate)
167168
}
168169

169-
fn multiple_arrays(data_type: &DataType) -> bool {
170-
match data_type {
171-
DataType::Struct(fields) => {
172-
fields.len() > 1 || fields.len() == 1 && multiple_arrays(fields[0].data_type())
173-
}
174-
DataType::Union(fields, UnionMode::Sparse) => !fields.is_empty(),
175-
_ => false,
176-
}
177-
}
178-
179170
/// Returns a filtered [RecordBatch] where the corresponding elements of
180171
/// `predicate` are true.
181172
///
@@ -193,7 +184,10 @@ pub fn filter_record_batch(
193184
let mut filter_builder = FilterBuilder::new(predicate);
194185
let num_cols = record_batch.num_columns();
195186
if num_cols > 1
196-
|| (num_cols > 0 && multiple_arrays(record_batch.schema_ref().field(0).data_type()))
187+
|| (num_cols > 0
188+
&& FilterBuilder::is_optimize_beneficial(
189+
record_batch.schema_ref().field(0).data_type(),
190+
))
197191
{
198192
// Only optimize if filtering more than one column or if the column contains multiple internal arrays
199193
// Otherwise, the overhead of optimization can be more than the benefit
@@ -230,11 +224,16 @@ impl FilterBuilder {
230224
}
231225
}
232226

233-
/// Compute an optimised representation of the provided `filter` mask that can be
227+
/// Compute an optimized representation of the provided `filter` mask that can be
234228
/// applied to an array more quickly.
235229
///
236-
/// Note: There is limited benefit to calling this to then filter a single array
237-
/// Note: This will likely have a larger memory footprint than the original mask
230+
/// When filtering multiple arrays (e.g. a [`RecordBatch`] or a
231+
/// [`StructArray`] with multiple fields), optimizing the filter can provide
232+
/// significant performance benefits.
233+
///
234+
/// However, optimization takes time and can have a larger memory footprint
235+
/// than the original mask, so it is often faster to filter a single array,
236+
/// without filter optimization.
238237
pub fn optimize(mut self) -> Self {
239238
match self.strategy {
240239
IterationStrategy::SlicesIterator => {
@@ -250,6 +249,22 @@ impl FilterBuilder {
250249
self
251250
}
252251

252+
/// Determines if calling [FilterBuilder::optimize] is beneficial for the
253+
/// given type even when filtering just a single array.
254+
///
255+
/// See [`FilterBuilder::optimize`] for more details.
256+
pub fn is_optimize_beneficial(data_type: &DataType) -> bool {
257+
match data_type {
258+
DataType::Struct(fields) => {
259+
fields.len() > 1
260+
|| fields.len() == 1
261+
&& FilterBuilder::is_optimize_beneficial(fields[0].data_type())
262+
}
263+
DataType::Union(fields, UnionMode::Sparse) => !fields.is_empty(),
264+
_ => false,
265+
}
266+
}
267+
253268
/// Construct the final `FilterPredicate`
254269
pub fn build(self) -> FilterPredicate {
255270
FilterPredicate {

0 commit comments

Comments
 (0)