@@ -131,6 +131,7 @@ pub fn prep_null_mask_filter(filter: &BooleanArray) -> BooleanArray {
131131/// If multiple arrays (or record batches) need to be filtered using the same predicate array,
132132/// consider using [FilterBuilder] to create a single [FilterPredicate] and then
133133/// calling [FilterPredicate::filter_record_batch].
134+ ///
134135/// In contrast to this function, it is then the responsibility of the caller
135136/// to use [FilterBuilder::optimize] if appropriate.
136137///
@@ -155,7 +156,7 @@ pub fn prep_null_mask_filter(filter: &BooleanArray) -> BooleanArray {
155156pub fn filter ( values : & dyn Array , predicate : & BooleanArray ) -> Result < ArrayRef , ArrowError > {
156157 let mut filter_builder = FilterBuilder :: new ( predicate) ;
157158
158- if multiple_arrays ( values. data_type ( ) ) {
159+ if FilterBuilder :: is_optimize_beneficial ( values. data_type ( ) ) {
159160 // Only optimize if filtering more than one array
160161 // Otherwise, the overhead of optimization can be more than the benefit
161162 filter_builder = filter_builder. optimize ( ) ;
@@ -166,16 +167,6 @@ pub fn filter(values: &dyn Array, predicate: &BooleanArray) -> Result<ArrayRef,
166167 filter_array ( values, & predicate)
167168}
168169
169- fn multiple_arrays ( data_type : & DataType ) -> bool {
170- match data_type {
171- DataType :: Struct ( fields) => {
172- fields. len ( ) > 1 || fields. len ( ) == 1 && multiple_arrays ( fields[ 0 ] . data_type ( ) )
173- }
174- DataType :: Union ( fields, UnionMode :: Sparse ) => !fields. is_empty ( ) ,
175- _ => false ,
176- }
177- }
178-
179170/// Returns a filtered [RecordBatch] where the corresponding elements of
180171/// `predicate` are true.
181172///
@@ -193,7 +184,10 @@ pub fn filter_record_batch(
193184 let mut filter_builder = FilterBuilder :: new ( predicate) ;
194185 let num_cols = record_batch. num_columns ( ) ;
195186 if num_cols > 1
196- || ( num_cols > 0 && multiple_arrays ( record_batch. schema_ref ( ) . field ( 0 ) . data_type ( ) ) )
187+ || ( num_cols > 0
188+ && FilterBuilder :: is_optimize_beneficial (
189+ record_batch. schema_ref ( ) . field ( 0 ) . data_type ( ) ,
190+ ) )
197191 {
198192 // Only optimize if filtering more than one column or if the column contains multiple internal arrays
199193 // Otherwise, the overhead of optimization can be more than the benefit
@@ -230,11 +224,16 @@ impl FilterBuilder {
230224 }
231225 }
232226
233- /// Compute an optimised representation of the provided `filter` mask that can be
227+ /// Compute an optimized representation of the provided `filter` mask that can be
234228 /// applied to an array more quickly.
235229 ///
236- /// Note: There is limited benefit to calling this to then filter a single array
237- /// Note: This will likely have a larger memory footprint than the original mask
230+ /// When filtering multiple arrays (e.g. a [`RecordBatch`] or a
231+ /// [`StructArray`] with multiple fields), optimizing the filter can provide
232+ /// significant performance benefits.
233+ ///
234+ /// However, optimization takes time and can have a larger memory footprint
235+ /// than the original mask, so it is often faster to filter a single array,
236+ /// without filter optimization.
238237 pub fn optimize ( mut self ) -> Self {
239238 match self . strategy {
240239 IterationStrategy :: SlicesIterator => {
@@ -250,6 +249,22 @@ impl FilterBuilder {
250249 self
251250 }
252251
252+ /// Determines if calling [FilterBuilder::optimize] is beneficial for the
253+ /// given type even when filtering just a single array.
254+ ///
255+ /// See [`FilterBuilder::optimize`] for more details.
256+ pub fn is_optimize_beneficial ( data_type : & DataType ) -> bool {
257+ match data_type {
258+ DataType :: Struct ( fields) => {
259+ fields. len ( ) > 1
260+ || fields. len ( ) == 1
261+ && FilterBuilder :: is_optimize_beneficial ( fields[ 0 ] . data_type ( ) )
262+ }
263+ DataType :: Union ( fields, UnionMode :: Sparse ) => !fields. is_empty ( ) ,
264+ _ => false ,
265+ }
266+ }
267+
253268 /// Construct the final `FilterPredicate`
254269 pub fn build ( self ) -> FilterPredicate {
255270 FilterPredicate {
0 commit comments