|
16 | 16 | // under the License.
|
17 | 17 |
|
18 | 18 | use std::collections::HashSet;
|
19 |
| -use std::sync::Arc; |
20 |
| - |
21 |
| -use crate::stats::Precision; |
22 |
| -use arrow::array::UInt64Array; |
23 |
| -use arrow::datatypes::FieldRef; |
24 | 19 | use arrow::{
|
25 | 20 | array::{ArrayRef, BooleanArray},
|
26 |
| - datatypes::{Schema, SchemaRef}, |
27 | 21 | };
|
28 | 22 |
|
29 | 23 | use crate::Column;
|
30 |
| -use crate::{ScalarValue, Statistics}; |
| 24 | +use crate::ScalarValue; |
31 | 25 |
|
32 | 26 | /// A source of runtime statistical information to [`PruningPredicate`]s.
|
33 | 27 | ///
|
@@ -131,324 +125,3 @@ pub trait PruningStatistics {
|
131 | 125 | ) -> Option<BooleanArray>;
|
132 | 126 | }
|
133 | 127 |
|
134 |
| -/// Prune files based on their partition values. |
135 |
| -/// This is used both at planning time and execution time to prune |
136 |
| -/// files based on their partition values. |
137 |
| -/// This feeds into [`CompositePruningStatistics`] to allow pruning |
138 |
| -/// with filters that depend both on partition columns and data columns |
139 |
| -/// (e.g. `WHERE partition_col = data_col`). |
140 |
| -pub struct PartitionPruningStatistics { |
141 |
| - /// Values for each column for each container. |
142 |
| - /// The outer vectors represent the columns while the inner |
143 |
| - /// vectors represent the containers. |
144 |
| - /// The order must match the order of the partition columns in |
145 |
| - /// [`PartitionPruningStatistics::partition_schema`]. |
146 |
| - partition_values: Vec<Vec<ScalarValue>>, |
147 |
| - /// The number of containers. |
148 |
| - /// Stored since the partition values are column-major and if |
149 |
| - /// there are no columns we wouldn't know the number of containers. |
150 |
| - num_containers: usize, |
151 |
| - /// The schema of the partition columns. |
152 |
| - /// This must **not** be the schema of the entire file or table: |
153 |
| - /// it must only be the schema of the partition columns, |
154 |
| - /// in the same order as the values in [`PartitionPruningStatistics::partition_values`]. |
155 |
| - partition_schema: SchemaRef, |
156 |
| -} |
157 |
| - |
158 |
| -impl PartitionPruningStatistics { |
159 |
| - /// Create a new instance of [`PartitionPruningStatistics`]. |
160 |
| - /// |
161 |
| - /// Args: |
162 |
| - /// * `partition_values`: A vector of vectors of [`ScalarValue`]s. |
163 |
| - /// The outer vector represents the containers while the inner |
164 |
| - /// vector represents the partition values for each column. |
165 |
| - /// Note that this is the **opposite** of the order of the |
166 |
| - /// partition columns in `PartitionPruningStatistics::partition_schema`. |
167 |
| - /// * `partition_schema`: The schema of the partition columns. |
168 |
| - /// This must **not** be the schema of the entire file or table: |
169 |
| - /// instead it must only be the schema of the partition columns, |
170 |
| - /// in the same order as the values in `partition_values`. |
171 |
| - pub fn new( |
172 |
| - partition_values: Vec<Vec<ScalarValue>>, |
173 |
| - partition_fields: Vec<FieldRef>, |
174 |
| - ) -> Self { |
175 |
| - let num_containers = partition_values.len(); |
176 |
| - let partition_schema = Arc::new(Schema::new(partition_fields)); |
177 |
| - let mut partition_valeus_by_column = |
178 |
| - vec![vec![]; partition_schema.fields().len()]; |
179 |
| - for partition_value in partition_values.iter() { |
180 |
| - for (i, value) in partition_value.iter().enumerate() { |
181 |
| - partition_valeus_by_column[i].push(value.clone()); |
182 |
| - } |
183 |
| - } |
184 |
| - Self { |
185 |
| - partition_values: partition_valeus_by_column, |
186 |
| - num_containers, |
187 |
| - partition_schema, |
188 |
| - } |
189 |
| - } |
190 |
| -} |
191 |
| - |
192 |
| -impl PruningStatistics for PartitionPruningStatistics { |
193 |
| - fn min_values(&self, column: &Column) -> Option<ArrayRef> { |
194 |
| - let index = self.partition_schema.index_of(column.name()).ok()?; |
195 |
| - let partition_values = self.partition_values.get(index)?; |
196 |
| - let mut values = Vec::with_capacity(self.partition_values.len()); |
197 |
| - for partition_value in partition_values { |
198 |
| - match partition_value { |
199 |
| - ScalarValue::Null => values.push(ScalarValue::Null), |
200 |
| - _ => values.push(partition_value.clone()), |
201 |
| - } |
202 |
| - } |
203 |
| - match ScalarValue::iter_to_array(values) { |
204 |
| - Ok(array) => Some(array), |
205 |
| - Err(_) => { |
206 |
| - log::warn!( |
207 |
| - "Failed to convert min values to array for column {}", |
208 |
| - column.name() |
209 |
| - ); |
210 |
| - None |
211 |
| - } |
212 |
| - } |
213 |
| - } |
214 |
| - |
215 |
| - fn max_values(&self, column: &Column) -> Option<ArrayRef> { |
216 |
| - self.min_values(column) |
217 |
| - } |
218 |
| - |
219 |
| - fn num_containers(&self) -> usize { |
220 |
| - self.num_containers |
221 |
| - } |
222 |
| - |
223 |
| - fn null_counts(&self, _column: &Column) -> Option<ArrayRef> { |
224 |
| - None |
225 |
| - } |
226 |
| - |
227 |
| - fn row_counts(&self, _column: &Column) -> Option<ArrayRef> { |
228 |
| - None |
229 |
| - } |
230 |
| - |
231 |
| - fn contained( |
232 |
| - &self, |
233 |
| - column: &Column, |
234 |
| - values: &HashSet<ScalarValue>, |
235 |
| - ) -> Option<BooleanArray> { |
236 |
| - let index = self.partition_schema.index_of(column.name()).ok()?; |
237 |
| - let partition_values = self.partition_values.get(index)?; |
238 |
| - let mut contained = Vec::with_capacity(self.partition_values.len()); |
239 |
| - for partition_value in partition_values { |
240 |
| - let contained_value = if values.contains(partition_value) { |
241 |
| - Some(true) |
242 |
| - } else { |
243 |
| - Some(false) |
244 |
| - }; |
245 |
| - contained.push(contained_value); |
246 |
| - } |
247 |
| - let array = BooleanArray::from(contained); |
248 |
| - Some(array) |
249 |
| - } |
250 |
| -} |
251 |
| - |
252 |
| -/// Prune a set of containers represented by their statistics. |
253 |
| -/// Each [`Statistics`] represents a container (e.g. a file or a partition of files). |
254 |
| -pub struct PrunableStatistics { |
255 |
| - /// Statistics for each container. |
256 |
| - statistics: Vec<Arc<Statistics>>, |
257 |
| - /// The schema of the file these statistics are for. |
258 |
| - schema: SchemaRef, |
259 |
| -} |
260 |
| - |
261 |
| -impl PrunableStatistics { |
262 |
| - /// Create a new instance of [`PrunableStatistics`]. |
263 |
| - /// Each [`Statistics`] represents a container (e.g. a file or a partition of files). |
264 |
| - /// The `schema` is the schema of the data in the containers and should apply to all files. |
265 |
| - pub fn new(statistics: Vec<Arc<Statistics>>, schema: SchemaRef) -> Self { |
266 |
| - Self { statistics, schema } |
267 |
| - } |
268 |
| -} |
269 |
| - |
270 |
| -impl PruningStatistics for PrunableStatistics { |
271 |
| - fn min_values(&self, column: &Column) -> Option<ArrayRef> { |
272 |
| - let index = self.schema.index_of(column.name()).ok()?; |
273 |
| - let mut values = Vec::with_capacity(self.statistics.len()); |
274 |
| - for stats in &self.statistics { |
275 |
| - let stat = stats.column_statistics.get(index)?; |
276 |
| - match &stat.min_value { |
277 |
| - Precision::Exact(min) => { |
278 |
| - values.push(min.clone()); |
279 |
| - } |
280 |
| - _ => values.push(ScalarValue::Null), |
281 |
| - } |
282 |
| - } |
283 |
| - match ScalarValue::iter_to_array(values) { |
284 |
| - Ok(array) => Some(array), |
285 |
| - Err(_) => { |
286 |
| - log::warn!( |
287 |
| - "Failed to convert min values to array for column {}", |
288 |
| - column.name() |
289 |
| - ); |
290 |
| - None |
291 |
| - } |
292 |
| - } |
293 |
| - } |
294 |
| - |
295 |
| - fn max_values(&self, column: &Column) -> Option<ArrayRef> { |
296 |
| - let index = self.schema.index_of(column.name()).ok()?; |
297 |
| - let mut values = Vec::with_capacity(self.statistics.len()); |
298 |
| - for stats in &self.statistics { |
299 |
| - let stat = stats.column_statistics.get(index)?; |
300 |
| - match &stat.max_value { |
301 |
| - Precision::Exact(max) => { |
302 |
| - values.push(max.clone()); |
303 |
| - } |
304 |
| - _ => values.push(ScalarValue::Null), |
305 |
| - } |
306 |
| - } |
307 |
| - match ScalarValue::iter_to_array(values) { |
308 |
| - Ok(array) => Some(array), |
309 |
| - Err(_) => { |
310 |
| - log::warn!( |
311 |
| - "Failed to convert max values to array for column {}", |
312 |
| - column.name() |
313 |
| - ); |
314 |
| - None |
315 |
| - } |
316 |
| - } |
317 |
| - } |
318 |
| - |
319 |
| - fn num_containers(&self) -> usize { |
320 |
| - self.statistics.len() |
321 |
| - } |
322 |
| - |
323 |
| - fn null_counts(&self, column: &Column) -> Option<ArrayRef> { |
324 |
| - let index = self.schema.index_of(column.name()).ok()?; |
325 |
| - let mut values = Vec::with_capacity(self.statistics.len()); |
326 |
| - let mut has_null_count = false; |
327 |
| - for stats in &self.statistics { |
328 |
| - let stat = stats.column_statistics.get(index)?; |
329 |
| - match &stat.null_count { |
330 |
| - Precision::Exact(null_count) => match u64::try_from(*null_count) { |
331 |
| - Ok(null_count) => { |
332 |
| - has_null_count = true; |
333 |
| - values.push(Some(null_count)); |
334 |
| - } |
335 |
| - Err(_) => { |
336 |
| - values.push(None); |
337 |
| - } |
338 |
| - }, |
339 |
| - _ => values.push(None), |
340 |
| - } |
341 |
| - } |
342 |
| - if has_null_count { |
343 |
| - Some(Arc::new(UInt64Array::from(values))) |
344 |
| - } else { |
345 |
| - None |
346 |
| - } |
347 |
| - } |
348 |
| - |
349 |
| - fn row_counts(&self, _column: &Column) -> Option<ArrayRef> { |
350 |
| - let mut values = Vec::with_capacity(self.statistics.len()); |
351 |
| - let mut has_row_count = false; |
352 |
| - for stats in &self.statistics { |
353 |
| - match &stats.num_rows { |
354 |
| - Precision::Exact(row_count) => match u64::try_from(*row_count) { |
355 |
| - Ok(row_count) => { |
356 |
| - has_row_count = true; |
357 |
| - values.push(Some(row_count)); |
358 |
| - } |
359 |
| - Err(_) => { |
360 |
| - values.push(None); |
361 |
| - } |
362 |
| - }, |
363 |
| - _ => values.push(None), |
364 |
| - } |
365 |
| - } |
366 |
| - if has_row_count { |
367 |
| - Some(Arc::new(UInt64Array::from(values))) |
368 |
| - } else { |
369 |
| - None |
370 |
| - } |
371 |
| - } |
372 |
| - |
373 |
| - fn contained( |
374 |
| - &self, |
375 |
| - _column: &Column, |
376 |
| - _values: &HashSet<ScalarValue>, |
377 |
| - ) -> Option<BooleanArray> { |
378 |
| - None |
379 |
| - } |
380 |
| -} |
381 |
| - |
382 |
| -/// Combine multiple [`PruningStatistics`] into a single |
383 |
| -/// [`CompositePruningStatistics`]. |
384 |
| -/// This can be used to combine statistics from different sources, |
385 |
| -/// for example partition values and file statistics. |
386 |
| -/// This allows pruning with filters that depend on multiple sources of statistics, |
387 |
| -/// such as `WHERE partition_col = data_col`. |
388 |
| -pub struct CompositePruningStatistics { |
389 |
| - pub statistics: Vec<Box<dyn PruningStatistics>>, |
390 |
| -} |
391 |
| - |
392 |
| -impl CompositePruningStatistics { |
393 |
| - /// Create a new instance of [`CompositePruningStatistics`] from |
394 |
| - /// a vector of [`PruningStatistics`]. |
395 |
| - pub fn new(statistics: Vec<Box<dyn PruningStatistics>>) -> Self { |
396 |
| - assert!(!statistics.is_empty()); |
397 |
| - Self { statistics } |
398 |
| - } |
399 |
| -} |
400 |
| - |
401 |
| -impl PruningStatistics for CompositePruningStatistics { |
402 |
| - fn min_values(&self, column: &Column) -> Option<ArrayRef> { |
403 |
| - for stats in &self.statistics { |
404 |
| - if let Some(array) = stats.min_values(column) { |
405 |
| - return Some(array); |
406 |
| - } |
407 |
| - } |
408 |
| - None |
409 |
| - } |
410 |
| - |
411 |
| - fn max_values(&self, column: &Column) -> Option<ArrayRef> { |
412 |
| - for stats in &self.statistics { |
413 |
| - if let Some(array) = stats.max_values(column) { |
414 |
| - return Some(array); |
415 |
| - } |
416 |
| - } |
417 |
| - None |
418 |
| - } |
419 |
| - |
420 |
| - fn num_containers(&self) -> usize { |
421 |
| - self.statistics[0].num_containers() |
422 |
| - } |
423 |
| - |
424 |
| - fn null_counts(&self, column: &Column) -> Option<ArrayRef> { |
425 |
| - for stats in &self.statistics { |
426 |
| - if let Some(array) = stats.null_counts(column) { |
427 |
| - return Some(array); |
428 |
| - } |
429 |
| - } |
430 |
| - None |
431 |
| - } |
432 |
| - |
433 |
| - fn row_counts(&self, column: &Column) -> Option<ArrayRef> { |
434 |
| - for stats in &self.statistics { |
435 |
| - if let Some(array) = stats.row_counts(column) { |
436 |
| - return Some(array); |
437 |
| - } |
438 |
| - } |
439 |
| - None |
440 |
| - } |
441 |
| - |
442 |
| - fn contained( |
443 |
| - &self, |
444 |
| - column: &Column, |
445 |
| - values: &HashSet<ScalarValue>, |
446 |
| - ) -> Option<BooleanArray> { |
447 |
| - for stats in &self.statistics { |
448 |
| - if let Some(array) = stats.contained(column, values) { |
449 |
| - return Some(array); |
450 |
| - } |
451 |
| - } |
452 |
| - None |
453 |
| - } |
454 |
| -} |
0 commit comments