Skip to content

Commit f4aa096

Browse files
committed
remove new code
1 parent bf9bf7d commit f4aa096

File tree

1 file changed

+1
-328
lines changed

1 file changed

+1
-328
lines changed

datafusion/common/src/pruning.rs

+1-328
Original file line numberDiff line numberDiff line change
@@ -16,18 +16,12 @@
1616
// under the License.
1717

1818
use std::collections::HashSet;
19-
use std::sync::Arc;
20-
21-
use crate::stats::Precision;
22-
use arrow::array::UInt64Array;
23-
use arrow::datatypes::FieldRef;
2419
use arrow::{
2520
array::{ArrayRef, BooleanArray},
26-
datatypes::{Schema, SchemaRef},
2721
};
2822

2923
use crate::Column;
30-
use crate::{ScalarValue, Statistics};
24+
use crate::ScalarValue;
3125

3226
/// A source of runtime statistical information to [`PruningPredicate`]s.
3327
///
@@ -131,324 +125,3 @@ pub trait PruningStatistics {
131125
) -> Option<BooleanArray>;
132126
}
133127

134-
/// Prune files based on their partition values.
135-
/// This is used both at planning time and execution time to prune
136-
/// files based on their partition values.
137-
/// This feeds into [`CompositePruningStatistics`] to allow pruning
138-
/// with filters that depend both on partition columns and data columns
139-
/// (e.g. `WHERE partition_col = data_col`).
140-
pub struct PartitionPruningStatistics {
141-
/// Values for each column for each container.
142-
/// The outer vectors represent the columns while the inner
143-
/// vectors represent the containers.
144-
/// The order must match the order of the partition columns in
145-
/// [`PartitionPruningStatistics::partition_schema`].
146-
partition_values: Vec<Vec<ScalarValue>>,
147-
/// The number of containers.
148-
/// Stored since the partition values are column-major and if
149-
/// there are no columns we wouldn't know the number of containers.
150-
num_containers: usize,
151-
/// The schema of the partition columns.
152-
/// This must **not** be the schema of the entire file or table:
153-
/// it must only be the schema of the partition columns,
154-
/// in the same order as the values in [`PartitionPruningStatistics::partition_values`].
155-
partition_schema: SchemaRef,
156-
}
157-
158-
impl PartitionPruningStatistics {
159-
/// Create a new instance of [`PartitionPruningStatistics`].
160-
///
161-
/// Args:
162-
/// * `partition_values`: A vector of vectors of [`ScalarValue`]s.
163-
/// The outer vector represents the containers while the inner
164-
/// vector represents the partition values for each column.
165-
/// Note that this is the **opposite** of the order of the
166-
/// partition columns in `PartitionPruningStatistics::partition_schema`.
167-
/// * `partition_schema`: The schema of the partition columns.
168-
/// This must **not** be the schema of the entire file or table:
169-
/// instead it must only be the schema of the partition columns,
170-
/// in the same order as the values in `partition_values`.
171-
pub fn new(
172-
partition_values: Vec<Vec<ScalarValue>>,
173-
partition_fields: Vec<FieldRef>,
174-
) -> Self {
175-
let num_containers = partition_values.len();
176-
let partition_schema = Arc::new(Schema::new(partition_fields));
177-
let mut partition_valeus_by_column =
178-
vec![vec![]; partition_schema.fields().len()];
179-
for partition_value in partition_values.iter() {
180-
for (i, value) in partition_value.iter().enumerate() {
181-
partition_valeus_by_column[i].push(value.clone());
182-
}
183-
}
184-
Self {
185-
partition_values: partition_valeus_by_column,
186-
num_containers,
187-
partition_schema,
188-
}
189-
}
190-
}
191-
192-
impl PruningStatistics for PartitionPruningStatistics {
193-
fn min_values(&self, column: &Column) -> Option<ArrayRef> {
194-
let index = self.partition_schema.index_of(column.name()).ok()?;
195-
let partition_values = self.partition_values.get(index)?;
196-
let mut values = Vec::with_capacity(self.partition_values.len());
197-
for partition_value in partition_values {
198-
match partition_value {
199-
ScalarValue::Null => values.push(ScalarValue::Null),
200-
_ => values.push(partition_value.clone()),
201-
}
202-
}
203-
match ScalarValue::iter_to_array(values) {
204-
Ok(array) => Some(array),
205-
Err(_) => {
206-
log::warn!(
207-
"Failed to convert min values to array for column {}",
208-
column.name()
209-
);
210-
None
211-
}
212-
}
213-
}
214-
215-
fn max_values(&self, column: &Column) -> Option<ArrayRef> {
216-
self.min_values(column)
217-
}
218-
219-
fn num_containers(&self) -> usize {
220-
self.num_containers
221-
}
222-
223-
fn null_counts(&self, _column: &Column) -> Option<ArrayRef> {
224-
None
225-
}
226-
227-
fn row_counts(&self, _column: &Column) -> Option<ArrayRef> {
228-
None
229-
}
230-
231-
fn contained(
232-
&self,
233-
column: &Column,
234-
values: &HashSet<ScalarValue>,
235-
) -> Option<BooleanArray> {
236-
let index = self.partition_schema.index_of(column.name()).ok()?;
237-
let partition_values = self.partition_values.get(index)?;
238-
let mut contained = Vec::with_capacity(self.partition_values.len());
239-
for partition_value in partition_values {
240-
let contained_value = if values.contains(partition_value) {
241-
Some(true)
242-
} else {
243-
Some(false)
244-
};
245-
contained.push(contained_value);
246-
}
247-
let array = BooleanArray::from(contained);
248-
Some(array)
249-
}
250-
}
251-
252-
/// Prune a set of containers represented by their statistics.
253-
/// Each [`Statistics`] represents a container (e.g. a file or a partition of files).
254-
pub struct PrunableStatistics {
255-
/// Statistics for each container.
256-
statistics: Vec<Arc<Statistics>>,
257-
/// The schema of the file these statistics are for.
258-
schema: SchemaRef,
259-
}
260-
261-
impl PrunableStatistics {
262-
/// Create a new instance of [`PrunableStatistics`].
263-
/// Each [`Statistics`] represents a container (e.g. a file or a partition of files).
264-
/// The `schema` is the schema of the data in the containers and should apply to all files.
265-
pub fn new(statistics: Vec<Arc<Statistics>>, schema: SchemaRef) -> Self {
266-
Self { statistics, schema }
267-
}
268-
}
269-
270-
impl PruningStatistics for PrunableStatistics {
271-
fn min_values(&self, column: &Column) -> Option<ArrayRef> {
272-
let index = self.schema.index_of(column.name()).ok()?;
273-
let mut values = Vec::with_capacity(self.statistics.len());
274-
for stats in &self.statistics {
275-
let stat = stats.column_statistics.get(index)?;
276-
match &stat.min_value {
277-
Precision::Exact(min) => {
278-
values.push(min.clone());
279-
}
280-
_ => values.push(ScalarValue::Null),
281-
}
282-
}
283-
match ScalarValue::iter_to_array(values) {
284-
Ok(array) => Some(array),
285-
Err(_) => {
286-
log::warn!(
287-
"Failed to convert min values to array for column {}",
288-
column.name()
289-
);
290-
None
291-
}
292-
}
293-
}
294-
295-
fn max_values(&self, column: &Column) -> Option<ArrayRef> {
296-
let index = self.schema.index_of(column.name()).ok()?;
297-
let mut values = Vec::with_capacity(self.statistics.len());
298-
for stats in &self.statistics {
299-
let stat = stats.column_statistics.get(index)?;
300-
match &stat.max_value {
301-
Precision::Exact(max) => {
302-
values.push(max.clone());
303-
}
304-
_ => values.push(ScalarValue::Null),
305-
}
306-
}
307-
match ScalarValue::iter_to_array(values) {
308-
Ok(array) => Some(array),
309-
Err(_) => {
310-
log::warn!(
311-
"Failed to convert max values to array for column {}",
312-
column.name()
313-
);
314-
None
315-
}
316-
}
317-
}
318-
319-
fn num_containers(&self) -> usize {
320-
self.statistics.len()
321-
}
322-
323-
fn null_counts(&self, column: &Column) -> Option<ArrayRef> {
324-
let index = self.schema.index_of(column.name()).ok()?;
325-
let mut values = Vec::with_capacity(self.statistics.len());
326-
let mut has_null_count = false;
327-
for stats in &self.statistics {
328-
let stat = stats.column_statistics.get(index)?;
329-
match &stat.null_count {
330-
Precision::Exact(null_count) => match u64::try_from(*null_count) {
331-
Ok(null_count) => {
332-
has_null_count = true;
333-
values.push(Some(null_count));
334-
}
335-
Err(_) => {
336-
values.push(None);
337-
}
338-
},
339-
_ => values.push(None),
340-
}
341-
}
342-
if has_null_count {
343-
Some(Arc::new(UInt64Array::from(values)))
344-
} else {
345-
None
346-
}
347-
}
348-
349-
fn row_counts(&self, _column: &Column) -> Option<ArrayRef> {
350-
let mut values = Vec::with_capacity(self.statistics.len());
351-
let mut has_row_count = false;
352-
for stats in &self.statistics {
353-
match &stats.num_rows {
354-
Precision::Exact(row_count) => match u64::try_from(*row_count) {
355-
Ok(row_count) => {
356-
has_row_count = true;
357-
values.push(Some(row_count));
358-
}
359-
Err(_) => {
360-
values.push(None);
361-
}
362-
},
363-
_ => values.push(None),
364-
}
365-
}
366-
if has_row_count {
367-
Some(Arc::new(UInt64Array::from(values)))
368-
} else {
369-
None
370-
}
371-
}
372-
373-
fn contained(
374-
&self,
375-
_column: &Column,
376-
_values: &HashSet<ScalarValue>,
377-
) -> Option<BooleanArray> {
378-
None
379-
}
380-
}
381-
382-
/// Combine multiple [`PruningStatistics`] into a single
383-
/// [`CompositePruningStatistics`].
384-
/// This can be used to combine statistics from different sources,
385-
/// for example partition values and file statistics.
386-
/// This allows pruning with filters that depend on multiple sources of statistics,
387-
/// such as `WHERE partition_col = data_col`.
388-
pub struct CompositePruningStatistics {
389-
pub statistics: Vec<Box<dyn PruningStatistics>>,
390-
}
391-
392-
impl CompositePruningStatistics {
393-
/// Create a new instance of [`CompositePruningStatistics`] from
394-
/// a vector of [`PruningStatistics`].
395-
pub fn new(statistics: Vec<Box<dyn PruningStatistics>>) -> Self {
396-
assert!(!statistics.is_empty());
397-
Self { statistics }
398-
}
399-
}
400-
401-
impl PruningStatistics for CompositePruningStatistics {
402-
fn min_values(&self, column: &Column) -> Option<ArrayRef> {
403-
for stats in &self.statistics {
404-
if let Some(array) = stats.min_values(column) {
405-
return Some(array);
406-
}
407-
}
408-
None
409-
}
410-
411-
fn max_values(&self, column: &Column) -> Option<ArrayRef> {
412-
for stats in &self.statistics {
413-
if let Some(array) = stats.max_values(column) {
414-
return Some(array);
415-
}
416-
}
417-
None
418-
}
419-
420-
fn num_containers(&self) -> usize {
421-
self.statistics[0].num_containers()
422-
}
423-
424-
fn null_counts(&self, column: &Column) -> Option<ArrayRef> {
425-
for stats in &self.statistics {
426-
if let Some(array) = stats.null_counts(column) {
427-
return Some(array);
428-
}
429-
}
430-
None
431-
}
432-
433-
fn row_counts(&self, column: &Column) -> Option<ArrayRef> {
434-
for stats in &self.statistics {
435-
if let Some(array) = stats.row_counts(column) {
436-
return Some(array);
437-
}
438-
}
439-
None
440-
}
441-
442-
fn contained(
443-
&self,
444-
column: &Column,
445-
values: &HashSet<ScalarValue>,
446-
) -> Option<BooleanArray> {
447-
for stats in &self.statistics {
448-
if let Some(array) = stats.contained(column, values) {
449-
return Some(array);
450-
}
451-
}
452-
None
453-
}
454-
}

0 commit comments

Comments
 (0)