-
Notifications
You must be signed in to change notification settings - Fork 1.7k
Evaluate projections at the file source level #18309
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -29,9 +29,14 @@ use crate::schema_adapter::SchemaAdapterFactory; | |
| use arrow::datatypes::SchemaRef; | ||
| use datafusion_common::config::ConfigOptions; | ||
| use datafusion_common::{not_impl_err, Result, Statistics}; | ||
| use datafusion_physical_expr::expressions::Column; | ||
| use datafusion_physical_expr::projection::ProjectionExprs; | ||
| use datafusion_physical_expr::{LexOrdering, PhysicalExpr}; | ||
| use datafusion_physical_plan::filter_pushdown::{FilterPushdownPropagation, PushedDown}; | ||
| use datafusion_physical_plan::metrics::ExecutionPlanMetricsSet; | ||
| use datafusion_physical_plan::projection::{ | ||
| all_alias_free_columns, new_projections_for_columns, | ||
| }; | ||
| use datafusion_physical_plan::DisplayFormatType; | ||
|
|
||
| use object_store::ObjectStore; | ||
|
|
@@ -129,6 +134,47 @@ pub trait FileSource: Send + Sync { | |
| )) | ||
| } | ||
|
|
||
| fn try_pushdown_projections( | ||
| &self, | ||
| projection_exprs: &ProjectionExprs, | ||
| file_schema: &SchemaRef, | ||
| current_projection: Option<&[usize]>, | ||
| ) -> Result<ProjectionPushdownResult> { | ||
| let projection_slice: Vec<_> = projection_exprs.iter().cloned().collect(); | ||
|
|
||
| // check if there are any partition columns in projection (columns beyond file schema) | ||
| let partitioned_columns_in_proj = projection_slice.iter().any(|proj_expr| { | ||
| proj_expr | ||
| .expr | ||
| .as_any() | ||
| .downcast_ref::<Column>() | ||
| .map(|expr| expr.index() >= file_schema.fields().len()) | ||
| .unwrap_or(false) | ||
| }); | ||
|
|
||
| // if there are any non-column or alias-carrier expressions, projection should not be removed | ||
| let no_aliases = all_alias_free_columns(&projection_slice); | ||
|
|
||
| if !no_aliases || partitioned_columns_in_proj { | ||
| return Ok(ProjectionPushdownResult::None); | ||
| } | ||
|
|
||
| let all_projections: Vec<usize> = (0..file_schema.fields().len()).collect(); | ||
| let source_projection = current_projection.unwrap_or(&all_projections); | ||
|
|
||
| let new_projection_indices = | ||
| new_projections_for_columns(&projection_slice, source_projection); | ||
|
|
||
| // return a partial projection with the new projection indices | ||
| // if `new_file_source` is None, it means the file source doesn't change, | ||
| // rather the new projection is updated in `FileScanConfig` | ||
| Ok(ProjectionPushdownResult::Partial { | ||
| new_file_source: None, | ||
| remaining_projections: None, | ||
| new_projection_indices: Some(new_projection_indices), | ||
| }) | ||
| } | ||
|
|
||
| /// Set optional schema adapter factory. | ||
| /// | ||
| /// [`SchemaAdapterFactory`] allows user to specify how fields from the | ||
|
|
@@ -155,3 +201,12 @@ pub trait FileSource: Send + Sync { | |
| None | ||
| } | ||
| } | ||
|
|
||
| pub enum ProjectionPushdownResult { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. doctorings please |
||
| None, | ||
| Partial { | ||
| new_file_source: Option<Arc<dyn FileSource>>, | ||
| remaining_projections: Option<ProjectionExprs>, | ||
| new_projection_indices: Option<Vec<usize>>, | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmm something seems off here to me. In my mind this should be more like: pub struct ProjectionPushdown {
new_file_source: Arc<dyn FileSource>,
remaining_projections: Option<ProjectionExprs>,
}
pub type ProjectionPushdownResult = Option<ProjectionPushdown>; I don't see how it could make sense to have a remaining projection if the source wasn't updated. File sources like Parquet will absorb the entire projection. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about we just return |
||
| }, | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -22,6 +22,7 @@ use std::fmt; | |
| use std::fmt::{Debug, Formatter}; | ||
| use std::sync::Arc; | ||
|
|
||
| use datafusion_physical_expr::projection::ProjectionExprs; | ||
| use datafusion_physical_plan::execution_plan::{ | ||
| Boundedness, EmissionType, SchedulingType, | ||
| }; | ||
|
|
@@ -175,7 +176,7 @@ pub trait DataSource: Send + Sync + Debug { | |
| fn try_swapping_with_projection( | ||
| &self, | ||
| _projection: &[ProjectionExpr], | ||
| ) -> Result<Option<Arc<dyn DataSource>>>; | ||
| ) -> Result<ProjectionPushdownResult>; | ||
| /// Try to push down filters into this DataSource. | ||
| /// See [`ExecutionPlan::handle_child_pushdown_result`] for more details. | ||
| /// | ||
|
|
@@ -191,6 +192,9 @@ pub trait DataSource: Send + Sync + Debug { | |
| } | ||
| } | ||
|
|
||
| pub type ProjectionPushdownResult = | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we please document this type (like what the two fields mean)? I actually think it would be even nicer if this was a real Perhaps like /// Result of evaluating projection pushdown ....
enum ProjectionPushdownResult {
...
}There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Makes sense to me. Can I get your thoughts on naming here? I made another There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe if we go with https://github.com/apache/datafusion/pull/18309/files#r2467051055 we can have just 1 enum / structure? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hm, I'm not sure if that is possible. One stores an There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can make them generic There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess what I'm trying to say is that it would be nice to give these types a proper name/alias. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's what we do for filter pushdown, seems okay |
||
| Option<(Arc<dyn DataSource>, Option<ProjectionExprs>)>; | ||
|
|
||
| /// [`ExecutionPlan`] that reads one or more files | ||
| /// | ||
| /// `DataSourceExec` implements common functionality such as applying | ||
|
|
@@ -321,8 +325,16 @@ impl ExecutionPlan for DataSourceExec { | |
| .data_source | ||
| .try_swapping_with_projection(projection.expr())? | ||
| { | ||
| Some(new_data_source) => { | ||
| Ok(Some(Arc::new(DataSourceExec::new(new_data_source)))) | ||
| Some((new_data_source, remaining_projections)) => { | ||
| let new_exec = Arc::new(DataSourceExec::new(new_data_source)); | ||
| if let Some(remaining_projections) = remaining_projections { | ||
| let new_projection_exec = | ||
| ProjectionExec::try_new(remaining_projections, new_exec)?; | ||
|
|
||
| return Ok(Some(Arc::new(new_projection_exec))); | ||
| } | ||
|
|
||
| Ok(Some(new_exec)) | ||
| } | ||
| None => Ok(None), | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A docstring would be great