Skip to content

Commit 279440b

Browse files
authored
Do not resort inputs to UnionExec if they are already sorted (#4946)
* Do not resort inputs to Union if they are already sorted * Remove debugging
1 parent 84ba3c2 commit 279440b

File tree

2 files changed

+103
-1
lines changed

2 files changed

+103
-1
lines changed

datafusion/core/src/physical_optimizer/sort_enforcement.rs

Lines changed: 79 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -486,15 +486,19 @@ fn check_alignment(
486486
#[cfg(test)]
487487
mod tests {
488488
use super::*;
489+
use crate::datasource::listing::PartitionedFile;
490+
use crate::datasource::object_store::ObjectStoreUrl;
489491
use crate::physical_plan::displayable;
492+
use crate::physical_plan::file_format::{FileScanConfig, ParquetExec};
490493
use crate::physical_plan::filter::FilterExec;
491494
use crate::physical_plan::memory::MemoryExec;
492495
use crate::physical_plan::sorts::sort_preserving_merge::SortPreservingMergeExec;
496+
use crate::physical_plan::union::UnionExec;
493497
use crate::physical_plan::windows::create_window_expr;
494498
use crate::prelude::SessionContext;
495499
use arrow::compute::SortOptions;
496500
use arrow::datatypes::{DataType, Field, Schema, SchemaRef};
497-
use datafusion_common::Result;
501+
use datafusion_common::{Result, Statistics};
498502
use datafusion_expr::{AggregateFunction, WindowFrame, WindowFunction};
499503
use datafusion_physical_expr::expressions::{col, NotExpr};
500504
use datafusion_physical_expr::PhysicalSortExpr;
@@ -813,6 +817,33 @@ mod tests {
813817
Ok(())
814818
}
815819

820+
#[tokio::test]
821+
async fn test_union_inputs_sorted() -> Result<()> {
822+
let schema = create_test_schema()?;
823+
824+
let source1 = parquet_exec(&schema);
825+
let sort_exprs = vec![sort_expr("nullable_col", &schema)];
826+
let sort = sort_exec(sort_exprs.clone(), source1);
827+
828+
let source2 = parquet_exec_sorted(&schema, sort_exprs.clone());
829+
830+
let union = union_exec(vec![source2, sort]);
831+
let physical_plan = sort_preserving_merge_exec(sort_exprs, union);
832+
833+
// one input to the union is already sorted, one is not.
834+
let expected_input = vec![
835+
"SortPreservingMergeExec: [nullable_col@0 ASC]",
836+
" UnionExec",
837+
" ParquetExec: limit=None, partitions={1 group: [[x]]}, output_ordering=[nullable_col@0 ASC], projection=[nullable_col, non_nullable_col]",
838+
" SortExec: [nullable_col@0 ASC]",
839+
" ParquetExec: limit=None, partitions={1 group: [[x]]}, projection=[nullable_col, non_nullable_col]",
840+
];
841+
// should not add a sort at the output of the union, input plan should not be changed
842+
let expected_optimized = expected_input.clone();
843+
assert_optimized!(expected_input, expected_optimized, physical_plan);
844+
Ok(())
845+
}
846+
816847
/// make PhysicalSortExpr with default options
817848
fn sort_expr(name: &str, schema: &Schema) -> PhysicalSortExpr {
818849
sort_expr_options(name, schema, SortOptions::default())
@@ -856,4 +887,51 @@ mod tests {
856887
) -> Arc<dyn ExecutionPlan> {
857888
Arc::new(FilterExec::try_new(predicate, input).unwrap())
858889
}
890+
891+
/// Create a non sorted parquet exec
892+
fn parquet_exec(schema: &SchemaRef) -> Arc<ParquetExec> {
893+
Arc::new(ParquetExec::new(
894+
FileScanConfig {
895+
object_store_url: ObjectStoreUrl::parse("test:///").unwrap(),
896+
file_schema: schema.clone(),
897+
file_groups: vec![vec![PartitionedFile::new("x".to_string(), 100)]],
898+
statistics: Statistics::default(),
899+
projection: None,
900+
limit: None,
901+
table_partition_cols: vec![],
902+
output_ordering: None,
903+
infinite_source: false,
904+
},
905+
None,
906+
None,
907+
))
908+
}
909+
910+
// Created a sorted parquet exec
911+
fn parquet_exec_sorted(
912+
schema: &SchemaRef,
913+
sort_exprs: impl IntoIterator<Item = PhysicalSortExpr>,
914+
) -> Arc<ParquetExec> {
915+
let sort_exprs = sort_exprs.into_iter().collect();
916+
917+
Arc::new(ParquetExec::new(
918+
FileScanConfig {
919+
object_store_url: ObjectStoreUrl::parse("test:///").unwrap(),
920+
file_schema: schema.clone(),
921+
file_groups: vec![vec![PartitionedFile::new("x".to_string(), 100)]],
922+
statistics: Statistics::default(),
923+
projection: None,
924+
limit: None,
925+
table_partition_cols: vec![],
926+
output_ordering: Some(sort_exprs),
927+
infinite_source: false,
928+
},
929+
None,
930+
None,
931+
))
932+
}
933+
934+
fn union_exec(input: Vec<Arc<dyn ExecutionPlan>>) -> Arc<dyn ExecutionPlan> {
935+
Arc::new(UnionExec::new(input))
936+
}
859937
}

datafusion/core/src/physical_plan/union.rs

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -247,6 +247,30 @@ impl ExecutionPlan for UnionExec {
247247
}
248248
}
249249

250+
fn maintains_input_order(&self) -> bool {
251+
let first_input_ordering = self.inputs[0].output_ordering();
252+
// If the Union is not partition aware and all the input
253+
// ordering spec strictly equal with the first_input_ordering,
254+
// then the `UnionExec` maintains the input order
255+
//
256+
// It might be too strict here in the case that the input
257+
// ordering are compatible but not exactly the same. See
258+
// comments in output_ordering
259+
!self.partition_aware
260+
&& first_input_ordering.is_some()
261+
&& self
262+
.inputs
263+
.iter()
264+
.map(|plan| plan.output_ordering())
265+
.all(|ordering| {
266+
ordering.is_some()
267+
&& sort_expr_list_eq_strict_order(
268+
ordering.unwrap(),
269+
first_input_ordering.unwrap(),
270+
)
271+
})
272+
}
273+
250274
fn with_new_children(
251275
self: Arc<Self>,
252276
children: Vec<Arc<dyn ExecutionPlan>>,

0 commit comments

Comments
 (0)