Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion datafusion/core/src/physical_planner.rs
Original file line number Diff line number Diff line change
Expand Up @@ -985,7 +985,7 @@ impl DefaultPhysicalPlanner {
struct_type_columns.clone(),
schema,
options.clone(),
))
)?)
}

// 2 Children
Expand Down
98 changes: 82 additions & 16 deletions datafusion/physical-plan/src/unnest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ use crate::{
};

use arrow::array::{
new_null_array, Array, ArrayRef, AsArray, FixedSizeListArray, Int64Array,
LargeListArray, ListArray, PrimitiveArray, Scalar, StructArray,
new_null_array, Array, ArrayRef, AsArray, BooleanBufferBuilder, FixedSizeListArray,
Int64Array, LargeListArray, ListArray, PrimitiveArray, Scalar, StructArray,
};
use arrow::compute::kernels::length::length;
use arrow::compute::kernels::zip::zip;
Expand All @@ -43,16 +43,19 @@ use arrow::record_batch::RecordBatch;
use arrow_ord::cmp::lt;
use async_trait::async_trait;
use datafusion_common::{
exec_datafusion_err, exec_err, internal_err, HashMap, HashSet, Result, UnnestOptions,
exec_datafusion_err, exec_err, internal_err, Constraints, HashMap, HashSet, Result,
UnnestOptions,
};
use datafusion_execution::TaskContext;
use datafusion_physical_expr::EquivalenceProperties;
use datafusion_physical_expr::equivalence::ProjectionMapping;
use datafusion_physical_expr::expressions::Column;
use datafusion_physical_expr::PhysicalExpr;
use futures::{Stream, StreamExt};
use log::trace;

/// Unnest the given columns (either with type struct or list)
/// For list unnesting, each rows is vertically transformed into multiple rows
/// For struct unnesting, each columns is horizontally transformed into multiple columns,
/// For list unnesting, each row is vertically transformed into multiple rows
/// For struct unnesting, each column is horizontally transformed into multiple columns,
Comment on lines -54 to +58
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Grammar fix

/// Thus the original RecordBatch with dimension (n x m) may have new dimension (n' x m')
///
/// See [`UnnestOptions`] for more details and an example.
Expand Down Expand Up @@ -82,31 +85,94 @@ impl UnnestExec {
struct_column_indices: Vec<usize>,
schema: SchemaRef,
options: UnnestOptions,
) -> Self {
let cache = Self::compute_properties(&input, Arc::clone(&schema));
) -> Result<Self> {
let cache = Self::compute_properties(
&input,
&list_column_indices,
&struct_column_indices,
Arc::clone(&schema),
)?;

UnnestExec {
Ok(UnnestExec {
input,
schema,
list_column_indices,
struct_column_indices,
options,
metrics: Default::default(),
cache,
}
})
}

/// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
fn compute_properties(
input: &Arc<dyn ExecutionPlan>,
list_column_indices: &[ListUnnest],
struct_column_indices: &[usize],
schema: SchemaRef,
) -> PlanProperties {
PlanProperties::new(
EquivalenceProperties::new(schema),
input.output_partitioning().to_owned(),
) -> Result<PlanProperties> {
// Find out which indices are not unnested, such that they can be copied over from the input plan
let input_schema = input.schema();
let mut unnested_indices = BooleanBufferBuilder::new(input_schema.fields().len());
unnested_indices.append_n(input_schema.fields().len(), false);
for list_unnest in list_column_indices {
unnested_indices.set_bit(list_unnest.index_in_input_schema, true);
}
for struct_unnest in struct_column_indices {
unnested_indices.set_bit(*struct_unnest, true)
}
let unnested_indices = unnested_indices.finish();
let non_unnested_indices: Vec<usize> = (0..input_schema.fields().len())
.filter(|idx| !unnested_indices.value(*idx))
.collect();

// Manually build projection mapping from non-unnested input columns to their positions in the output
let input_schema = input.schema();
let projection_mapping: ProjectionMapping = non_unnested_indices
.iter()
.map(|&input_idx| {
// Find what index the input column has in the output schema
let input_field = input_schema.field(input_idx);
let output_idx = schema
.fields()
.iter()
.position(|output_field| output_field.name() == input_field.name())
.ok_or_else(|| {
exec_datafusion_err!(
"Non-unnested column '{}' must exist in output schema",
input_field.name()
)
})?;

let input_col = Arc::new(Column::new(input_field.name(), input_idx))
as Arc<dyn PhysicalExpr>;
let target_col = Arc::new(Column::new(input_field.name(), output_idx))
as Arc<dyn PhysicalExpr>;
// Use From<Vec<(Arc<dyn PhysicalExpr>, usize)>> for ProjectionTargets
let targets = vec![(target_col, output_idx)].into();
Ok((input_col, targets))
})
.collect::<Result<ProjectionMapping>>()?;

// Create the unnest's equivalence properties by copying the input plan's equivalence properties
// for the unaffected columns. Except for the constraints, which are removed entirely because
// the unnest operation invalidates any global uniqueness or primary-key constraints.
let input_eq_properties = input.equivalence_properties();
let eq_properties = input_eq_properties
.project(&projection_mapping, Arc::clone(&schema))
.with_constraints(Constraints::default());

// Output partitioning must use the projection mapping
let output_partitioning = input
.output_partitioning()
.project(&projection_mapping, &eq_properties);

Ok(PlanProperties::new(
eq_properties,
output_partitioning,
input.pipeline_behavior(),
input.boundedness(),
)
))
}

/// Input execution plan
Expand Down Expand Up @@ -173,7 +239,7 @@ impl ExecutionPlan for UnnestExec {
self.struct_column_indices.clone(),
Arc::clone(&self.schema),
self.options.clone(),
)))
)?))
}

fn required_input_distribution(&self) -> Vec<Distribution> {
Expand Down
2 changes: 1 addition & 1 deletion datafusion/proto/src/physical_plan/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1757,7 +1757,7 @@ impl protobuf::PhysicalPlanNode {
unnest.struct_type_columns.iter().map(|c| *c as _).collect(),
Arc::new(convert_required!(unnest.schema)?),
into_required!(unnest.options)?,
)))
)?))
}

fn generate_series_name_to_str(name: protobuf::GenerateSeriesName) -> &'static str {
Expand Down
2 changes: 1 addition & 1 deletion datafusion/proto/tests/cases/roundtrip_physical_plan.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1714,7 +1714,7 @@ fn roundtrip_unnest() -> Result<()> {
vec![2, 4],
output_schema,
options,
);
)?;
roundtrip_test(Arc::new(unnest))
}

Expand Down
Loading