Skip to content

Commit a56ae74

Browse files
authored
Simplify and encapsulate window function state management (#6621)
* Simplify and encapsulate window function state management * Fix docs * Improve documentation * Improve comment and readability
1 parent d602de2 commit a56ae74

File tree

7 files changed

+47
-96
lines changed

7 files changed

+47
-96
lines changed

datafusion/physical-expr/src/window/built_in.rs

Lines changed: 3 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -24,14 +24,12 @@ use std::sync::Arc;
2424
use super::window_frame_state::WindowFrameContext;
2525
use super::BuiltInWindowFunctionExpr;
2626
use super::WindowExpr;
27-
use crate::window::window_expr::{
28-
BuiltinWindowState, NthValueKind, NthValueState, WindowFn,
29-
};
27+
use crate::window::window_expr::WindowFn;
3028
use crate::window::{
3129
PartitionBatches, PartitionWindowAggStates, WindowAggState, WindowState,
3230
};
3331
use crate::{expressions::PhysicalSortExpr, reverse_order_bys, PhysicalExpr};
34-
use arrow::array::{new_empty_array, Array, ArrayRef};
32+
use arrow::array::{new_empty_array, ArrayRef};
3533
use arrow::compute::SortOptions;
3634
use arrow::datatypes::Field;
3735
use arrow::record_batch::RecordBatch;
@@ -211,13 +209,7 @@ impl WindowExpr for BuiltInWindowExpr {
211209

212210
state.update(&out_col, partition_batch_state)?;
213211
if self.window_frame.start_bound.is_unbounded() {
214-
let mut evaluator_state = evaluator.state()?;
215-
if let BuiltinWindowState::NthValue(nth_value_state) =
216-
&mut evaluator_state
217-
{
218-
memoize_nth_value(state, nth_value_state)?;
219-
evaluator.set_state(&evaluator_state)?;
220-
}
212+
evaluator.memoize(state)?;
221213
}
222214
}
223215
Ok(())
@@ -244,35 +236,3 @@ impl WindowExpr for BuiltInWindowExpr {
244236
|| !self.window_frame.end_bound.is_unbounded())
245237
}
246238
}
247-
248-
// When the window frame has a fixed beginning (e.g UNBOUNDED PRECEDING), for
249-
// FIRST_VALUE, LAST_VALUE and NTH_VALUE functions: we can memoize result.
250-
// Once result is calculated it will always stay same. Hence, we do not
251-
// need to keep past data as we process the entire dataset. This feature
252-
// enables us to prune rows from table.
253-
fn memoize_nth_value(
254-
state: &mut WindowAggState,
255-
nth_value_state: &mut NthValueState,
256-
) -> Result<()> {
257-
let out = &state.out_col;
258-
let size = out.len();
259-
let (is_prunable, new_prunable) = match nth_value_state.kind {
260-
NthValueKind::First => {
261-
let n_range = state.window_frame_range.end - state.window_frame_range.start;
262-
(n_range > 0 && size > 0, true)
263-
}
264-
NthValueKind::Last => (true, false),
265-
NthValueKind::Nth(n) => {
266-
let n_range = state.window_frame_range.end - state.window_frame_range.start;
267-
(n_range >= (n as usize) && size >= (n as usize), true)
268-
}
269-
};
270-
if is_prunable {
271-
if nth_value_state.finalized_result.is_none() && new_prunable {
272-
let result = ScalarValue::try_from_array(out, size - 1)?;
273-
nth_value_state.finalized_result = Some(result);
274-
}
275-
state.window_frame_range.start = state.window_frame_range.end.saturating_sub(1);
276-
}
277-
Ok(())
278-
}

datafusion/physical-expr/src/window/lead_lag.rs

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
//! at runtime during query execution
2020
2121
use crate::window::partition_evaluator::PartitionEvaluator;
22-
use crate::window::window_expr::{BuiltinWindowState, LeadLagState};
22+
use crate::window::window_expr::LeadLagState;
2323
use crate::window::{BuiltInWindowFunctionExpr, WindowAggState};
2424
use crate::PhysicalExpr;
2525
use arrow::array::ArrayRef;
@@ -182,11 +182,6 @@ fn shift_with_default_value(
182182
}
183183

184184
impl PartitionEvaluator for WindowShiftEvaluator {
185-
fn state(&self) -> Result<BuiltinWindowState> {
186-
// If we do not use state we just return Default
187-
Ok(BuiltinWindowState::LeadLag(self.state.clone()))
188-
}
189-
190185
fn update_state(
191186
&mut self,
192187
_state: &WindowAggState,

datafusion/physical-expr/src/window/nth_value.rs

Lines changed: 30 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
//! that can evaluated at runtime during query execution
2020
2121
use crate::window::partition_evaluator::PartitionEvaluator;
22-
use crate::window::window_expr::{BuiltinWindowState, NthValueKind, NthValueState};
22+
use crate::window::window_expr::{NthValueKind, NthValueState};
2323
use crate::window::{BuiltInWindowFunctionExpr, WindowAggState};
2424
use crate::PhysicalExpr;
2525
use arrow::array::{Array, ArrayRef};
@@ -152,11 +152,6 @@ pub(crate) struct NthValueEvaluator {
152152
}
153153

154154
impl PartitionEvaluator for NthValueEvaluator {
155-
fn state(&self) -> Result<BuiltinWindowState> {
156-
// If we do not use state we just return Default
157-
Ok(BuiltinWindowState::NthValue(self.state.clone()))
158-
}
159-
160155
fn update_state(
161156
&mut self,
162157
state: &WindowAggState,
@@ -169,9 +164,35 @@ impl PartitionEvaluator for NthValueEvaluator {
169164
Ok(())
170165
}
171166

172-
fn set_state(&mut self, state: &BuiltinWindowState) -> Result<()> {
173-
if let BuiltinWindowState::NthValue(nth_value_state) = state {
174-
self.state = nth_value_state.clone()
167+
/// When the window frame has a fixed beginning (e.g UNBOUNDED
168+
/// PRECEDING), for some functions such as FIRST_VALUE, LAST_VALUE and
169+
/// NTH_VALUE we can memoize result. Once result is calculated it
170+
/// will always stay same. Hence, we do not need to keep past data
171+
/// as we process the entire dataset. This feature enables us to
172+
/// prune rows from table. The default implementation does nothing
173+
fn memoize(&mut self, state: &mut WindowAggState) -> Result<()> {
174+
let out = &state.out_col;
175+
let size = out.len();
176+
let (is_prunable, is_last) = match self.state.kind {
177+
NthValueKind::First => {
178+
let n_range =
179+
state.window_frame_range.end - state.window_frame_range.start;
180+
(n_range > 0 && size > 0, false)
181+
}
182+
NthValueKind::Last => (true, true),
183+
NthValueKind::Nth(n) => {
184+
let n_range =
185+
state.window_frame_range.end - state.window_frame_range.start;
186+
(n_range >= (n as usize) && size >= (n as usize), false)
187+
}
188+
};
189+
if is_prunable {
190+
if self.state.finalized_result.is_none() && !is_last {
191+
let result = ScalarValue::try_from_array(out, size - 1)?;
192+
self.state.finalized_result = Some(result);
193+
}
194+
state.window_frame_range.start =
195+
state.window_frame_range.end.saturating_sub(1);
175196
}
176197
Ok(())
177198
}

datafusion/physical-expr/src/window/partition_evaluator.rs

Lines changed: 10 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717

1818
//! Partition evaluation module
1919
20-
use crate::window::window_expr::BuiltinWindowState;
2120
use crate::window::WindowAggState;
2221
use arrow::array::ArrayRef;
2322
use datafusion_common::Result;
@@ -78,8 +77,7 @@ use std::ops::Range;
7877
///
7978
/// In this case, [`Self::evaluate_stateful`] is called to calculate
8079
/// the results of the window function incrementally for each new
81-
/// batch, saving and restoring any state needed to do so as
82-
/// [`BuiltinWindowState`].
80+
/// batch.
8381
///
8482
/// For example, when computing `ROW_NUMBER` incrementally,
8583
/// [`Self::evaluate_stateful`] will be called multiple times with
@@ -91,14 +89,6 @@ use std::ops::Range;
9189
/// [`BuiltInWindowFunctionExpr`]: crate::window::BuiltInWindowFunctionExpr
9290
/// [`BuiltInWindowFunctionExpr::create_evaluator`]: crate::window::BuiltInWindowFunctionExpr::create_evaluator
9391
pub trait PartitionEvaluator: Debug + Send {
94-
/// Returns the internal state of the window function
95-
///
96-
/// Only used for stateful evaluation
97-
fn state(&self) -> Result<BuiltinWindowState> {
98-
// If we do not use state we just return Default
99-
Ok(BuiltinWindowState::Default)
100-
}
101-
10292
/// Updates the internal state for window function
10393
///
10494
/// Only used for stateful evaluation
@@ -118,13 +108,16 @@ pub trait PartitionEvaluator: Debug + Send {
118108
Ok(())
119109
}
120110

121-
/// Sets the internal state for window function
111+
/// When the window frame has a fixed beginning (e.g UNBOUNDED
112+
/// PRECEDING), some functions such as FIRST_VALUE, LAST_VALUE and
113+
/// NTH_VALUE do not need the (unbounded) input once they have
114+
/// seen a certain amount of input.
122115
///
123-
/// Only used for stateful evaluation
124-
fn set_state(&mut self, _state: &BuiltinWindowState) -> Result<()> {
125-
Err(DataFusionError::NotImplemented(
126-
"set_state is not implemented for this window function".to_string(),
127-
))
116+
/// `memoize` is called after each input batch is processed, and
117+
/// such functions can save whatever they need and modify
118+
/// [`WindowAggState`] appropriately to allow rows to be pruned
119+
fn memoize(&mut self, _state: &mut WindowAggState) -> Result<()> {
120+
Ok(())
128121
}
129122

130123
/// Gets the range where the window function result is calculated.

datafusion/physical-expr/src/window/rank.rs

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
//! at runtime during query execution
2020
2121
use crate::window::partition_evaluator::PartitionEvaluator;
22-
use crate::window::window_expr::{BuiltinWindowState, RankState};
22+
use crate::window::window_expr::RankState;
2323
use crate::window::{BuiltInWindowFunctionExpr, WindowAggState};
2424
use crate::PhysicalExpr;
2525
use arrow::array::ArrayRef;
@@ -129,10 +129,6 @@ impl PartitionEvaluator for RankEvaluator {
129129
Ok(Range { start, end })
130130
}
131131

132-
fn state(&self) -> Result<BuiltinWindowState> {
133-
Ok(BuiltinWindowState::Rank(self.state.clone()))
134-
}
135-
136132
fn update_state(
137133
&mut self,
138134
state: &WindowAggState,

datafusion/physical-expr/src/window/row_number.rs

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
//! Defines physical expression for `row_number` that can evaluated at runtime during query execution
1919
2020
use crate::window::partition_evaluator::PartitionEvaluator;
21-
use crate::window::window_expr::{BuiltinWindowState, NumRowsState};
21+
use crate::window::window_expr::NumRowsState;
2222
use crate::window::BuiltInWindowFunctionExpr;
2323
use crate::PhysicalExpr;
2424
use arrow::array::{ArrayRef, UInt64Array};
@@ -76,11 +76,6 @@ pub(crate) struct NumRowsEvaluator {
7676
}
7777

7878
impl PartitionEvaluator for NumRowsEvaluator {
79-
fn state(&self) -> Result<BuiltinWindowState> {
80-
// If we do not use state we just return Default
81-
Ok(BuiltinWindowState::NumRows(self.state.clone()))
82-
}
83-
8479
fn get_range(&self, idx: usize, _n_rows: usize) -> Result<Range<usize>> {
8580
let start = idx;
8681
let end = idx + 1;

datafusion/physical-expr/src/window/window_expr.rs

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -327,16 +327,7 @@ pub struct LeadLagState {
327327
pub idx: usize,
328328
}
329329

330-
#[derive(Debug, Clone, Default)]
331-
pub enum BuiltinWindowState {
332-
Rank(RankState),
333-
NumRows(NumRowsState),
334-
NthValue(NthValueState),
335-
LeadLag(LeadLagState),
336-
#[default]
337-
Default,
338-
}
339-
330+
/// Holds the state of evaluating a window function
340331
#[derive(Debug)]
341332
pub struct WindowAggState {
342333
/// The range that we calculate the window function

0 commit comments

Comments
 (0)