15
15
// specific language governing permissions and limitations
16
16
// under the License.
17
17
18
- //! partition evaluation module
18
+ //! Partition evaluation module
19
19
20
20
use crate :: window:: window_expr:: BuiltinWindowState ;
21
21
use crate :: window:: WindowAggState ;
@@ -25,24 +25,97 @@ use datafusion_common::{DataFusionError, ScalarValue};
25
25
use std:: fmt:: Debug ;
26
26
use std:: ops:: Range ;
27
27
28
- /// Partition evaluator
28
+ /// Partition evaluator for Window Functions
29
+ ///
30
+ /// # Background
31
+ ///
32
+ /// An implementation of this trait is created and used for each
33
+ /// partition defined by an `OVER` clause and is instantiated by
34
+ /// [`BuiltInWindowFunctionExpr::create_evaluator`]
35
+ ///
36
+ /// For example, evaluating `window_func(val) OVER (PARTITION BY col)`
37
+ /// on the following data:
38
+ ///
39
+ /// ```text
40
+ /// col | val
41
+ /// --- + ----
42
+ /// A | 10
43
+ /// A | 10
44
+ /// C | 20
45
+ /// D | 30
46
+ /// D | 30
47
+ /// ```
48
+ ///
49
+ /// Will instantiate three `PartitionEvaluator`s, one each for the
50
+ /// partitions defined by `col=A`, `col=B`, and `col=C`.
51
+ ///
52
+ /// ```text
53
+ /// col | val
54
+ /// --- + ----
55
+ /// A | 10 <--- partition 1
56
+ /// A | 10
57
+ ///
58
+ /// col | val
59
+ /// --- + ----
60
+ /// C | 20 <--- partition 2
61
+ ///
62
+ /// col | val
63
+ /// --- + ----
64
+ /// D | 30 <--- partition 3
65
+ /// D | 30
66
+ /// ```
67
+ ///
68
+ /// Different methods on this trait will be called depending on the
69
+ /// capabilities described by [`BuiltInWindowFunctionExpr`]:
70
+ ///
71
+ /// # Stateless `PartitionEvaluator`
72
+ ///
73
+ /// In this case, [`Self::evaluate`], [`Self::evaluate_with_rank`] or
74
+ /// [`Self::evaluate_inside_range`] is called with values for the
75
+ /// entire partition.
76
+ ///
77
+ /// # Stateful `PartitionEvaluator`
78
+ ///
79
+ /// In this case, [`Self::evaluate_stateful`] is called to calculate
80
+ /// the results of the window function incrementally for each new
81
+ /// batch, saving and restoring any state needed to do so as
82
+ /// [`BuiltinWindowState`].
83
+ ///
84
+ /// For example, when computing `ROW_NUMBER` incrementally,
85
+ /// [`Self::evaluate_stateful`] will be called multiple times with
86
+ /// different batches. For all batches after the first, the output
87
+ /// `row_number` must start from last `row_number` produced for the
88
+ /// previous batch. The previous row number is saved and restored as
89
+ /// the state.
90
+ ///
91
+ /// [`BuiltInWindowFunctionExpr`]: crate::window::BuiltInWindowFunctionExpr
92
+ /// [`BuiltInWindowFunctionExpr::create_evaluator`]: crate::window::BuiltInWindowFunctionExpr::create_evaluator
29
93
pub trait PartitionEvaluator : Debug + Send {
30
- /// Whether the evaluator should be evaluated with rank
94
+ /// Can this evaluator be evaluated with (only) rank
95
+ ///
96
+ /// If `include_rank` is true, then [`Self::evaluate_with_rank`]
97
+ /// will be called for each partition, which includes the
98
+ /// `rank`.
31
99
fn include_rank ( & self ) -> bool {
32
100
false
33
101
}
34
102
35
- /// Returns state of the Built-in Window Function
103
+ /// Returns the internal state of the window function
104
+ ///
105
+ /// Only used for stateful evaluation
36
106
fn state ( & self ) -> Result < BuiltinWindowState > {
37
107
// If we do not use state we just return Default
38
108
Ok ( BuiltinWindowState :: Default )
39
109
}
40
110
41
- /// Updates the internal state for Built-in window function
42
- // state is useful to update internal state for Built-in window function.
43
- // idx is the index of last row for which result is calculated.
44
- // range_columns is the result of order by column values. It is used to calculate rank boundaries
45
- // sort_partition_points is the boundaries of each rank in the range_column. It is used to update rank.
111
+ /// Updates the internal state for window function
112
+ ///
113
+ /// Only used for stateful evaluation
114
+ ///
115
+ /// `state`: is useful to update internal state for window function.
116
+ /// `idx`: is the index of last row for which result is calculated.
117
+ /// `range_columns`: is the result of order by column values. It is used to calculate rank boundaries
118
+ /// `sort_partition_points`: is the boundaries of each rank in the range_column. It is used to update rank.
46
119
fn update_state (
47
120
& mut self ,
48
121
_state : & WindowAggState ,
@@ -54,36 +127,72 @@ pub trait PartitionEvaluator: Debug + Send {
54
127
Ok ( ( ) )
55
128
}
56
129
130
+ /// Sets the internal state for window function
131
+ ///
132
+ /// Only used for stateful evaluation
57
133
fn set_state ( & mut self , _state : & BuiltinWindowState ) -> Result < ( ) > {
58
134
Err ( DataFusionError :: NotImplemented (
59
135
"set_state is not implemented for this window function" . to_string ( ) ,
60
136
) )
61
137
}
62
138
63
- /// Gets the range where Built-in window function result is calculated.
64
- // idx is the index of last row for which result is calculated.
65
- // n_rows is the number of rows of the input record batch (Used during bound check)
139
+ /// Gets the range where the window function result is calculated.
140
+ ///
141
+ /// `idx`: is the index of last row for which result is calculated.
142
+ /// `n_rows`: is the number of rows of the input record batch (Used during bounds check)
66
143
fn get_range ( & self , _idx : usize , _n_rows : usize ) -> Result < Range < usize > > {
67
144
Err ( DataFusionError :: NotImplemented (
68
145
"get_range is not implemented for this window function" . to_string ( ) ,
69
146
) )
70
147
}
71
148
72
- /// Evaluate the partition evaluator against the partition
149
+ /// Called for window functions that *do not use* values from the
150
+ /// the window frame, such as `ROW_NUMBER`, `RANK`, `DENSE_RANK`,
151
+ /// `PERCENT_RANK`, `CUME_DIST`, `LEAD`, `LAG`).
73
152
fn evaluate ( & self , _values : & [ ArrayRef ] , _num_rows : usize ) -> Result < ArrayRef > {
74
153
Err ( DataFusionError :: NotImplemented (
75
154
"evaluate is not implemented by default" . into ( ) ,
76
155
) )
77
156
}
78
157
79
- /// Evaluate window function result inside given range
158
+ /// Evaluate window function result inside given range.
159
+ ///
160
+ /// Only used for stateful evaluation
80
161
fn evaluate_stateful ( & mut self , _values : & [ ArrayRef ] ) -> Result < ScalarValue > {
81
162
Err ( DataFusionError :: NotImplemented (
82
163
"evaluate_stateful is not implemented by default" . into ( ) ,
83
164
) )
84
165
}
85
166
86
- /// evaluate the partition evaluator against the partition but with rank
167
+ /// [`PartitionEvaluator::evaluate_with_rank`] is called for window
168
+ /// functions that only need the rank of a row within its window
169
+ /// frame.
170
+ ///
171
+ /// Evaluate the partition evaluator against the partition using
172
+ /// the row ranks. For example, `RANK(col)` produces
173
+ ///
174
+ /// ```text
175
+ /// col | rank
176
+ /// --- + ----
177
+ /// A | 1
178
+ /// A | 1
179
+ /// C | 3
180
+ /// D | 4
181
+ /// D | 5
182
+ /// ```
183
+ ///
184
+ /// For this case, `num_rows` would be `5` and the
185
+ /// `ranks_in_partition` would be called with
186
+ ///
187
+ /// ```text
188
+ /// [
189
+ /// (0,1),
190
+ /// (2,2),
191
+ /// (3,4),
192
+ /// ]
193
+ /// ```
194
+ ///
195
+ /// See [`Self::include_rank`] for more details
87
196
fn evaluate_with_rank (
88
197
& self ,
89
198
_num_rows : usize ,
@@ -94,7 +203,11 @@ pub trait PartitionEvaluator: Debug + Send {
94
203
) )
95
204
}
96
205
97
- /// evaluate window function result inside given range
206
+ /// Called for window functions that use values from window frame,
207
+ /// such as `FIRST_VALUE`, `LAST_VALUE`, `NTH_VALUE` and produce a
208
+ /// single value for every row in the partition.
209
+ ///
210
+ /// Returns a [`ScalarValue`] that is the value of the window function for the entire partition
98
211
fn evaluate_inside_range (
99
212
& self ,
100
213
_values : & [ ArrayRef ] ,
0 commit comments