From 440ab7b2815e2e3dd5e9e2a1bd26f7cbed2b7a41 Mon Sep 17 00:00:00 2001 From: Jefffrey Date: Sat, 27 Sep 2025 11:12:19 +0900 Subject: [PATCH 1/3] docs: refine `AggregateUDFImpl::is_ordered_set_aggregate` documentation --- datafusion/expr/src/udaf.rs | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/datafusion/expr/src/udaf.rs b/datafusion/expr/src/udaf.rs index fa71a76c09c7..1800434ec5c1 100644 --- a/datafusion/expr/src/udaf.rs +++ b/datafusion/expr/src/udaf.rs @@ -742,18 +742,23 @@ pub trait AggregateUDFImpl: Debug + DynEq + DynHash + Send + Sync { /// If this function is ordered-set aggregate function, return true /// otherwise, return false /// - /// Ordered-set aggregate functions require an explicit `ORDER BY` clause - /// because the calculation performed by these functions is dependent on the - /// specific sequence of the input rows, unlike other aggregate functions - /// like `SUM`, `AVG`, or `COUNT`. + /// Ordered-set aggregate functions allow an `ORDER BY` clause because the + /// calculation performed by these functions is dependent on the specific + /// sequence of the input rows, unlike other aggregate functions like `SUM` + /// `AVG`, or `COUNT`. If explit order is specified then a default order + /// of ascending is assumed. /// - /// An example of an ordered-set aggregate function is `percentile_cont` - /// which computes a specific percentile value from a sorted list of values, and - /// is only meaningful when the input data is ordered. + /// Note that setting this to `true` does not guarantee input sort order to + /// the aggregate function; it instead gives the function full control over + /// the sorting process and they are expected to handle order of input values + /// themselves. + /// + /// An example of an ordered-set aggregate function is `approx_percentile_cont` + /// which computes an approximate percentile value from a sorted list of values. /// /// In SQL syntax, ordered-set aggregate functions are used with the /// `WITHIN GROUP (ORDER BY ...)` clause to specify the ordering of the input - /// data. + /// data, or can be simply omitted to assume ascending. fn is_ordered_set_aggregate(&self) -> bool { false } From cb8facdcf1b506664eec3c06e917ff32ba3216ab Mon Sep 17 00:00:00 2001 From: Jefffrey Date: Sat, 27 Sep 2025 11:23:01 +0900 Subject: [PATCH 2/3] I am very good at the English language --- datafusion/expr/src/udaf.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/expr/src/udaf.rs b/datafusion/expr/src/udaf.rs index 1800434ec5c1..8404b09f0c46 100644 --- a/datafusion/expr/src/udaf.rs +++ b/datafusion/expr/src/udaf.rs @@ -745,7 +745,7 @@ pub trait AggregateUDFImpl: Debug + DynEq + DynHash + Send + Sync { /// Ordered-set aggregate functions allow an `ORDER BY` clause because the /// calculation performed by these functions is dependent on the specific /// sequence of the input rows, unlike other aggregate functions like `SUM` - /// `AVG`, or `COUNT`. If explit order is specified then a default order + /// `AVG`, or `COUNT`. If explicit order is specified then a default order /// of ascending is assumed. /// /// Note that setting this to `true` does not guarantee input sort order to From 601f84166a88b89fb1534fcc28652a044f06b0b6 Mon Sep 17 00:00:00 2001 From: Jefffrey Date: Sun, 19 Oct 2025 09:50:07 +0900 Subject: [PATCH 3/3] update docstring --- datafusion/expr/src/udaf.rs | 58 +++++++++++++++++++++++++++---------- 1 file changed, 42 insertions(+), 16 deletions(-) diff --git a/datafusion/expr/src/udaf.rs b/datafusion/expr/src/udaf.rs index ccab3fc153f9..b593f8411d24 100644 --- a/datafusion/expr/src/udaf.rs +++ b/datafusion/expr/src/udaf.rs @@ -746,26 +746,52 @@ pub trait AggregateUDFImpl: Debug + DynEq + DynHash + Send + Sync { true } - /// If this function is ordered-set aggregate function, return true - /// otherwise, return false + /// If this function is an ordered-set aggregate function, return `true`. + /// Otherwise, return `false` (default). /// - /// Ordered-set aggregate functions allow an `ORDER BY` clause because the - /// calculation performed by these functions is dependent on the specific - /// sequence of the input rows, unlike other aggregate functions like `SUM` - /// `AVG`, or `COUNT`. If explicit order is specified then a default order - /// of ascending is assumed. + /// Ordered-set aggregate functions allow specifying a sort order that affects + /// how the function calculates its result, unlike other aggregate functions + /// like `SUM` or `COUNT`. For example, `percentile_cont` is an ordered-set + /// aggregate function that calculates the exact percentile value from a list + /// of values; the output of calculating the `0.75` percentile depends on if + /// you're calculating on an ascending or descending list of values. /// - /// Note that setting this to `true` does not guarantee input sort order to - /// the aggregate function; it instead gives the function full control over - /// the sorting process and they are expected to handle order of input values - /// themselves. + /// Setting this to return `true` affects only SQL parsing & planning; it allows + /// use of the `WITHIN GROUP` clause to specify this order, for example: + /// + /// ```sql + /// -- Ascending + /// SELECT percentile_cont(0.75) WITHIN GROUP (ORDER BY c1 ASC) FROM table; + /// -- Default ordering is ascending if not explicitly specified + /// SELECT percentile_cont(0.75) WITHIN GROUP (ORDER BY c1) FROM table; + /// -- Descending + /// SELECT percentile_cont(0.75) WITHIN GROUP (ORDER BY c1 DESC) FROM table; + /// ``` + /// + /// This calculates the `0.75` percentile of the column `c1` from `table`, + /// according to the specific ordering. The column specified in the `WITHIN GROUP` + /// ordering clause is taken as the column to calculate values on; specifying + /// the `WITHIN GROUP` clause is optional so these queries are equivalent: /// - /// An example of an ordered-set aggregate function is `approx_percentile_cont` - /// which computes an approximate percentile value from a sorted list of values. + /// ```sql + /// -- If no WITHIN GROUP is specified then default ordering is implementation + /// -- dependent; in this case ascending for percentile_cont + /// SELECT percentile_cont(c1, 0.75) FROM table; + /// SELECT percentile_cont(0.75) WITHIN GROUP (ORDER BY c1 ASC) FROM table; + /// ``` + /// + /// Aggregate UDFs can define their default ordering if the function is called + /// without the `WITHIN GROUP` clause, though a default of ascending is the + /// standard practice. + /// + /// Note that setting this to `true` does not guarantee input sort order to + /// the aggregate function; it expects the function to handle ordering the + /// input values themselves (e.g. `percentile_cont` must buffer and sort + /// the values internally). That is, DataFusion does not introduce any kind + /// of sort into the plan for these functions. /// - /// In SQL syntax, ordered-set aggregate functions are used with the - /// `WITHIN GROUP (ORDER BY ...)` clause to specify the ordering of the input - /// data, or can be simply omitted to assume ascending. + /// Setting this to `false` disallows calling this function with the `WITHIN GROUP` + /// clause. fn is_ordered_set_aggregate(&self) -> bool { false }