Skip to content

Commit 5bbe9c8

Browse files
committed
[SPARK-48842][DOCS] Document non-determinism of max_by and min_by
### What changes were proposed in this pull request? Document non-determinism of max_by and min_by ### Why are the changes needed? I have been confused by this non-determinism twice, it occurred like a correctness bug to me. So I think we need to document it ### Does this PR introduce _any_ user-facing change? doc change only ### How was this patch tested? ci ### Was this patch authored or co-authored using generative AI tooling? no Closes #47266 from zhengruifeng/py_doc_max_by. Authored-by: Ruifeng Zheng <[email protected]> Signed-off-by: Ruifeng Zheng <[email protected]>
1 parent 0e940e2 commit 5bbe9c8

File tree

5 files changed

+38
-0
lines changed

5 files changed

+38
-0
lines changed

R/pkg/R/functions.R

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1558,6 +1558,9 @@ setMethod("max",
15581558
#' @details
15591559
#' \code{max_by}: Returns the value associated with the maximum value of ord.
15601560
#'
1561+
#' Note: The function is non-deterministic so the output order can be different
1562+
#' for those associated the same values of `x`.
1563+
#'
15611564
#' @rdname column_aggregate_functions
15621565
#' @aliases max_by max_by,Column-method
15631566
#' @note max_by since 3.3.0
@@ -1633,6 +1636,9 @@ setMethod("min",
16331636
#' @details
16341637
#' \code{min_by}: Returns the value associated with the minimum value of ord.
16351638
#'
1639+
#' Note: The function is non-deterministic so the output order can be different
1640+
#' for those associated the same values of `x`.
1641+
#'
16361642
#' @rdname column_aggregate_functions
16371643
#' @aliases min_by min_by,Column-method
16381644
#' @note min_by since 3.3.0

connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -884,6 +884,10 @@ object functions {
884884
/**
885885
* Aggregate function: returns the value associated with the maximum value of ord.
886886
*
887+
* @note
888+
* The function is non-deterministic so the output order can be different for those associated
889+
* the same values of `e`.
890+
*
887891
* @group agg_funcs
888892
* @since 3.4.0
889893
*/
@@ -932,6 +936,10 @@ object functions {
932936
/**
933937
* Aggregate function: returns the value associated with the minimum value of ord.
934938
*
939+
* @note
940+
* The function is non-deterministic so the output order can be different for those associated
941+
* the same values of `e`.
942+
*
935943
* @group agg_funcs
936944
* @since 3.4.0
937945
*/

python/pyspark/sql/functions/builtin.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1271,6 +1271,11 @@ def max_by(col: "ColumnOrName", ord: "ColumnOrName") -> Column:
12711271
.. versionchanged:: 3.4.0
12721272
Supports Spark Connect.
12731273

1274+
Notes
1275+
-----
1276+
The function is non-deterministic so the output order can be different for those
1277+
associated the same values of `col`.
1278+
12741279
Parameters
12751280
----------
12761281
col : :class:`~pyspark.sql.Column` or str
@@ -1352,6 +1357,11 @@ def min_by(col: "ColumnOrName", ord: "ColumnOrName") -> Column:
13521357
.. versionchanged:: 3.4.0
13531358
Supports Spark Connect.
13541359

1360+
Notes
1361+
-----
1362+
The function is non-deterministic so the output order can be different for those
1363+
associated the same values of `col`.
1364+
13551365
Parameters
13561366
----------
13571367
col : :class:`~pyspark.sql.Column` or str

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/MaxByAndMinBy.scala

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,10 @@ abstract class MaxMinBy extends DeclarativeAggregate with BinaryLike[Expression]
9999
> SELECT _FUNC_(x, y) FROM VALUES ('a', 10), ('b', 50), ('c', 20) AS tab(x, y);
100100
b
101101
""",
102+
note = """
103+
The function is non-deterministic so the output order can be different for
104+
those associated the same values of `x`.
105+
""",
102106
group = "agg_funcs",
103107
since = "3.0.0")
104108
case class MaxBy(valueExpr: Expression, orderingExpr: Expression) extends MaxMinBy {
@@ -122,6 +126,10 @@ case class MaxBy(valueExpr: Expression, orderingExpr: Expression) extends MaxMin
122126
> SELECT _FUNC_(x, y) FROM VALUES ('a', 10), ('b', 50), ('c', 20) AS tab(x, y);
123127
a
124128
""",
129+
note = """
130+
The function is non-deterministic so the output order can be different for
131+
those associated the same values of `x`.
132+
""",
125133
group = "agg_funcs",
126134
since = "3.0.0")
127135
case class MinBy(valueExpr: Expression, orderingExpr: Expression) extends MaxMinBy {

sql/core/src/main/scala/org/apache/spark/sql/functions.scala

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -902,6 +902,9 @@ object functions {
902902
/**
903903
* Aggregate function: returns the value associated with the maximum value of ord.
904904
*
905+
* @note The function is non-deterministic so the output order can be different for
906+
* those associated the same values of `e`.
907+
*
905908
* @group agg_funcs
906909
* @since 3.3.0
907910
*/
@@ -952,6 +955,9 @@ object functions {
952955
/**
953956
* Aggregate function: returns the value associated with the minimum value of ord.
954957
*
958+
* @note The function is non-deterministic so the output order can be different for
959+
* those associated the same values of `e`.
960+
*
955961
* @group agg_funcs
956962
* @since 3.3.0
957963
*/

0 commit comments

Comments
 (0)