[SPARK-48842][DOCS] Document non-determinism of max_by and min_by

zhengruifeng · zhengruifeng · commit 5bbe9c850aaa · 2024-07-12T12:41:07.000+08:00
### What changes were proposed in this pull request? Document non-determinism of max_by and min_by ### Why are the changes needed? I have been confused by this non-determinism twice, it occurred like a correctness bug to me. So I think we need to document it ### Does this PR introduce _any_ user-facing change? doc change only ### How was this patch tested? ci ### Was this patch authored or co-authored using generative AI tooling? no Closes #47266 from zhengruifeng/py_doc_max_by. Authored-by: Ruifeng Zheng <ruifengz@apache.org> Signed-off-by: Ruifeng Zheng <ruifengz@apache.org>
diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R
@@ -1558,6 +1558,9 @@ setMethod("max",
 #' @details
 #' \code{max_by}: Returns the value associated with the maximum value of ord.
 #'
+#' Note: The function is non-deterministic so the output order can be different
+#' for those associated the same values of `x`.
+#'
 #' @rdname column_aggregate_functions
 #' @aliases max_by max_by,Column-method
 #' @note max_by since 3.3.0
@@ -1633,6 +1636,9 @@ setMethod("min",
 #' @details
 #' \code{min_by}: Returns the value associated with the minimum value of ord.
 #'
+#' Note: The function is non-deterministic so the output order can be different
+#' for those associated the same values of `x`.
+#'
 #' @rdname column_aggregate_functions
 #' @aliases min_by min_by,Column-method
 #' @note min_by since 3.3.0
diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala
@@ -884,6 +884,10 @@ object functions {
   /**
    * Aggregate function: returns the value associated with the maximum value of ord.
    *
+   * @note
+   *   The function is non-deterministic so the output order can be different for those associated
+   *   the same values of `e`.
+   *
    * @group agg_funcs
    * @since 3.4.0
    */
@@ -932,6 +936,10 @@ object functions {
   /**
    * Aggregate function: returns the value associated with the minimum value of ord.
    *
+   * @note
+   *   The function is non-deterministic so the output order can be different for those associated
+   *   the same values of `e`.
+   *
    * @group agg_funcs
    * @since 3.4.0
    */
diff --git a/python/pyspark/sql/functions/builtin.py b/python/pyspark/sql/functions/builtin.py
@@ -1271,6 +1271,11 @@ def max_by(col: "ColumnOrName", ord: "ColumnOrName") -> Column:
     .. versionchanged:: 3.4.0
         Supports Spark Connect.
 
+    Notes
+    -----
+    The function is non-deterministic so the output order can be different for those
+    associated the same values of `col`.
+
     Parameters
     ----------
     col : :class:`~pyspark.sql.Column` or str
@@ -1352,6 +1357,11 @@ def min_by(col: "ColumnOrName", ord: "ColumnOrName") -> Column:
     .. versionchanged:: 3.4.0
         Supports Spark Connect.
 
+    Notes
+    -----
+    The function is non-deterministic so the output order can be different for those
+    associated the same values of `col`.
+
     Parameters
     ----------
     col : :class:`~pyspark.sql.Column` or str
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/MaxByAndMinBy.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/MaxByAndMinBy.scala
@@ -99,6 +99,10 @@ abstract class MaxMinBy extends DeclarativeAggregate with BinaryLike[Expression]
       > SELECT _FUNC_(x, y) FROM VALUES ('a', 10), ('b', 50), ('c', 20) AS tab(x, y);
        b
   """,
+  note = """
+    The function is non-deterministic so the output order can be different for
+    those associated the same values of `x`.
+  """,
   group = "agg_funcs",
   since = "3.0.0")
 case class MaxBy(valueExpr: Expression, orderingExpr: Expression) extends MaxMinBy {
@@ -122,6 +126,10 @@ case class MaxBy(valueExpr: Expression, orderingExpr: Expression) extends MaxMin
       > SELECT _FUNC_(x, y) FROM VALUES ('a', 10), ('b', 50), ('c', 20) AS tab(x, y);
        a
   """,
+  note = """
+    The function is non-deterministic so the output order can be different for
+    those associated the same values of `x`.
+  """,
   group = "agg_funcs",
   since = "3.0.0")
 case class MinBy(valueExpr: Expression, orderingExpr: Expression) extends MaxMinBy {
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -902,6 +902,9 @@ object functions {
   /**
    * Aggregate function: returns the value associated with the maximum value of ord.
    *
+   * @note The function is non-deterministic so the output order can be different for
+   * those associated the same values of `e`.
+   *
    * @group agg_funcs
    * @since 3.3.0
    */
@@ -952,6 +955,9 @@ object functions {
   /**
    * Aggregate function: returns the value associated with the minimum value of ord.
    *
+   * @note The function is non-deterministic so the output order can be different for
+   * those associated the same values of `e`.
+   *
    * @group agg_funcs
    * @since 3.3.0
    */