apache
diff --git a/‎.github/workflows/build_and_test.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/build_and_test.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala‎
Lines changed: 2 additions & 2 deletions b/‎core/src/main/scala/org/apache/spark/scheduler/TaskSchedulerImpl.scala‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎core/src/test/scala/org/apache/spark/SparkFunSuite.scala‎
Lines changed: 16 additions & 4 deletions b/‎core/src/test/scala/org/apache/spark/SparkFunSuite.scala‎
Lines changed: 16 additions & 4 deletions
diff --git a/‎docs/monitoring.md‎
Lines changed: 3 additions & 0 deletions b/‎docs/monitoring.md‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎docs/sql-migration-guide.md‎
Lines changed: 2 additions & 0 deletions b/‎docs/sql-migration-guide.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/submitting-applications.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/submitting-applications.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎project/build.properties‎
Lines changed: 1 addition & 1 deletion b/‎project/build.properties‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/pyspark/ml/tuning.py‎
Lines changed: 2 additions & 1 deletion b/‎python/pyspark/ml/tuning.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala‎
Lines changed: 20 additions & 4 deletions b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/hash.scala‎
Lines changed: 20 additions & 4 deletions
diff --git a/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala‎
Lines changed: 10 additions & 14 deletions b/‎sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala‎
Lines changed: 10 additions & 14 deletions
@@ -537,6 +537,7 @@ jobs:
       uses: actions/checkout@v2
       with:
         repository: databricks/tpcds-kit
+        ref: 2a5078a782192ddb6efbcead8de9973d6ab4f069
         path: ./tpcds-kit
     - name: Build tpcds-kit
       if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
 
@@ -128,7 +128,7 @@ private[spark] class TaskSchedulerImpl(
 
   @volatile private var hasReceivedTask = false
   @volatile private var hasLaunchedTask = false
-  private val starvationTimer = new Timer(true)
+  private val starvationTimer = new Timer("task-starvation-timer", true)
 
   // Incrementing task IDs
   val nextTaskId = new AtomicLong(0)
@@ -152,7 +152,7 @@ private[spark] class TaskSchedulerImpl(
 
   protected val executorIdToHost = new HashMap[String, String]
 
-  private val abortTimer = new Timer(true)
+  private val abortTimer = new Timer("task-abort-timer", true)
   // Exposed for testing
   val unschedulableTaskSetToExpiryTime = new HashMap[TaskSetManager, Long]
 
 
@@ -19,21 +19,22 @@ package org.apache.spark
 
 // scalastyle:off
 import java.io.File
+import java.nio.file.Path
 import java.util.{Locale, TimeZone}
 
-import org.apache.log4j.spi.LoggingEvent
-
 import scala.annotation.tailrec
+import scala.collection.mutable.ArrayBuffer
+
 import org.apache.commons.io.FileUtils
 import org.apache.log4j.{Appender, AppenderSkeleton, Level, Logger}
+import org.apache.log4j.spi.LoggingEvent
 import org.scalatest.{BeforeAndAfter, BeforeAndAfterAll, BeforeAndAfterEach, Failed, Outcome}
 import org.scalatest.funsuite.AnyFunSuite
+
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config.Tests.IS_TESTING
 import org.apache.spark.util.{AccumulatorContext, Utils}
 
-import scala.collection.mutable.ArrayBuffer
-
 /**
  * Base abstract class for all unit tests in Spark for handling common functionality.
  *
@@ -119,6 +120,17 @@ abstract class SparkFunSuite
     file
   }
 
+  /**
+   * Get a Path relative to the root project. It is assumed that a spark home is set.
+   */
+  protected final def getWorkspaceFilePath(first: String, more: String*): Path = {
+    if (!(sys.props.contains("spark.test.home") || sys.env.contains("SPARK_HOME"))) {
+      fail("spark.test.home or SPARK_HOME is not set.")
+    }
+    val sparkHome = sys.props.getOrElse("spark.test.home", sys.env("SPARK_HOME"))
+    java.nio.file.Paths.get(sparkHome, first +: more: _*)
+  }
+
   /**
    * Note: this method doesn't support `BeforeAndAfter`. You must use `BeforeAndAfterEach` to
    * set up and tear down resources.
 
@@ -19,6 +19,9 @@ license: |
   limitations under the License.
 ---
 
+* This will become a table of contents (this text will be scraped).
+{:toc}
+
 There are several ways to monitor Spark applications: web UIs, metrics, and external instrumentation.
 
 # Web Interfaces
 
@@ -87,6 +87,8 @@ license: |
 
   - In Spark 3.2, Spark supports `DayTimeIntervalType` and `YearMonthIntervalType` as inputs and outputs of `TRANSFORM` clause in Hive `SERDE` mode, the behavior is different between Hive `SERDE` mode and `ROW FORMAT DELIMITED` mode when these two types are used as inputs. In Hive `SERDE` mode, `DayTimeIntervalType` column is converted to `HiveIntervalDayTime`, its string format is `[-]?d h:m:s.n`, but in `ROW FORMAT DELIMITED` mode the format is `INTERVAL '[-]?d h:m:s.n' DAY TO TIME`. In Hive `SERDE` mode, `YearMonthIntervalType` column is converted to `HiveIntervalYearMonth`, its string format is `[-]?y-m`, but in `ROW FORMAT DELIMITED` mode the format is `INTERVAL '[-]?y-m' YEAR TO MONTH`.
 
+  - In Spark 3.2, `hash(0) == hash(-0)` for floating point types. Previously, different values were generated.
+
 ## Upgrading from Spark SQL 3.0 to 3.1
 
   - In Spark 3.1, statistical aggregation function includes `std`, `stddev`, `stddev_samp`, `variance`, `var_samp`, `skewness`, `kurtosis`, `covar_samp`, `corr` will return `NULL` instead of `Double.NaN` when `DivideByZero` occurs during expression evaluation, for example, when `stddev_samp` applied on a single element set. In Spark version 3.0 and earlier, it will return `Double.NaN` in such case. To restore the behavior before Spark 3.1, you can set `spark.sql.legacy.statisticalAggregate` to `true`.
 
@@ -181,7 +181,7 @@ The master URL passed to Spark can be in one of the following formats:
         The cluster location will be found based on the <code>HADOOP_CONF_DIR</code> or <code>YARN_CONF_DIR</code> variable.
 </td></tr>
 <tr><td> <code>k8s://HOST:PORT</code> </td><td> Connect to a <a href="running-on-kubernetes.html">Kubernetes</a> cluster in
-        <code>cluster</code> mode. Client mode is currently unsupported and will be supported in future releases.
+        <code>client</code> or <code>cluster</code> mode depending on the value of <code>--deploy-mode</code>.
         The <code>HOST</code> and <code>PORT</code> refer to the <a href="https://kubernetes.io/docs/reference/generated/kube-apiserver/">Kubernetes API Server</a>.
         It connects using TLS by default. In order to force it to use an unsecured connection, you can use
         <code>k8s://http://HOST:PORT</code>.
 
@@ -14,4 +14,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
-sbt.version=1.5.1
+sbt.version=1.5.2
@@ -602,7 +602,7 @@ class CrossValidator(Estimator, _CrossValidatorParams, HasParallelism, HasCollec
     >>> from pyspark.ml.classification import LogisticRegression
     >>> from pyspark.ml.evaluation import BinaryClassificationEvaluator
     >>> from pyspark.ml.linalg import Vectors
-    >>> from pyspark.ml.tuning import CrossValidatorModel
+    >>> from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, CrossValidatorModel
     >>> import tempfile
     >>> dataset = spark.createDataFrame(
     ...     [(Vectors.dense([0.0]), 0.0),
@@ -1141,6 +1141,7 @@ class TrainValidationSplit(Estimator, _TrainValidationSplitParams, HasParallelis
     >>> from pyspark.ml.classification import LogisticRegression
     >>> from pyspark.ml.evaluation import BinaryClassificationEvaluator
     >>> from pyspark.ml.linalg import Vectors
+    >>> from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder
     >>> from pyspark.ml.tuning import TrainValidationSplitModel
     >>> import tempfile
     >>> dataset = spark.createDataFrame(
 
@@ -369,11 +369,25 @@ abstract class HashExpression[E] extends Expression {
   protected def genHashBoolean(input: String, result: String): String =
     genHashInt(s"$input ? 1 : 0", result)
 
-  protected def genHashFloat(input: String, result: String): String =
-    genHashInt(s"Float.floatToIntBits($input)", result)
+  protected def genHashFloat(input: String, result: String): String = {
+    s"""
+       |if($input == -0.0f) {
+       |  ${genHashInt("0", result)}
+       |} else {
+       |  ${genHashInt(s"Float.floatToIntBits($input)", result)}
+       |}
+     """.stripMargin
+  }
 
-  protected def genHashDouble(input: String, result: String): String =
-    genHashLong(s"Double.doubleToLongBits($input)", result)
+  protected def genHashDouble(input: String, result: String): String = {
+    s"""
+      |if($input == -0.0d) {
+      |  ${genHashLong("0L", result)}
+      |} else {
+      |  ${genHashLong(s"Double.doubleToLongBits($input)", result)}
+      |}
+     """.stripMargin
+  }
 
   protected def genHashDecimal(
       ctx: CodegenContext,
@@ -523,7 +537,9 @@ abstract class InterpretedHashFunction {
       case s: Short => hashInt(s, seed)
       case i: Int => hashInt(i, seed)
       case l: Long => hashLong(l, seed)
+      case f: Float if (f == -0.0f) => hashInt(0, seed)
       case f: Float => hashInt(java.lang.Float.floatToIntBits(f), seed)
+      case d: Double if (d == -0.0d) => hashLong(0L, seed)
       case d: Double => hashLong(java.lang.Double.doubleToLongBits(d), seed)
       case d: Decimal =>
         val precision = dataType.asInstanceOf[DecimalType].precision
 
@@ -52,6 +52,12 @@ trait InvokeLike extends Expression with NonSQLExpression {
 
   protected lazy val needNullCheck: Boolean = propagateNull && arguments.exists(_.nullable)
   protected lazy val evaluatedArgs: Array[Object] = new Array[Object](arguments.length)
+  private lazy val boxingFn: Any => Any =
+    ScalaReflection.typeBoxedJavaMapping
+      .get(dataType)
+      .map(cls => v => cls.cast(v))
+      .getOrElse(identity)
+
 
   /**
    * Prepares codes for arguments.
@@ -122,12 +128,7 @@ trait InvokeLike extends Expression with NonSQLExpression {
    * @param dataType the data type of the return object
    * @return the return object of a method call
    */
-  def invoke(
-      obj: Any,
-      method: Method,
-      arguments: Seq[Expression],
-      input: InternalRow,
-      dataType: DataType): Any = {
+  def invoke(obj: Any, method: Method, input: InternalRow): Any = {
     var i = 0
     val len = arguments.length
     while (i < len) {
@@ -145,12 +146,7 @@ trait InvokeLike extends Expression with NonSQLExpression {
         case e: java.lang.reflect.InvocationTargetException if e.getCause != null =>
           throw e.getCause
       }
-      val boxedClass = ScalaReflection.typeBoxedJavaMapping.get(dataType)
-      if (boxedClass.isDefined) {
-        boxedClass.get.cast(ret)
-      } else {
-        ret
-      }
+      boxingFn(ret)
     }
   }
 
@@ -256,7 +252,7 @@ case class StaticInvoke(
   @transient lazy val method = findMethod(cls, functionName, argClasses)
 
   override def eval(input: InternalRow): Any = {
-    invoke(null, method, arguments, input, dataType)
+    invoke(null, method, input)
   }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
@@ -358,7 +354,7 @@ case class Invoke(
       } else {
         obj.getClass.getMethod(functionName, argClasses: _*)
       }
-      invoke(obj, invokeMethod, arguments, input, dataType)
+      invoke(obj, invokeMethod, input)
     }
   }
Original file line number	Diff line number	Diff line change
`@@ -14,4 +14,4 @@`
`14`	`14`	`# See the License for the specific language governing permissions and`
`15`	`15`	`# limitations under the License.`
`16`	`16`	`#`
`17`		`-sbt.version=1.5.1`
	`17`	`+sbt.version=1.5.2`