[SPARK-39529][INFRA] Refactor and merge all related job selection logic into precondition

a0x8o · EnricoMi · a0x8o · commit a34dc4d2fde7 · 2022-06-22T10:58:14.000-05:00
### What changes were proposed in this pull request? This PR borrows the idea from apache/spark#36928 but adds some more changes in order for scheduled jobs to share the `precondition` so all conditional logic is consolidated here. This PR also adds a new option to `is-changed.py` so dependent modules can be checked together. In this way, we don't have to change `build_and_test.yml` often when we add a new module. In addition, this PR removes `type` because `precondition` job now replaces it. Lastly, this PR enables PySpark, SparkR TPC-DS and Docker integration tests for scheduled jobs when applicable. Closes #36928 ### Why are the changes needed? To make it easier to read. ### Does this PR introduce _any_ user-facing change? No, dev-only. ### How was this patch tested? Tested locally and in my fork (https://github.com/HyukjinKwon/spark/actions) Closes #36940 from HyukjinKwon/SPARK-39529. Lead-authored-by: Hyukjin Kwon <gurwls223@apache.org> Co-authored-by: Enrico Minack <github@enrico.minack.dev> Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
diff --git a/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala b/core/src/test/scala/org/apache/spark/scheduler/OutputCommitCoordinatorSuite.scala
@@ -144,7 +144,7 @@ class OutputCommitCoordinatorSuite extends SparkFunSuite with BeforeAndAfter {
     assert(tempDir.list().size === 1)
   }
 
-  test("If commit fails, if task is retried it should not be locked, and will succeed.") {
+  ignore("If commit fails, if task is retried it should not be locked, and will succeed.") {
     val rdd = sc.parallelize(Seq(1), 1)
     sc.runJob(rdd, OutputCommitFunctions(tempDir.getAbsolutePath).failFirstCommitAttempt _,
       rdd.partitions.indices)
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala b/mllib/src/main/scala/org/apache/spark/mllib/evaluation/BinaryClassificationMetrics.scala
@@ -45,6 +45,9 @@ class BinaryClassificationMetrics @Since("3.0.0") (
     @Since("1.3.0") val scoreAndLabels: RDD[_ <: Product],
     @Since("1.3.0") val numBins: Int = 1000)
   extends Logging {
+
+  @deprecated("The variable `scoreLabelsWeight` should be private and " +
+    "will be removed in 4.0.0.", "3.4.0")
   val scoreLabelsWeight: RDD[(Double, (Double, Double))] = scoreAndLabels.map {
     case (prediction: Double, label: Double, weight: Double) =>
       require(weight >= 0.0, s"instance weight, $weight has to be >= 0.0")
diff --git a/pom.xml b/pom.xml
@@ -1218,7 +1218,7 @@
       <dependency>
         <groupId>mysql</groupId>
         <artifactId>mysql-connector-java</artifactId>
-        <version>8.0.27</version>
+        <version>8.0.29</version>
         <scope>test</scope>
       </dependency>
       <dependency>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/catalyst/util/V2ExpressionBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/catalyst/util/V2ExpressionBuilder.scala
@@ -17,19 +17,15 @@
 
 package org.apache.spark.sql.catalyst.util
 
-import org.apache.spark.sql.catalyst.expressions.{Abs, Add, And, BinaryComparison, BinaryOperator, BitwiseAnd, BitwiseNot, BitwiseOr, BitwiseXor, CaseWhen, Cast, Ceil, Coalesce, Contains, Divide, EndsWith, EqualTo, Exp, Expression, Floor, In, InSet, IsNotNull, IsNull, Literal, Log, Lower, Multiply, Not, Or, Overlay, Pow, Predicate, Remainder, Sqrt, StartsWith, StringPredicate, StringTranslate, StringTrim, StringTrimLeft, StringTrimRight, Substring, Subtract, UnaryMinus, Upper, WidthBucket}
+import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.connector.expressions.{Cast => V2Cast, Expression => V2Expression, FieldReference, GeneralScalarExpression, LiteralValue}
 import org.apache.spark.sql.connector.expressions.filter.{AlwaysFalse, AlwaysTrue, And => V2And, Not => V2Not, Or => V2Or, Predicate => V2Predicate}
-import org.apache.spark.sql.execution.datasources.PushableColumn
 import org.apache.spark.sql.types.BooleanType
 
 /**
  * The builder to generate V2 expressions from catalyst expressions.
  */
-class V2ExpressionBuilder(
-  e: Expression, nestedPredicatePushdownEnabled: Boolean = false, isPredicate: Boolean = false) {
-
-  val pushableColumn = PushableColumn(nestedPredicatePushdownEnabled)
+class V2ExpressionBuilder(e: Expression, isPredicate: Boolean = false) {
 
   def build(): Option[V2Expression] = generateExpression(e, isPredicate)
 
@@ -49,12 +45,8 @@ class V2ExpressionBuilder(
     case Literal(true, BooleanType) => Some(new AlwaysTrue())
     case Literal(false, BooleanType) => Some(new AlwaysFalse())
     case Literal(value, dataType) => Some(LiteralValue(value, dataType))
-    case col @ pushableColumn(name) =>
-      val ref = if (nestedPredicatePushdownEnabled) {
-        FieldReference(name)
-      } else {
-        FieldReference.column(name)
-      }
+    case col @ ColumnOrField(nameParts) =>
+      val ref = FieldReference(nameParts)
       if (isPredicate && col.dataType.isInstanceOf[BooleanType]) {
         Some(new V2Predicate("=", Array(ref, LiteralValue(true, BooleanType))))
       } else {
@@ -266,3 +258,12 @@ class V2ExpressionBuilder(
     case _ => None
   }
 }
+
+object ColumnOrField {
+  def unapply(e: Expression): Option[Seq[String]] = e match {
+    case a: Attribute => Some(Seq(a.name))
+    case s: GetStructField =>
+      unapply(s.child).map(_ :+ s.childSchema(s.ordinal).name)
+    case _ => None
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/AdaptiveSparkPlanExec.scala
@@ -118,6 +118,7 @@ case class AdaptiveSparkPlanExec(
     Seq(
       RemoveRedundantProjects,
       ensureRequirements,
+      ValidateSparkPlan,
       ReplaceHashWithSortAgg,
       RemoveRedundantSorts,
       DisableUnnecessaryBucketedScan,
@@ -301,17 +302,20 @@ case class AdaptiveSparkPlanExec(
         // plans are updated, we can clear the query stage list because at this point the two plans
         // are semantically and physically in sync again.
         val logicalPlan = replaceWithQueryStagesInLogicalPlan(currentLogicalPlan, stagesToReplace)
-        val (newPhysicalPlan, newLogicalPlan) = reOptimize(logicalPlan)
-        val origCost = costEvaluator.evaluateCost(currentPhysicalPlan)
-        val newCost = costEvaluator.evaluateCost(newPhysicalPlan)
-        if (newCost < origCost ||
+        val afterReOptimize = reOptimize(logicalPlan)
+        if (afterReOptimize.isDefined) {
+          val (newPhysicalPlan, newLogicalPlan) = afterReOptimize.get
+          val origCost = costEvaluator.evaluateCost(currentPhysicalPlan)
+          val newCost = costEvaluator.evaluateCost(newPhysicalPlan)
+          if (newCost < origCost ||
             (newCost == origCost && currentPhysicalPlan != newPhysicalPlan)) {
-          logOnLevel("Plan changed:\n" +
-            sideBySide(currentPhysicalPlan.treeString, newPhysicalPlan.treeString).mkString("\n"))
-          cleanUpTempTags(newPhysicalPlan)
-          currentPhysicalPlan = newPhysicalPlan
-          currentLogicalPlan = newLogicalPlan
-          stagesToReplace = Seq.empty[QueryStageExec]
+            logOnLevel("Plan changed:\n" +
+              sideBySide(currentPhysicalPlan.treeString, newPhysicalPlan.treeString).mkString("\n"))
+            cleanUpTempTags(newPhysicalPlan)
+            currentPhysicalPlan = newPhysicalPlan
+            currentLogicalPlan = newLogicalPlan
+            stagesToReplace = Seq.empty[QueryStageExec]
+          }
         }
         // Now that some stages have finished, we can try creating new stages.
         result = createQueryStages(currentPhysicalPlan)
@@ -641,29 +645,35 @@ case class AdaptiveSparkPlanExec(
   /**
    * Re-optimize and run physical planning on the current logical plan based on the latest stats.
    */
-  private def reOptimize(logicalPlan: LogicalPlan): (SparkPlan, LogicalPlan) = {
-    logicalPlan.invalidateStatsCache()
-    val optimized = optimizer.execute(logicalPlan)
-    val sparkPlan = context.session.sessionState.planner.plan(ReturnAnswer(optimized)).next()
-    val newPlan = applyPhysicalRules(
-      sparkPlan,
-      preprocessingRules ++ queryStagePreparationRules,
-      Some((planChangeLogger, "AQE Replanning")))
-
-    // When both enabling AQE and DPP, `PlanAdaptiveDynamicPruningFilters` rule will
-    // add the `BroadcastExchangeExec` node manually in the DPP subquery,
-    // not through `EnsureRequirements` rule. Therefore, when the DPP subquery is complicated
-    // and need to be re-optimized, AQE also need to manually insert the `BroadcastExchangeExec`
-    // node to prevent the loss of the `BroadcastExchangeExec` node in DPP subquery.
-    // Here, we also need to avoid to insert the `BroadcastExchangeExec` node when the newPlan
-    // is already the `BroadcastExchangeExec` plan after apply the `LogicalQueryStageStrategy` rule.
-    val finalPlan = currentPhysicalPlan match {
-      case b: BroadcastExchangeLike
-        if (!newPlan.isInstanceOf[BroadcastExchangeLike]) => b.withNewChildren(Seq(newPlan))
-      case _ => newPlan
-    }
+  private def reOptimize(logicalPlan: LogicalPlan): Option[(SparkPlan, LogicalPlan)] = {
+    try {
+      logicalPlan.invalidateStatsCache()
+      val optimized = optimizer.execute(logicalPlan)
+      val sparkPlan = context.session.sessionState.planner.plan(ReturnAnswer(optimized)).next()
+      val newPlan = applyPhysicalRules(
+        sparkPlan,
+        preprocessingRules ++ queryStagePreparationRules,
+        Some((planChangeLogger, "AQE Replanning")))
+
+      // When both enabling AQE and DPP, `PlanAdaptiveDynamicPruningFilters` rule will
+      // add the `BroadcastExchangeExec` node manually in the DPP subquery,
+      // not through `EnsureRequirements` rule. Therefore, when the DPP subquery is complicated
+      // and need to be re-optimized, AQE also need to manually insert the `BroadcastExchangeExec`
+      // node to prevent the loss of the `BroadcastExchangeExec` node in DPP subquery.
+      // Here, we also need to avoid to insert the `BroadcastExchangeExec` node when the newPlan is
+      // already the `BroadcastExchangeExec` plan after apply the `LogicalQueryStageStrategy` rule.
+      val finalPlan = currentPhysicalPlan match {
+        case b: BroadcastExchangeLike
+          if (!newPlan.isInstanceOf[BroadcastExchangeLike]) => b.withNewChildren(Seq(newPlan))
+        case _ => newPlan
+      }
 
-    (finalPlan, optimized)
+      Some((finalPlan, optimized))
+    } catch {
+      case e: InvalidAQEPlanException[_] =>
+        logOnLevel(s"Re-optimize - ${e.getMessage()}:\n${e.plan}")
+        None
+    }
   }
 
   /**
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/InvalidAQEPlanException.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/InvalidAQEPlanException.scala
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.adaptive
+
+import org.apache.spark.sql.catalyst.plans.QueryPlan
+
+/**
+ * Exception thrown when an invalid query plan is detected in AQE replanning,
+ * in which case AQE will stop the current replanning process and keep using the latest valid plan.
+ *
+ * @param message The reason why the plan is considered invalid.
+ * @param plan The invalid plan/sub-plan.
+ */
+case class InvalidAQEPlanException[QueryType <: QueryPlan[_]](message: String, plan: QueryType)
+  extends Exception(message)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/ValidateSparkPlan.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/adaptive/ValidateSparkPlan.scala
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.adaptive
+
+import org.apache.spark.sql.catalyst.optimizer.{BuildLeft, BuildRight}
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, BroadcastNestedLoopJoinExec}
+
+/**
+ * Detects invalid physical plans generated by AQE replanning and throws `InvalidAQEPlanException`
+ * if such plans are detected. This rule should be called after EnsureRequirements where all
+ * necessary Exchange nodes are added.
+ */
+object ValidateSparkPlan extends Rule[SparkPlan] {
+
+  def apply(plan: SparkPlan): SparkPlan = {
+    validate(plan)
+    plan
+  }
+
+  /**
+   * Validate that the plan satisfies the following condition:
+   * - BroadcastQueryStage only appears as the immediate child and the build side of a broadcast
+   *   hash join or broadcast nested loop join.
+   */
+  private def validate(plan: SparkPlan): Unit = plan match {
+    case b: BroadcastHashJoinExec =>
+      val (buildPlan, probePlan) = b.buildSide match {
+        case BuildLeft => (b.left, b.right)
+        case BuildRight => (b.right, b.left)
+      }
+      if (!buildPlan.isInstanceOf[BroadcastQueryStageExec]) {
+        validate(buildPlan)
+      }
+      validate(probePlan)
+    case b: BroadcastNestedLoopJoinExec =>
+      val (buildPlan, probePlan) = b.buildSide match {
+        case BuildLeft => (b.left, b.right)
+        case BuildRight => (b.right, b.left)
+      }
+      if (!buildPlan.isInstanceOf[BroadcastQueryStageExec]) {
+        validate(buildPlan)
+      }
+      validate(probePlan)
+    case q: BroadcastQueryStageExec => errorOnInvalidBroadcastQueryStage(q)
+    case _ => plan.children.foreach(validate)
+  }
+
+  private def errorOnInvalidBroadcastQueryStage(plan: SparkPlan): Unit = {
+    throw InvalidAQEPlanException("Invalid broadcast query stage", plan)
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala
@@ -491,12 +491,9 @@ class DataSourceV2Strategy(session: SparkSession) extends Strategy with Predicat
 
 private[sql] object DataSourceV2Strategy {
 
-  private def translateLeafNodeFilterV2(
-      predicate: Expression,
-      supportNestedPredicatePushdown: Boolean): Option[Predicate] = {
-    val pushablePredicate = PushablePredicate(supportNestedPredicatePushdown)
+  private def translateLeafNodeFilterV2(predicate: Expression): Option[Predicate] = {
     predicate match {
-      case pushablePredicate(expr) => Some(expr)
+      case PushablePredicate(expr) => Some(expr)
       case _ => None
     }
   }
@@ -506,10 +503,8 @@ private[sql] object DataSourceV2Strategy {
    *
    * @return a `Some[Filter]` if the input [[Expression]] is convertible, otherwise a `None`.
    */
-  protected[sql] def translateFilterV2(
-      predicate: Expression,
-      supportNestedPredicatePushdown: Boolean): Option[Predicate] = {
-    translateFilterV2WithMapping(predicate, None, supportNestedPredicatePushdown)
+  protected[sql] def translateFilterV2(predicate: Expression): Option[Predicate] = {
+    translateFilterV2WithMapping(predicate, None)
   }
 
   /**
@@ -523,8 +518,7 @@ private[sql] object DataSourceV2Strategy {
    */
   protected[sql] def translateFilterV2WithMapping(
       predicate: Expression,
-      translatedFilterToExpr: Option[mutable.HashMap[Predicate, Expression]],
-      nestedPredicatePushdownEnabled: Boolean)
+      translatedFilterToExpr: Option[mutable.HashMap[Predicate, Expression]])
   : Option[Predicate] = {
     predicate match {
       case And(left, right) =>
@@ -538,26 +532,21 @@ private[sql] object DataSourceV2Strategy {
         // Pushing one leg of AND down is only safe to do at the top level.
         // You can see ParquetFilters' createFilter for more details.
         for {
-          leftFilter <- translateFilterV2WithMapping(
-            left, translatedFilterToExpr, nestedPredicatePushdownEnabled)
-          rightFilter <- translateFilterV2WithMapping(
-            right, translatedFilterToExpr, nestedPredicatePushdownEnabled)
+          leftFilter <- translateFilterV2WithMapping(left, translatedFilterToExpr)
+          rightFilter <- translateFilterV2WithMapping(right, translatedFilterToExpr)
         } yield new V2And(leftFilter, rightFilter)
 
       case Or(left, right) =>
         for {
-          leftFilter <- translateFilterV2WithMapping(
-            left, translatedFilterToExpr, nestedPredicatePushdownEnabled)
-          rightFilter <- translateFilterV2WithMapping(
-            right, translatedFilterToExpr, nestedPredicatePushdownEnabled)
+          leftFilter <- translateFilterV2WithMapping(left, translatedFilterToExpr)
+          rightFilter <- translateFilterV2WithMapping(right, translatedFilterToExpr)
         } yield new V2Or(leftFilter, rightFilter)
 
       case Not(child) =>
-        translateFilterV2WithMapping(child, translatedFilterToExpr, nestedPredicatePushdownEnabled)
-          .map(new V2Not(_))
+        translateFilterV2WithMapping(child, translatedFilterToExpr).map(new V2Not(_))
 
       case other =>
-        val filter = translateLeafNodeFilterV2(other, nestedPredicatePushdownEnabled)
+        val filter = translateLeafNodeFilterV2(other)
         if (filter.isDefined && translatedFilterToExpr.isDefined) {
           translatedFilterToExpr.get(filter.get) = predicate
         }
@@ -589,10 +578,9 @@ private[sql] object DataSourceV2Strategy {
 /**
  * Get the expression of DS V2 to represent catalyst predicate that can be pushed down.
  */
-case class PushablePredicate(nestedPredicatePushdownEnabled: Boolean) {
-
+object PushablePredicate {
   def unapply(e: Expression): Option[Predicate] =
-    new V2ExpressionBuilder(e, nestedPredicatePushdownEnabled, true).build().map { v =>
+    new V2ExpressionBuilder(e, true).build().map { v =>
       assert(v.isInstanceOf[Predicate])
       v.asInstanceOf[Predicate]
     }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownUtils.scala
@@ -80,7 +80,7 @@ object PushDownUtils extends PredicateHelper {
         for (filterExpr <- filters) {
           val translated =
             DataSourceV2Strategy.translateFilterV2WithMapping(
-              filterExpr, Some(translatedFilterToExpr), nestedPredicatePushdownEnabled = true)
+              filterExpr, Some(translatedFilterToExpr))
           if (translated.isEmpty) {
             untranslatableExprs += filterExpr
           } else {
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/adaptive/AdaptiveQueryExecSuite.scala
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2StrategySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2StrategySuite.scala
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCV2Suite.scala

Original file line number	Diff line number	Diff line change
`@@ -144,7 +144,7 @@ class OutputCommitCoordinatorSuite extends SparkFunSuite with BeforeAndAfter {`
`144`	`144`	`assert(tempDir.list().size === 1)`
`145`	`145`	`}`
`146`	`146`
`147`		`- test("If commit fails, if task is retried it should not be locked, and will succeed.") {`
	`147`	`+ ignore("If commit fails, if task is retried it should not be locked, and will succeed.") {`
`148`	`148`	`val rdd = sc.parallelize(Seq(1), 1)`
`149`	`149`	`sc.runJob(rdd, OutputCommitFunctions(tempDir.getAbsolutePath).failFirstCommitAttempt _,`
`150`	`150`	`rdd.partitions.indices)`