-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-23128][SQL] A new approach to do adaptive execution in Spark SQL #20303
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 12 commits
7f0c2c9
4a9d054
3487eb8
7df45f8
a83967c
63fece9
52c7616
1081a3f
4a2311c
2c55985
5819826
ea93dbf
068ef94
4e69702
41f3a90
666bf76
e4bfc22
bef8ab8
2d6f110
fd413d4
028b0ac
2e08778
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -282,14 +282,19 @@ object SQLConf { | |
|
|
||
| val SHUFFLE_MIN_NUM_POSTSHUFFLE_PARTITIONS = | ||
| buildConf("spark.sql.adaptive.minNumPostShufflePartitions") | ||
| .internal() | ||
| .doc("The advisory minimal number of post-shuffle partitions provided to " + | ||
| "ExchangeCoordinator. This setting is used in our test to make sure we " + | ||
| "have enough parallelism to expose issues that will not be exposed with a " + | ||
| "single partition. When the value is a non-positive value, this setting will " + | ||
| "not be provided to ExchangeCoordinator.") | ||
| .doc("The advisory minimum number of post-shuffle partitions used in adaptive execution.") | ||
| .intConf | ||
| .checkValue(numPartitions => numPartitions > 0, "The minimum shuffle partition number " + | ||
| "must be a positive integer.") | ||
| .createWithDefault(1) | ||
|
|
||
| val SHUFFLE_MAX_NUM_POSTSHUFFLE_PARTITIONS = | ||
| buildConf("spark.sql.adaptive.maxNumPostShufflePartitions") | ||
| .doc("The advisory maximum number of post-shuffle partitions used in adaptive execution.") | ||
| .intConf | ||
| .createWithDefault(-1) | ||
| .checkValue(numPartitions => numPartitions > 0, "The maximum shuffle partition number " + | ||
|
||
| "must be a positive integer.") | ||
| .createWithDefault(500) | ||
|
||
|
|
||
| val SUBEXPRESSION_ELIMINATION_ENABLED = | ||
| buildConf("spark.sql.subexpressionElimination.enabled") | ||
|
|
@@ -1728,8 +1733,9 @@ class SQLConf extends Serializable with Logging { | |
|
|
||
| def adaptiveExecutionEnabled: Boolean = getConf(ADAPTIVE_EXECUTION_ENABLED) | ||
|
|
||
| def minNumPostShufflePartitions: Int = | ||
| getConf(SHUFFLE_MIN_NUM_POSTSHUFFLE_PARTITIONS) | ||
| def minNumPostShufflePartitions: Int = getConf(SHUFFLE_MIN_NUM_POSTSHUFFLE_PARTITIONS) | ||
|
|
||
| def maxNumPostShufflePartitions: Int = getConf(SHUFFLE_MAX_NUM_POSTSHUFFLE_PARTITIONS) | ||
|
|
||
| def minBatchesToRetain: Int = getConf(MIN_BATCHES_TO_RETAIN) | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -30,6 +30,7 @@ import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, ReturnAnswer} | |
| import org.apache.spark.sql.catalyst.rules.Rule | ||
| import org.apache.spark.sql.catalyst.util.StringUtils.StringConcat | ||
| import org.apache.spark.sql.catalyst.util.truncatedString | ||
| import org.apache.spark.sql.execution.adaptive.PlanQueryStage | ||
| import org.apache.spark.sql.execution.exchange.{EnsureRequirements, ReuseExchange} | ||
| import org.apache.spark.sql.internal.SQLConf | ||
| import org.apache.spark.util.Utils | ||
|
|
@@ -93,7 +94,12 @@ class QueryExecution( | |
| * row format conversions as needed. | ||
| */ | ||
| protected def prepareForExecution(plan: SparkPlan): SparkPlan = { | ||
| preparations.foldLeft(plan) { case (sp, rule) => rule.apply(sp) } | ||
| val rules = if (sparkSession.sessionState.conf.adaptiveExecutionEnabled) { | ||
| adaptivePreparations | ||
| } else { | ||
| preparations | ||
| } | ||
| rules.foldLeft(plan) { case (sp, rule) => rule.apply(sp)} | ||
| } | ||
|
|
||
| /** A sequence of rules that will be applied in order to the physical plan before execution. */ | ||
|
|
@@ -104,6 +110,17 @@ class QueryExecution( | |
| ReuseExchange(sparkSession.sessionState.conf), | ||
| ReuseSubquery(sparkSession.sessionState.conf)) | ||
|
|
||
| // With adaptive execution, whole stage codegen will be done inside `QueryStageExecutor`. | ||
| protected def adaptivePreparations: Seq[Rule[SparkPlan]] = Seq( | ||
|
||
| PlanSubqueries(sparkSession), | ||
| EnsureRequirements(sparkSession.sessionState.conf), | ||
|
||
| ReuseExchange(sparkSession.sessionState.conf), | ||
| ReuseSubquery(sparkSession.sessionState.conf), | ||
| // PlanQueryStage needs to be the last rule because it divides the plan into multiple sub-trees | ||
| // by inserting leaf node QueryStage. Transforming the plan after applying this rule will | ||
| // only transform node in a sub-tree. | ||
| PlanQueryStage(sparkSession)) | ||
|
|
||
| def simpleString: String = withRedaction { | ||
| val concat = new StringConcat() | ||
| concat.append("== Physical Plan ==\n") | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,126 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.sql.execution.adaptive | ||
|
|
||
| import java.util.concurrent.CountDownLatch | ||
|
|
||
| import org.apache.spark.rdd.RDD | ||
| import org.apache.spark.sql.SparkSession | ||
| import org.apache.spark.sql.catalyst.InternalRow | ||
| import org.apache.spark.sql.catalyst.expressions.Attribute | ||
| import org.apache.spark.sql.execution.{LeafExecNode, SparkPlan, SparkPlanInfo, SQLExecution} | ||
| import org.apache.spark.sql.execution.ui.SparkListenerSQLAdaptiveExecutionUpdate | ||
|
|
||
| /** | ||
| * A root node to trigger query stages and execute the query plan adaptively. It incrementally | ||
| * updates the query plan when a query stage is materialized and provides accurate runtime | ||
| * statistics. | ||
| */ | ||
| case class AdaptiveSparkPlan(initialPlan: ResultQueryStage, session: SparkSession) | ||
| extends LeafExecNode{ | ||
|
|
||
| override def output: Seq[Attribute] = initialPlan.output | ||
|
|
||
| @volatile private var currentQueryStage: QueryStage = initialPlan | ||
| @volatile private var error: Throwable = null | ||
| private val readyLock = new CountDownLatch(1) | ||
|
||
|
|
||
| private def replaceStage(oldStage: QueryStage, newStage: QueryStage): QueryStage = { | ||
| if (oldStage.id == newStage.id) { | ||
| newStage | ||
| } else { | ||
| val newPlanForOldStage = oldStage.plan.transform { | ||
|
||
| case q: QueryStage => replaceStage(q, newStage) | ||
| } | ||
| oldStage.withNewPlan(newPlanForOldStage) | ||
| } | ||
| } | ||
|
|
||
| private def createCallback(executionId: Option[Long]): QueryStageTriggerCallback = { | ||
| new QueryStageTriggerCallback { | ||
| override def onStageUpdated(stage: QueryStage): Unit = { | ||
| updateCurrentQueryStage(stage, executionId) | ||
| if (stage.isInstanceOf[ResultQueryStage]) readyLock.countDown() | ||
| } | ||
|
|
||
| override def onStagePlanningFailed(stage: QueryStage, e: Throwable): Unit = { | ||
| error = new RuntimeException( | ||
|
||
| s""" | ||
| |Fail to plan stage ${stage.id}: | ||
| |${stage.plan.treeString} | ||
| """.stripMargin, e) | ||
| readyLock.countDown() | ||
| } | ||
|
|
||
| override def onStageMaterializingFailed(stage: QueryStage, e: Throwable): Unit = { | ||
| error = new RuntimeException( | ||
| s""" | ||
| |Fail to materialize stage ${stage.id}: | ||
| |${stage.plan.treeString} | ||
| """.stripMargin, e) | ||
| readyLock.countDown() | ||
| } | ||
|
|
||
| override def onError(e: Throwable): Unit = { | ||
| error = e | ||
| readyLock.countDown() | ||
| } | ||
| } | ||
| } | ||
|
|
||
| private def updateCurrentQueryStage(newStage: QueryStage, executionId: Option[Long]): Unit = { | ||
| currentQueryStage = replaceStage(currentQueryStage, newStage) | ||
| executionId.foreach { id => | ||
| session.sparkContext.listenerBus.post(SparkListenerSQLAdaptiveExecutionUpdate( | ||
| id, | ||
| SQLExecution.getQueryExecution(id).toString, | ||
| SparkPlanInfo.fromSparkPlan(currentQueryStage))) | ||
| } | ||
| } | ||
|
|
||
| def resultStage: ResultQueryStage = { | ||
| if (readyLock.getCount > 0) { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is finalPlan be called/used concurrently in multiple threads? If yes, the use of readyLock.getCount may cause problem. For example, two thread will both find getCount > 0 and each thread will create a QueryStageCreator before readyLock.countDown is called.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It should only be called in a single thread. @cloud-fan may confirm it. |
||
| val sc = session.sparkContext | ||
| val executionId = Option(sc.getLocalProperty(SQLExecution.EXECUTION_ID_KEY)).map(_.toLong) | ||
| val trigger = new QueryStageTrigger(session, createCallback(executionId)) | ||
| trigger.start() | ||
| trigger.trigger(initialPlan) | ||
| readyLock.await() | ||
| trigger.stop() | ||
| } | ||
|
|
||
| if (error != null) throw error | ||
| currentQueryStage.asInstanceOf[ResultQueryStage] | ||
| } | ||
|
|
||
| override def executeCollect(): Array[InternalRow] = resultStage.executeCollect() | ||
| override def executeTake(n: Int): Array[InternalRow] = resultStage.executeTake(n) | ||
| override def executeToIterator(): Iterator[InternalRow] = resultStage.executeToIterator() | ||
| override def doExecute(): RDD[InternalRow] = resultStage.execute() | ||
| override def generateTreeString( | ||
| depth: Int, | ||
| lastChildren: Seq[Boolean], | ||
| append: String => Unit, | ||
| verbose: Boolean, | ||
| prefix: String = "", | ||
| addSuffix: Boolean = false, | ||
| maxFields: Int): Unit = { | ||
| currentQueryStage.generateTreeString( | ||
| depth, lastChildren, append, verbose, "", false, maxFields) | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,57 @@ | ||
| /* | ||
| * Licensed to the Apache Software Foundation (ASF) under one or more | ||
| * contributor license agreements. See the NOTICE file distributed with | ||
| * this work for additional information regarding copyright ownership. | ||
| * The ASF licenses this file to You under the Apache License, Version 2.0 | ||
| * (the "License"); you may not use this file except in compliance with | ||
| * the License. You may obtain a copy of the License at | ||
| * | ||
| * http://www.apache.org/licenses/LICENSE-2.0 | ||
| * | ||
| * Unless required by applicable law or agreed to in writing, software | ||
| * distributed under the License is distributed on an "AS IS" BASIS, | ||
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
| * See the License for the specific language governing permissions and | ||
| * limitations under the License. | ||
| */ | ||
|
|
||
| package org.apache.spark.sql.execution.adaptive | ||
|
|
||
| import org.apache.spark.sql.SparkSession | ||
| import org.apache.spark.sql.catalyst.rules.Rule | ||
| import org.apache.spark.sql.execution.SparkPlan | ||
| import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, Exchange, ReusedExchangeExec, ShuffleExchangeExec} | ||
|
|
||
| /** | ||
| * Divide the spark plan into multiple QueryStages. For each Exchange in the plan, it wraps it with | ||
| * a [[QueryStage]]. At the end it adds an [[AdaptiveSparkPlan]] at the top, which will drive the | ||
| * execution of query stages. | ||
| */ | ||
| case class PlanQueryStage(session: SparkSession) extends Rule[SparkPlan] { | ||
|
|
||
| def apply(plan: SparkPlan): SparkPlan = { | ||
| var id = 0 | ||
| val exchangeToQueryStage = new java.util.IdentityHashMap[Exchange, QueryStage] | ||
| val planWithStages = plan.transformUp { | ||
| case e: ShuffleExchangeExec => | ||
| val queryStage = ShuffleQueryStage(id, e) | ||
|
||
| id += 1 | ||
| exchangeToQueryStage.put(e, queryStage) | ||
| queryStage | ||
| case e: BroadcastExchangeExec => | ||
| val queryStage = BroadcastQueryStage(id, e) | ||
| id += 1 | ||
| exchangeToQueryStage.put(e, queryStage) | ||
| queryStage | ||
| // The `ReusedExchangeExec` was added in the rule `ReuseExchange`, via transforming up the | ||
| // query plan. This rule also transform up the query plan, so when we hit `ReusedExchangeExec` | ||
| // here, the exchange being reused must already be hit before and there should be an entry | ||
| // for it in `exchangeToQueryStage`. | ||
| case e: ReusedExchangeExec => | ||
| val existingQueryStage = exchangeToQueryStage.get(e.child) | ||
| assert(existingQueryStage != null, "The exchange being reused should be hit before.") | ||
|
||
| ReusedQueryStage(existingQueryStage, e.output) | ||
|
||
| } | ||
| AdaptiveSparkPlan(ResultQueryStage(id, planWithStages), session) | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
super nit: we can simply write
_ > 0