-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-28346][SQL] clone the query plan between analyzer, optimizer and planner #25111
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,7 +28,7 @@ import org.apache.spark.sql.catalyst.analysis.UnsupportedOperationChecker | |
| import org.apache.spark.sql.catalyst.plans.QueryPlan | ||
| import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, ReturnAnswer} | ||
| import org.apache.spark.sql.catalyst.rules.Rule | ||
| import org.apache.spark.sql.catalyst.util.StringUtils.{PlanStringConcat, StringConcat} | ||
| import org.apache.spark.sql.catalyst.util.StringUtils.PlanStringConcat | ||
| import org.apache.spark.sql.catalyst.util.truncatedString | ||
| import org.apache.spark.sql.execution.adaptive.InsertAdaptiveSparkPlan | ||
| import org.apache.spark.sql.execution.exchange.{EnsureRequirements, ReuseExchange} | ||
|
|
@@ -60,36 +60,31 @@ class QueryExecution( | |
|
|
||
| lazy val analyzed: LogicalPlan = tracker.measurePhase(QueryPlanningTracker.ANALYSIS) { | ||
| SparkSession.setActiveSession(sparkSession) | ||
| // We can't clone `logical` here, which will reset the `_analyzed` flag. | ||
| sparkSession.sessionState.analyzer.executeAndCheck(logical, tracker) | ||
| } | ||
|
|
||
| lazy val withCachedData: LogicalPlan = { | ||
| assertAnalyzed() | ||
| assertSupported() | ||
| sparkSession.sharedState.cacheManager.useCachedData(analyzed) | ||
| sparkSession.sharedState.cacheManager.useCachedData(analyzed.clone()) | ||
| } | ||
|
|
||
| lazy val optimizedPlan: LogicalPlan = tracker.measurePhase(QueryPlanningTracker.OPTIMIZATION) { | ||
| sparkSession.sessionState.optimizer.executeAndTrack(withCachedData, tracker) | ||
| sparkSession.sessionState.optimizer.executeAndTrack(withCachedData.clone(), tracker) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
If we decide to clone the plan after each stage, will any test fail if we do not clone it?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. test added |
||
| } | ||
|
|
||
| lazy val sparkPlan: SparkPlan = tracker.measurePhase(QueryPlanningTracker.PLANNING) { | ||
| SparkSession.setActiveSession(sparkSession) | ||
| // Runtime re-optimization requires a unique instance of every node in the logical plan. | ||
| val logicalPlan = if (sparkSession.sessionState.conf.adaptiveExecutionEnabled) { | ||
| optimizedPlan.clone() | ||
| } else { | ||
| optimizedPlan | ||
| } | ||
| // TODO: We use next(), i.e. take the first plan returned by the planner, here for now, | ||
| // but we will implement to choose the best plan. | ||
| planner.plan(ReturnAnswer(logicalPlan)).next() | ||
| planner.plan(ReturnAnswer(optimizedPlan.clone())).next() | ||
| } | ||
|
|
||
| // executedPlan should not be used to initialize any SparkPlan. It should be | ||
| // only used for execution. | ||
| lazy val executedPlan: SparkPlan = tracker.measurePhase(QueryPlanningTracker.PLANNING) { | ||
| prepareForExecution(sparkPlan) | ||
| prepareForExecution(sparkPlan.clone()) | ||
| } | ||
|
|
||
| /** | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,7 +20,7 @@ package org.apache.spark.sql.execution.command | |
| import org.apache.spark.internal.Logging | ||
| import org.apache.spark.sql.{Row, SparkSession} | ||
| import org.apache.spark.sql.catalyst.expressions.Attribute | ||
| import org.apache.spark.sql.catalyst.plans.logical.IgnoreCachedData | ||
| import org.apache.spark.sql.catalyst.plans.logical.{IgnoreCachedData, LogicalPlan} | ||
| import org.apache.spark.sql.internal.SQLConf | ||
| import org.apache.spark.sql.internal.StaticSQLConf.CATALOG_IMPLEMENTATION | ||
| import org.apache.spark.sql.types.{StringType, StructField, StructType} | ||
|
|
@@ -168,4 +168,6 @@ case object ResetCommand extends RunnableCommand with IgnoreCachedData { | |
| sparkSession.sessionState.conf.clear() | ||
| Seq.empty[Row] | ||
| } | ||
|
|
||
| override def clone(): LogicalPlan = this | ||
|
||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -52,4 +52,8 @@ case class SaveIntoDataSourceCommand( | |
| val redacted = SQLConf.get.redactOptions(options) | ||
| s"SaveIntoDataSourceCommand ${dataSource}, ${redacted}, ${mode}" | ||
| } | ||
|
|
||
| override def clone(): LogicalPlan = { | ||
| SaveIntoDataSourceCommand(query.clone(), dataSource, options, mode) | ||
|
||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,7 +19,8 @@ package org.apache.spark.sql.execution | |
| import scala.io.Source | ||
|
|
||
| import org.apache.spark.sql.AnalysisException | ||
| import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, OneRowRelation} | ||
| import org.apache.spark.sql.catalyst.plans.logical.{Filter, LogicalPlan, OneRowRelation, SubqueryAlias} | ||
| import org.apache.spark.sql.catalyst.trees.TreeNodeTag | ||
| import org.apache.spark.sql.internal.SQLConf | ||
| import org.apache.spark.sql.test.SharedSQLContext | ||
|
|
||
|
|
@@ -138,4 +139,27 @@ class QueryExecutionSuite extends SharedSQLContext { | |
| val error = intercept[Error](qe.toString) | ||
| assert(error.getMessage.contains("error")) | ||
| } | ||
|
|
||
| test("analyzed plan should not change after it's generated") { | ||
|
||
| val df = spark.range(10).filter('id > 0).as("a") | ||
| val analyzedPlan = df.queryExecution.analyzed | ||
| val tag = new TreeNodeTag[String]("test") | ||
| analyzedPlan.setTagValue(tag, "tag") | ||
|
|
||
| def checkPlan(l: LogicalPlan): Unit = { | ||
| assert(l.isInstanceOf[SubqueryAlias]) | ||
| val sub = l.asInstanceOf[SubqueryAlias] | ||
| assert(sub.child.isInstanceOf[Filter]) | ||
| assert(sub.getTagValue(tag).isDefined) | ||
| assert(sub.child.getTagValue(tag).isEmpty) | ||
| } | ||
|
|
||
| checkPlan(analyzedPlan) | ||
| val df2 = df.filter('id > 0) | ||
| // trigger optimizaion | ||
| df2.queryExecution.optimizedPlan | ||
|
|
||
| // The previous analyzed plan should not get changed. | ||
| checkPlan(analyzedPlan) | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe not necessary, but should we clone
logicaltoo before sending to analyzer?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yea I think we should