-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-29800][SQL] Rewrite non-correlated EXISTS subquery use ScalaSubquery to optimize perf #26437
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
2d762b4
1c577bc
5fa971b
1401349
7b943aa
95e446d
20cda42
8e3ce4f
c290411
866ddc7
3de0ecc
32f85c3
e47a757
4c86605
626e41f
ce76e0c
4a4ca9b
88f804d
7668bd6
a6b8485
34046be
4c6c04d
ac6a4d2
59162c6
89a1721
fb98b54
67b4281
821ed40
e319fee
2c387f2
2aff8eb
2b7b417
88fcdbf
9f084ee
8c6060a
9a9d9d1
173942d
26258b0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -106,12 +106,20 @@ object RewritePredicateSubquery extends Rule[LogicalPlan] with PredicateHelper { | |
|
|
||
| // Filter the plan by applying left semi and left anti joins. | ||
| withSubquery.foldLeft(newFilter) { | ||
| case (p, Exists(sub, conditions, _)) => | ||
| val (joinCond, outerPlan) = rewriteExistentialExpr(conditions, p) | ||
| buildJoin(outerPlan, sub, LeftSemi, joinCond) | ||
| case (p, Not(Exists(sub, conditions, _))) => | ||
| val (joinCond, outerPlan) = rewriteExistentialExpr(conditions, p) | ||
| buildJoin(outerPlan, sub, LeftAnti, joinCond) | ||
| case (p, exists @ Exists(sub, conditions, _)) => | ||
| if (SubqueryExpression.hasCorrelatedSubquery(exists)) { | ||
| val (joinCond, outerPlan) = rewriteExistentialExpr(conditions, p) | ||
| buildJoin(outerPlan, sub, LeftSemi, joinCond) | ||
| } else { | ||
| Filter(exists, newFilter) | ||
| } | ||
| case (p, Not(exists @ Exists(sub, conditions, _))) => | ||
| if (SubqueryExpression.hasCorrelatedSubquery(exists)) { | ||
| val (joinCond, outerPlan) = rewriteExistentialExpr(conditions, p) | ||
| buildJoin(outerPlan, sub, LeftAnti, joinCond) | ||
| } else { | ||
| Filter(Not(exists), newFilter) | ||
| } | ||
|
||
| case (p, InSubquery(values, ListQuery(sub, conditions, _, _))) => | ||
| // Deduplicate conflicting attributes if any. | ||
| val newSub = dedupSubqueryOnSelfJoin(p, sub, Some(values)) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,7 +23,7 @@ import scala.collection.mutable.ArrayBuffer | |
| import org.apache.spark.broadcast.Broadcast | ||
| import org.apache.spark.sql.SparkSession | ||
| import org.apache.spark.sql.catalyst.{expressions, InternalRow} | ||
| import org.apache.spark.sql.catalyst.expressions.{AttributeSeq, CreateNamedStruct, Expression, ExprId, InSet, ListQuery, Literal, PlanExpression} | ||
| import org.apache.spark.sql.catalyst.expressions.{CreateNamedStruct, ExistsSubquery, Expression, ExprId, InSet, ListQuery, Literal, PlanExpression} | ||
| import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} | ||
| import org.apache.spark.sql.catalyst.rules.Rule | ||
| import org.apache.spark.sql.internal.SQLConf | ||
|
|
@@ -171,6 +171,63 @@ case class InSubqueryExec( | |
| } | ||
| } | ||
|
|
||
| /** | ||
| * The physical node of exists-subquery. This is for support use exists in join's on condition, | ||
|
||
| * since some join type we can't pushdown exists condition, we plan it here | ||
| */ | ||
| case class ExistsExec(child: Expression, | ||
|
||
| subQuery: String, | ||
| plan: BaseSubqueryExec, | ||
| exprId: ExprId, | ||
| private var resultBroadcast: Broadcast[Boolean] = null) | ||
|
||
| extends ExecSubqueryExpression { | ||
|
|
||
| @transient private var result: Boolean = _ | ||
|
||
|
|
||
| override def dataType: DataType = BooleanType | ||
| override def children: Seq[Expression] = child :: Nil | ||
| override def nullable: Boolean = child.nullable | ||
| override def toString: String = s"EXISTS ${plan.name}" | ||
|
||
| override def withNewPlan(plan: BaseSubqueryExec): ExistsExec = copy(plan = plan) | ||
|
|
||
| override def semanticEquals(other: Expression): Boolean = other match { | ||
| case in: ExistsExec => child.semanticEquals(in.child) && plan.sameResult(in.plan) | ||
| case _ => false | ||
| } | ||
|
|
||
|
|
||
| def updateResult(): Unit = { | ||
| result = !plan.execute().isEmpty() | ||
|
||
| resultBroadcast = plan.sqlContext.sparkContext.broadcast[Boolean](result) | ||
| } | ||
|
|
||
| def values(): Option[Boolean] = Option(resultBroadcast).map(_.value) | ||
|
|
||
| private def prepareResult(): Unit = { | ||
| require(resultBroadcast != null, s"$this has not finished") | ||
| result = resultBroadcast.value | ||
| } | ||
|
|
||
| override def eval(input: InternalRow): Any = { | ||
| prepareResult() | ||
| result | ||
|
||
| } | ||
|
|
||
| override lazy val canonicalized: ExistsExec = { | ||
| copy( | ||
| child = child.canonicalized, | ||
| subQuery = subQuery, | ||
| plan = plan.canonicalized.asInstanceOf[BaseSubqueryExec], | ||
| exprId = ExprId(0), | ||
| resultBroadcast = null) | ||
| } | ||
|
|
||
| override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { | ||
|
||
| prepareResult() | ||
| ExistsSubquery(child, subQuery, result).doGenCode(ctx, ev) | ||
|
||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Plans subqueries that are present in the given [[SparkPlan]]. | ||
| */ | ||
|
|
@@ -194,6 +251,19 @@ case class PlanSubqueries(sparkSession: SparkSession) extends Rule[SparkPlan] { | |
| } | ||
| val executedPlan = new QueryExecution(sparkSession, query).executedPlan | ||
| InSubqueryExec(expr, SubqueryExec(s"subquery#${exprId.id}", executedPlan), exprId) | ||
| case expressions.Exists(sub, children, exprId) => | ||
cloud-fan marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| val expr = if (children.length == 1) { | ||
| children.head | ||
| } else { | ||
| CreateNamedStruct( | ||
| children.zipWithIndex.flatMap { case (v, index) => | ||
| Seq(Literal(s"col_$index"), v) | ||
| } | ||
| ) | ||
| } | ||
| val executedPlan = new QueryExecution(sparkSession, sub).executedPlan | ||
| ExistsExec(expr, sub.treeString, | ||
|
||
| SubqueryExec(s"subquery#${exprId.id}", executedPlan), exprId) | ||
| } | ||
| } | ||
| } | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can we change the beginning instead?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
or we change
hasInOrExistsSubquerytohasInOrCorrelatedExistsSubqueryThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done