-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-12032] [SQL] Re-order inner joins to do join with conditions first #10073
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
18af774
1a32fea
3a62c32
babe395
70508f0
21a81b5
5469caf
556a382
eb31c37
ddffd8f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,14 +18,12 @@ | |
| package org.apache.spark.sql.catalyst.optimizer | ||
|
|
||
| import scala.collection.immutable.HashSet | ||
|
|
||
| import org.apache.spark.sql.catalyst.analysis.{CleanupAliases, EliminateSubQueries} | ||
| import org.apache.spark.sql.catalyst.expressions._ | ||
| import org.apache.spark.sql.catalyst.expressions.aggregate._ | ||
| import org.apache.spark.sql.catalyst.plans.Inner | ||
| import org.apache.spark.sql.catalyst.plans.FullOuter | ||
| import org.apache.spark.sql.catalyst.plans.LeftOuter | ||
| import org.apache.spark.sql.catalyst.plans.RightOuter | ||
| import org.apache.spark.sql.catalyst.plans.LeftSemi | ||
| import org.apache.spark.sql.catalyst.planning.FilterAndInnerJoins | ||
| import org.apache.spark.sql.catalyst.plans.{FullOuter, Inner, LeftOuter, LeftSemi, RightOuter} | ||
| import org.apache.spark.sql.catalyst.plans.logical._ | ||
| import org.apache.spark.sql.catalyst.rules._ | ||
| import org.apache.spark.sql.types._ | ||
|
|
@@ -44,6 +42,7 @@ object DefaultOptimizer extends Optimizer { | |
| // Operator push down | ||
| SetOperationPushDown, | ||
| SamplePushDown, | ||
| ReorderJoin, | ||
| PushPredicateThroughJoin, | ||
| PushPredicateThroughProject, | ||
| PushPredicateThroughGenerate, | ||
|
|
@@ -711,6 +710,49 @@ object PushPredicateThroughAggregate extends Rule[LogicalPlan] with PredicateHel | |
| } | ||
| } | ||
|
|
||
| /** | ||
| * Reorder the joins so that the bottom ones have at least one condition. | ||
| */ | ||
| object ReorderJoin extends Rule[LogicalPlan] with PredicateHelper { | ||
|
|
||
| /** | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Remove this comment if it is the same as the object comment or augment this with more detail. Can you comment what the input arguments are? What is input? The least common ancestor of joins? Similar for conditions
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated |
||
| * Reorder the joins so that the bottom ones have at least one condition. | ||
| */ | ||
| def reorder( | ||
| input: LogicalPlan, | ||
| joins: Seq[LogicalPlan], | ||
| conditions: Seq[Expression]): LogicalPlan = { | ||
| // filter out the conditions that could be pushed down to `joined` | ||
| val otherConditions = conditions.filterNot { cond => | ||
| cond.references.subsetOf(input.outputSet) | ||
| } | ||
| if (joins.isEmpty) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would have the pattern return |
||
| input | ||
| } else if (otherConditions.isEmpty) { | ||
| // no condition for these joins, so put them in original order | ||
| (Seq(input) ++ joins).reduceLeft(Join(_, _, Inner, None)) | ||
| } else { | ||
| // find out the first join that have at least one condition | ||
| val conditionalJoin = joins.find { plan => | ||
| val refs = input.outputSet ++ plan.outputSet | ||
| otherConditions.exists(cond => cond.references.subsetOf(refs)) | ||
| } | ||
| assert(conditionalJoin.isDefined) | ||
| val picked = conditionalJoin.get | ||
| val joined = Join(input, picked, Inner, None) | ||
| reorder(joined, joins.filterNot(_ eq picked), otherConditions) | ||
| } | ||
| } | ||
|
|
||
| def apply(plan: LogicalPlan): LogicalPlan = plan transform { | ||
| // TODO: support outer join | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i would consider omitting this |
||
| case FilterAndInnerJoins(input, joins, filterConditions) if joins.size > 1 => | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ExtractFiltersAndInnerJoins? |
||
| assert(filterConditions.nonEmpty) | ||
| val joined = reorder(input, joins, filterConditions) | ||
| Filter(filterConditions.reduceLeft(And), joined) | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Pushes down [[Filter]] operators where the `condition` can be | ||
| * evaluated using only the attributes of the left or right side of a join. Other | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,7 +21,6 @@ import org.apache.spark.Logging | |
| import org.apache.spark.sql.catalyst.expressions._ | ||
| import org.apache.spark.sql.catalyst.plans._ | ||
| import org.apache.spark.sql.catalyst.plans.logical._ | ||
| import org.apache.spark.sql.catalyst.trees.TreeNodeRef | ||
|
|
||
| /** | ||
| * A pattern that matches any number of project or filter operations on top of another relational | ||
|
|
@@ -132,6 +131,38 @@ object ExtractEquiJoinKeys extends Logging with PredicateHelper { | |
| } | ||
| } | ||
|
|
||
| /** | ||
| * A pattern that collects the filter and inner joins. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it much more work to extract all the filters? For example if there is a filter after the inner join of input and plan 1. We'd ideally use this for predicate progation as well. For example select * from t1 join t2 on t1.key = t2.key and t1.key = 5. If we collected all the filters, this could be used to infer t2.key = 5 and push that down to t2.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||
| * | ||
| * Filter | ||
| * | | ||
| * inner Join | ||
| * / \ ----> (filters, Seq(plan1, plan2), input) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Update comment to match the return order of the function (input, joins, filters) |
||
| * inner join plan2 | ||
| * / \ | ||
| * input plan1 | ||
| */ | ||
| object FilterAndInnerJoins extends PredicateHelper { | ||
| def unapply(plan: LogicalPlan): Option[(LogicalPlan, Seq[LogicalPlan], Seq[Expression])] = | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I know I suggested this interface, but now I'm questioning why we are differentiating the |
||
| plan match { | ||
| case f @ Filter(filterCondition, j @ Join(left, right, Inner, None)) => | ||
|
|
||
| // flatten all inner joins, which are next to each other and has no condition | ||
| def flattenJoin(plan: LogicalPlan): (LogicalPlan, Seq[LogicalPlan]) = plan match { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| case Join(left, right, Inner, None) => | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why can't these have conditions? Seems this would just go into the returned filters
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 |
||
| val (input, joins) = flattenJoin(left) | ||
| (input, joins ++ Seq(right)) | ||
| case _ => (plan, Seq()) | ||
| } | ||
|
|
||
| val allConditions = splitConjunctivePredicates(filterCondition) | ||
| val (input, joins) = flattenJoin(j) | ||
| Some((input, joins, allConditions)) | ||
|
|
||
| case _ => None | ||
| } | ||
| } | ||
|
|
||
| /** | ||
| * A pattern that collects all adjacent unions and returns their children as a Seq. | ||
| */ | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -38,6 +38,7 @@ class FilterPushdownSuite extends PlanTest { | |
| CombineFilters, | ||
| PushPredicateThroughProject, | ||
| BooleanSimplification, | ||
| ReorderJoin, | ||
| PushPredicateThroughJoin, | ||
| PushPredicateThroughGenerate, | ||
| PushPredicateThroughAggregate, | ||
|
|
@@ -548,6 +549,25 @@ class FilterPushdownSuite extends PlanTest { | |
| comparePlans(optimized, analysis.EliminateSubQueries(correctAnswer)) | ||
| } | ||
|
|
||
| test("joins: reorder inner joins") { | ||
|
||
| val x = testRelation.subquery('x) | ||
| val y = testRelation1.subquery('y) | ||
| val z = testRelation.subquery('z) | ||
|
|
||
| val originalQuery = { | ||
| x.join(y).join(z) | ||
| .where(("x.b".attr === "z.b".attr) && ("y.d".attr === "z.a".attr)) | ||
| } | ||
|
|
||
| val optimized = Optimize.execute(originalQuery.analyze) | ||
| val correctAnswer = | ||
| x.join(z, condition = Some("x.b".attr === "z.b".attr)) | ||
| .join(y, condition = Some("y.d".attr === "z.a".attr)) | ||
| .analyze | ||
|
|
||
| comparePlans(optimized, analysis.EliminateSubQueries(correctAnswer)) | ||
| } | ||
|
|
||
| val testRelationWithArrayType = LocalRelation('a.int, 'b.int, 'c_arr.array(IntegerType)) | ||
|
|
||
| test("generate: predicate referenced no generated column") { | ||
|
|
@@ -750,4 +770,5 @@ class FilterPushdownSuite extends PlanTest { | |
|
|
||
| comparePlans(optimized, correctAnswer) | ||
| } | ||
|
|
||
|
||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add to this comment what makes this rule stable? It's not obvious from reading the code.