Skip to content

Commit b72b852

Browse files
gengliangwangcloud-fan
authored andcommitted
[SPARK-21222] Move elimination of Distinct clause from analyzer to optimizer
## What changes were proposed in this pull request? Move elimination of Distinct clause from analyzer to optimizer Distinct clause is useless after MAX/MIN clause. For example, "Select MAX(distinct a) FROM src from" is equivalent of "Select MAX(a) FROM src from" However, this optimization is implemented in analyzer. It should be in optimizer. ## How was this patch tested? Unit test gatorsmile cloud-fan Please review http://spark.apache.org/contributing.html before opening a pull request. Author: Wang Gengliang <ltnwgl@gmail.com> Closes #18429 from gengliangwang/distinct_opt.
1 parent e68aed7 commit b72b852

4 files changed

Lines changed: 73 additions & 5 deletions

File tree

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/Analyzer.scala

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1197,11 +1197,6 @@ class Analyzer(
11971197
case u @ UnresolvedFunction(funcId, children, isDistinct) =>
11981198
withPosition(u) {
11991199
catalog.lookupFunction(funcId, children) match {
1200-
// DISTINCT is not meaningful for a Max or a Min.
1201-
case max: Max if isDistinct =>
1202-
AggregateExpression(max, Complete, isDistinct = false)
1203-
case min: Min if isDistinct =>
1204-
AggregateExpression(min, Complete, isDistinct = false)
12051200
// AggregateWindowFunctions are AggregateFunctions that can only be evaluated within
12061201
// the context of a Window clause. They do not need to be wrapped in an
12071202
// AggregateExpression.

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/dsl/package.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,7 +159,9 @@ package object dsl {
159159
def first(e: Expression): Expression = new First(e).toAggregateExpression()
160160
def last(e: Expression): Expression = new Last(e).toAggregateExpression()
161161
def min(e: Expression): Expression = Min(e).toAggregateExpression()
162+
def minDistinct(e: Expression): Expression = Min(e).toAggregateExpression(isDistinct = true)
162163
def max(e: Expression): Expression = Max(e).toAggregateExpression()
164+
def maxDistinct(e: Expression): Expression = Max(e).toAggregateExpression(isDistinct = true)
163165
def upper(e: Expression): Expression = Upper(e)
164166
def lower(e: Expression): Expression = Lower(e)
165167
def sqrt(e: Expression): Expression = Sqrt(e)

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ abstract class Optimizer(sessionCatalog: SessionCatalog, conf: SQLConf)
4040
protected val fixedPoint = FixedPoint(conf.optimizerMaxIterations)
4141

4242
def batches: Seq[Batch] = {
43+
Batch("Eliminate Distinct", Once, EliminateDistinct) ::
4344
// Technically some of the rules in Finish Analysis are not optimizer rules and belong more
4445
// in the analyzer, because they are needed for correctness (e.g. ComputeCurrentTime).
4546
// However, because we also use the analyzer to canonicalized queries (for view definition),
@@ -151,6 +152,20 @@ abstract class Optimizer(sessionCatalog: SessionCatalog, conf: SQLConf)
151152
def extendedOperatorOptimizationRules: Seq[Rule[LogicalPlan]] = Nil
152153
}
153154

155+
/**
156+
* Remove useless DISTINCT for MAX and MIN.
157+
* This rule should be applied before RewriteDistinctAggregates.
158+
*/
159+
object EliminateDistinct extends Rule[LogicalPlan] {
160+
override def apply(plan: LogicalPlan): LogicalPlan = plan transformExpressions {
161+
case ae: AggregateExpression if ae.isDistinct =>
162+
ae.aggregateFunction match {
163+
case _: Max | _: Min => ae.copy(isDistinct = false)
164+
case _ => ae
165+
}
166+
}
167+
}
168+
154169
/**
155170
* An optimizer used in test code.
156171
*
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.spark.sql.catalyst.optimizer
18+
19+
import org.apache.spark.sql.catalyst.dsl.expressions._
20+
import org.apache.spark.sql.catalyst.dsl.plans._
21+
import org.apache.spark.sql.catalyst.plans.PlanTest
22+
import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Expand, LocalRelation, LogicalPlan}
23+
import org.apache.spark.sql.catalyst.rules.RuleExecutor
24+
25+
class EliminateDistinctSuite extends PlanTest {
26+
27+
object Optimize extends RuleExecutor[LogicalPlan] {
28+
val batches =
29+
Batch("Operator Optimizations", Once,
30+
EliminateDistinct) :: Nil
31+
}
32+
33+
val testRelation = LocalRelation('a.int)
34+
35+
test("Eliminate Distinct in Max") {
36+
val query = testRelation
37+
.select(maxDistinct('a).as('result))
38+
.analyze
39+
val answer = testRelation
40+
.select(max('a).as('result))
41+
.analyze
42+
assert(query != answer)
43+
comparePlans(Optimize.execute(query), answer)
44+
}
45+
46+
test("Eliminate Distinct in Min") {
47+
val query = testRelation
48+
.select(minDistinct('a).as('result))
49+
.analyze
50+
val answer = testRelation
51+
.select(min('a).as('result))
52+
.analyze
53+
assert(query != answer)
54+
comparePlans(Optimize.execute(query), answer)
55+
}
56+
}

0 commit comments

Comments
 (0)