apache · wangyum · Jul 17, 2021 · Jul 19, 2021 · Jul 19, 2021 · Jul 23, 2021
diff --git a/...st/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RemoveRedundantAggregates.scala b/...st/src/main/scala/org/apache/spark/sql/catalyst/optimizer/RemoveRedundantAggregates.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.catalyst.optimizer
 import org.apache.spark.sql.catalyst.analysis.PullOutNondeterministic
 import org.apache.spark.sql.catalyst.expressions.{AliasHelper, AttributeSet}
 import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
-import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan}
+import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, LogicalPlan, Project}
 import org.apache.spark.sql.catalyst.rules.Rule
 import org.apache.spark.sql.catalyst.trees.TreePattern.AGGREGATE
 
@@ -47,6 +47,10 @@ object RemoveRedundantAggregates extends Rule[LogicalPlan] with AliasHelper {
       } else {
         newAggregate
       }
+
+     case agg @ Aggregate(groupingExps, _, child) if agg.groupOnly && child.deterministic &&
 test("Remove redundant aggregate with non-deterministic upper") { 
   val query = relation 
     .groupBy('a)('a) 
     .groupBy('a)('a, rand(0) as 'c) 
     .analyze 
   val expected = relation 
     .groupBy('a)('a, rand(0) as 'c) 
     .analyze 
   val optimized = Optimize.execute(query) 
   comparePlans(optimized, expected) 
 } 
 test("Keep non-redundant aggregate - upper references non-deterministic non-grouping") { 
   val query = relation 
     .groupBy('a)('a, ('a + rand(0)) as 'c) 
     .groupBy('a, 'c)('a, 'c) 
     .analyze 
   val optimized = Optimize.execute(query) 
   comparePlans(optimized, query) 
 } 
 test("Remove redundant aggregate with non-deterministic upper") { 
   val query = relation 
     .groupBy('a)('a) 
     .groupBy('a)('a, rand(0) as 'c) 
     .analyze 
   val expected = relation 
     .groupBy('a)('a, rand(0) as 'c) 
     .analyze 
   val optimized = Optimize.execute(query) 
   comparePlans(optimized, expected) 
 } 
 test("Keep non-redundant aggregate - upper references non-deterministic non-grouping") { 
   val query = relation 
     .groupBy('a)('a, ('a + rand(0)) as 'c) 
     .groupBy('a, 'c)('a, 'c) 
     .analyze 
   val optimized = Optimize.execute(query) 
   comparePlans(optimized, query) 
 } 
+         child.distinctKeys.exists(_.subsetOf(AttributeSet(groupingExps))) =>
+      Project(agg.aggregateExpressions, child)
   }
 
   private def isLowerRedundant(upper: Aggregate, lower: Aggregate): Boolean = {

diff --git a/...alyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/DistinctKeyVisitor.scala b/...alyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/DistinctKeyVisitor.scala
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.plans.logical
+
+import org.apache.spark.sql.catalyst.expressions.{Alias, AttributeSet, ExpressionSet, NamedExpression}
+import org.apache.spark.sql.catalyst.plans.LeftExistence
+
+/**
+ * A visitor pattern for traversing a [[LogicalPlan]] tree and propagate the distinct attributes.
+ */
+object DistinctKeyVisitor extends LogicalPlanVisitor[Set[AttributeSet]] {
+
+  private def projectDistinctKeys(
+      keys: Set[ExpressionSet], projectList: Seq[NamedExpression]): Set[AttributeSet] = {
+    val expressions = keys.flatMap(_.toSet)
+    projectList.filter {
+      case a: Alias => expressions.exists(_.semanticEquals(a.child))
+      case ne => expressions.exists(_.semanticEquals(ne))
+    }.toSet.subsets(keys.map(_.size).min).filter { s =>
+      val references = s.map {
+        case a: Alias => a.child
+        case ne => ne
+      }
+      keys.exists(_.equals(ExpressionSet(references)))
+    }.map(s => AttributeSet(s.map(_.toAttribute))).toSet
+  }
+
+  override def default(p: LogicalPlan): Set[AttributeSet] = Set.empty[AttributeSet]
+
+  override def visitAggregate(p: Aggregate): Set[AttributeSet] = {
+    val groupingExps = ExpressionSet(p.groupingExpressions) // handle group by a, a
+    projectDistinctKeys(Set(groupingExps), p.aggregateExpressions)
+  }
+
+  override def visitDistinct(p: Distinct): Set[AttributeSet] = {
+    Set(p.outputSet)
+  }
+
+  override def visitExcept(p: Except): Set[AttributeSet] =
+    if (!p.isAll && p.deterministic) Set(p.outputSet) else default(p)
+
+  override def visitExpand(p: Expand): Set[AttributeSet ] = default(p)
+
+  override def visitFilter(p: Filter): Set[AttributeSet ] = p.child.distinctKeys
+
+  override def visitGenerate(p: Generate): Set[AttributeSet ] = default(p)
+
+  override def visitGlobalLimit(p: GlobalLimit): Set[AttributeSet ] = p.child.distinctKeys
+
+  override def visitIntersect(p: Intersect): Set[AttributeSet ] = {
+    if (!p.isAll && p.deterministic) Set(p.outputSet) else default(p)
+  }
+
+  override def visitJoin(p: Join): Set[AttributeSet] = {
+    p.joinType match {
+      case LeftExistence(_) => p.left.distinctKeys
+      case _ => default(p)
+    }
+  }
+
+  override def visitLocalLimit(p: LocalLimit): Set[AttributeSet] = p.child.distinctKeys
+
+  override def visitPivot(p: Pivot): Set[AttributeSet] = default(p)
+
+  override def visitProject(p: Project): Set[AttributeSet] = {
+    if (p.child.distinctKeys.nonEmpty) {
+      projectDistinctKeys(p.child.distinctKeys.map(ExpressionSet(_)), p.projectList)
+    } else {
+      default(p)
+    }
+  }
+
+  override def visitRepartition(p: Repartition): Set[AttributeSet] = p.child.distinctKeys
+
+  override def visitRepartitionByExpr(p: RepartitionByExpression): Set[AttributeSet] =
+    p.child.distinctKeys
+
+  override def visitSample(p: Sample): Set[AttributeSet] = default(p)
+
+  override def visitScriptTransform(p: ScriptTransformation): Set[AttributeSet] = default(p)
+
+  override def visitUnion(p: Union): Set[AttributeSet] = default(p)
+
+  override def visitWindow(p: Window): Set[AttributeSet] = p.child.distinctKeys
+
+  override def visitTail(p: Tail): Set[AttributeSet] = p.child.distinctKeys
+
+  override def visitSort(p: Sort): Set[AttributeSet] = p.child.distinctKeys
+
+  override def visitRebalancePartitions(p: RebalancePartitions): Set[AttributeSet] =
+    p.child.distinctKeys
+
+  override def visitWithCTE(p: WithCTE): Set[AttributeSet] = default(p)
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlan.scala
@@ -31,6 +31,7 @@ abstract class LogicalPlan
   extends QueryPlan[LogicalPlan]
   with AnalysisHelper
   with LogicalPlanStats
+  with LogicalPlanDistinctKeys
   with QueryPlanConstraints
   with Logging {
 

diff --git a/.../src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlanDistinctKeys.scala b/.../src/main/scala/org/apache/spark/sql/catalyst/plans/logical/LogicalPlanDistinctKeys.scala
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.plans.logical
+
+import org.apache.spark.sql.catalyst.expressions.AttributeSet
+
+/**
+ * A trait to add distinct attributes to [[LogicalPlan]]. For example:
+ * {{{
+ *   SELECT a, b, SUM(c) FROM Tab1 GROUP BY a, b
+ *   // returns a, b
+ * }}}
+ */
+trait LogicalPlanDistinctKeys { self: LogicalPlan =>
+  lazy val distinctKeys: Set[AttributeSet] = DistinctKeyVisitor.visit(self)
+}