apache · beliefer · Jan 7, 2022 · Jan 7, 2022 · Jan 10, 2022 · Jan 10, 2022
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/expressions/aggregate/Avg.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/expressions/aggregate/Avg.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.connector.expressions.aggregate;
+
+import org.apache.spark.annotation.Evolving;
+import org.apache.spark.sql.connector.expressions.NamedReference;
+
+/**
+ * An aggregate function that returns the mean of all the values in a group.
+ *
+ * @since 3.3.0
+ */
+@Evolving
+public final class Avg implements AggregateFunc {
+  private final NamedReference column;
+  private final boolean isDistinct;
+
+  public Avg(NamedReference column, boolean isDistinct) {
+    this.column = column;
+    this.isDistinct = isDistinct;
+  }
+
+  public NamedReference column() { return column; }
+  public boolean isDistinct() { return isDistinct; }
+
+  @Override
+  public String toString() {
+    if (isDistinct) {
+      return "AVG(DISTINCT " + column.describe() + ")";
+    } else {
+      return "AVG(" + column.describe() + ")";
+    }
+  }
+}
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/DataSourceStrategy.scala
@@ -41,7 +41,7 @@ import org.apache.spark.sql.catalyst.streaming.StreamingRelationV2
 import org.apache.spark.sql.connector.catalog.SupportsRead
 import org.apache.spark.sql.connector.catalog.TableCapability._
 import org.apache.spark.sql.connector.expressions.{FieldReference, NullOrdering, SortDirection, SortOrder => SortOrderV2, SortValue}
-import org.apache.spark.sql.connector.expressions.aggregate.{AggregateFunc, Count, CountStar, GeneralAggregateFunc, Max, Min, Sum}
+import org.apache.spark.sql.connector.expressions.aggregate.{AggregateFunc, Aggregation, Avg, Count, CountStar, GeneralAggregateFunc, Max, Min, Sum}
 import org.apache.spark.sql.errors.QueryCompilationErrors
 import org.apache.spark.sql.execution.{InSubqueryExec, RowDataSourceScanExec, SparkPlan}
 import org.apache.spark.sql.execution.command._
@@ -720,7 +720,7 @@ object DataSourceStrategy
         case aggregate.Sum(PushableColumnWithoutNestedColumn(name), _) =>
           Some(new Sum(FieldReference.column(name), agg.isDistinct))
         case aggregate.Average(PushableColumnWithoutNestedColumn(name), _) =>
-          Some(new GeneralAggregateFunc("AVG", agg.isDistinct, Array(FieldReference.column(name))))
+          Some(new Avg(FieldReference.column(name), agg.isDistinct))
         case aggregate.VariancePop(PushableColumnWithoutNestedColumn(name), _) =>
           Some(new GeneralAggregateFunc(
             "VAR_POP", agg.isDistinct, Array(FieldReference.column(name))))
@@ -752,6 +752,31 @@ object DataSourceStrategy
     }
   }
 
+  /**
+   * Translate aggregate expressions and group by expressions.
+   *
+   * @return translated aggregation.
+   */
+  protected[sql] def translateAggregation(
+      aggregates: Seq[AggregateExpression], groupBy: Seq[Expression]): Option[Aggregation] = {
+
+    def columnAsString(e: Expression): Option[FieldReference] = e match {
+      case PushableColumnWithoutNestedColumn(name) =>
+        Some(FieldReference.column(name).asInstanceOf[FieldReference])
+      case _ => None
+    }
+
+    val translatedAggregates = aggregates.flatMap(translateAggregate)
+    val translatedGroupBys = groupBy.flatMap(columnAsString)
+
+    if (translatedAggregates.length != aggregates.length ||
+      translatedGroupBys.length != groupBy.length) {
+      return None
+    }
+
+    Some(new Aggregation(translatedAggregates.toArray, translatedGroupBys.toArray))
+  }
+
   protected[sql] def translateSortOrders(sortOrders: Seq[SortOrder]): Seq[SortOrderV2] = {
     def translateOortOrder(sortOrder: SortOrder): Option[SortOrderV2] = sortOrder match {
       case SortOrder(PushableColumnWithoutNestedColumn(name), directionV1, nullOrderingV1, _) =>

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/PushDownUtils.scala
@@ -20,13 +20,11 @@ package org.apache.spark.sql.execution.datasources.v2
 import scala.collection.mutable
 
 import org.apache.spark.sql.catalyst.expressions.{AttributeReference, AttributeSet, Expression, NamedExpression, PredicateHelper, SchemaPruning}
-import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
 import org.apache.spark.sql.catalyst.util.CharVarcharUtils
-import org.apache.spark.sql.connector.expressions.{FieldReference, SortOrder}
-import org.apache.spark.sql.connector.expressions.aggregate.Aggregation
+import org.apache.spark.sql.connector.expressions.SortOrder
 import org.apache.spark.sql.connector.expressions.filter.{Filter => V2Filter}
-import org.apache.spark.sql.connector.read.{Scan, ScanBuilder, SupportsPushDownAggregates, SupportsPushDownFilters, SupportsPushDownLimit, SupportsPushDownRequiredColumns, SupportsPushDownTableSample, SupportsPushDownTopN, SupportsPushDownV2Filters}
-import org.apache.spark.sql.execution.datasources.{DataSourceStrategy, PushableColumnWithoutNestedColumn}
+import org.apache.spark.sql.connector.read.{Scan, ScanBuilder, SupportsPushDownFilters, SupportsPushDownLimit, SupportsPushDownRequiredColumns, SupportsPushDownTableSample, SupportsPushDownTopN, SupportsPushDownV2Filters}
+import org.apache.spark.sql.execution.datasources.DataSourceStrategy
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.sources
 import org.apache.spark.sql.types.StructType
@@ -106,34 +104,6 @@ object PushDownUtils extends PredicateHelper {
     }
   }
 
-  /**
-   * Pushes down aggregates to the data source reader
-   *
-   * @return pushed aggregation.
-   */
-  def pushAggregates(
-      scanBuilder: SupportsPushDownAggregates,
-      aggregates: Seq[AggregateExpression],
-      groupBy: Seq[Expression]): Option[Aggregation] = {
-
-    def columnAsString(e: Expression): Option[FieldReference] = e match {
-      case PushableColumnWithoutNestedColumn(name) =>
-        Some(FieldReference.column(name).asInstanceOf[FieldReference])
-      case _ => None
-    }
-
-    val translatedAggregates = aggregates.flatMap(DataSourceStrategy.translateAggregate)
-    val translatedGroupBys = groupBy.flatMap(columnAsString)
-
-    if (translatedAggregates.length != aggregates.length ||
-      translatedGroupBys.length != groupBy.length) {
-      return None
-    }
-
-    val agg = new Aggregation(translatedAggregates.toArray, translatedGroupBys.toArray)
-    Some(agg).filter(scanBuilder.pushAggregation)
-  }
-
   /**
    * Pushes down TableSample to the data source Scan
    */

diff --git a/...src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanRelationPushDown.scala b/...src/main/scala/org/apache/spark/sql/execution/datasources/v2/V2ScanRelationPushDown.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.datasources.v2
 
 import scala.collection.mutable
 
-import org.apache.spark.sql.catalyst.expressions.{Alias, And, Attribute, AttributeReference, Cast, Expression, IntegerLiteral, NamedExpression, PredicateHelper, ProjectionOverSchema, SubqueryExpression}
+import org.apache.spark.sql.catalyst.expressions.{Alias, And, Attribute, AttributeReference, Cast, Divide, Expression, IntegerLiteral, NamedExpression, PredicateHelper, ProjectionOverSchema, SubqueryExpression}
 import org.apache.spark.sql.catalyst.expressions.aggregate
 import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
 import org.apache.spark.sql.catalyst.planning.ScanOperation
@@ -88,25 +88,49 @@ object V2ScanRelationPushDown extends Rule[LogicalPlan] with PredicateHelper {
           sHolder.builder match {
             case r: SupportsPushDownAggregates =>
               val aggExprToOutputOrdinal = mutable.HashMap.empty[Expression, Int]
-              var ordinal = 0
-              val aggregates = resultExpressions.flatMap { expr =>
-                expr.collect {
-                  // Do not push down duplicated aggregate expressions. For example,
-                  // `SELECT max(a) + 1, max(a) + 2 FROM ...`, we should only push down one
-                  // `max(a)` to the data source.
-                  case agg: AggregateExpression
-                      if !aggExprToOutputOrdinal.contains(agg.canonicalized) =>
-                    aggExprToOutputOrdinal(agg.canonicalized) = ordinal
-                    ordinal += 1
-                    agg
-                }
-              }
+              val aggregates = collectAggregates(resultExpressions, aggExprToOutputOrdinal)
               val normalizedAggregates = DataSourceStrategy.normalizeExprs(
                 aggregates, sHolder.relation.output).asInstanceOf[Seq[AggregateExpression]]
               val normalizedGroupingExpressions = DataSourceStrategy.normalizeExprs(
                 groupingExpressions, sHolder.relation.output)
-              val pushedAggregates = PushDownUtils.pushAggregates(
-                r, normalizedAggregates, normalizedGroupingExpressions)
+              val translatedAggregates = DataSourceStrategy.translateAggregation(
+                normalizedAggregates, normalizedGroupingExpressions)
+              val (finalResultExpressions, finalAggregates, finalTranslatedAggregates) = {
+                if (translatedAggregates.isEmpty ||
+                  r.supportCompletePushDown(translatedAggregates.get)) {
+                  (resultExpressions, aggregates, translatedAggregates)
+                } else {
+                  // The data source doesn't support the complete push-down of this aggregation.
+                  // Here we translate `AVG` to `SUM / COUNT`, so that it's more likely to be
+                  // pushed, completely or partially.
+                  var findAverage = false
+                  val newResultExpressions = resultExpressions.map { expr =>
+                    expr.transform {
+                      case AggregateExpression(avg: aggregate.Average, _, isDistinct, _, _) =>
+                        findAverage = true
+                        val left = addCastIfNeeded(aggregate.Sum(avg.child)
+                          .toAggregateExpression(isDistinct), avg.dataType)
+                        val right = addCastIfNeeded(aggregate.Count(avg.child)
+                          .toAggregateExpression(isDistinct), avg.dataType)
+                        Divide(left, right)
+                    }
+                  }.asInstanceOf[Seq[NamedExpression]]
+                  if (findAverage) {
+                    // Because aggregate expressions changed, translate them again.
+                    aggExprToOutputOrdinal.clear()
+                    val newAggregates =
+                      collectAggregates(newResultExpressions, aggExprToOutputOrdinal)
+                    val newNormalizedAggregates = DataSourceStrategy.normalizeExprs(
+                      newAggregates, sHolder.relation.output).asInstanceOf[Seq[AggregateExpression]]
+                    (newResultExpressions, newAggregates, DataSourceStrategy.translateAggregation(
+                      newNormalizedAggregates, normalizedGroupingExpressions))
+                  } else {
+                    (resultExpressions, aggregates, translatedAggregates)
+                  }
+                }
+              }
+
+              val pushedAggregates = finalTranslatedAggregates.filter(r.pushAggregation)
               if (pushedAggregates.isEmpty) {
                 aggNode // return original plan node
               } else if (!supportPartialAggPushDown(pushedAggregates.get) &&
@@ -129,7 +153,7 @@ object V2ScanRelationPushDown extends Rule[LogicalPlan] with PredicateHelper {
                 // +- RelationV2[c2#10, min(c1)#21, max(c1)#22]
                 // scalastyle:on
                 val newOutput = scan.readSchema().toAttributes
-                assert(newOutput.length == groupingExpressions.length + aggregates.length)
+                assert(newOutput.length == groupingExpressions.length + finalAggregates.length)
                 val groupAttrs = normalizedGroupingExpressions.zip(newOutput).map {
                   case (a: Attribute, b: Attribute) => b.withExprId(a.exprId)
                   case (_, b) => b
@@ -164,25 +188,25 @@ object V2ScanRelationPushDown extends Rule[LogicalPlan] with PredicateHelper {
                   Project(projectExpressions, scanRelation)
                 } else {
                   val plan = Aggregate(
-                    output.take(groupingExpressions.length), resultExpressions, scanRelation)
+                    output.take(groupingExpressions.length), finalResultExpressions, scanRelation)
 
                   // scalastyle:off
                   // Change the optimized logical plan to reflect the pushed down aggregate
                   // e.g. TABLE t (c1 INT, c2 INT, c3 INT)
-                  // SELECT min(c1), max(c1) FROM t GROUP BY c2;
+                  // SELECT min(c1), max(c1), avg(c1) FROM t GROUP BY c2;
                   // The original logical plan is
-                  // Aggregate [c2#10],[min(c1#9) AS min(c1)#17, max(c1#9) AS max(c1)#18]
+                  // Aggregate [c2#10],[min(c1#9) AS min(c1)#17, max(c1#9) AS max(c1)#18, avg(c1#9) AS avg(c1)#19]
                   // +- RelationV2[c1#9, c2#10] ...
                   //
-                  // After change the V2ScanRelation output to [c2#10, min(c1)#21, max(c1)#22]
+                  // After change the V2ScanRelation output to [c2#10, min(c1)#21, max(c1)#22, sum(c1)#23, count(c1)#24]
                   // we have the following
-                  // !Aggregate [c2#10], [min(c1#9) AS min(c1)#17, max(c1#9) AS max(c1)#18]
-                  // +- RelationV2[c2#10, min(c1)#21, max(c1)#22] ...
+                  // !Aggregate [c2#10], [min(c1#9) AS min(c1)#17, max(c1#9) AS max(c1)#18, sum(c1#9)/count(c1#9) AS avg(c1)#19]
+                  // +- RelationV2[c2#10, min(c1)#21, max(c1)#22, sum(c1)#23, count(c1)#24] ...
                   //
                   // We want to change it to
                   // == Optimized Logical Plan ==
-                  // Aggregate [c2#10], [min(min(c1)#21) AS min(c1)#17, max(max(c1)#22) AS max(c1)#18]
-                  // +- RelationV2[c2#10, min(c1)#21, max(c1)#22] ...
+                  // Aggregate [c2#10], [min(min(c1)#21) AS min(c1)#17, max(max(c1)#22) AS max(c1)#18, sum(sum(c1)#23)/sum(count(c1)#24) AS avg(c1)#19]
+                  // +- RelationV2[c2#10, min(c1)#21, max(c1)#22, sum(c1)#23, count(c1)#24] ...
                   // scalastyle:on
                   plan.transformExpressions {
                     case agg: AggregateExpression =>
@@ -210,16 +234,33 @@ object V2ScanRelationPushDown extends Rule[LogicalPlan] with PredicateHelper {
       }
   }
 
+  private def collectAggregates(resultExpressions: Seq[NamedExpression],
+      aggExprToOutputOrdinal: mutable.HashMap[Expression, Int]): Seq[AggregateExpression] = {
+    var ordinal = 0
+    resultExpressions.flatMap { expr =>
+      expr.collect {
+        // Do not push down duplicated aggregate expressions. For example,
+        // `SELECT max(a) + 1, max(a) + 2 FROM ...`, we should only push down one
+        // `max(a)` to the data source.
+        case agg: AggregateExpression
+          if !aggExprToOutputOrdinal.contains(agg.canonicalized) =>
+          aggExprToOutputOrdinal(agg.canonicalized) = ordinal
+          ordinal += 1
+          agg
+      }
+    }
+  }
+
   private def supportPartialAggPushDown(agg: Aggregation): Boolean = {
     // We don't know the agg buffer of `GeneralAggregateFunc`, so can't do partial agg push down.
     agg.aggregateExpressions().forall(!_.isInstanceOf[GeneralAggregateFunc])
   }
 
-  private def addCastIfNeeded(aggAttribute: AttributeReference, aggDataType: DataType) =
-    if (aggAttribute.dataType == aggDataType) {
-      aggAttribute
+  private def addCastIfNeeded(expression: Expression, aggDataType: DataType) =
-  private def addCastIfNeeded(expression: Expression, aggDataType: DataType) =
+  private def addCastIfNeeded(expression: Expression, expectedDataType: DataType) =
-  private def addCastIfNeeded(expression: Expression, aggDataType: DataType) =
+  private def addCastIfNeeded(expression: Expression, expectedDataType: DataType) =
+    if (expression.dataType == aggDataType) {
+      expression
     } else {
-      Cast(aggAttribute, aggDataType)
+      Cast(expression, aggDataType)
     }
 
   def applyColumnPruning(plan: LogicalPlan): LogicalPlan = plan.transform {

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/JdbcDialects.scala
@@ -33,7 +33,7 @@ import org.apache.spark.sql.connector.catalog.TableChange
 import org.apache.spark.sql.connector.catalog.TableChange._
 import org.apache.spark.sql.connector.catalog.index.TableIndex
 import org.apache.spark.sql.connector.expressions.NamedReference
-import org.apache.spark.sql.connector.expressions.aggregate.{AggregateFunc, Count, CountStar, GeneralAggregateFunc, Max, Min, Sum}
+import org.apache.spark.sql.connector.expressions.aggregate.{AggregateFunc, Avg, Count, CountStar, Max, Min, Sum}
 import org.apache.spark.sql.errors.QueryCompilationErrors
 import org.apache.spark.sql.execution.datasources.jdbc.{JDBCOptions, JdbcUtils}
 import org.apache.spark.sql.execution.datasources.v2.TableSampleInfo
@@ -220,10 +220,11 @@ abstract class JdbcDialect extends Serializable with Logging{
         Some(s"SUM($distinct$column)")
       case _: CountStar =>
         Some("COUNT(*)")
-      case f: GeneralAggregateFunc if f.name() == "AVG" =>
-        assert(f.inputs().length == 1)
-        val distinct = if (f.isDistinct) "DISTINCT " else ""
-        Some(s"AVG($distinct${f.inputs().head})")
+      case avg: Avg =>
+        if (avg.column.fieldNames.length != 1) return None
+        val distinct = if (avg.isDistinct) "DISTINCT " else ""
+        val column = quoteIdentifier(avg.column.fieldNames.head)
+        Some(s"AVG($distinct$column)")
       case _ => None
     }
   }

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCV2Suite.scala b/sql/core/src/test/scala/org/apache/spark/sql/jdbc/JDBCV2Suite.scala
@@ -27,7 +27,7 @@ import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Filter, Sort}
 import org.apache.spark.sql.connector.expressions.{FieldReference, NullOrdering, SortDirection, SortValue}
 import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2ScanRelation, V1ScanWrapper}
 import org.apache.spark.sql.execution.datasources.v2.jdbc.JDBCTableCatalog
-import org.apache.spark.sql.functions.{lit, sum, udf}
+import org.apache.spark.sql.functions.{avg, count, lit, sum, udf}
 import org.apache.spark.sql.test.SharedSparkSession
 import org.apache.spark.util.Utils
 
@@ -874,4 +874,32 @@ class JDBCV2Suite extends QueryTest with SharedSparkSession with ExplainSuiteHel
     checkAnswer(df, Seq(Row(2)))
     // scalastyle:on
   }
+
+  test("scan with aggregate push-down: partial push-down SUM, AVG, COUNT") {
+    val df = spark.read
+      .option("partitionColumn", "dept")
+      .option("lowerBound", "0")
+      .option("upperBound", "2")
+      .option("numPartitions", "2")
+      .table("h2.test.employee")
+      .agg(sum($"SALARY").as("sum"), avg($"SALARY").as("avg"), count($"SALARY").as("count"))
+    checkAggregateRemoved(df, false)
+    checkAnswer(df, Seq(Row(53000.00, 10600.000000, 5)))
+
+    val df2 = spark.read
+      .option("partitionColumn", "dept")
+      .option("lowerBound", "0")
+      .option("upperBound", "2")
+      .option("numPartitions", "2")
+      .table("h2.test.employee")
+      .groupBy($"name")
+      .agg(sum($"SALARY").as("sum"), avg($"SALARY").as("avg"), count($"SALARY").as("count"))
+    checkAggregateRemoved(df2, false)
+    checkAnswer(df2, Seq(
+      Row("alex", 12000.00, 12000.000000, 1),
+      Row("amy", 10000.00, 10000.000000, 1),
+      Row("cathy", 9000.00, 9000.000000, 1),
+      Row("david", 10000.00, 10000.000000, 1),
+      Row("jen", 12000.00, 12000.000000, 1)))
+  }
 }