apache · karuppayya · Jun 12, 2020 · Jun 12, 2020 · Jun 12, 2020 · Jun 12, 2020
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -2196,6 +2196,13 @@ object SQLConf {
       .checkValue(bit => bit >= 10 && bit <= 30, "The bit value must be in [10, 30].")
       .createWithDefault(16)
 
+  val SKIP_PARTIAL_AGGREGATE_ENABLED =
+    buildConf("spark.sql.aggregate.partialaggregate.skip.enabled")
+      .internal()
+      .doc("Avoid sort/spill to disk during partial aggregation")
+      .booleanConf
+      .createWithDefault(true)
+
   val AVRO_COMPRESSION_CODEC = buildConf("spark.sql.avro.compression.codec")
     .doc("Compression codec used in writing of AVRO files. Supported codecs: " +
       "uncompressed, deflate, snappy, bzip2 and xz. Default codec is snappy.")
@@ -2922,6 +2929,8 @@ class SQLConf extends Serializable with Logging {
 
   def fastHashAggregateRowMaxCapacityBit: Int = getConf(FAST_HASH_AGGREGATE_MAX_ROWS_CAPACITY_BIT)
 
+  def skipPartialAggregate: Boolean = getConf(SKIP_PARTIAL_AGGREGATE_ENABLED)
+
   def datetimeJava8ApiEnabled: Boolean = getConf(DATETIME_JAVA8API_ENABLED)
 
   def uiExplainMode: String = getConf(UI_EXPLAIN_MODE)

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/AggUtils.scala
@@ -353,4 +353,8 @@ object AggUtils {
 
     finalAndCompleteAggregate :: Nil
   }
+
+  def areAggExpressionsPartial(modes: Seq[AggregateMode]): Boolean = {
+    modes.nonEmpty && modes.forall(_ == Partial)
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/aggregate/HashAggregateExec.scala
@@ -63,6 +63,8 @@ case class HashAggregateExec(
 
   require(HashAggregateExec.supportsAggregate(aggregateBufferAttributes))
 
+  override def needStopCheck: Boolean = skipPartialAggregate
+
   override lazy val allAttributes: AttributeSeq =
     child.output ++ aggregateBufferAttributes ++ aggregateAttributes ++
       aggregateExpressions.flatMap(_.aggregateFunction.inputAggBufferAttributes)
@@ -72,6 +74,8 @@ case class HashAggregateExec(
     "peakMemory" -> SQLMetrics.createSizeMetric(sparkContext, "peak memory"),
     "spillSize" -> SQLMetrics.createSizeMetric(sparkContext, "spill size"),
     "aggTime" -> SQLMetrics.createTimingMetric(sparkContext, "time in aggregation build"),
+    "partialAggSkipped" -> SQLMetrics.createMetric(sparkContext,
+      "number of skipped records for partial aggregates"),
     "avgHashProbe" ->
       SQLMetrics.createAverageMetric(sparkContext, "avg hash probe bucket list iters"))
 
@@ -409,6 +413,11 @@ case class HashAggregateExec(
   private var fastHashMapTerm: String = _
   private var isFastHashMapEnabled: Boolean = false
 
+  private var avoidSpillInPartialAggregateTerm: String = _
+  private val skipPartialAggregate = sqlContext.conf.skipPartialAggregate &&
+    AggUtils.areAggExpressionsPartial(modes) && find(_.isInstanceOf[ExpandExec]).isEmpty
+  private var outputFunc: String = _
+
   // whether a vectorized hashmap is used instead
   // we have decided to always use the row-based hashmap,
   // but the vectorized hashmap can still be switched on for testing and benchmarking purposes.
@@ -628,6 +637,8 @@ case class HashAggregateExec(
          |${consume(ctx, resultVars)}
        """.stripMargin
     }
+
+
     ctx.addNewFunction(funcName,
       s"""
          |private void $funcName(UnsafeRow $keyTerm, UnsafeRow $bufferTerm)
@@ -680,6 +691,10 @@ case class HashAggregateExec(
 
   private def doProduceWithKeys(ctx: CodegenContext): String = {
     val initAgg = ctx.addMutableState(CodeGenerator.JAVA_BOOLEAN, "initAgg")
+    avoidSpillInPartialAggregateTerm = ctx.
+      addMutableState(CodeGenerator.JAVA_BOOLEAN, "avoidPartialAggregate")
+    val childrenConsumed = ctx.
+      addMutableState(CodeGenerator.JAVA_BOOLEAN, "childrenConsumed")
     if (sqlContext.conf.enableTwoLevelAggMap) {
       enableTwoLevelHashMap(ctx)
     } else if (sqlContext.conf.enableVectorizedHashMap) {
@@ -750,18 +765,19 @@ case class HashAggregateExec(
       finishRegularHashMap
     }
 
+    outputFunc = generateResultFunction(ctx)
     val doAggFuncName = ctx.addNewFunction(doAgg,
       s"""
          |private void $doAgg() throws java.io.IOException {
          |  ${child.asInstanceOf[CodegenSupport].produce(ctx, this)}
+         |  $childrenConsumed = true;
          |  $finishHashMap
          |}
        """.stripMargin)
 
     // generate code for output
     val keyTerm = ctx.freshName("aggKey")
     val bufferTerm = ctx.freshName("aggBuffer")
-    val outputFunc = generateResultFunction(ctx)
 
     def outputFromFastHashMap: String = {
       if (isFastHashMapEnabled) {
@@ -833,11 +849,18 @@ case class HashAggregateExec(
     s"""
        |if (!$initAgg) {
        |  $initAgg = true;
+       |  $avoidSpillInPartialAggregateTerm =
+       |   ${Utils.isTesting} && $skipPartialAggregate;
        |  $createFastHashMap
        |  $hashMapTerm = $thisPlan.createHashMap();
        |  long $beforeAgg = System.nanoTime();
        |  $doAggFuncName();
        |  $aggTime.add((System.nanoTime() - $beforeAgg) / $NANOS_PER_MILLIS);
+       |  $shouldStopCheckCode;
+       |}
+       |if (!$childrenConsumed) {
+       |  $doAggFuncName();
+       |  $shouldStopCheckCode;
        |}
        |// output the result
        |$outputFromFastHashMap
@@ -878,43 +901,51 @@ case class HashAggregateExec(
     }
 
     val oomeClassName = classOf[SparkOutOfMemoryError].getName
-
     val findOrInsertRegularHashMap: String =
       s"""
-         |// generate grouping key
-         |${unsafeRowKeyCode.code}
-         |int $unsafeRowKeyHash = ${unsafeRowKeyCode.value}.hashCode();
-         |if ($checkFallbackForBytesToBytesMap) {
-         |  // try to get the buffer from hash map
-         |  $unsafeRowBuffer =
-         |    $hashMapTerm.getAggregationBufferFromUnsafeRow($unsafeRowKeys, $unsafeRowKeyHash);
-         |}
-         |// Can't allocate buffer from the hash map. Spill the map and fallback to sort-based
-         |// aggregation after processing all input rows.
-         |if ($unsafeRowBuffer == null) {
-         |  if ($sorterTerm == null) {
-         |    $sorterTerm = $hashMapTerm.destructAndCreateExternalSorter();
-         |  } else {
-         |    $sorterTerm.merge($hashMapTerm.destructAndCreateExternalSorter());
+         |if (!$avoidSpillInPartialAggregateTerm) {
+         |  // generate grouping key
+         |  ${unsafeRowKeyCode.code}
+         |  int $unsafeRowKeyHash = ${unsafeRowKeyCode.value}.hashCode();
+         |  if ($checkFallbackForBytesToBytesMap) {
+         |    // try to get the buffer from hash map
+         |    $unsafeRowBuffer =
+         |      $hashMapTerm.getAggregationBufferFromUnsafeRow($unsafeRowKeys, $unsafeRowKeyHash);
          |  }
-         |  $resetCounter
-         |  // the hash map had be spilled, it should have enough memory now,
-         |  // try to allocate buffer again.
-         |  $unsafeRowBuffer = $hashMapTerm.getAggregationBufferFromUnsafeRow(
-         |    $unsafeRowKeys, $unsafeRowKeyHash);
-         |  if ($unsafeRowBuffer == null) {
-         |    // failed to allocate the first page
-         |    throw new $oomeClassName("No enough memory for aggregation");
+         |  // Can't allocate buffer from the hash map. Spill the map and fallback to sort-based
+         |  // aggregation after processing all input rows.
+         |  if ($unsafeRowBuffer == null && !$avoidSpillInPartialAggregateTerm) {
+         |    // If sort/spill to disk is disabled, nothing is done.
+         |    // Aggregation buffer is created later
+         |    if ($skipPartialAggregate) {
+         |      $avoidSpillInPartialAggregateTerm = true;
+         |    } else {
+         |      if ($sorterTerm == null) {
+         |        $sorterTerm = $hashMapTerm.destructAndCreateExternalSorter();
+         |      } else {
+         |        $sorterTerm.merge($hashMapTerm.destructAndCreateExternalSorter());
+         |      }
+         |      $resetCounter
+         |      // the hash map had be spilled, it should have enough memory now,
+         |      // try to allocate buffer again.
+         |      $unsafeRowBuffer = $hashMapTerm.getAggregationBufferFromUnsafeRow(
+         |        $unsafeRowKeys, $unsafeRowKeyHash);
+         |      if ($unsafeRowBuffer == null) {
+         |        // failed to allocate the first page
+         |        throw new $oomeClassName("No enough memory for aggregation");
+         |      }
+         |    }
          |  }
          |}
        """.stripMargin
 
+    val partTerm = metricTerm(ctx, "partialAggSkipped")
     val findOrInsertHashMap: String = {
-      if (isFastHashMapEnabled) {
+      val insertCode = if (isFastHashMapEnabled) {
         // If fast hash map is on, we first generate code to probe and update the fast hash map.
         // If the probe is successful the corresponding fast row buffer will hold the mutable row.
         s"""
-           |if ($checkFallbackForGeneratedHashMap) {
+           |if ($checkFallbackForGeneratedHashMap && !$avoidSpillInPartialAggregateTerm) {
            |  ${fastRowKeys.map(_.code).mkString("\n")}
            |  if (${fastRowKeys.map("!" + _.isNull).mkString(" && ")}) {
            |    $fastRowBuffer = $fastHashMapTerm.findOrInsert(
@@ -929,6 +960,18 @@ case class HashAggregateExec(
       } else {
         findOrInsertRegularHashMap
       }
+      val initExpr = declFunctions.flatMap(f => f.initialValues)
+      val emptyBufferKeyCode = GenerateUnsafeProjection.createCode(ctx, initExpr)
+      s"""
+         |$insertCode
+         |// Create an empty aggregation buffer
+         |if ($avoidSpillInPartialAggregateTerm) {
+         |  ${unsafeRowKeyCode.code}
+         |  ${emptyBufferKeyCode.code}
+         |  $unsafeRowBuffer = ${emptyBufferKeyCode.value};
+         |  $partTerm.add(1);
+         |}
+         |""".stripMargin
     }
 
     val inputAttr = aggregateBufferAttributes ++ inputAttributes
@@ -1005,7 +1048,7 @@ case class HashAggregateExec(
     }
 
     val updateRowInHashMap: String = {
-      if (isFastHashMapEnabled) {
+      val updateRowinMap = if (isFastHashMapEnabled) {
         if (isVectorizedHashMapEnabled) {
           ctx.INPUT_ROW = fastRowBuffer
           val boundUpdateExprs = updateExprs.map { updateExprsForOneFunc =>
@@ -1080,6 +1123,12 @@ case class HashAggregateExec(
       } else {
         updateRowInRegularHashMap
       }
+      s"""
+        |$updateRowinMap
+        |if ($avoidSpillInPartialAggregateTerm) {
+        |  $outputFunc(${unsafeRowKeyCode.value}, $unsafeRowBuffer);
+        |}
+        |""".stripMargin
     }
 
     val declareRowBuffer: String = if (isFastHashMapEnabled) {

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/WholeStageCodegenSuite.scala
@@ -20,7 +20,7 @@ package org.apache.spark.sql.execution
 import org.apache.spark.sql.{Dataset, QueryTest, Row, SaveMode}
 import org.apache.spark.sql.catalyst.expressions.codegen.{ByteCodeStats, CodeAndComment, CodeGenerator}
 import org.apache.spark.sql.execution.adaptive.DisableAdaptiveExecutionSuite
-import org.apache.spark.sql.execution.aggregate.HashAggregateExec
+import org.apache.spark.sql.execution.aggregate.{AggUtils, HashAggregateExec}
 import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec
 import org.apache.spark.sql.execution.joins.BroadcastHashJoinExec
 import org.apache.spark.sql.execution.joins.SortMergeJoinExec
@@ -51,6 +51,51 @@ class WholeStageCodegenSuite extends QueryTest with SharedSparkSession
     assert(df.collect() === Array(Row(9, 4.5)))
   }
 
+  test(s"Avoid spill in partial aggregation" ) {
+    withSQLConf((SQLConf.SKIP_PARTIAL_AGGREGATE_ENABLED.key, "true")) {
+      // Create Dataframes
+      val data = Seq(("James", 1), ("James", 1), ("Phil", 1))
+      val aggDF = data.toDF("name", "values").groupBy("name").sum("values")
+      val partAggNode = aggDF.queryExecution.executedPlan.find {
+        case h: HashAggregateExec =>
+          AggUtils.areAggExpressionsPartial(h.aggregateExpressions.map(_.mode))
+        case _ => false
+      }
+      checkAnswer(aggDF, Seq(Row("James", 2), Row("Phil", 1)))
+      assert(partAggNode.isDefined,
+      "No HashAggregate node with partial aggregate expression found")
+      assert(partAggNode.get.metrics("partialAggSkipped").value == data.size,
+      "Partial aggregation got triggered in partial hash aggregate node")
+    }
+  }
+
+  test(s"Partial aggregation should not happen when no Aggregate expr" ) {
+    withSQLConf((SQLConf.SKIP_PARTIAL_AGGREGATE_ENABLED.key, "true")) {
+      val aggDF = testData2.select(sumDistinct($"a"))
+      val aggNodes = aggDF.queryExecution.executedPlan.collect {
+        case h: HashAggregateExec => h
+      }
+      checkAnswer(aggDF, Row(6))
+      assert(aggNodes.nonEmpty)
+      assert(aggNodes.forall(_.metrics("partialAggSkipped").value == 0))
+    }
+  }
+
+  test(s"Distinct: Partial aggregation should happen for" +
+    s" HashAggregate nodes performing partial Aggregate operations " ) {
+    withSQLConf((SQLConf.SKIP_PARTIAL_AGGREGATE_ENABLED.key, "true")) {
+      val aggDF = testData2.select(sumDistinct($"a"), sum($"b"))
+      val aggNodes = aggDF.queryExecution.executedPlan.collect {
+        case h: HashAggregateExec => h
+      }
+      val (baseNodes, other) = aggNodes.partition(_.child.isInstanceOf[SerializeFromObjectExec])
+      checkAnswer(aggDF, Row(6, 9))
+      assert(baseNodes.size == 1 )
+      assert(baseNodes.head.metrics("partialAggSkipped").value == testData2.count())
+      assert(other.forall(_.metrics("partialAggSkipped").value == 0))
+    }
+  }
+
   test("Aggregate with grouping keys should be included in WholeStageCodegen") {
     val df = spark.range(3).groupBy(col("id") * 2).count().orderBy(col("id") * 2)
     val plan = df.queryExecution.executedPlan

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
@@ -142,48 +142,53 @@ class SQLMetricsSuite extends SharedSparkSession with SQLMetricsTestUtils
   }
 
   test("Aggregate metrics: track avg probe") {
-    // The executed plan looks like:
-    // HashAggregate(keys=[a#61], functions=[count(1)], output=[a#61, count#71L])
-    // +- Exchange hashpartitioning(a#61, 5)
-    //    +- HashAggregate(keys=[a#61], functions=[partial_count(1)], output=[a#61, count#76L])
-    //       +- Exchange RoundRobinPartitioning(1)
-    //          +- LocalTableScan [a#61]
-    //
-    // Assume the execution plan with node id is:
-    // Wholestage disabled:
-    // HashAggregate(nodeId = 0)
-    //   Exchange(nodeId = 1)
-    //     HashAggregate(nodeId = 2)
-    //       Exchange (nodeId = 3)
-    //         LocalTableScan(nodeId = 4)
-    //
-    // Wholestage enabled:
-    // WholeStageCodegen(nodeId = 0)
-    //   HashAggregate(nodeId = 1)
-    //     Exchange(nodeId = 2)
-    //       WholeStageCodegen(nodeId = 3)
-    //         HashAggregate(nodeId = 4)
-    //           Exchange(nodeId = 5)
-    //             LocalTableScan(nodeId = 6)
-    Seq(true, false).foreach { enableWholeStage =>
-      val df = generateRandomBytesDF().repartition(1).groupBy('a).count()
-      val nodeIds = if (enableWholeStage) {
-        Set(4L, 1L)
-      } else {
-        Set(2L, 0L)
-      }
-      val metrics = getSparkPlanMetrics(df, 1, nodeIds, enableWholeStage).get
-      nodeIds.foreach { nodeId =>
-        val probes = metrics(nodeId)._2("avg hash probe bucket list iters").toString
-        if (!probes.contains("\n")) {
-          // It's a single metrics value
-          assert(probes.toDouble > 1.0)
+    val skipPartialAgg = spark.sessionState.conf.getConf(SQLConf.SKIP_PARTIAL_AGGREGATE_ENABLED)
+    if (skipPartialAgg) {
+      logInfo("Skipping, since partial Aggregation is disabled")
+    } else {
+      // The executed plan looks like:
+      // HashAggregate(keys=[a#61], functions=[count(1)], output=[a#61, count#71L])
+      // +- Exchange hashpartitioning(a#61, 5)
+      //    +- HashAggregate(keys=[a#61], functions=[partial_count(1)], output=[a#61, count#76L])
+      //       +- Exchange RoundRobinPartitioning(1)
+      //          +- LocalTableScan [a#61]
+      //
+      // Assume the execution plan with node id is:
+      // Wholestage disabled:
+      // HashAggregate(nodeId = 0)
+      //   Exchange(nodeId = 1)
+      //     HashAggregate(nodeId = 2)
+      //       Exchange (nodeId = 3)
+      //         LocalTableScan(nodeId = 4)
+      //
+      // Wholestage enabled:
+      // WholeStageCodegen(nodeId = 0)
+      //   HashAggregate(nodeId = 1)
+      //     Exchange(nodeId = 2)
+      //       WholeStageCodegen(nodeId = 3)
+      //         HashAggregate(nodeId = 4)
+      //           Exchange(nodeId = 5)
+      //             LocalTableScan(nodeId = 6)
+      Seq(true, false).foreach { enableWholeStage =>
+        val df = generateRandomBytesDF().repartition(1).groupBy('a).count()
+        val nodeIds = if (enableWholeStage) {
+          Set(4L, 1L)
         } else {
-          val mainValue = probes.split("\n").apply(1).stripPrefix("(").stripSuffix(")")
-          // Extract min, med, max from the string and strip off everthing else.
-          val index = mainValue.indexOf(" (", 0)
-          mainValue.slice(0, index).split(", ").foreach {
-            probe => assert(probe.toDouble > 1.0)
+          Set(2L, 0L)
+        }
+        val metrics = getSparkPlanMetrics(df, 1, nodeIds, enableWholeStage).get
+        nodeIds.foreach { nodeId =>
+          val probes = metrics(nodeId)._2("avg hash probe bucket list iters").toString
+          if (!probes.contains("\n")) {
+            // It's a single metrics value
+            assert(probes.toDouble > 1.0)
+          } else {
+            val mainValue = probes.split("\n").apply(1).stripPrefix("(").stripSuffix(")")
+            // Extract min, med, max from the string and strip off everthing else.
+            val index = mainValue.indexOf(" (", 0)
+            mainValue.slice(0, index).split(", ").foreach {
+              probe => assert(probe.toDouble > 1.0)
+            }
           }
         }
       }