apache · seancxmao · Dec 8, 2018 · Dec 8, 2018 · Dec 25, 2018 · Dec 25, 2018
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsSuite.scala
@@ -26,7 +26,7 @@ import org.apache.spark.SparkFunSuite
 import org.apache.spark.sql._
 import org.apache.spark.sql.catalyst.expressions.aggregate.{Final, Partial}
 import org.apache.spark.sql.catalyst.plans.logical.LocalRelation
-import org.apache.spark.sql.execution.{FilterExec, RangeExec, SparkPlan, WholeStageCodegenExec}
+import org.apache.spark.sql.execution.{FilterExec, RangeExec, SortExec, SparkPlan, WholeStageCodegenExec}
 import org.apache.spark.sql.execution.aggregate.HashAggregateExec
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
@@ -194,10 +194,20 @@ class SQLMetricsSuite extends SparkFunSuite with SQLMetricsTestUtils with Shared
   }
 
   test("Sort metrics") {
-    // Assume the execution plan is
-    // WholeStageCodegen(nodeId = 0, Range(nodeId = 2) -> Sort(nodeId = 1))
-    val ds = spark.range(10).sort('id)
-    testSparkPlanMetrics(ds.toDF(), 2, Map.empty)
+    // Assume the execution plan with node id is
+    // Sort(nodeId = 0)
+    //   Exchange(nodeId = 1)
+    //     Project(nodeId = 2)
+    //       LocalTableScan(nodeId = 3)
+    // Because of SPARK-25267, ConvertToLocalRelation is disabled in the test cases of sql/core,
+    // so Project here is not collapsed into LocalTableScan.
+    val df = Seq(1, 3, 2).toDF("id").sort('id)
+    testSparkPlanMetricsWithPredicates(df, 2, Map(
+      0L -> (("Sort", Map(
+        "sort time total (min, med, max)" -> checkPattern(timingMetricPattern),
+        "peak memory total (min, med, max)" -> checkPattern(sizeMetricPattern),
+        "spill size total (min, med, max)" -> checkPattern(sizeMetricPattern))))
+    ))
   }
 
   test("SortMergeJoin metrics") {

diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/metric/SQLMetricsTestUtils.scala
@@ -18,6 +18,7 @@
 package org.apache.spark.sql.execution.metric
 
 import java.io.File
+import java.util.regex.Pattern
 
 import scala.collection.mutable.HashMap
 
@@ -40,6 +41,26 @@ trait SQLMetricsTestUtils extends SQLTestUtils {
 
   protected def statusStore: SQLAppStatusStore = spark.sharedState.statusStore
 
+  protected val bytes = "([0-9]+(\\.[0-9]+)?) (EiB|PiB|TiB|GiB|MiB|KiB|B)"
+
+  protected val duration = "([0-9]+(\\.[0-9]+)?) (ms|s|m|h)"
+
+  // "\n96.2 MiB (32.1 MiB, 32.1 MiB, 32.1 MiB)"
+  protected val sizeMetricPattern = Pattern.compile(s"\\n$bytes \\($bytes, $bytes, $bytes\\)")
+
+  // "\n2.0 ms (1.0 ms, 1.0 ms, 1.0 ms)"
+  protected val timingMetricPattern =
+    Pattern.compile(s"\\n$duration \\($duration, $duration, $duration\\)")
+
+  /** Generate a function to check the specified pattern.
+   *
+   * @param pattern a pattern
+   * @return a function to check the specified pattern
+   */
+  protected def checkPattern(pattern: Pattern): (Any => Boolean) = {
+    (in: Any) => pattern.matcher(in.toString).matches()
+  }
+
   /**
    * Get execution metrics for the SQL execution and verify metrics values.
    *
@@ -198,6 +219,32 @@ trait SQLMetricsTestUtils extends SQLTestUtils {
       }
     }
   }
+
+  /**
+   * Call `df.collect()` and verify if the collected metrics satisfy the specified predicates.
+   * @param df `DataFrame` to run
+   * @param expectedNumOfJobs number of jobs that will run
+   * @param expectedMetricsPredicates the expected metrics predicates. The format is
+   * `nodeId -> (operatorName, metric name -> metric value predicate)`.
+   */
+  protected def testSparkPlanMetricsWithPredicates(
+      df: DataFrame,
+      expectedNumOfJobs: Int,
+      expectedMetricsPredicates: Map[Long, (String, Map[String, Any => Boolean])]): Unit = {
+    val optActualMetrics =
+      getSparkPlanMetrics(df, expectedNumOfJobs, expectedMetricsPredicates.keySet)
+    optActualMetrics.foreach { actualMetrics =>
+      assert(expectedMetricsPredicates.keySet === actualMetrics.keySet)
+      for (nodeId <- expectedMetricsPredicates.keySet) {
+        val (expectedNodeName, expectedMetricsPredicatesMap) = expectedMetricsPredicates(nodeId)
+        val (actualNodeName, actualMetricsMap) = actualMetrics(nodeId)
+        assert(expectedNodeName === actualNodeName)
+        for (metricName <- expectedMetricsPredicatesMap.keySet) {
+          assert(expectedMetricsPredicatesMap(metricName)(actualMetricsMap(metricName)))
+        }
+      }
+    }
+  }
 def createSizeMetric(sc: SparkContext, name: String): SQLMetric = { 
   // The final result of this metric in physical operator UI may look like: 
   // data size total (min, med, max): 
   // 100GB (100MB, 1GB, 10GB) 
   val acc = new SQLMetric(SIZE_METRIC, -1) 
   acc.register(sc, name = Some(s"$name total (min, med, max)"), countFailedValues = false) 
   acc 
 } 
 def createTimingMetric(sc: SparkContext, name: String): SQLMetric = { 
   // The final result of this metric in physical operator UI may looks like: 
   // duration(min, med, max): 
   // 5s (800ms, 1s, 2s) 
   val acc = new SQLMetric(TIMING_METRIC, -1) 
   acc.register(sc, name = Some(s"$name total (min, med, max)"), countFailedValues = false) 
   acc 
 } 
 def createSizeMetric(sc: SparkContext, name: String): SQLMetric = { 
   // The final result of this metric in physical operator UI may look like: 
   // data size total (min, med, max): 
   // 100GB (100MB, 1GB, 10GB) 
   val acc = new SQLMetric(SIZE_METRIC, -1) 
   acc.register(sc, name = Some(s"$name total (min, med, max)"), countFailedValues = false) 
   acc 
 } 
  
 def createTimingMetric(sc: SparkContext, name: String): SQLMetric = { 
   // The final result of this metric in physical operator UI may looks like: 
   // duration(min, med, max): 
   // 5s (800ms, 1s, 2s) 
   val acc = new SQLMetric(TIMING_METRIC, -1) 
   acc.register(sc, name = Some(s"$name total (min, med, max)"), countFailedValues = false) 
   acc 
 } 
 }