Refactor.

viirya · viirya · commit 930ffe22c990 · 2020-11-23T18:26:39.000-08:00
diff --git a/sql/core/benchmarks/SubExprEliminationBenchmark-jdk11-results.txt b/sql/core/benchmarks/SubExprEliminationBenchmark-jdk11-results.txt
@@ -5,21 +5,21 @@ Benchmark for performance of subexpression elimination
 Preparing data for benchmarking ...
 OpenJDK 64-Bit Server VM 11.0.9+11 on Mac OS X 10.15.6
 Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz
-from_json as subExpr in Project:           Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
--------------------------------------------------------------------------------------------------------------------------
-subexpressionElimination off, codegen on           22605          22935         291          0.0   226047196.5       1.0X
-subexpressionElimination off, codegen off          21811          22151         303          0.0   218105716.6       1.0X
-subexpressionElimination on, codegen on             1353           1385          36          0.0    13531011.3      16.7X
-subexpressionElimination on, codegen off            1237           1260          20          0.0    12368657.3      18.3X
+from_json as subExpr in Project:          Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+subExprElimination false, codegen: true           26447          27127         605          0.0   264467933.4       1.0X
+subExprElimination false, codegen: false          25673          26035         546          0.0   256732419.1       1.0X
+subExprElimination true, codegen: true             1384           1448         102          0.0    13842910.3      19.1X
+subExprElimination true, codegen: false            1244           1347         123          0.0    12442389.3      21.3X
 
 Preparing data for benchmarking ...
 OpenJDK 64-Bit Server VM 11.0.9+11 on Mac OS X 10.15.6
 Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz
-from_json as subExpr in Filter:            Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
--------------------------------------------------------------------------------------------------------------------------
-subexpressionElimination off, codegen on           32792          33101         282          0.0   327922763.5       1.0X
-subexpressionElimination off, codegen off          32809          33433         550          0.0   328088662.6       1.0X
-subexpressionElimination on, codegen on            18173          18828         869          0.0   181734709.5       1.8X
-subexpressionElimination on, codegen off           33695          33951         287          0.0   336950807.7       1.0X
+from_json as subExpr in Filter:           Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+subexpressionElimination off, codegen on          34631          35449         833          0.0   346309884.0       1.0X
+subexpressionElimination off, codegen on          34480          34851         353          0.0   344798490.4       1.0X
+subexpressionElimination off, codegen on          16618          16811         291          0.0   166176642.6       2.1X
+subexpressionElimination off, codegen on          34316          34667         310          0.0   343157094.7       1.0X
 
 
diff --git a/sql/core/benchmarks/SubExprEliminationBenchmark-results.txt b/sql/core/benchmarks/SubExprEliminationBenchmark-results.txt
@@ -5,21 +5,21 @@ Benchmark for performance of subexpression elimination
 Preparing data for benchmarking ...
 OpenJDK 64-Bit Server VM 1.8.0_265-b01 on Mac OS X 10.15.6
 Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz
-from_json as subExpr in Project:           Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
--------------------------------------------------------------------------------------------------------------------------
-subexpressionElimination off, codegen on           25887          26105         326          0.0   258868246.6       1.0X
-subexpressionElimination off, codegen off          25131          25454         522          0.0   251309329.7       1.0X
-subexpressionElimination on, codegen on             2230           2340         106          0.0    22302959.3      11.6X
-subexpressionElimination on, codegen off            2185           2254          64          0.0    21852694.0      11.8X
+from_json as subExpr in Project:          Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+subExprElimination false, codegen: true           22767          23240         424          0.0   227665316.7       1.0X
+subExprElimination false, codegen: false          22869          23351         465          0.0   228693464.1       1.0X
+subExprElimination true, codegen: true             1328           1340          10          0.0    13280056.2      17.1X
+subExprElimination true, codegen: false            1248           1276          31          0.0    12476135.1      18.2X
 
 Preparing data for benchmarking ...
 OpenJDK 64-Bit Server VM 1.8.0_265-b01 on Mac OS X 10.15.6
 Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz
-from_json as subExpr in Filter:            Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
--------------------------------------------------------------------------------------------------------------------------
-subexpressionElimination off, codegen on           42687          42805         111          0.0   426873372.5       1.0X
-subexpressionElimination off, codegen off          43606          45108        1613          0.0   436055236.3       1.0X
-subexpressionElimination on, codegen on            29761          30563         704          0.0   297614324.4       1.4X
-subexpressionElimination on, codegen off           41676          42598         955          0.0   416758112.3       1.0X
+from_json as subExpr in Filter:           Best Time(ms)   Avg Time(ms)   Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+subexpressionElimination off, codegen on          37691          38846        1004          0.0   376913767.9       1.0X
+subexpressionElimination off, codegen on          37852          39124        1103          0.0   378517745.5       1.0X
+subexpressionElimination off, codegen on          22900          23085         202          0.0   229000242.5       1.6X
+subexpressionElimination off, codegen on          38298          38598         374          0.0   382978731.3       1.0X
 
 
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/SubExprEliminationBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/SubExprEliminationBenchmark.scala
@@ -52,57 +52,26 @@ object SubExprEliminationBenchmark extends SqlBasedBenchmark {
         from_json('value, schema).getField(s"col$idx")
       }
 
-      // We only benchmark subexpression performance under codegen/non-codegen, so disabling
-      // json optimization.
-      benchmark.addCase("subexpressionElimination off, codegen on", numIters) { _ =>
-        withSQLConf(
-          SQLConf.SUBEXPRESSION_ELIMINATION_ENABLED.key -> "false",
-          SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "true",
-          SQLConf.CODEGEN_FACTORY_MODE.key -> "CODEGEN_ONLY",
-          SQLConf.JSON_EXPRESSION_OPTIMIZATION.key -> "false") {
-          val df = spark.read
-            .text(path.getAbsolutePath)
-            .select(cols: _*)
-          df.write.mode("overwrite").format("noop").save()
-        }
-      }
-
-      benchmark.addCase("subexpressionElimination off, codegen off", numIters) { _ =>
-        withSQLConf(
-          SQLConf.SUBEXPRESSION_ELIMINATION_ENABLED.key -> "false",
-          SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false",
-          SQLConf.CODEGEN_FACTORY_MODE.key -> "NO_CODEGEN",
-          SQLConf.JSON_EXPRESSION_OPTIMIZATION.key -> "false") {
-          val df = spark.read
-            .text(path.getAbsolutePath)
-            .select(cols: _*)
-          df.write.mode("overwrite").format("noop").save()
-        }
-      }
-
-      benchmark.addCase("subexpressionElimination on, codegen on", numIters) { _ =>
-        withSQLConf(
-            SQLConf.SUBEXPRESSION_ELIMINATION_ENABLED.key -> "true",
-            SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "true",
-            SQLConf.CODEGEN_FACTORY_MODE.key -> "CODEGEN_ONLY",
+      Seq(
+        ("false", "true", "CODEGEN_ONLY"),
+        ("false", "false", "NO_CODEGEN"),
+        ("true", "true", "CODEGEN_ONLY"),
+        ("true", "false", "NO_CODEGEN")
+      ).foreach { case (subExprEliminationEnabled, codegenEnabled, codegenFactory) =>
+        // We only benchmark subexpression performance under codegen/non-codegen, so disabling
+        // json optimization.
+        val caseName = s"subExprElimination $subExprEliminationEnabled, codegen: $codegenEnabled"
+        benchmark.addCase(caseName, numIters) { _ =>
+          withSQLConf(
+            SQLConf.SUBEXPRESSION_ELIMINATION_ENABLED.key -> subExprEliminationEnabled,
+            SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> codegenEnabled,
+            SQLConf.CODEGEN_FACTORY_MODE.key -> codegenFactory,
             SQLConf.JSON_EXPRESSION_OPTIMIZATION.key -> "false") {
-          val df = spark.read
-            .text(path.getAbsolutePath)
-            .select(cols: _*)
-          df.write.mode("overwrite").format("noop").save()
-        }
-      }
-
-      benchmark.addCase("subexpressionElimination on, codegen off", numIters) { _ =>
-        withSQLConf(
-          SQLConf.SUBEXPRESSION_ELIMINATION_ENABLED.key -> "true",
-          SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false",
-          SQLConf.CODEGEN_FACTORY_MODE.key -> "NO_CODEGEN",
-          SQLConf.JSON_EXPRESSION_OPTIMIZATION.key -> "false") {
-          val df = spark.read
-            .text(path.getAbsolutePath)
-            .select(cols: _*)
-          df.write.mode("overwrite").format("noop").save()
+            val df = spark.read
+              .text(path.getAbsolutePath)
+              .select(cols: _*)
+            df.write.mode("overwrite").format("noop").save()
+          }
         }
       }
 
@@ -122,57 +91,26 @@ object SubExprEliminationBenchmark extends SqlBasedBenchmark {
         (from_json('value, schema).getField(s"col$idx") >= Literal(100000)).expr
       }.asInstanceOf[Seq[Expression]].reduce(Or)
 
-      // We only benchmark subexpression performance under codegen/non-codegen, so disabling
-      // json optimization.
-      benchmark.addCase("subexpressionElimination off, codegen on", numIters) { _ =>
-        withSQLConf(
-          SQLConf.SUBEXPRESSION_ELIMINATION_ENABLED.key -> "false",
-          SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "true",
-          SQLConf.CODEGEN_FACTORY_MODE.key -> "CODEGEN_ONLY",
-          SQLConf.JSON_EXPRESSION_OPTIMIZATION.key -> "false") {
-          val df = spark.read
-            .text(path.getAbsolutePath)
-            .where(Column(predicate))
-          df.write.mode("overwrite").format("noop").save()
-        }
-      }
-
-      benchmark.addCase("subexpressionElimination off, codegen off", numIters) { _ =>
-        withSQLConf(
-          SQLConf.SUBEXPRESSION_ELIMINATION_ENABLED.key -> "false",
-          SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false",
-          SQLConf.CODEGEN_FACTORY_MODE.key -> "NO_CODEGEN",
-          SQLConf.JSON_EXPRESSION_OPTIMIZATION.key -> "false") {
-          val df = spark.read
-            .text(path.getAbsolutePath)
-            .where(Column(predicate))
-          df.write.mode("overwrite").format("noop").save()
-        }
-      }
-
-      benchmark.addCase("subexpressionElimination on, codegen on", numIters) { _ =>
-        withSQLConf(
-          SQLConf.SUBEXPRESSION_ELIMINATION_ENABLED.key -> "true",
-          SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "true",
-          SQLConf.CODEGEN_FACTORY_MODE.key -> "CODEGEN_ONLY",
-          SQLConf.JSON_EXPRESSION_OPTIMIZATION.key -> "false") {
-          val df = spark.read
-            .text(path.getAbsolutePath)
-            .where(Column(predicate))
-          df.write.mode("overwrite").format("noop").save()
-        }
-      }
-
-      benchmark.addCase("subexpressionElimination on, codegen off", numIters) { _ =>
-        withSQLConf(
-          SQLConf.SUBEXPRESSION_ELIMINATION_ENABLED.key -> "true",
-          SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false",
-          SQLConf.CODEGEN_FACTORY_MODE.key -> "NO_CODEGEN",
-          SQLConf.JSON_EXPRESSION_OPTIMIZATION.key -> "false") {
-          val df = spark.read
-            .text(path.getAbsolutePath)
-            .where(Column(predicate))
-          df.write.mode("overwrite").format("noop").save()
+      Seq(
+        ("false", "true", "CODEGEN_ONLY"),
+        ("false", "false", "NO_CODEGEN"),
+        ("true", "true", "CODEGEN_ONLY"),
+        ("true", "false", "NO_CODEGEN")
+      ).foreach { case (subExprEliminationEnabled, codegenEnabled, codegenFactory) =>
+        // We only benchmark subexpression performance under codegen/non-codegen, so disabling
+        // json optimization.
+        val caseName = s"subExprElimination $subExprEliminationEnabled, codegen: $codegenEnabled"
+        benchmark.addCase("subexpressionElimination off, codegen on", numIters) { _ =>
+          withSQLConf(
+            SQLConf.SUBEXPRESSION_ELIMINATION_ENABLED.key -> subExprEliminationEnabled,
+            SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> codegenEnabled,
+            SQLConf.CODEGEN_FACTORY_MODE.key -> codegenFactory,
+            SQLConf.JSON_EXPRESSION_OPTIMIZATION.key -> "false") {
+            val df = spark.read
+              .text(path.getAbsolutePath)
+              .where(Column(predicate))
+            df.write.mode("overwrite").format("noop").save()
+          }
         }
       }