[SPARK-25847][SQL][TEST] Refactor JSONBenchmarks to use main method

heary-cao · dongjoon-hyun · HyukjinKwon · commit f6ff6329eee7 · 2018-10-31T10:28:17.000+08:00
## What changes were proposed in this pull request? Refactor JSONBenchmark to use main method use spark-submit: `bin/spark-submit --class org.apache.spark.sql.execution.datasources.json.JSONBenchmark --jars ./core/target/spark-core_2.11-3.0.0-SNAPSHOT-tests.jar,./sql/catalyst/target/spark-catalyst_2.11-3.0.0-SNAPSHOT-tests.jar ./sql/core/target/spark-sql_2.11-3.0.0-SNAPSHOT-tests.jar` Generate benchmark result: `SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain org.apache.spark.sql.execution.datasources.json.JSONBenchmark"` ## How was this patch tested? manual tests Closes #22844 from heary-cao/JSONBenchmarks. Lead-authored-by: caoxuewen <cao.xuewen@zte.com.cn> Co-authored-by: heary <cao.xuewen@zte.com.cn> Co-authored-by: Dongjoon Hyun <dongjoon@apache.org> Signed-off-by: hyukjinkwon <gurwls223@apache.org>
diff --git a/sql/core/benchmarks/JSONBenchmark-results.txt b/sql/core/benchmarks/JSONBenchmark-results.txt
@@ -0,0 +1,37 @@
+================================================================================================
+Benchmark for performance of JSON parsing
+================================================================================================
+
+Preparing data for benchmarking ...
+OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+JSON schema inferring:                   Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+No encoding                                 62946 / 63310          1.6         629.5       1.0X
+UTF-8 is set                              112814 / 112866          0.9        1128.1       0.6X
+
+Preparing data for benchmarking ...
+OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+JSON per-line parsing:                   Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+No encoding                                 16468 / 16553          6.1         164.7       1.0X
+UTF-8 is set                                16420 / 16441          6.1         164.2       1.0X
+
+Preparing data for benchmarking ...
+OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+JSON parsing of wide lines:              Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+No encoding                                 39789 / 40053          0.3        3978.9       1.0X
+UTF-8 is set                                39505 / 39584          0.3        3950.5       1.0X
+
+OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Count a dataset with 10 columns:         Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Select 10 columns + count()                 15997 / 16015          0.6        1599.7       1.0X
+Select 1 column + count()                   13280 / 13326          0.8        1328.0       1.2X
+count()                                       3006 / 3021          3.3         300.6       5.3X
+
+
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmark.scala
@@ -16,32 +16,31 @@
  */
 package org.apache.spark.sql.execution.datasources.json
 
-import java.io.File
-
-import org.apache.spark.SparkConf
 import org.apache.spark.benchmark.Benchmark
-import org.apache.spark.sql.{Row, SparkSession}
-import org.apache.spark.sql.catalyst.plans.SQLHelper
+import org.apache.spark.sql.Row
+import org.apache.spark.sql.execution.benchmark.SqlBasedBenchmark
 import org.apache.spark.sql.functions.lit
 import org.apache.spark.sql.types._
 
 /**
  * The benchmarks aims to measure performance of JSON parsing when encoding is set and isn't.
- * To run this:
- *  spark-submit --class <this class> --jars <spark sql test jar>
+ * To run this benchmark:
+ * {{{
+ *   1. without sbt:
+ *      bin/spark-submit --class <this class> --jars <spark core test jar>,
+ *        <spark catalyst test jar> <spark sql test jar>
+ *   2. build/sbt "sql/test:runMain <this class>"
+ *   3. generate result:
+ *      SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain <this class>"
+ *      Results will be written to "benchmarks/JSONBenchmark-results.txt".
+ * }}}
  */
-object JSONBenchmarks extends SQLHelper {
-  val conf = new SparkConf()
-
-  val spark = SparkSession.builder
-    .master("local[1]")
-    .appName("benchmark-json-datasource")
-    .config(conf)
-    .getOrCreate()
+
+object JSONBenchmark extends SqlBasedBenchmark {
   import spark.implicits._
 
   def schemaInferring(rowsNum: Int): Unit = {
-    val benchmark = new Benchmark("JSON schema inferring", rowsNum)
+    val benchmark = new Benchmark("JSON schema inferring", rowsNum, output = output)
 
     withTempPath { path =>
       // scalastyle:off println
@@ -65,21 +64,12 @@ object JSONBenchmarks extends SQLHelper {
           .json(path.getAbsolutePath)
       }
 
-      /*
-      Java HotSpot(TM) 64-Bit Server VM 1.8.0_172-b11 on Mac OS X 10.13.5
-      Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
-
-      JSON schema inferring:                Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
-      ---------------------------------------------------------------------------------------------
-      No encoding                              45908 / 46480          2.2         459.1       1.0X
-      UTF-8 is set                             68469 / 69762          1.5         684.7       0.7X
-      */
       benchmark.run()
     }
   }
 
   def perlineParsing(rowsNum: Int): Unit = {
-    val benchmark = new Benchmark("JSON per-line parsing", rowsNum)
+    val benchmark = new Benchmark("JSON per-line parsing", rowsNum, output = output)
 
     withTempPath { path =>
       // scalastyle:off println
@@ -107,21 +97,12 @@ object JSONBenchmarks extends SQLHelper {
           .count()
       }
 
-      /*
-      Java HotSpot(TM) 64-Bit Server VM 1.8.0_172-b11 on Mac OS X 10.13.5
-      Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
-
-      JSON per-line parsing:                Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
-      ---------------------------------------------------------------------------------------------
-      No encoding                               9982 / 10237         10.0          99.8       1.0X
-      UTF-8 is set                             16373 / 16806          6.1         163.7       0.6X
-      */
       benchmark.run()
     }
   }
 
   def perlineParsingOfWideColumn(rowsNum: Int): Unit = {
-    val benchmark = new Benchmark("JSON parsing of wide lines", rowsNum)
+    val benchmark = new Benchmark("JSON parsing of wide lines", rowsNum, output = output)
 
     withTempPath { path =>
       // scalastyle:off println
@@ -156,22 +137,14 @@ object JSONBenchmarks extends SQLHelper {
           .count()
       }
 
-      /*
-      Java HotSpot(TM) 64-Bit Server VM 1.8.0_172-b11 on Mac OS X 10.13.5
-      Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
-
-      JSON parsing of wide lines:           Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
-      ---------------------------------------------------------------------------------------------
-      No encoding                              26038 / 26386          0.4        2603.8       1.0X
-      UTF-8 is set                             28343 / 28557          0.4        2834.3       0.9X
-      */
       benchmark.run()
     }
   }
 
   def countBenchmark(rowsNum: Int): Unit = {
     val colsNum = 10
-    val benchmark = new Benchmark(s"Count a dataset with $colsNum columns", rowsNum)
+    val benchmark =
+      new Benchmark(s"Count a dataset with $colsNum columns", rowsNum, output = output)
 
     withTempPath { path =>
       val fields = Seq.tabulate(colsNum)(i => StructField(s"col$i", IntegerType))
@@ -195,23 +168,16 @@ object JSONBenchmarks extends SQLHelper {
         ds.count()
       }
 
-      /*
-      Intel(R) Core(TM) i7-7700HQ CPU @ 2.80GHz
-
-      Count a dataset with 10 columns:      Best/Avg Time(ms)    Rate(M/s)   Per Row(ns)   Relative
-      ---------------------------------------------------------------------------------------------
-      Select 10 columns + count()               9961 / 10006          1.0         996.1       1.0X
-      Select 1 column + count()                  8355 / 8470          1.2         835.5       1.2X
-      count()                                    2104 / 2156          4.8         210.4       4.7X
-      */
       benchmark.run()
     }
   }
 
-  def main(args: Array[String]): Unit = {
-    schemaInferring(100 * 1000 * 1000)
-    perlineParsing(100 * 1000 * 1000)
-    perlineParsingOfWideColumn(10 * 1000 * 1000)
-    countBenchmark(10 * 1000 * 1000)
+  override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
+    runBenchmark("Benchmark for performance of JSON parsing") {
+      schemaInferring(100 * 1000 * 1000)
+      perlineParsing(100 * 1000 * 1000)
+      perlineParsingOfWideColumn(10 * 1000 * 1000)
+      countBenchmark(10 * 1000 * 1000)
+    }
   }
 }