Skip to content

Commit f6ff632

Browse files
heary-caodongjoon-hyun
authored andcommitted
[SPARK-25847][SQL][TEST] Refactor JSONBenchmarks to use main method
## What changes were proposed in this pull request? Refactor JSONBenchmark to use main method use spark-submit: `bin/spark-submit --class org.apache.spark.sql.execution.datasources.json.JSONBenchmark --jars ./core/target/spark-core_2.11-3.0.0-SNAPSHOT-tests.jar,./sql/catalyst/target/spark-catalyst_2.11-3.0.0-SNAPSHOT-tests.jar ./sql/core/target/spark-sql_2.11-3.0.0-SNAPSHOT-tests.jar` Generate benchmark result: `SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain org.apache.spark.sql.execution.datasources.json.JSONBenchmark"` ## How was this patch tested? manual tests Closes #22844 from heary-cao/JSONBenchmarks. Lead-authored-by: caoxuewen <[email protected]> Co-authored-by: heary <[email protected]> Co-authored-by: Dongjoon Hyun <[email protected]> Signed-off-by: hyukjinkwon <[email protected]>
1 parent 891032d commit f6ff632

File tree

2 files changed

+63
-60
lines changed

2 files changed

+63
-60
lines changed
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
================================================================================================
2+
Benchmark for performance of JSON parsing
3+
================================================================================================
4+
5+
Preparing data for benchmarking ...
6+
OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64
7+
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
8+
JSON schema inferring: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
9+
------------------------------------------------------------------------------------------------
10+
No encoding 62946 / 63310 1.6 629.5 1.0X
11+
UTF-8 is set 112814 / 112866 0.9 1128.1 0.6X
12+
13+
Preparing data for benchmarking ...
14+
OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64
15+
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
16+
JSON per-line parsing: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
17+
------------------------------------------------------------------------------------------------
18+
No encoding 16468 / 16553 6.1 164.7 1.0X
19+
UTF-8 is set 16420 / 16441 6.1 164.2 1.0X
20+
21+
Preparing data for benchmarking ...
22+
OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64
23+
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
24+
JSON parsing of wide lines: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
25+
------------------------------------------------------------------------------------------------
26+
No encoding 39789 / 40053 0.3 3978.9 1.0X
27+
UTF-8 is set 39505 / 39584 0.3 3950.5 1.0X
28+
29+
OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64
30+
Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
31+
Count a dataset with 10 columns: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
32+
------------------------------------------------------------------------------------------------
33+
Select 10 columns + count() 15997 / 16015 0.6 1599.7 1.0X
34+
Select 1 column + count() 13280 / 13326 0.8 1328.0 1.2X
35+
count() 3006 / 3021 3.3 300.6 5.3X
36+
37+

sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmarks.scala renamed to sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmark.scala

Lines changed: 26 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -16,32 +16,31 @@
1616
*/
1717
package org.apache.spark.sql.execution.datasources.json
1818

19-
import java.io.File
20-
21-
import org.apache.spark.SparkConf
2219
import org.apache.spark.benchmark.Benchmark
23-
import org.apache.spark.sql.{Row, SparkSession}
24-
import org.apache.spark.sql.catalyst.plans.SQLHelper
20+
import org.apache.spark.sql.Row
21+
import org.apache.spark.sql.execution.benchmark.SqlBasedBenchmark
2522
import org.apache.spark.sql.functions.lit
2623
import org.apache.spark.sql.types._
2724

2825
/**
2926
* The benchmarks aims to measure performance of JSON parsing when encoding is set and isn't.
30-
* To run this:
31-
* spark-submit --class <this class> --jars <spark sql test jar>
27+
* To run this benchmark:
28+
* {{{
29+
* 1. without sbt:
30+
* bin/spark-submit --class <this class> --jars <spark core test jar>,
31+
* <spark catalyst test jar> <spark sql test jar>
32+
* 2. build/sbt "sql/test:runMain <this class>"
33+
* 3. generate result:
34+
* SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain <this class>"
35+
* Results will be written to "benchmarks/JSONBenchmark-results.txt".
36+
* }}}
3237
*/
33-
object JSONBenchmarks extends SQLHelper {
34-
val conf = new SparkConf()
35-
36-
val spark = SparkSession.builder
37-
.master("local[1]")
38-
.appName("benchmark-json-datasource")
39-
.config(conf)
40-
.getOrCreate()
38+
39+
object JSONBenchmark extends SqlBasedBenchmark {
4140
import spark.implicits._
4241

4342
def schemaInferring(rowsNum: Int): Unit = {
44-
val benchmark = new Benchmark("JSON schema inferring", rowsNum)
43+
val benchmark = new Benchmark("JSON schema inferring", rowsNum, output = output)
4544

4645
withTempPath { path =>
4746
// scalastyle:off println
@@ -65,21 +64,12 @@ object JSONBenchmarks extends SQLHelper {
6564
.json(path.getAbsolutePath)
6665
}
6766

68-
/*
69-
Java HotSpot(TM) 64-Bit Server VM 1.8.0_172-b11 on Mac OS X 10.13.5
70-
Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
71-
72-
JSON schema inferring: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
73-
---------------------------------------------------------------------------------------------
74-
No encoding 45908 / 46480 2.2 459.1 1.0X
75-
UTF-8 is set 68469 / 69762 1.5 684.7 0.7X
76-
*/
7767
benchmark.run()
7868
}
7969
}
8070

8171
def perlineParsing(rowsNum: Int): Unit = {
82-
val benchmark = new Benchmark("JSON per-line parsing", rowsNum)
72+
val benchmark = new Benchmark("JSON per-line parsing", rowsNum, output = output)
8373

8474
withTempPath { path =>
8575
// scalastyle:off println
@@ -107,21 +97,12 @@ object JSONBenchmarks extends SQLHelper {
10797
.count()
10898
}
10999

110-
/*
111-
Java HotSpot(TM) 64-Bit Server VM 1.8.0_172-b11 on Mac OS X 10.13.5
112-
Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
113-
114-
JSON per-line parsing: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
115-
---------------------------------------------------------------------------------------------
116-
No encoding 9982 / 10237 10.0 99.8 1.0X
117-
UTF-8 is set 16373 / 16806 6.1 163.7 0.6X
118-
*/
119100
benchmark.run()
120101
}
121102
}
122103

123104
def perlineParsingOfWideColumn(rowsNum: Int): Unit = {
124-
val benchmark = new Benchmark("JSON parsing of wide lines", rowsNum)
105+
val benchmark = new Benchmark("JSON parsing of wide lines", rowsNum, output = output)
125106

126107
withTempPath { path =>
127108
// scalastyle:off println
@@ -156,22 +137,14 @@ object JSONBenchmarks extends SQLHelper {
156137
.count()
157138
}
158139

159-
/*
160-
Java HotSpot(TM) 64-Bit Server VM 1.8.0_172-b11 on Mac OS X 10.13.5
161-
Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
162-
163-
JSON parsing of wide lines: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
164-
---------------------------------------------------------------------------------------------
165-
No encoding 26038 / 26386 0.4 2603.8 1.0X
166-
UTF-8 is set 28343 / 28557 0.4 2834.3 0.9X
167-
*/
168140
benchmark.run()
169141
}
170142
}
171143

172144
def countBenchmark(rowsNum: Int): Unit = {
173145
val colsNum = 10
174-
val benchmark = new Benchmark(s"Count a dataset with $colsNum columns", rowsNum)
146+
val benchmark =
147+
new Benchmark(s"Count a dataset with $colsNum columns", rowsNum, output = output)
175148

176149
withTempPath { path =>
177150
val fields = Seq.tabulate(colsNum)(i => StructField(s"col$i", IntegerType))
@@ -195,23 +168,16 @@ object JSONBenchmarks extends SQLHelper {
195168
ds.count()
196169
}
197170

198-
/*
199-
Intel(R) Core(TM) i7-7700HQ CPU @ 2.80GHz
200-
201-
Count a dataset with 10 columns: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
202-
---------------------------------------------------------------------------------------------
203-
Select 10 columns + count() 9961 / 10006 1.0 996.1 1.0X
204-
Select 1 column + count() 8355 / 8470 1.2 835.5 1.2X
205-
count() 2104 / 2156 4.8 210.4 4.7X
206-
*/
207171
benchmark.run()
208172
}
209173
}
210174

211-
def main(args: Array[String]): Unit = {
212-
schemaInferring(100 * 1000 * 1000)
213-
perlineParsing(100 * 1000 * 1000)
214-
perlineParsingOfWideColumn(10 * 1000 * 1000)
215-
countBenchmark(10 * 1000 * 1000)
175+
override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
176+
runBenchmark("Benchmark for performance of JSON parsing") {
177+
schemaInferring(100 * 1000 * 1000)
178+
perlineParsing(100 * 1000 * 1000)
179+
perlineParsingOfWideColumn(10 * 1000 * 1000)
180+
countBenchmark(10 * 1000 * 1000)
181+
}
216182
}
217183
}

0 commit comments

Comments
 (0)