Skip to content
Closed
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions external/avro/benchmarks/AvroWriteBenchmark-results.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
Java HotSpot(TM) 64-Bit Server VM 1.8.0_162-b12 on Mac OS X 10.13.6
Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
Avro writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------
Output Single Int Column 2287 / 2295 6.9 145.4 1.0X
Output Single Double Column 2494 / 2538 6.3 158.6 0.9X
Output Int and String Column 5555 / 5587 2.8 353.2 0.4X
Output Partitions 3928 / 4157 4.0 249.7 0.6X
Output Buckets 5374 / 5441 2.9 341.7 0.4X

Original file line number Diff line number Diff line change
Expand Up @@ -19,22 +19,17 @@ package org.apache.spark.sql.execution.benchmark

/**
* Benchmark to measure Avro data sources write performance.
* Usage:
* 1. with spark-submit: bin/spark-submit --class <this class> <spark sql test jar>
* 2. with sbt: build/sbt "avro/test:runMain <this class>"
* To run this benchmark:
* {{{
* 1. without sbt: bin/spark-submit --class <this class>
Copy link
Member

@wangyum wangyum Oct 29, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add avro:

bin/spark-submit --class <this class> --jars <spark core test jar>,<spark catalyst test jar>,<spark sql test jar> <spark avro test jar>

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I hint an exception when run:

bin/spark-submit --class org.apache.spark.sql.execution.benchmark.AvroWriteBenchmark --jars ./core/target/spark-core_2.11-3.0.0-SNAPSHOT-tests.jar,./sql/catalyst/target/spark-catalyst_2.11-3.0.0-SNAPSHOT-tests.jar,./sql/core/target/spark-sql_2.11-3.0.0-SNAPSHOT-tests.jar ./external/avro/target/spark-avro_2.11-3.0.0-SNAPSHOT-tests.jar
Exception in thread "main" org.apache.spark.sql.AnalysisException: Failed to find data source: Avro. Avro is built-in but external data source module since Spark 2.4. Please deploy the application as per the deployment section of "Apache Avro Data Source Guide".;
	at org.apache.spark.sql.execution.datasources.DataSource$.lookupDataSource(DataSource.scala:647)
	at org.apache.spark.sql.execution.datasources.DataSource.providingClass$lzycompute(DataSource.scala:94)
	at org.apache.spark.sql.execution.datasources.DataSource.providingClass(DataSource.scala:93)
	at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:313)
	at org.apache.spark.sql.execution.command.CreateDataSourceTableCommand.run(createDataSourceTables.scala:78)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult$lzycompute(commands.scala:70)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.sideEffectResult(commands.scala:68)
	at org.apache.spark.sql.execution.command.ExecutedCommandExec.executeCollect(commands.scala:79)
	at org.apache.spark.sql.Dataset$$anonfun$6.apply(Dataset.scala:195)
	at org.apache.spark.sql.Dataset$$anonfun$6.apply(Dataset.scala:195)
......

Copy link
Contributor Author

@yucai yucai Oct 30, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@wangyum Good catch! I think it needs <spark avro jar>, added.

* --jars <spark core test jar>,<spark catalyst test jar> <spark sql test jar>
* 2. build/sbt "sql/test:runMain <this class>"
* 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "avro/test:runMain <this class>"
* Results will be written to "benchmarks/AvroWriteBenchmark-results.txt".
* }}}
*/
object AvroWriteBenchmark extends DataSourceWriteBenchmark {
def main(args: Array[String]): Unit = {
/*
Intel(R) Core(TM) i7-6920HQ CPU @ 2.90GHz
Avro writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------
Output Single Int Column 2481 / 2499 6.3 157.8 1.0X
Output Single Double Column 2705 / 2710 5.8 172.0 0.9X
Output Int and String Column 5539 / 5639 2.8 352.2 0.4X
Output Partitions 4613 / 5004 3.4 293.3 0.5X
Output Buckets 5554 / 5561 2.8 353.1 0.4X
*/
runBenchmark("Avro")
override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
runDataSourceBenchmark("Avro")
}
}
60 changes: 60 additions & 0 deletions sql/core/benchmarks/BuiltInDataSourceWriteBenchmark-results.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
================================================================================================
Parquet writer benchmark
================================================================================================

Java HotSpot(TM) 64-Bit Server VM 1.8.0_162-b12 on Mac OS X 10.13.6
Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
Parquet writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------
Output Single Int Column 2049 / 2120 7.7 130.2 1.0X
Output Single Double Column 2194 / 2203 7.2 139.5 0.9X
Output Int and String Column 5704 / 5715 2.8 362.7 0.4X
Output Partitions 3727 / 3856 4.2 237.0 0.5X
Output Buckets 5119 / 5361 3.1 325.4 0.4X


================================================================================================
ORC writer benchmark
================================================================================================

Java HotSpot(TM) 64-Bit Server VM 1.8.0_162-b12 on Mac OS X 10.13.6
Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
ORC writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------
Output Single Int Column 1109 / 1116 14.2 70.5 1.0X
Output Single Double Column 1366 / 1378 11.5 86.8 0.8X
Output Int and String Column 5303 / 5318 3.0 337.2 0.2X
Output Partitions 3078 / 3472 5.1 195.7 0.4X
Output Buckets 4374 / 4398 3.6 278.1 0.3X


================================================================================================
JSON writer benchmark
================================================================================================

Java HotSpot(TM) 64-Bit Server VM 1.8.0_162-b12 on Mac OS X 10.13.6
Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
JSON writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------
Output Single Int Column 1806 / 1813 8.7 114.8 1.0X
Output Single Double Column 2530 / 2538 6.2 160.8 0.7X
Output Int and String Column 5311 / 5344 3.0 337.6 0.3X
Output Partitions 3524 / 3580 4.5 224.1 0.5X
Output Buckets 4661 / 4723 3.4 296.3 0.4X


================================================================================================
CSV writer benchmark
================================================================================================

Java HotSpot(TM) 64-Bit Server VM 1.8.0_162-b12 on Mac OS X 10.13.6
Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz
CSV writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------
Output Single Int Column 2863 / 2896 5.5 182.0 1.0X
Output Single Double Column 3233 / 3238 4.9 205.6 0.9X
Output Int and String Column 6805 / 6822 2.3 432.6 0.4X
Output Partitions 4873 / 4886 3.2 309.8 0.6X
Output Buckets 6733 / 6757 2.3 428.1 0.4X


Original file line number Diff line number Diff line change
Expand Up @@ -18,62 +18,40 @@ package org.apache.spark.sql.execution.benchmark

/**
* Benchmark to measure built-in data sources write performance.
* By default it measures 4 data source format: Parquet, ORC, JSON, CSV. Run it with spark-submit:
* spark-submit --class <this class> <spark sql test jar>
* Or with sbt:
* build/sbt "sql/test:runMain <this class>"
* To run this benchmark:
* {{{
* By default it measures 4 data source format: Parquet, ORC, JSON, CSV.
* 1. without sbt: bin/spark-submit --class <this class>
* --jars <spark core test jar>,<spark catalyst test jar> <spark sql test jar>
* 2. build/sbt "sql/test:runMain <this class>"
* 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain <this class>"
* Results will be written to "benchmarks/BuiltInDataSourceWriteBenchmark-results.txt".
*
* To measure specified formats, run it with arguments.
* 1. without sbt:
* bin/spark-submit --class <this class> <spark sql test jar> format1 [format2] [...]
* 2. build/sbt "sql/test:runMain <this class> format1 [format2] [...]"
* 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt
* "sql/test:runMain <this class> format1 [format2] [...]"
* Results will be written to "benchmarks/BuiltInDataSourceWriteBenchmark-results.txt".
* }}}
*
* To measure specified formats, run it with arguments:
* spark-submit --class <this class> <spark sql test jar> format1 [format2] [...]
* Or with sbt:
* build/sbt "sql/test:runMain <this class> format1 [format2] [...]"
*/
object BuiltInDataSourceWriteBenchmark extends DataSourceWriteBenchmark {
def main(args: Array[String]): Unit = {
val formats: Seq[String] = if (args.isEmpty) {
override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
val formats: Seq[String] = if (mainArgs.isEmpty) {
Seq("Parquet", "ORC", "JSON", "CSV")
} else {
args
mainArgs
}

spark.conf.set("spark.sql.parquet.compression.codec", "snappy")
spark.conf.set("spark.sql.orc.compression.codec", "snappy")
/*
Intel(R) Core(TM) i7-6920HQ CPU @ 2.90GHz
Parquet writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------
Output Single Int Column 1815 / 1932 8.7 115.4 1.0X
Output Single Double Column 1877 / 1878 8.4 119.3 1.0X
Output Int and String Column 6265 / 6543 2.5 398.3 0.3X
Output Partitions 4067 / 4457 3.9 258.6 0.4X
Output Buckets 5608 / 5820 2.8 356.6 0.3X

ORC writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------
Output Single Int Column 1201 / 1239 13.1 76.3 1.0X
Output Single Double Column 1542 / 1600 10.2 98.0 0.8X
Output Int and String Column 6495 / 6580 2.4 412.9 0.2X
Output Partitions 3648 / 3842 4.3 231.9 0.3X
Output Buckets 5022 / 5145 3.1 319.3 0.2X

JSON writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------
Output Single Int Column 1988 / 2093 7.9 126.4 1.0X
Output Single Double Column 2854 / 2911 5.5 181.4 0.7X
Output Int and String Column 6467 / 6653 2.4 411.1 0.3X
Output Partitions 4548 / 5055 3.5 289.1 0.4X
Output Buckets 5664 / 5765 2.8 360.1 0.4X

CSV writer benchmark: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
------------------------------------------------------------------------------------------------
Output Single Int Column 3025 / 3190 5.2 192.3 1.0X
Output Single Double Column 3575 / 3634 4.4 227.3 0.8X
Output Int and String Column 7313 / 7399 2.2 464.9 0.4X
Output Partitions 5105 / 5190 3.1 324.6 0.6X
Output Buckets 6986 / 6992 2.3 444.1 0.4X
*/
formats.foreach { format =>
runBenchmark(format)
runBenchmark(s"$format writer benchmark") {
runDataSourceBenchmark(format)
}
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,9 @@
*/
package org.apache.spark.sql.execution.benchmark

import org.apache.spark.SparkConf
import org.apache.spark.benchmark.Benchmark
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.internal.SQLConf

trait DataSourceWriteBenchmark {
val conf = new SparkConf()
.setAppName("DataSourceWriteBenchmark")
.setIfMissing("spark.master", "local[1]")
.set(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key, "true")

val spark = SparkSession.builder.config(conf).getOrCreate()
trait DataSourceWriteBenchmark extends SqlBasedBenchmark {

val tempTable = "temp"
val numRows = 1024 * 1024 * 15
Expand Down Expand Up @@ -75,7 +66,7 @@ trait DataSourceWriteBenchmark {
}
}

def runBenchmark(format: String): Unit = {
def runDataSourceBenchmark(format: String): Unit = {
val tableInt = "tableInt"
val tableDouble = "tableDouble"
val tableIntString = "tableIntString"
Expand All @@ -84,7 +75,7 @@ trait DataSourceWriteBenchmark {
withTempTable(tempTable) {
spark.range(numRows).createOrReplaceTempView(tempTable)
withTable(tableInt, tableDouble, tableIntString, tablePartition, tableBucket) {
val benchmark = new Benchmark(s"$format writer benchmark", numRows)
val benchmark = new Benchmark(s"$format writer benchmark", numRows, output = output)
writeNumeric(tableInt, format, benchmark, "Int")
writeNumeric(tableDouble, format, benchmark, "Double")
writeIntString(tableIntString, format, benchmark)
Expand Down