Skip to content

Commit 40037e6

Browse files
committed
Merge pull request apache#33 from sameeragarwal/fix
resolve merge conflicts in vectorized parquet reader
2 parents b0cd621 + 93f2cef commit 40037e6

9 files changed

Lines changed: 84 additions & 382 deletions

File tree

sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/UnsafeRowParquetRecordReader.java renamed to sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java

Lines changed: 42 additions & 299 deletions
Large diffs are not rendered by default.

sql/core/src/main/scala/org/apache/spark/sql/execution/command/commands.scala

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,15 @@ case class SetCommand(kv: Option[(String, Option[String])]) extends RunnableComm
160160
}
161161
(keyValueOutput, runFunc)
162162

163+
case Some((SQLConf.Deprecated.PARQUET_UNSAFE_ROW_RECORD_READER_ENABLED, Some(value))) =>
164+
val runFunc = (sqlContext: SQLContext) => {
165+
logWarning(
166+
s"Property ${SQLConf.Deprecated.PARQUET_UNSAFE_ROW_RECORD_READER_ENABLED} is " +
167+
s"deprecated and will be ignored. Vectorized parquet reader will be used instead.")
168+
Seq(Row(SQLConf.PARQUET_VECTORIZED_READER_ENABLED, "true"))
169+
}
170+
(keyValueOutput, runFunc)
171+
163172
// Configures a single property.
164173
case Some((key, Some(value))) =>
165174
val runFunc = (sqlContext: SQLContext) => {

sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/SqlNewHadoopRDD.scala

Lines changed: 8 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,9 @@ import org.apache.spark.broadcast.Broadcast
3333
import org.apache.spark.deploy.SparkHadoopUtil
3434
import org.apache.spark.executor.DataReadMethod
3535
import org.apache.spark.internal.Logging
36-
import org.apache.spark.sql.internal.SQLConf
3736
import org.apache.spark.sql.SQLContext
38-
import org.apache.spark.sql.execution.datasources.parquet.UnsafeRowParquetRecordReader
37+
import org.apache.spark.sql.execution.datasources.parquet.VectorizedParquetRecordReader
38+
import org.apache.spark.sql.internal.SQLConf
3939
import org.apache.spark.storage.StorageLevel
4040
import org.apache.spark.util.{SerializableConfiguration, ShutdownHookManager}
4141

@@ -99,8 +99,6 @@ private[spark] class SqlNewHadoopRDD[V: ClassTag](
9999

100100
// If true, enable using the custom RecordReader for parquet. This only works for
101101
// a subset of the types (no complex types).
102-
protected val enableUnsafeRowParquetReader: Boolean =
103-
sqlContext.getConf(SQLConf.PARQUET_UNSAFE_ROW_RECORD_READER_ENABLED.key).toBoolean
104102
protected val enableVectorizedParquetReader: Boolean =
105103
sqlContext.getConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key).toBoolean
106104
protected val enableWholestageCodegen: Boolean =
@@ -174,19 +172,17 @@ private[spark] class SqlNewHadoopRDD[V: ClassTag](
174172
* fails (for example, unsupported schema), try with the normal reader.
175173
* TODO: plumb this through a different way?
176174
*/
177-
if (enableUnsafeRowParquetReader &&
175+
if (enableVectorizedParquetReader &&
178176
format.getClass.getName == "org.apache.parquet.hadoop.ParquetInputFormat") {
179-
val parquetReader: UnsafeRowParquetRecordReader = new UnsafeRowParquetRecordReader()
177+
val parquetReader: VectorizedParquetRecordReader = new VectorizedParquetRecordReader()
180178
if (!parquetReader.tryInitialize(
181179
split.serializableHadoopSplit.value, hadoopAttemptContext)) {
182180
parquetReader.close()
183181
} else {
184182
reader = parquetReader.asInstanceOf[RecordReader[Void, V]]
185-
if (enableVectorizedParquetReader) {
186-
parquetReader.resultBatch()
187-
// Whole stage codegen (PhysicalRDD) is able to deal with batches directly
188-
if (enableWholestageCodegen) parquetReader.enableReturningBatches();
189-
}
183+
parquetReader.resultBatch()
184+
// Whole stage codegen (PhysicalRDD) is able to deal with batches directly
185+
if (enableWholestageCodegen) parquetReader.enableReturningBatches()
190186
}
191187
}
192188

@@ -203,7 +199,7 @@ private[spark] class SqlNewHadoopRDD[V: ClassTag](
203199
private[this] var finished = false
204200

205201
override def hasNext: Boolean = {
206-
if (context.isInterrupted) {
202+
if (context.isInterrupted()) {
207203
throw new TaskKilledException
208204
}
209205
if (!finished && !havePair) {

sql/core/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -350,11 +350,6 @@ object SQLConf {
350350
"option must be set in Hadoop Configuration. 2. This option overrides " +
351351
"\"spark.sql.sources.outputCommitterClass\".")
352352

353-
val PARQUET_UNSAFE_ROW_RECORD_READER_ENABLED = booleanConf(
354-
key = "spark.sql.parquet.enableUnsafeRowRecordReader",
355-
defaultValue = Some(true),
356-
doc = "Enables using the custom ParquetUnsafeRowRecordReader.")
357-
358353
val PARQUET_VECTORIZED_READER_ENABLED = booleanConf(
359354
key = "spark.sql.parquet.enableVectorizedReader",
360355
defaultValue = Some(true),
@@ -532,6 +527,7 @@ object SQLConf {
532527
val CODEGEN_ENABLED = "spark.sql.codegen"
533528
val UNSAFE_ENABLED = "spark.sql.unsafe.enabled"
534529
val SORTMERGE_JOIN = "spark.sql.planner.sortMergeJoin"
530+
val PARQUET_UNSAFE_ROW_RECORD_READER_ENABLED = "spark.sql.parquet.enableUnsafeRowRecordReader"
535531
}
536532
}
537533

sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetEncodingSuite.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ class ParquetEncodingSuite extends ParquetCompatibilityTest with SharedSQLContex
3636
List.fill(n)(ROW).toDF.repartition(1).write.parquet(dir.getCanonicalPath)
3737
val file = SpecificParquetRecordReaderBase.listDirectory(dir).toArray.head
3838

39-
val reader = new UnsafeRowParquetRecordReader
39+
val reader = new VectorizedParquetRecordReader
4040
reader.initialize(file.asInstanceOf[String], null)
4141
val batch = reader.resultBatch()
4242
assert(reader.nextBatch())
@@ -61,7 +61,7 @@ class ParquetEncodingSuite extends ParquetCompatibilityTest with SharedSQLContex
6161
data.repartition(1).write.parquet(dir.getCanonicalPath)
6262
val file = SpecificParquetRecordReaderBase.listDirectory(dir).toArray.head
6363

64-
val reader = new UnsafeRowParquetRecordReader
64+
val reader = new VectorizedParquetRecordReader
6565
reader.initialize(file.asInstanceOf[String], null)
6666
val batch = reader.resultBatch()
6767
assert(reader.nextBatch())

sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
5757
val output = predicate.collect { case a: Attribute => a }.distinct
5858

5959
withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> "true") {
60-
withSQLConf(SQLConf.PARQUET_UNSAFE_ROW_RECORD_READER_ENABLED.key -> "false") {
60+
withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") {
6161
val query = df
6262
.select(output.map(e => Column(e)): _*)
6363
.where(Column(predicate))
@@ -446,7 +446,7 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
446446
test("SPARK-11661 Still pushdown filters returned by unhandledFilters") {
447447
import testImplicits._
448448
withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> "true") {
449-
withSQLConf(SQLConf.PARQUET_UNSAFE_ROW_RECORD_READER_ENABLED.key -> "false") {
449+
withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") {
450450
withTempPath { dir =>
451451
val path = s"${dir.getCanonicalPath}/part=1"
452452
(1 to 3).map(i => (i, i.toString)).toDF("a", "b").write.parquet(path)
@@ -520,7 +520,7 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex
520520
test("SPARK-11164: test the parquet filter in") {
521521
import testImplicits._
522522
withSQLConf(SQLConf.PARQUET_FILTER_PUSHDOWN_ENABLED.key -> "true") {
523-
withSQLConf(SQLConf.PARQUET_UNSAFE_ROW_RECORD_READER_ENABLED.key -> "false") {
523+
withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") {
524524
withTempPath { dir =>
525525
val path = s"${dir.getCanonicalPath}/table1"
526526
(1 to 5).map(i => (i.toFloat, i%3)).toDF("a", "b").write.parquet(path)

sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetIOSuite.scala

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -656,7 +656,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
656656
var hash1: Int = 0
657657
var hash2: Int = 0
658658
(false :: true :: Nil).foreach { v =>
659-
withSQLConf(SQLConf.PARQUET_UNSAFE_ROW_RECORD_READER_ENABLED.key -> v.toString) {
659+
withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> v.toString) {
660660
val df = sqlContext.read.parquet(dir.getCanonicalPath)
661661
val rows = df.queryExecution.toRdd.map(_.copy()).collect()
662662
val unsafeRows = rows.map(_.asInstanceOf[UnsafeRow])
@@ -672,13 +672,13 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
672672
}
673673
}
674674

675-
test("UnsafeRowParquetRecordReader - direct path read") {
676-
val data = (0 to 10).map(i => (i, ((i + 'a').toChar.toString)))
675+
test("VectorizedParquetRecordReader - direct path read") {
676+
val data = (0 to 10).map(i => (i, (i + 'a').toChar.toString))
677677
withTempPath { dir =>
678678
sqlContext.createDataFrame(data).repartition(1).write.parquet(dir.getCanonicalPath)
679679
val file = SpecificParquetRecordReaderBase.listDirectory(dir).get(0);
680680
{
681-
val reader = new UnsafeRowParquetRecordReader
681+
val reader = new VectorizedParquetRecordReader
682682
try {
683683
reader.initialize(file, null)
684684
val result = mutable.ArrayBuffer.empty[(Int, String)]
@@ -695,7 +695,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
695695

696696
// Project just one column
697697
{
698-
val reader = new UnsafeRowParquetRecordReader
698+
val reader = new VectorizedParquetRecordReader
699699
try {
700700
reader.initialize(file, ("_2" :: Nil).asJava)
701701
val result = mutable.ArrayBuffer.empty[(String)]
@@ -711,7 +711,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
711711

712712
// Project columns in opposite order
713713
{
714-
val reader = new UnsafeRowParquetRecordReader
714+
val reader = new VectorizedParquetRecordReader
715715
try {
716716
reader.initialize(file, ("_2" :: "_1" :: Nil).asJava)
717717
val result = mutable.ArrayBuffer.empty[(String, Int)]
@@ -728,7 +728,7 @@ class ParquetIOSuite extends QueryTest with ParquetTest with SharedSQLContext {
728728

729729
// Empty projection
730730
{
731-
val reader = new UnsafeRowParquetRecordReader
731+
val reader = new VectorizedParquetRecordReader
732732
try {
733733
reader.initialize(file, List[String]().asJava)
734734
var result = 0

sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetReadBenchmark.scala

Lines changed: 8 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -82,38 +82,17 @@ object ParquetReadBenchmark {
8282
}
8383

8484
sqlBenchmark.addCase("SQL Parquet MR") { iter =>
85-
withSQLConf(SQLConf.PARQUET_UNSAFE_ROW_RECORD_READER_ENABLED.key -> "false") {
86-
sqlContext.sql("select sum(id) from tempTable").collect()
87-
}
88-
}
89-
90-
sqlBenchmark.addCase("SQL Parquet Non-Vectorized") { iter =>
9185
withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") {
9286
sqlContext.sql("select sum(id) from tempTable").collect()
9387
}
9488
}
9589

9690
val files = SpecificParquetRecordReaderBase.listDirectory(dir).toArray
97-
// Driving the parquet reader directly without Spark.
98-
parquetReaderBenchmark.addCase("ParquetReader Non-Vectorized") { num =>
99-
var sum = 0L
100-
files.map(_.asInstanceOf[String]).foreach { p =>
101-
val reader = new UnsafeRowParquetRecordReader
102-
reader.initialize(p, ("id" :: Nil).asJava)
103-
104-
while (reader.nextKeyValue()) {
105-
val record = reader.getCurrentValue.asInstanceOf[InternalRow]
106-
if (!record.isNullAt(0)) sum += record.getInt(0)
107-
}
108-
reader.close()
109-
}
110-
}
111-
11291
// Driving the parquet reader in batch mode directly.
11392
parquetReaderBenchmark.addCase("ParquetReader Vectorized") { num =>
11493
var sum = 0L
11594
files.map(_.asInstanceOf[String]).foreach { p =>
116-
val reader = new UnsafeRowParquetRecordReader
95+
val reader = new VectorizedParquetRecordReader
11796
try {
11897
reader.initialize(p, ("id" :: Nil).asJava)
11998
val batch = reader.resultBatch()
@@ -136,7 +115,7 @@ object ParquetReadBenchmark {
136115
parquetReaderBenchmark.addCase("ParquetReader Vectorized -> Row") { num =>
137116
var sum = 0L
138117
files.map(_.asInstanceOf[String]).foreach { p =>
139-
val reader = new UnsafeRowParquetRecordReader
118+
val reader = new VectorizedParquetRecordReader
140119
try {
141120
reader.initialize(p, ("id" :: Nil).asJava)
142121
val batch = reader.resultBatch()
@@ -159,17 +138,15 @@ object ParquetReadBenchmark {
159138
-------------------------------------------------------------------------------------------
160139
SQL Parquet Vectorized 215 / 262 73.0 13.7 1.0X
161140
SQL Parquet MR 1946 / 2083 8.1 123.7 0.1X
162-
SQL Parquet Non-Vectorized 1079 / 1213 14.6 68.6 0.2X
163141
*/
164142
sqlBenchmark.run()
165143

166144
/*
167145
Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz
168146
Parquet Reader Single Int Column Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
169147
-------------------------------------------------------------------------------------------
170-
ParquetReader Non-Vectorized 610 / 737 25.8 38.8 1.0X
171-
ParquetReader Vectorized 123 / 152 127.8 7.8 5.0X
172-
ParquetReader Vectorized -> Row 165 / 180 95.2 10.5 3.7X
148+
ParquetReader Vectorized 123 / 152 127.8 7.8 1.0X
149+
ParquetReader Vectorized -> Row 165 / 180 95.2 10.5 0.7X
173150
*/
174151
parquetReaderBenchmark.run()
175152
}
@@ -191,41 +168,19 @@ object ParquetReadBenchmark {
191168
}
192169

193170
benchmark.addCase("SQL Parquet MR") { iter =>
194-
withSQLConf(SQLConf.PARQUET_UNSAFE_ROW_RECORD_READER_ENABLED.key -> "false") {
195-
sqlContext.sql("select sum(c1), sum(length(c2)) from tempTable").collect
196-
}
197-
}
198-
199-
benchmark.addCase("SQL Parquet Non-vectorized") { iter =>
200171
withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") {
201172
sqlContext.sql("select sum(c1), sum(length(c2)) from tempTable").collect
202173
}
203174
}
204175

205176
val files = SpecificParquetRecordReaderBase.listDirectory(dir).toArray
206-
benchmark.addCase("ParquetReader Non-vectorized") { num =>
207-
var sum1 = 0L
208-
var sum2 = 0L
209-
files.map(_.asInstanceOf[String]).foreach { p =>
210-
val reader = new UnsafeRowParquetRecordReader
211-
reader.initialize(p, null)
212-
while (reader.nextKeyValue()) {
213-
val record = reader.getCurrentValue.asInstanceOf[InternalRow]
214-
if (!record.isNullAt(0)) sum1 += record.getInt(0)
215-
if (!record.isNullAt(1)) sum2 += record.getUTF8String(1).numBytes()
216-
}
217-
reader.close()
218-
}
219-
}
220177

221178
/*
222179
Intel(R) Core(TM) i7-4870HQ CPU @ 2.50GHz
223180
Int and String Scan: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative
224181
-------------------------------------------------------------------------------------------
225182
SQL Parquet Vectorized 628 / 720 16.7 59.9 1.0X
226183
SQL Parquet MR 1905 / 2239 5.5 181.7 0.3X
227-
SQL Parquet Non-vectorized 1429 / 1732 7.3 136.3 0.4X
228-
ParquetReader Non-vectorized 989 / 1357 10.6 94.3 0.6X
229184
*/
230185
benchmark.run()
231186
}
@@ -247,7 +202,7 @@ object ParquetReadBenchmark {
247202
}
248203

249204
benchmark.addCase("SQL Parquet MR") { iter =>
250-
withSQLConf(SQLConf.PARQUET_UNSAFE_ROW_RECORD_READER_ENABLED.key -> "false") {
205+
withSQLConf(SQLConf.PARQUET_VECTORIZED_READER_ENABLED.key -> "false") {
251206
sqlContext.sql("select sum(length(c1)) from tempTable").collect
252207
}
253208
}
@@ -293,7 +248,7 @@ object ParquetReadBenchmark {
293248
Read data column 191 / 250 82.1 12.2 1.0X
294249
Read partition column 82 / 86 192.4 5.2 2.3X
295250
Read both columns 220 / 248 71.5 14.0 0.9X
296-
*/
251+
*/
297252
benchmark.run()
298253
}
299254
}
@@ -319,7 +274,7 @@ object ParquetReadBenchmark {
319274
benchmark.addCase("PR Vectorized") { num =>
320275
var sum = 0
321276
files.map(_.asInstanceOf[String]).foreach { p =>
322-
val reader = new UnsafeRowParquetRecordReader
277+
val reader = new VectorizedParquetRecordReader
323278
try {
324279
reader.initialize(p, ("c1" :: "c2" :: Nil).asJava)
325280
val batch = reader.resultBatch()
@@ -340,7 +295,7 @@ object ParquetReadBenchmark {
340295
benchmark.addCase("PR Vectorized (Null Filtering)") { num =>
341296
var sum = 0L
342297
files.map(_.asInstanceOf[String]).foreach { p =>
343-
val reader = new UnsafeRowParquetRecordReader
298+
val reader = new VectorizedParquetRecordReader
344299
try {
345300
reader.initialize(p, ("c1" :: "c2" :: Nil).asJava)
346301
val batch = reader.resultBatch()

sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/HiveComparisonTest.scala

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -409,7 +409,10 @@ abstract class HiveComparisonTest
409409
}
410410

411411
try {
412-
new TestHive.QueryExecution(convertedSQL)
412+
val queryExecution = new TestHive.QueryExecution(convertedSQL)
413+
// Trigger the analysis of this converted SQL query.
414+
queryExecution.analyzed
415+
queryExecution
413416
} catch {
414417
case NonFatal(e) => fail(
415418
s"""Failed to analyze the converted SQL string:

0 commit comments

Comments
 (0)