-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-28213][SQL] Replace ColumnarBatchScan with equivilant from Columnar #25008
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
8c285e5
1d70026
a9e8aea
86dd5a0
2cce2fa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -37,10 +37,11 @@ import org.apache.spark.sql.execution.datasources.parquet.{ParquetFileFormat => | |
| import org.apache.spark.sql.execution.metric.SQLMetrics | ||
| import org.apache.spark.sql.sources.{BaseRelation, Filter} | ||
| import org.apache.spark.sql.types.StructType | ||
| import org.apache.spark.sql.vectorized.ColumnarBatch | ||
| import org.apache.spark.util.Utils | ||
| import org.apache.spark.util.collection.BitSet | ||
|
|
||
| trait DataSourceScanExec extends LeafExecNode with CodegenSupport { | ||
| trait DataSourceScanExec extends LeafExecNode { | ||
| val relation: BaseRelation | ||
| val tableIdentifier: Option[TableIdentifier] | ||
|
|
||
|
|
@@ -69,6 +70,12 @@ trait DataSourceScanExec extends LeafExecNode with CodegenSupport { | |
| private def redact(text: String): String = { | ||
| Utils.redact(sqlContext.sessionState.conf.stringRedactionPattern, text) | ||
| } | ||
|
|
||
| /** | ||
| * The data being read in. This is to provide input to the tests in a way compatible with | ||
| * [[InputRDDCodegen]] which all implementations used to extend. | ||
| */ | ||
| def inputRDDs(): Seq[RDD[InternalRow]] | ||
| } | ||
|
|
||
| /** Physical plan node for scanning data from a relation. */ | ||
|
|
@@ -141,11 +148,11 @@ case class FileSourceScanExec( | |
| optionalBucketSet: Option[BitSet], | ||
| dataFilters: Seq[Expression], | ||
| override val tableIdentifier: Option[TableIdentifier]) | ||
| extends DataSourceScanExec with ColumnarBatchScan { | ||
| extends DataSourceScanExec { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this change making all
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Correct, but there were only 2 things that the code gen was doing. Either convert ColumnarBatch into UnsafeRows or to convert whatever other rows were being returned by the DataSourceScanExec into UnsafeRows. The ColumnarBatch conversion is now covered by ColumnarToRowExec. The row to row conversion is covered by UnsafeProjections that are either inserted as a part of this patch or were already in the code, so we ended up doing a double conversion. |
||
|
|
||
| // Note that some vals referring the file-based relation are lazy intentionally | ||
| // so that this plan can be canonicalized on executor side too. See SPARK-23731. | ||
| override lazy val supportsBatch: Boolean = { | ||
| override lazy val supportsColumnar: Boolean = { | ||
| relation.fileFormat.supportBatch(relation.sparkSession, schema) | ||
| } | ||
|
|
||
|
|
@@ -275,7 +282,7 @@ case class FileSourceScanExec( | |
| Map( | ||
| "Format" -> relation.fileFormat.toString, | ||
| "ReadSchema" -> requiredSchema.catalogString, | ||
| "Batched" -> supportsBatch.toString, | ||
| "Batched" -> supportsColumnar.toString, | ||
| "PartitionFilters" -> seqToString(partitionFilters), | ||
| "PushedFilters" -> seqToString(pushedDownFilters), | ||
| "DataFilters" -> seqToString(dataFilters), | ||
|
|
@@ -302,7 +309,7 @@ case class FileSourceScanExec( | |
| withSelectedBucketsCount | ||
| } | ||
|
|
||
| private lazy val inputRDD: RDD[InternalRow] = { | ||
| lazy val inputRDD: RDD[InternalRow] = { | ||
| val readFile: (PartitionedFile) => Iterator[InternalRow] = | ||
| relation.fileFormat.buildReaderWithPartitionValues( | ||
| sparkSession = relation.sparkSession, | ||
|
|
@@ -334,29 +341,30 @@ case class FileSourceScanExec( | |
| "scanTime" -> SQLMetrics.createTimingMetric(sparkContext, "scan time")) | ||
|
|
||
| protected override def doExecute(): RDD[InternalRow] = { | ||
| if (supportsBatch) { | ||
| // in the case of fallback, this batched scan should never fail because of: | ||
| // 1) only primitive types are supported | ||
| // 2) the number of columns should be smaller than spark.sql.codegen.maxFields | ||
| WholeStageCodegenExec(this)(codegenStageId = 0).execute() | ||
| } else { | ||
| val numOutputRows = longMetric("numOutputRows") | ||
|
|
||
| if (needsUnsafeRowConversion) { | ||
| inputRDD.mapPartitionsWithIndexInternal { (index, iter) => | ||
| val proj = UnsafeProjection.create(schema) | ||
| proj.initialize(index) | ||
| iter.map( r => { | ||
| numOutputRows += 1 | ||
| proj(r) | ||
| }) | ||
| } | ||
| } else { | ||
| inputRDD.map { r => | ||
| val numOutputRows = longMetric("numOutputRows") | ||
|
|
||
| if (needsUnsafeRowConversion) { | ||
| inputRDD.mapPartitionsWithIndexInternal { (index, iter) => | ||
| val proj = UnsafeProjection.create(schema) | ||
| proj.initialize(index) | ||
| iter.map( r => { | ||
| numOutputRows += 1 | ||
| r | ||
| } | ||
| proj(r) | ||
| }) | ||
| } | ||
| } else { | ||
| inputRDD.map { r => | ||
| numOutputRows += 1 | ||
| r | ||
| } | ||
| } | ||
| } | ||
|
|
||
| protected override def doExecuteColumnar(): RDD[ColumnarBatch] = { | ||
| val numOutputRows = longMetric("numOutputRows") | ||
| inputRDD.asInstanceOf[RDD[ColumnarBatch]].map { batch => | ||
| numOutputRows += batch.numRows() | ||
| batch | ||
| } | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ColumnarToRowExec's comment also mentionsColumnarBatchScan. If you are like to remove all reference toColumnarBatchScan...There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Great catch. I thought I got rid of all of them. Will grep though again.