-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-24991][SQL] use InternalRow in DataSourceWriter #21948
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -50,4 +50,15 @@ public interface DataWriterFactory<T> extends Serializable { | |
| * this ID will always be 0. | ||
| */ | ||
| DataWriter<T> createDataWriter(int partitionId, long taskId, long epochId); | ||
|
|
||
| /** | ||
| * When true, Spark will reuse the same data object instance when sending data to the data writer, | ||
| * for better performance. Data writers should carefully handle the data objects if it's reused, | ||
| * e.g. do not buffer the data objects in a list. By default it returns false for safety, data | ||
|
||
| * sources can override it if their data writers immediately write the data object to somewhere | ||
| * else like a memory buffer or disk. | ||
| */ | ||
| default boolean reuseDataObject() { | ||
|
||
| return false; | ||
| } | ||
| } | ||
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -50,11 +50,7 @@ case class WriteToDataSourceV2Exec(writer: DataSourceWriter, query: SparkPlan) e | |
| override def output: Seq[Attribute] = Nil | ||
|
|
||
| override protected def doExecute(): RDD[InternalRow] = { | ||
| val writeTask = writer match { | ||
| case w: SupportsWriteInternalRow => w.createInternalRowWriterFactory() | ||
| case _ => new InternalRowDataWriterFactory(writer.createWriterFactory(), query.schema) | ||
| } | ||
|
|
||
| val writeTask = writer.createWriterFactory() | ||
| val useCommitCoordinator = writer.useCommitCoordinator | ||
| val rdd = query.execute() | ||
| val messages = new Array[WriterCommitMessage](rdd.partitions.length) | ||
|
|
@@ -113,11 +109,15 @@ object DataWritingSparkTask extends Logging { | |
| val attemptId = context.attemptNumber() | ||
| val epochId = Option(context.getLocalProperty(MicroBatchExecution.BATCH_ID_KEY)).getOrElse("0") | ||
| val dataWriter = writeTask.createDataWriter(partId, taskId, epochId.toLong) | ||
| val copyIfNeeded: InternalRow => InternalRow = | ||
| if (writeTask.reuseDataObject()) identity else _.copy() | ||
|
|
||
| // write the data and commit this writer. | ||
| Utils.tryWithSafeFinallyAndFailureCallbacks(block = { | ||
| while (iter.hasNext) { | ||
| dataWriter.write(iter.next()) | ||
| // Internally Spark reuse the same UnsafeRow instance when producing output rows, here we | ||
|
||
| // copy it to avoid troubles at data source side. | ||
| dataWriter.write(copyIfNeeded(iter.next())) | ||
| } | ||
|
|
||
| val msg = if (useCommitCoordinator) { | ||
|
|
@@ -155,27 +155,3 @@ object DataWritingSparkTask extends Logging { | |
| }) | ||
| } | ||
| } | ||
|
|
||
| class InternalRowDataWriterFactory( | ||
| rowWriterFactory: DataWriterFactory[Row], | ||
| schema: StructType) extends DataWriterFactory[InternalRow] { | ||
|
|
||
| override def createDataWriter( | ||
| partitionId: Int, | ||
| taskId: Long, | ||
| epochId: Long): DataWriter[InternalRow] = { | ||
| new InternalRowDataWriter( | ||
| rowWriterFactory.createDataWriter(partitionId, taskId, epochId), | ||
| RowEncoder.apply(schema).resolveAndBind()) | ||
| } | ||
| } | ||
|
|
||
| class InternalRowDataWriter(rowWriter: DataWriter[Row], encoder: ExpressionEncoder[Row]) | ||
| extends DataWriter[InternalRow] { | ||
|
|
||
| override def write(record: InternalRow): Unit = rowWriter.write(encoder.fromRow(record)) | ||
|
|
||
| override def commit(): WriterCommitMessage = rowWriter.commit() | ||
|
|
||
| override def abort(): Unit = rowWriter.abort() | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -89,8 +89,7 @@ class RateStreamContinuousReader(options: DataSourceOptions) extends ContinuousR | |
| start.runTimeMs, | ||
| i, | ||
| numPartitions, | ||
| perPartitionRate) | ||
| .asInstanceOf[InputPartition[InternalRow]] | ||
| perPartitionRate): InputPartition[InternalRow] | ||
|
||
| }.asJava | ||
| } | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nit:
if it's reusedtheithere is ambiguous. Maybe change toif they are reused?