apache · edrevo · Sep 27, 2019 · Nov 15, 2019 · Nov 15, 2019 · Nov 15, 2019
diff --git a/external/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroTable.scala b/external/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroTable.scala
@@ -22,7 +22,7 @@ import org.apache.hadoop.fs.FileStatus
 
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.avro.AvroUtils
-import org.apache.spark.sql.connector.write.WriteBuilder
+import org.apache.spark.sql.connector.write.{WriteBuilder, WriteInfo}
 import org.apache.spark.sql.execution.datasources.FileFormat
 import org.apache.spark.sql.execution.datasources.v2.FileTable
 import org.apache.spark.sql.types.{DataType, StructType}
@@ -42,8 +42,10 @@ case class AvroTable(
   override def inferSchema(files: Seq[FileStatus]): Option[StructType] =
     AvroUtils.inferSchema(sparkSession, options.asScala.toMap, files)
 
-  override def newWriteBuilder(options: CaseInsensitiveStringMap): WriteBuilder =
-    new AvroWriteBuilder(options, paths, formatName, supportsDataType)
+  override def newWriteBuilder(options: CaseInsensitiveStringMap,
+                               writeInfo: WriteInfo): WriteBuilder =
+    new AvroWriteBuilder(
+      options, paths, formatName, supportsDataType, writeInfo)
 
   override def supportsDataType(dataType: DataType): Boolean = AvroUtils.supportsDataType(dataType)
 

diff --git a/external/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroWriteBuilder.scala b/external/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroWriteBuilder.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.v2.avro
 import org.apache.hadoop.mapreduce.Job
 
 import org.apache.spark.sql.avro.AvroUtils
+import org.apache.spark.sql.connector.write.WriteInfo
 import org.apache.spark.sql.execution.datasources.OutputWriterFactory
 import org.apache.spark.sql.execution.datasources.v2.FileWriteBuilder
 import org.apache.spark.sql.internal.SQLConf
@@ -29,8 +30,9 @@ class AvroWriteBuilder(
     options: CaseInsensitiveStringMap,
     paths: Seq[String],
     formatName: String,
-    supportsDataType: DataType => Boolean)
-  extends FileWriteBuilder(options, paths, formatName, supportsDataType) {
+    supportsDataType: DataType => Boolean,
+    writeInfo: WriteInfo)
+  extends FileWriteBuilder(options, paths, formatName, supportsDataType, writeInfo) {
   override def prepareWrite(
       sqlConf: SQLConf,
       job: Job,

diff --git a/...nal/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala b/...nal/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala
@@ -33,7 +33,7 @@ import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
 import org.apache.spark.sql.connector.catalog.{SupportsRead, SupportsWrite, Table, TableCapability, TableProvider}
 import org.apache.spark.sql.connector.read.{Batch, Scan, ScanBuilder}
 import org.apache.spark.sql.connector.read.streaming.{ContinuousStream, MicroBatchStream}
-import org.apache.spark.sql.connector.write.{BatchWrite, WriteBuilder}
+import org.apache.spark.sql.connector.write.{BatchWrite, WriteBuilder, WriteInfo}
 import org.apache.spark.sql.connector.write.streaming.StreamingWrite
 import org.apache.spark.sql.execution.streaming.{Sink, Source}
 import org.apache.spark.sql.sources._
@@ -392,18 +392,14 @@ private[kafka010] class KafkaSourceProvider extends DataSourceRegister
     override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder =
       () => new KafkaScan(options)
 
-    override def newWriteBuilder(options: CaseInsensitiveStringMap): WriteBuilder = {
+    override def newWriteBuilder(options: CaseInsensitiveStringMap,
+                                 writeInfo: WriteInfo): WriteBuilder = {
       new WriteBuilder {
-        private var inputSchema: StructType = _
+        private val inputSchema: StructType = writeInfo.schema()
         private val topic = Option(options.get(TOPIC_OPTION_KEY)).map(_.trim)
         private val producerParams =
           kafkaParamsForProducer(CaseInsensitiveMap(options.asScala.toMap))
 
-        override def withInputDataSchema(schema: StructType): WriteBuilder = {
-          this.inputSchema = schema
-          this
-        }
-
         override def buildForBatch(): BatchWrite = {
           assert(inputSchema != null)
           new KafkaBatchWrite(topic, producerParams, inputSchema)

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/StagedTable.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/StagedTable.java
@@ -21,6 +21,7 @@
 
 import org.apache.spark.annotation.Experimental;
 import org.apache.spark.sql.connector.expressions.Transform;
+import org.apache.spark.sql.connector.write.WriteInfo;
 import org.apache.spark.sql.types.StructType;
 import org.apache.spark.sql.util.CaseInsensitiveStringMap;
 
@@ -32,10 +33,11 @@
  * {@link StagingTableCatalog#stageCreate(Identifier, StructType, Transform[], Map)} or
  * {@link StagingTableCatalog#stageReplace(Identifier, StructType, Transform[], Map)} to prepare the
  * table for being written to. This table should usually implement {@link SupportsWrite}. A new
- * writer will be constructed via {@link SupportsWrite#newWriteBuilder(CaseInsensitiveStringMap)},
- * and the write will be committed. The job concludes with a call to {@link #commitStagedChanges()},
- * at which point implementations are expected to commit the table's metadata into the metastore
- * along with the data that was written by the writes from the write builder this table created.
+ * writer will be constructed via
+ * {@link SupportsWrite#newWriteBuilder(CaseInsensitiveStringMap, WriteInfo)}, and the write will
+ * be committed. The job concludes with a call to {@link #commitStagedChanges()}, at which point
+ * implementations are expected to commit the table's metadata into the metastore along with the
+ * data that was written by the writes from the write builder this table created.
  */
 @Experimental
 public interface StagedTable extends Table {

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/StagingTableCatalog.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/StagingTableCatalog.java
@@ -21,6 +21,7 @@
 
 import org.apache.spark.annotation.Experimental;
 import org.apache.spark.sql.connector.expressions.Transform;
+import org.apache.spark.sql.connector.write.WriteInfo;
 import org.apache.spark.sql.catalyst.analysis.NoSuchNamespaceException;
 import org.apache.spark.sql.catalyst.analysis.NoSuchTableException;
 import org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException;
@@ -39,9 +40,9 @@
  * TABLE AS SELECT operation, if the catalog does not implement this trait, the planner will first
  * drop the table via {@link TableCatalog#dropTable(Identifier)}, then create the table via
  * {@link TableCatalog#createTable(Identifier, StructType, Transform[], Map)}, and then perform
- * the write via {@link SupportsWrite#newWriteBuilder(CaseInsensitiveStringMap)}. However, if the
- * write operation fails, the catalog will have already dropped the table, and the planner cannot
- * roll back the dropping of the table.
+ * the write via {@link SupportsWrite#newWriteBuilder(CaseInsensitiveStringMap, WriteInfo)}.
+ * However, if the write operation fails, the catalog will have already dropped the table, and the
+ * planner cannot roll back the dropping of the table.
  * <p>
  * If the catalog implements this plugin, the catalog can implement the methods to "stage" the
  * creation and the replacement of a table. After the table's

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsWrite.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/catalog/SupportsWrite.java
@@ -20,11 +20,12 @@
 import org.apache.spark.annotation.Experimental;
 import org.apache.spark.sql.connector.write.BatchWrite;
 import org.apache.spark.sql.connector.write.WriteBuilder;
+import org.apache.spark.sql.connector.write.WriteInfo;
 import org.apache.spark.sql.util.CaseInsensitiveStringMap;
 
 /**
  * A mix-in interface of {@link Table}, to indicate that it's writable. This adds
- * {@link #newWriteBuilder(CaseInsensitiveStringMap)} that is used to create a write
+ * {@link #newWriteBuilder(CaseInsensitiveStringMap, WriteInfo)} that is used to create a write
  * for batch or streaming.
  */
 @Experimental
@@ -34,5 +35,5 @@ public interface SupportsWrite extends Table {
    * Returns a {@link WriteBuilder} which can be used to create {@link BatchWrite}. Spark will call
    * this method to configure each data source write.
    */
-  WriteBuilder newWriteBuilder(CaseInsensitiveStringMap options);
+  WriteBuilder newWriteBuilder(CaseInsensitiveStringMap options, WriteInfo info);
 }
diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/WriteBuilder.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/WriteBuilder.java
@@ -33,28 +33,6 @@
 @Evolving
 public interface WriteBuilder {
 
-  /**
-   * Passes the `queryId` from Spark to data source. `queryId` is a unique string of the query. It's
-   * possible that there are many queries running at the same time, or a query is restarted and
-   * resumed. {@link BatchWrite} can use this id to identify the query.
-   *
-   * @return a new builder with the `queryId`. By default it returns `this`, which means the given
-   *         `queryId` is ignored. Please override this method to take the `queryId`.
-   */
-  default WriteBuilder withQueryId(String queryId) {
-    return this;
-  }
-
-  /**
-   * Passes the schema of the input data from Spark to data source.
-   *
-   * @return a new builder with the `schema`. By default it returns `this`, which means the given
-   *         `schema` is ignored. Please override this method to take the `schema`.
-   */
-  default WriteBuilder withInputDataSchema(StructType schema) {
-    return this;
-  }
-
   /**
    * Returns a {@link BatchWrite} to write data to batch source. By default this method throws
    * exception, data sources must overwrite this method to provide an implementation, if the

diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/WriteInfo.java b/sql/catalyst/src/main/java/org/apache/spark/sql/connector/write/WriteInfo.java
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.connector.write;
+
+import org.apache.spark.annotation.Experimental;
+import org.apache.spark.sql.types.StructType;
+
+@Experimental
+public interface WriteInfo {
+  String queryId();
+
+  StructType schema();
+
+  int numPartitions();
+}
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/write/WriteInfoImpl.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/connector/write/WriteInfoImpl.scala
@@ -0,0 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.connector.write
+
+import org.apache.spark.sql.types.StructType
+
+private[sql] case class WriteInfoImpl(queryId: String,
+                                      schema: StructType,
+                                      numPartitions: Int) extends WriteInfo
diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/InMemoryTable.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/InMemoryTable.scala
@@ -93,7 +93,7 @@ class InMemoryTable(
     override def createReaderFactory(): PartitionReaderFactory = BufferedRowsReaderFactory
   }
 
-  override def newWriteBuilder(options: CaseInsensitiveStringMap): WriteBuilder = {
+  override def newWriteBuilder(options: CaseInsensitiveStringMap, info: WriteInfo): WriteBuilder = {
     InMemoryTable.maybeSimulateFailedTableWrite(options)
 
     new WriteBuilder with SupportsTruncate with SupportsOverwrite with SupportsDynamicOverwrite {

diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/StagingInMemoryTableCatalog.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/connector/StagingInMemoryTableCatalog.scala
@@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.analysis.{CannotReplaceMissingTableExceptio
 import org.apache.spark.sql.connector.catalog._
 import org.apache.spark.sql.connector.expressions.Transform
 import org.apache.spark.sql.connector.read.ScanBuilder
-import org.apache.spark.sql.connector.write.WriteBuilder
+import org.apache.spark.sql.connector.write.{WriteBuilder, WriteInfo}
 import org.apache.spark.sql.types.StructType
 import org.apache.spark.sql.util.CaseInsensitiveStringMap
 
@@ -88,8 +88,9 @@ class StagingInMemoryTableCatalog extends InMemoryTableCatalog with StagingTable
 
     override def capabilities(): util.Set[TableCapability] = delegateTable.capabilities
 
-    override def newWriteBuilder(options: CaseInsensitiveStringMap): WriteBuilder = {
-      delegateTable.newWriteBuilder(options)
+    override def newWriteBuilder(options: CaseInsensitiveStringMap,
+                                 writeInfo: WriteInfo): WriteBuilder = {
+      delegateTable.newWriteBuilder(options, writeInfo)
     }
 
     override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = {

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/noop/NoopDataSource.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/noop/NoopDataSource.scala
@@ -23,7 +23,7 @@ import scala.collection.JavaConverters._
 
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.connector.catalog.{SupportsWrite, Table, TableCapability, TableProvider}
-import org.apache.spark.sql.connector.write.{BatchWrite, DataWriter, DataWriterFactory, SupportsTruncate, WriteBuilder, WriterCommitMessage}
+import org.apache.spark.sql.connector.write.{BatchWrite, DataWriter, DataWriterFactory, SupportsTruncate, WriteBuilder, WriteInfo, WriterCommitMessage}
 import org.apache.spark.sql.connector.write.streaming.{StreamingDataWriterFactory, StreamingWrite}
 import org.apache.spark.sql.sources.DataSourceRegister
 import org.apache.spark.sql.types.StructType
@@ -39,7 +39,8 @@ class NoopDataSource extends TableProvider with DataSourceRegister {
 }
 
 private[noop] object NoopTable extends Table with SupportsWrite {
-  override def newWriteBuilder(options: CaseInsensitiveStringMap): WriteBuilder = NoopWriteBuilder
+  override def newWriteBuilder(options: CaseInsensitiveStringMap,
+                               writeInfo: WriteInfo): WriteBuilder = NoopWriteBuilder
   override def name(): String = "noop-table"
   override def schema(): StructType = new StructType()
   override def capabilities(): util.Set[TableCapability] = {

diff --git a/...e/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala b/...e/src/main/scala/org/apache/spark/sql/execution/datasources/v2/DataSourceV2Strategy.scala
@@ -229,7 +229,7 @@ object DataSourceV2Strategy extends Strategy with PredicateHelper {
     case AppendData(r: DataSourceV2Relation, query, writeOptions, _) =>
       r.table.asWritable match {
         case v1 if v1.supports(TableCapability.V1_BATCH_WRITE) =>
-          AppendDataExecV1(v1, writeOptions.asOptions, query) :: Nil
+          AppendDataExecV1(v1, writeOptions.asOptions, planLater(query)) :: Nil
         case v2 =>
           AppendDataExec(v2, writeOptions.asOptions, planLater(query)) :: Nil
       }
@@ -242,7 +242,7 @@ object DataSourceV2Strategy extends Strategy with PredicateHelper {
       }.toArray
       r.table.asWritable match {
         case v1 if v1.supports(TableCapability.V1_BATCH_WRITE) =>
-          OverwriteByExpressionExecV1(v1, filters, writeOptions.asOptions, query) :: Nil
+          OverwriteByExpressionExecV1(v1, filters, writeOptions.asOptions, planLater(query)) :: Nil
         case v2 =>
           OverwriteByExpressionExec(v2, filters, writeOptions.asOptions, planLater(query)) :: Nil
       }

diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWriteBuilder.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/FileWriteBuilder.scala
@@ -30,7 +30,7 @@ import org.apache.spark.internal.io.FileCommitProtocol
 import org.apache.spark.sql.{AnalysisException, SaveMode, SparkSession}
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, DateTimeUtils}
-import org.apache.spark.sql.connector.write.{BatchWrite, WriteBuilder}
+import org.apache.spark.sql.connector.write.{BatchWrite, WriteBuilder, WriteInfo}
 import org.apache.spark.sql.execution.datasources.{BasicWriteJobStatsTracker, DataSource, OutputWriterFactory, WriteJobDescription}
 import org.apache.spark.sql.execution.metric.SQLMetric
 import org.apache.spark.sql.internal.SQLConf
@@ -43,21 +43,12 @@ abstract class FileWriteBuilder(
     options: CaseInsensitiveStringMap,
     paths: Seq[String],
     formatName: String,
-    supportsDataType: DataType => Boolean) extends WriteBuilder {
-  private var schema: StructType = _
-  private var queryId: String = _
+    supportsDataType: DataType => Boolean,
+    writeInfo: WriteInfo) extends WriteBuilder {
+  private val schema = writeInfo.schema()
+  private val queryId = writeInfo.queryId()
   private var mode: SaveMode = _
 
-  override def withInputDataSchema(schema: StructType): WriteBuilder = {
-    this.schema = schema
-    this
-  }
-
-  override def withQueryId(queryId: String): WriteBuilder = {
-    this.queryId = queryId
-    this
-  }
-
   def mode(mode: SaveMode): WriteBuilder = {
     this.mode = mode
     this

diff --git a/...core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V1FallbackWriters.scala b/...core/src/main/scala/org/apache/spark/sql/execution/datasources/v2/V1FallbackWriters.scala
@@ -21,12 +21,10 @@ import java.util.UUID
 
 import org.apache.spark.SparkException
 import org.apache.spark.rdd.RDD
-import org.apache.spark.sql.Dataset
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.Attribute
-import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
 import org.apache.spark.sql.connector.catalog.SupportsWrite
-import org.apache.spark.sql.connector.write.{SupportsOverwrite, SupportsTruncate, V1WriteBuilder, WriteBuilder}
+import org.apache.spark.sql.connector.write.{SupportsOverwrite, SupportsTruncate, V1WriteBuilder, WriteBuilder, WriteInfo, WriteInfoImpl}
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.sources.{AlwaysTrue, Filter, InsertableRelation}
 import org.apache.spark.sql.util.CaseInsensitiveStringMap
@@ -39,7 +37,7 @@ import org.apache.spark.sql.util.CaseInsensitiveStringMap
 case class AppendDataExecV1(
     table: SupportsWrite,
     writeOptions: CaseInsensitiveStringMap,
-    plan: LogicalPlan) extends V1FallbackWriters {
+    query: SparkPlan) extends V1FallbackWriters {
 
   override protected def doExecute(): RDD[InternalRow] = {
     writeWithV1(newWriteBuilder().buildForV1Write())
@@ -61,7 +59,7 @@ case class OverwriteByExpressionExecV1(
     table: SupportsWrite,
     deleteWhere: Array[Filter],
     writeOptions: CaseInsensitiveStringMap,
-    plan: LogicalPlan) extends V1FallbackWriters {
+    query: SparkPlan) extends V1FallbackWriters {
 
   private def isTruncate(filters: Array[Filter]): Boolean = {
     filters.length == 1 && filters(0).isInstanceOf[AlwaysTrue]
@@ -82,7 +80,7 @@ case class OverwriteByExpressionExecV1(
 }
 
 /** Some helper interfaces that use V2 write semantics through the V1 writer interface. */
-sealed trait V1FallbackWriters extends SupportsV1Write {
+sealed trait V1FallbackWriters extends SupportsV1Write { this: SupportsV1Write =>
   override def output: Seq[Attribute] = Nil
   override final def children: Seq[SparkPlan] = Nil
 
@@ -98,22 +96,22 @@ sealed trait V1FallbackWriters extends SupportsV1Write {
   }
 
   protected def newWriteBuilder(): V1WriteBuilder = {
-    val writeBuilder = table.newWriteBuilder(writeOptions)
-      .withInputDataSchema(plan.schema)
-      .withQueryId(UUID.randomUUID().toString)
+    val writeInfo: WriteInfo = WriteInfoImpl(
+      queryId = UUID.randomUUID().toString,
+      schema = query.schema,
+      numPartitions = rdd.getNumPartitions)
+    val writeBuilder = table.newWriteBuilder(writeOptions, writeInfo)
+
     writeBuilder.asV1Builder
   }
 }
 
 /**
  * A trait that allows Tables that use V1 Writer interfaces to append data.
  */
-trait SupportsV1Write extends SparkPlan {
-  // TODO: We should be able to work on SparkPlans at this point.
-  def plan: LogicalPlan
-
+trait SupportsV1Write extends SparkPlan with WriteBase {
   protected def writeWithV1(relation: InsertableRelation): RDD[InternalRow] = {
-    relation.insert(Dataset.ofRows(sqlContext.sparkSession, plan), overwrite = false)
+    relation.insert(sqlContext.internalCreateDataFrame(rdd, query.schema), overwrite = false)
     sparkContext.emptyRDD
   }
 }