Skip to content

Commit 8f9c5ac

Browse files
rdbluemccheah
authored andcommitted
[SPARK-26811][SQL] Add capabilities to v2.Table
This adds a new method, `capabilities` to `v2.Table` that returns a set of `TableCapability`. Capabilities are used to fail queries during analysis checks, `V2WriteSupportCheck`, when the table does not support operations, like truncation. Existing tests for regressions, added new analysis suite, `V2WriteSupportCheckSuite`, for new capability checks. Closes apache#24012 from rdblue/SPARK-26811-add-capabilities. Authored-by: Ryan Blue <[email protected]> Signed-off-by: Wenchen Fan <[email protected]>
1 parent 1609b3f commit 8f9c5ac

32 files changed

Lines changed: 417 additions & 114 deletions

external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
package org.apache.spark.sql.kafka010
1919

2020
import java.{util => ju}
21-
import java.util.{Locale, UUID}
21+
import java.util.{Collections, Locale, UUID}
2222

2323
import scala.collection.JavaConverters._
2424

@@ -358,6 +358,8 @@ private[kafka010] class KafkaSourceProvider extends DataSourceRegister
358358

359359
override def schema(): StructType = KafkaOffsetReader.kafkaSchema
360360

361+
override def capabilities(): ju.Set[TableCapability] = Collections.emptySet()
362+
361363
override def newScanBuilder(options: CaseInsensitiveStringMap): ScanBuilder = new ScanBuilder {
362364
override def build(): Scan = new KafkaScan(options)
363365
}

sql/core/src/main/java/org/apache/spark/sql/sources/v2/SupportsBatchRead.java

Lines changed: 0 additions & 34 deletions
This file was deleted.

sql/core/src/main/java/org/apache/spark/sql/sources/v2/SupportsBatchWrite.java

Lines changed: 0 additions & 33 deletions
This file was deleted.

sql/core/src/main/java/org/apache/spark/sql/sources/v2/SupportsRead.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
* {@link #newScanBuilder(CaseInsensitiveStringMap)} that is used to create a scan for batch,
2727
* micro-batch, or continuous processing.
2828
*/
29-
interface SupportsRead extends Table {
29+
public interface SupportsRead extends Table {
3030

3131
/**
3232
* Returns a {@link ScanBuilder} which can be used to build a {@link Scan}. Spark will call this

sql/core/src/main/java/org/apache/spark/sql/sources/v2/SupportsWrite.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
* {@link #newWriteBuilder(CaseInsensitiveStringMap)} that is used to create a write
2727
* for batch or streaming.
2828
*/
29-
interface SupportsWrite extends Table {
29+
public interface SupportsWrite extends Table {
3030

3131
/**
3232
* Returns a {@link WriteBuilder} which can be used to create {@link BatchWrite}. Spark will call

sql/core/src/main/java/org/apache/spark/sql/sources/v2/Table.java

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -20,16 +20,15 @@
2020
import org.apache.spark.annotation.Evolving;
2121
import org.apache.spark.sql.types.StructType;
2222

23+
import java.util.Set;
24+
2325
/**
2426
* An interface representing a logical structured data set of a data source. For example, the
2527
* implementation can be a directory on the file system, a topic of Kafka, or a table in the
2628
* catalog, etc.
2729
* <p>
28-
* This interface can mixin the following interfaces to support different operations:
29-
* </p>
30-
* <ul>
31-
* <li>{@link SupportsBatchRead}: this table can be read in batch queries.</li>
32-
* </ul>
30+
* This interface can mixin the following interfaces to support different operations, like
31+
* {@code SupportsRead}.
3332
*/
3433
@Evolving
3534
public interface Table {
@@ -45,4 +44,9 @@ public interface Table {
4544
* empty schema can be returned here.
4645
*/
4746
StructType schema();
47+
48+
/**
49+
* Returns the set of capabilities for this table.
50+
*/
51+
Set<TableCapability> capabilities();
4852
}
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.sql.sources.v2;
19+
20+
import org.apache.spark.annotation.Experimental;
21+
22+
/**
23+
* Capabilities that can be provided by a {@link Table} implementation.
24+
* <p>
25+
* Tables use {@link Table#capabilities()} to return a set of capabilities. Each capability signals
26+
* to Spark that the table supports a feature identified by the capability. For example, returning
27+
* {@code BATCH_READ} allows Spark to read from the table using a batch scan.
28+
*/
29+
@Experimental
30+
public enum TableCapability {
31+
/**
32+
* Signals that the table supports reads in batch execution mode.
33+
*/
34+
BATCH_READ,
35+
36+
/**
37+
* Signals that the table supports append writes in batch execution mode.
38+
* <p>
39+
* Tables that return this capability must support appending data and may also support additional
40+
* write modes, like {@link #TRUNCATE}, {@link #OVERWRITE_BY_FILTER}, and
41+
* {@link #OVERWRITE_DYNAMIC}.
42+
*/
43+
BATCH_WRITE,
44+
45+
/**
46+
* Signals that the table can be truncated in a write operation.
47+
* <p>
48+
* Truncating a table removes all existing rows.
49+
* <p>
50+
* See {@link org.apache.spark.sql.sources.v2.writer.SupportsTruncate}.
51+
*/
52+
TRUNCATE,
53+
54+
/**
55+
* Signals that the table can replace existing data that matches a filter with appended data in
56+
* a write operation.
57+
* <p>
58+
* See {@link org.apache.spark.sql.sources.v2.writer.SupportsOverwrite}.
59+
*/
60+
OVERWRITE_BY_FILTER,
61+
62+
/**
63+
* Signals that the table can dynamically replace existing data partitions with appended data in
64+
* a write operation.
65+
* <p>
66+
* See {@link org.apache.spark.sql.sources.v2.writer.SupportsDynamicOverwrite}.
67+
*/
68+
OVERWRITE_DYNAMIC
69+
}

sql/core/src/main/java/org/apache/spark/sql/sources/v2/reader/Scan.java

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,6 @@
2121
import org.apache.spark.sql.sources.v2.reader.streaming.ContinuousStream;
2222
import org.apache.spark.sql.sources.v2.reader.streaming.MicroBatchStream;
2323
import org.apache.spark.sql.types.StructType;
24-
import org.apache.spark.sql.sources.v2.SupportsBatchRead;
2524
import org.apache.spark.sql.sources.v2.SupportsContinuousRead;
2625
import org.apache.spark.sql.sources.v2.SupportsMicroBatchRead;
2726
import org.apache.spark.sql.sources.v2.Table;
@@ -33,8 +32,8 @@
3332
* This logical representation is shared between batch scan, micro-batch streaming scan and
3433
* continuous streaming scan. Data sources must implement the corresponding methods in this
3534
* interface, to match what the table promises to support. For example, {@link #toBatch()} must be
36-
* implemented, if the {@link Table} that creates this {@link Scan} implements
37-
* {@link SupportsBatchRead}.
35+
* implemented, if the {@link Table} that creates this {@link Scan} returns BATCH_READ support in
36+
* its {@link Table#capabilities()}.
3837
* </p>
3938
*/
4039
@Evolving
@@ -62,7 +61,7 @@ default String description() {
6261
/**
6362
* Returns the physical representation of this scan for batch query. By default this method throws
6463
* exception, data sources must overwrite this method to provide an implementation, if the
65-
* {@link Table} that creates this scan implements {@link SupportsBatchRead}.
64+
* {@link Table} that creates this returns batch read support in its {@link Table#capabilities()}.
6665
*
6766
* @throws UnsupportedOperationException
6867
*/

sql/core/src/main/java/org/apache/spark/sql/sources/v2/writer/WriteBuilder.java

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
package org.apache.spark.sql.sources.v2.writer;
1919

2020
import org.apache.spark.annotation.Evolving;
21-
import org.apache.spark.sql.sources.v2.SupportsBatchWrite;
2221
import org.apache.spark.sql.sources.v2.Table;
2322
import org.apache.spark.sql.sources.v2.writer.streaming.StreamingWrite;
2423
import org.apache.spark.sql.types.StructType;
@@ -58,7 +57,8 @@ default WriteBuilder withInputDataSchema(StructType schema) {
5857
/**
5958
* Returns a {@link BatchWrite} to write data to batch source. By default this method throws
6059
* exception, data sources must overwrite this method to provide an implementation, if the
61-
* {@link Table} that creates this scan implements {@link SupportsBatchWrite}.
60+
* {@link Table} that creates this write returns BATCH_WRITE support in its
61+
* {@link Table#capabilities()}.
6262
*
6363
* Note that, the returned {@link BatchWrite} can be null if the implementation supports SaveMode,
6464
* to indicate that no writing is needed. We can clean it up after removing

sql/core/src/main/scala/org/apache/spark/sql/DataFrameReader.scala

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,9 @@ import org.apache.spark.sql.execution.datasources.DataSource
3737
import org.apache.spark.sql.execution.datasources.csv._
3838
import org.apache.spark.sql.execution.datasources.jdbc._
3939
import org.apache.spark.sql.execution.datasources.json.TextInputJsonDataSource
40-
import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, DataSourceV2Utils, FileDataSourceV2, FileTable}
40+
import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2Relation, DataSourceV2Utils, FileDataSourceV2}
4141
import org.apache.spark.sql.sources.v2._
42+
import org.apache.spark.sql.sources.v2.TableCapability._
4243
import org.apache.spark.sql.types.StructType
4344
import org.apache.spark.sql.util.CaseInsensitiveStringMap
4445
import org.apache.spark.unsafe.types.UTF8String
@@ -221,8 +222,9 @@ class DataFrameReader private[sql](sparkSession: SparkSession) extends Logging {
221222
case Some(schema) => provider.getTable(dsOptions, schema)
222223
case _ => provider.getTable(dsOptions)
223224
}
225+
import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Implicits._
224226
table match {
225-
case _: SupportsBatchRead =>
227+
case _: SupportsRead if table.supports(BATCH_READ) =>
226228
Dataset.ofRows(sparkSession, DataSourceV2Relation.create(table, dsOptions))
227229

228230
case _ => loadV1Source(paths: _*)

0 commit comments

Comments
 (0)