-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-4574][SQL] Adding support for defining schema in foreign DDL commands. #3431
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 20 commits
0ba70df
7787ec7
9bf12f8
83b6fc3
44eb70c
02a662c
445b57b
cf982d2
91ad91b
ddab984
8dfbf7a
d02547f
b621c8f
f1cffe4
f5c22b0
1eeb769
50a03b0
baf79b5
f336a16
a852b10
65e9c73
38f634e
7e79ce5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,16 +17,15 @@ | |
|
|
||
| package org.apache.spark.sql.sources | ||
|
|
||
| import org.apache.spark.Logging | ||
| import org.apache.spark.sql.SQLContext | ||
| import org.apache.spark.sql.execution.RunnableCommand | ||
| import org.apache.spark.util.Utils | ||
|
|
||
| import scala.language.implicitConversions | ||
| import scala.util.parsing.combinator.lexical.StdLexical | ||
| import scala.util.parsing.combinator.syntactical.StandardTokenParsers | ||
| import scala.util.parsing.combinator.PackratParsers | ||
|
|
||
| import org.apache.spark.Logging | ||
| import org.apache.spark.sql.SQLContext | ||
| import org.apache.spark.sql.catalyst.types._ | ||
| import org.apache.spark.sql.execution.RunnableCommand | ||
| import org.apache.spark.util.Utils | ||
| import org.apache.spark.sql.catalyst.plans.logical._ | ||
| import org.apache.spark.sql.catalyst.SqlLexical | ||
|
|
||
|
|
@@ -44,6 +43,14 @@ private[sql] class DDLParser extends StandardTokenParsers with PackratParsers wi | |
| } | ||
| } | ||
|
|
||
| def parseType(input: String): DataType = { | ||
| phrase(dataType)(new lexical.Scanner(input)) match { | ||
| case Success(r, x) => r | ||
| case x => | ||
| sys.error(s"Unsupported dataType: $x") | ||
| } | ||
| } | ||
|
|
||
| protected case class Keyword(str: String) | ||
|
|
||
| protected implicit def asParser(k: Keyword): Parser[String] = | ||
|
|
@@ -55,6 +62,24 @@ private[sql] class DDLParser extends StandardTokenParsers with PackratParsers wi | |
| protected val USING = Keyword("USING") | ||
| protected val OPTIONS = Keyword("OPTIONS") | ||
|
|
||
| // Data types. | ||
| protected val STRING = Keyword("STRING") | ||
| protected val BINARY = Keyword("BINARY") | ||
| protected val BOOLEAN = Keyword("BOOLEAN") | ||
| protected val TINYINT = Keyword("TINYINT") | ||
| protected val SMALLINT = Keyword("SMALLINT") | ||
| protected val INT = Keyword("INT") | ||
| protected val BIGINT = Keyword("BIGINT") | ||
| protected val FLOAT = Keyword("FLOAT") | ||
| protected val DOUBLE = Keyword("DOUBLE") | ||
| protected val DECIMAL = Keyword("DECIMAL") | ||
| protected val DATE = Keyword("DATE") | ||
| protected val TIMESTAMP = Keyword("TIMESTAMP") | ||
| protected val VARCHAR = Keyword("VARCHAR") | ||
| protected val ARRAY = Keyword("ARRAY") | ||
| protected val MAP = Keyword("MAP") | ||
| protected val STRUCT = Keyword("STRUCT") | ||
|
|
||
| // Use reflection to find the reserved words defined in this class. | ||
| protected val reservedWords = | ||
| this.getClass | ||
|
|
@@ -67,26 +92,92 @@ private[sql] class DDLParser extends StandardTokenParsers with PackratParsers wi | |
| protected lazy val ddl: Parser[LogicalPlan] = createTable | ||
|
|
||
| /** | ||
| * CREATE TEMPORARY TABLE avroTable | ||
| * `CREATE TEMPORARY TABLE avroTable | ||
| * USING org.apache.spark.sql.avro | ||
| * OPTIONS (path "../hive/src/test/resources/data/files/episodes.avro") | ||
| * OPTIONS (path "../hive/src/test/resources/data/files/episodes.avro")` | ||
| * or | ||
| * `CREATE TEMPORARY TABLE avroTable(intField int, stringField string...) | ||
| * USING org.apache.spark.sql.avro | ||
| * OPTIONS (path "../hive/src/test/resources/data/files/episodes.avro")` | ||
| */ | ||
| protected lazy val createTable: Parser[LogicalPlan] = | ||
| CREATE ~ TEMPORARY ~ TABLE ~> ident ~ (USING ~> className) ~ (OPTIONS ~> options) ^^ { | ||
| case tableName ~ provider ~ opts => | ||
| CreateTableUsing(tableName, provider, opts) | ||
| ( | ||
| CREATE ~ TEMPORARY ~ TABLE ~> ident | ||
| ~ (tableCols).? ~ (USING ~> className) ~ (OPTIONS ~> options) ^^ { | ||
| case tableName ~ columns ~ provider ~ opts => | ||
| val userSpecifiedSchema = columns.flatMap(fields => Some(StructType(fields))) | ||
| CreateTableUsing(tableName, userSpecifiedSchema, provider, opts) | ||
| } | ||
| ) | ||
|
|
||
| protected lazy val tableCols: Parser[Seq[StructField]] = "(" ~> repsep(column, ",") <~ ")" | ||
|
|
||
| protected lazy val options: Parser[Map[String, String]] = | ||
| "(" ~> repsep(pair, ",") <~ ")" ^^ { case s: Seq[(String, String)] => s.toMap } | ||
|
|
||
| protected lazy val className: Parser[String] = repsep(ident, ".") ^^ { case s => s.mkString(".")} | ||
|
|
||
| protected lazy val pair: Parser[(String, String)] = ident ~ stringLit ^^ { case k ~ v => (k,v) } | ||
|
|
||
| protected lazy val column: Parser[StructField] = | ||
| ident ~ dataType ^^ { case columnName ~ typ => | ||
| StructField(columnName, typ) | ||
| } | ||
|
|
||
| protected lazy val primitiveType: Parser[DataType] = | ||
| STRING ^^^ StringType | | ||
| BINARY ^^^ BinaryType | | ||
| BOOLEAN ^^^ BooleanType | | ||
| TINYINT ^^^ ByteType | | ||
| SMALLINT ^^^ ShortType | | ||
| INT ^^^ IntegerType | | ||
| BIGINT ^^^ LongType | | ||
| FLOAT ^^^ FloatType | | ||
| DOUBLE ^^^ DoubleType | | ||
| fixedDecimalType | // decimal with precision/scale | ||
| DECIMAL ^^^ DecimalType.Unlimited | // decimal with no precision/scale | ||
| DATE ^^^ DateType | | ||
| TIMESTAMP ^^^ TimestampType | | ||
| VARCHAR ~ "(" ~ numericLit ~ ")" ^^^ StringType | ||
|
|
||
| protected lazy val fixedDecimalType: Parser[DataType] = | ||
| (DECIMAL ~ "(" ~> numericLit) ~ ("," ~> numericLit <~ ")") ^^ { | ||
| case precision ~ scale => DecimalType(precision.toInt, scale.toInt) | ||
| } | ||
|
|
||
| protected lazy val arrayType: Parser[DataType] = | ||
| ARRAY ~> "<" ~> dataType <~ ">" ^^ { | ||
| case tpe => ArrayType(tpe) | ||
| } | ||
|
|
||
| protected lazy val mapType: Parser[DataType] = | ||
| MAP ~> "<" ~> dataType ~ "," ~ dataType <~ ">" ^^ { | ||
| case t1 ~ _ ~ t2 => MapType(t1, t2) | ||
| } | ||
|
|
||
| protected lazy val structField: Parser[StructField] = | ||
| ident ~ ":" ~ dataType ^^ { | ||
| case fieldName ~ _ ~ tpe => StructField(fieldName, tpe, nullable = true) | ||
| } | ||
|
|
||
| protected lazy val structType: Parser[DataType] = | ||
| (STRUCT ~> "<" ~> repsep(structField, ",") <~ ">" ^^ { | ||
| case fields => new StructType(fields) | ||
| }) | | ||
| (STRUCT ~> "<>" ^^ { | ||
| case fields => new StructType(Nil) | ||
| }) | ||
|
|
||
| private[sql] lazy val dataType: Parser[DataType] = | ||
| arrayType | | ||
| mapType | | ||
| structType | | ||
| primitiveType | ||
| } | ||
|
|
||
| private[sql] case class CreateTableUsing( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be nice to move this
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok, i will move it |
||
| tableName: String, | ||
| userSpecifiedSchema: Option[StructType], | ||
| provider: String, | ||
| options: Map[String, String]) extends RunnableCommand { | ||
|
|
||
|
|
@@ -99,8 +190,16 @@ private[sql] case class CreateTableUsing( | |
| sys.error(s"Failed to load class for data source: $provider") | ||
| } | ||
| } | ||
| val dataSource = clazz.newInstance().asInstanceOf[org.apache.spark.sql.sources.RelationProvider] | ||
| val relation = dataSource.createRelation(sqlContext, new CaseInsensitiveMap(options)) | ||
| val relation = clazz.newInstance match { | ||
| case dataSource: org.apache.spark.sql.sources.RelationProvider => | ||
| dataSource | ||
| .asInstanceOf[org.apache.spark.sql.sources.RelationProvider] | ||
| .createRelation(sqlContext, new CaseInsensitiveMap(options)) | ||
| case dataSource: org.apache.spark.sql.sources.SchemaRelationProvider => | ||
| dataSource | ||
| .asInstanceOf[org.apache.spark.sql.sources.SchemaRelationProvider] | ||
| .createRelation(sqlContext, new CaseInsensitiveMap(options), userSpecifiedSchema) | ||
| } | ||
|
|
||
| sqlContext.baseRelationToSchemaRDD(relation).registerTempTable(tableName) | ||
| Seq.empty | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,7 +18,7 @@ package org.apache.spark.sql.sources | |
|
|
||
| import org.apache.spark.annotation.{Experimental, DeveloperApi} | ||
| import org.apache.spark.rdd.RDD | ||
| import org.apache.spark.sql.{SQLConf, Row, SQLContext, StructType} | ||
| import org.apache.spark.sql.{Row, SQLContext, StructType} | ||
| import org.apache.spark.sql.catalyst.expressions.{Expression, Attribute} | ||
|
|
||
| /** | ||
|
|
@@ -44,6 +44,33 @@ trait RelationProvider { | |
| def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation | ||
| } | ||
|
|
||
| /** | ||
| * ::DeveloperApi:: | ||
| * Implemented by objects that produce relations for a specific kind of data source. When | ||
| * Spark SQL is given a DDL operation with | ||
| * 1. USING clause: to specify the implemented SchemaRelationProvider | ||
| * 2. User defined schema: users can define schema optionally when create table | ||
| * | ||
| * Users may specify the fully qualified class name of a given data source. When that class is | ||
| * not found Spark SQL will append the class name `DefaultSource` to the path, allowing for | ||
| * less verbose invocation. For example, 'org.apache.spark.sql.json' would resolve to the | ||
| * data source 'org.apache.spark.sql.json.DefaultSource' | ||
| * | ||
| * A new instance of this class with be instantiated each time a DDL call is made. | ||
| */ | ||
| @DeveloperApi | ||
| trait SchemaRelationProvider { | ||
| /** | ||
| * Returns a new base relation with the given parameters and user defined schema. | ||
| * Note: the parameters' keywords are case insensitive and this insensitivity is enforced | ||
| * by the Map that is passed to the function. | ||
| */ | ||
| def createRelation( | ||
| sqlContext: SQLContext, | ||
| parameters: Map[String, String], | ||
| schema: Option[StructType]): BaseRelation | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why is this an option? we have two traits and option is not very friendly to java
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My initial idea is to compatible with the old traits, since we will have two traits i will fix this. |
||
| } | ||
|
|
||
| /** | ||
| * ::DeveloperApi:: | ||
| * Represents a collection of tuples with a known schema. Classes that extend BaseRelation must | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We cannot change the function signature, otherwise we will break existing libraries. Instead I think we need to create a new interface
SchemaRelationProvidermaybe?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Or using a default value for schema:
schema: Option[StructType] = None?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Default values do not preserve binary compatibility, only source compatibility.