-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-27946][SQL] Hive DDL to Spark DDL conversion USING "show create table" #24938
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
7795355
b9dacc5
021e48b
a5e3a15
a909790
6fa7fab
5f92532
8dec28b
9228d6a
d444f29
bbe6fa7
4f33fd8
4dc4007
a4b0ce6
115dce3
88feda3
67cbea2
4311955
f886f04
9882932
27c76b3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -43,7 +43,7 @@ import org.apache.spark.sql.execution.datasources.v2.csv.CSVDataSourceV2 | |
| import org.apache.spark.sql.execution.datasources.v2.json.JsonDataSourceV2 | ||
| import org.apache.spark.sql.execution.datasources.v2.orc.OrcDataSourceV2 | ||
| import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetDataSourceV2 | ||
| import org.apache.spark.sql.internal.SQLConf | ||
| import org.apache.spark.sql.internal.{HiveSerDe, SQLConf} | ||
| import org.apache.spark.sql.types._ | ||
| import org.apache.spark.sql.util.SchemaUtils | ||
|
|
||
|
|
@@ -940,7 +940,95 @@ case class ShowPartitionsCommand( | |
| } | ||
| } | ||
|
|
||
| case class ShowCreateTableCommand(table: TableIdentifier) extends RunnableCommand { | ||
| /** | ||
| * Provides common utilities between `ShowCreateTableCommand` and `ShowCreateTableAsSparkCommand`. | ||
| */ | ||
| trait ShowCreateTableCommandBase { | ||
|
|
||
| protected val table: TableIdentifier | ||
|
|
||
| protected def showTableLocation(metadata: CatalogTable, builder: StringBuilder): Unit = { | ||
| if (metadata.tableType == EXTERNAL) { | ||
| metadata.storage.locationUri.foreach { location => | ||
| builder ++= s"LOCATION '${escapeSingleQuotedString(CatalogUtils.URIToString(location))}'\n" | ||
| } | ||
| } | ||
| } | ||
|
|
||
| protected def showTableComment(metadata: CatalogTable, builder: StringBuilder): Unit = { | ||
| metadata | ||
| .comment | ||
| .map("COMMENT '" + escapeSingleQuotedString(_) + "'\n") | ||
| .foreach(builder.append) | ||
| } | ||
|
|
||
| protected def showTableProperties(metadata: CatalogTable, builder: StringBuilder): Unit = { | ||
| if (metadata.properties.nonEmpty) { | ||
| val props = metadata.properties.map { case (key, value) => | ||
| s"'${escapeSingleQuotedString(key)}' = '${escapeSingleQuotedString(value)}'" | ||
| } | ||
|
|
||
| builder ++= props.mkString("TBLPROPERTIES (\n ", ",\n ", "\n)\n") | ||
| } | ||
| } | ||
|
|
||
| protected def showDataSourceTableDataColumns( | ||
|
||
| metadata: CatalogTable, builder: StringBuilder): Unit = { | ||
| val columns = metadata.schema.fields.map(_.toDDL) | ||
| builder ++= columns.mkString("(", ", ", ")\n") | ||
| } | ||
|
|
||
| protected def showDataSourceTableOptions(metadata: CatalogTable, builder: StringBuilder): Unit = { | ||
| builder ++= s"USING ${metadata.provider.get}\n" | ||
|
||
|
|
||
| val dataSourceOptions = SQLConf.get.redactOptions(metadata.storage.properties).map { | ||
| case (key, value) => s"${quoteIdentifier(key)} '${escapeSingleQuotedString(value)}'" | ||
| } | ||
|
|
||
| if (dataSourceOptions.nonEmpty) { | ||
| builder ++= "OPTIONS (\n" | ||
| builder ++= dataSourceOptions.mkString(" ", ",\n ", "\n") | ||
| builder ++= ")\n" | ||
| } | ||
| } | ||
|
|
||
| protected def showDataSourceTableNonDataColumns( | ||
| metadata: CatalogTable, builder: StringBuilder): Unit = { | ||
| val partCols = metadata.partitionColumnNames | ||
| if (partCols.nonEmpty) { | ||
| builder ++= s"PARTITIONED BY ${partCols.mkString("(", ", ", ")")}\n" | ||
| } | ||
|
|
||
| metadata.bucketSpec.foreach { spec => | ||
| if (spec.bucketColumnNames.nonEmpty) { | ||
| builder ++= s"CLUSTERED BY ${spec.bucketColumnNames.mkString("(", ", ", ")")}\n" | ||
|
|
||
| if (spec.sortColumnNames.nonEmpty) { | ||
| builder ++= s"SORTED BY ${spec.sortColumnNames.mkString("(", ", ", ")")}\n" | ||
| } | ||
|
|
||
| builder ++= s"INTO ${spec.numBuckets} BUCKETS\n" | ||
| } | ||
| } | ||
| } | ||
|
|
||
| protected def showCreateDataSourceTable(metadata: CatalogTable): String = { | ||
| val builder = StringBuilder.newBuilder | ||
|
|
||
| builder ++= s"CREATE TABLE ${table.quotedString} " | ||
| showDataSourceTableDataColumns(metadata, builder) | ||
| showDataSourceTableOptions(metadata, builder) | ||
| showDataSourceTableNonDataColumns(metadata, builder) | ||
| showTableComment(metadata, builder) | ||
| showTableLocation(metadata, builder) | ||
| showTableProperties(metadata, builder) | ||
|
|
||
| builder.toString() | ||
| } | ||
| } | ||
|
|
||
| case class ShowCreateTableCommand(table: TableIdentifier) | ||
| extends RunnableCommand with ShowCreateTableCommandBase { | ||
| override val output: Seq[Attribute] = Seq( | ||
| AttributeReference("createtab_stmt", StringType, nullable = false)() | ||
| ) | ||
|
|
@@ -1041,7 +1129,7 @@ case class ShowCreateTableCommand(table: TableIdentifier) extends RunnableComman | |
| storage.serde.foreach { serde => | ||
| builder ++= s"ROW FORMAT SERDE '$serde'\n" | ||
|
|
||
| val serdeProps = metadata.storage.properties.map { | ||
| val serdeProps = SQLConf.get.redactOptions(metadata.storage.properties).map { | ||
| case (key, value) => | ||
| s"'${escapeSingleQuotedString(key)}' = '${escapeSingleQuotedString(value)}'" | ||
| } | ||
|
|
@@ -1061,83 +1149,86 @@ case class ShowCreateTableCommand(table: TableIdentifier) extends RunnableComman | |
| } | ||
| } | ||
| } | ||
| } | ||
|
|
||
| private def showTableLocation(metadata: CatalogTable, builder: StringBuilder): Unit = { | ||
| if (metadata.tableType == EXTERNAL) { | ||
| metadata.storage.locationUri.foreach { location => | ||
| builder ++= s"LOCATION '${escapeSingleQuotedString(CatalogUtils.URIToString(location))}'\n" | ||
| } | ||
| } | ||
| } | ||
| /** | ||
| * This commands generates Spark DDL for Hive table. | ||
| * | ||
| * The syntax of using this command in SQL is: | ||
| * {{{ | ||
| * SHOW CREATE TABLE table_identifier AS SPARK; | ||
| * }}} | ||
| */ | ||
| case class ShowCreateTableAsSparkCommand(table: TableIdentifier) | ||
| extends RunnableCommand with ShowCreateTableCommandBase { | ||
| override val output: Seq[Attribute] = Seq( | ||
| AttributeReference("sparktab_stmt", StringType, nullable = false)() | ||
| ) | ||
|
|
||
| private def showTableComment(metadata: CatalogTable, builder: StringBuilder): Unit = { | ||
| metadata | ||
| .comment | ||
| .map("COMMENT '" + escapeSingleQuotedString(_) + "'\n") | ||
| .foreach(builder.append) | ||
| } | ||
| override def run(sparkSession: SparkSession): Seq[Row] = { | ||
| val catalog = sparkSession.sessionState.catalog | ||
| val tableMetadata = catalog.getTableMetadata(table) | ||
|
|
||
| private def showTableProperties(metadata: CatalogTable, builder: StringBuilder): Unit = { | ||
| if (metadata.properties.nonEmpty) { | ||
| val props = metadata.properties.map { case (key, value) => | ||
| s"'${escapeSingleQuotedString(key)}' = '${escapeSingleQuotedString(value)}'" | ||
| val stmt = if (DDLUtils.isDatasourceTable(tableMetadata)) { | ||
| throw new AnalysisException( | ||
| s"$table is already a Spark data source table. Use `SHOW CREATE TABLE` instead.") | ||
| } else { | ||
| if (tableMetadata.unsupportedFeatures.nonEmpty) { | ||
| throw new AnalysisException( | ||
| "Failed to execute SHOW CREATE TABLE AS SPARK against table " + | ||
| s"${tableMetadata.identifier}, which is created by Hive and uses the " + | ||
| "following unsupported feature(s)\n" + | ||
| tableMetadata.unsupportedFeatures.map(" - " + _).mkString("\n") | ||
| ) | ||
| } | ||
|
|
||
| builder ++= props.mkString("TBLPROPERTIES (\n ", ",\n ", "\n)\n") | ||
| } | ||
| } | ||
|
|
||
| private def showCreateDataSourceTable(metadata: CatalogTable): String = { | ||
| val builder = StringBuilder.newBuilder | ||
|
|
||
| builder ++= s"CREATE TABLE ${table.quotedString} " | ||
| showDataSourceTableDataColumns(metadata, builder) | ||
| showDataSourceTableOptions(metadata, builder) | ||
| showDataSourceTableNonDataColumns(metadata, builder) | ||
| showTableComment(metadata, builder) | ||
| showTableLocation(metadata, builder) | ||
| showTableProperties(metadata, builder) | ||
|
|
||
| builder.toString() | ||
| } | ||
|
|
||
| private def showDataSourceTableDataColumns( | ||
| metadata: CatalogTable, builder: StringBuilder): Unit = { | ||
| val columns = metadata.schema.fields.map(_.toDDL) | ||
| builder ++= columns.mkString("(", ", ", ")\n") | ||
| } | ||
| if (tableMetadata.tableType == VIEW) { | ||
| throw new AnalysisException("Hive view isn't supported by SHOW CREATE TABLE AS SPARK") | ||
| } | ||
|
|
||
| private def showDataSourceTableOptions(metadata: CatalogTable, builder: StringBuilder): Unit = { | ||
| builder ++= s"USING ${metadata.provider.get}\n" | ||
| // scalastyle:off caselocale | ||
| if (tableMetadata.properties.getOrElse("transactional", "false").toLowerCase.equals("true")) { | ||
|
||
| throw new AnalysisException( | ||
| "SHOW CRETE TABLE AS SPARK doesn't support transactional Hive table") | ||
| } | ||
| // scalastyle:on caselocale | ||
|
|
||
| val dataSourceOptions = SQLConf.get.redactOptions(metadata.storage.properties).map { | ||
| case (key, value) => s"${quoteIdentifier(key)} '${escapeSingleQuotedString(value)}'" | ||
| showCreateDataSourceTable(convertTableMetadata(tableMetadata)) | ||
| } | ||
|
|
||
| if (dataSourceOptions.nonEmpty) { | ||
| builder ++= "OPTIONS (\n" | ||
| builder ++= dataSourceOptions.mkString(" ", ",\n ", "\n") | ||
| builder ++= ")\n" | ||
| } | ||
| Seq(Row(stmt)) | ||
| } | ||
|
|
||
| private def showDataSourceTableNonDataColumns( | ||
| metadata: CatalogTable, builder: StringBuilder): Unit = { | ||
| val partCols = metadata.partitionColumnNames | ||
| if (partCols.nonEmpty) { | ||
| builder ++= s"PARTITIONED BY ${partCols.mkString("(", ", ", ")")}\n" | ||
| } | ||
|
|
||
| metadata.bucketSpec.foreach { spec => | ||
| if (spec.bucketColumnNames.nonEmpty) { | ||
| builder ++= s"CLUSTERED BY ${spec.bucketColumnNames.mkString("(", ", ", ")")}\n" | ||
|
|
||
| if (spec.sortColumnNames.nonEmpty) { | ||
| builder ++= s"SORTED BY ${spec.sortColumnNames.mkString("(", ", ", ")")}\n" | ||
| } | ||
|
|
||
| builder ++= s"INTO ${spec.numBuckets} BUCKETS\n" | ||
| private def convertTableMetadata(tableMetadata: CatalogTable): CatalogTable = { | ||
| val hiveSerde = HiveSerDe( | ||
| serde = tableMetadata.storage.serde, | ||
| inputFormat = tableMetadata.storage.inputFormat, | ||
| outputFormat = tableMetadata.storage.outputFormat) | ||
|
|
||
| // Looking for Spark data source that maps to to the Hive serde. | ||
| // TODO: some Hive fileformat + row serde might be mapped to Spark data source, e.g. CSV. | ||
| val source = HiveSerDe.serdeToSource(hiveSerde) | ||
| if (source.isEmpty) { | ||
| val builder = StringBuilder.newBuilder | ||
| hiveSerde.serde.foreach { serde => | ||
| builder ++= s" SERDE: $serde" | ||
| } | ||
| hiveSerde.inputFormat.foreach { format => | ||
| builder ++= s" INPUTFORMAT: $format" | ||
| } | ||
| hiveSerde.outputFormat.foreach { format => | ||
| builder ++= s" OUTPUTFORMAT: $format" | ||
| } | ||
| throw new AnalysisException( | ||
| "Failed to execute SHOW CREATE TABLE AS SPARK against table " + | ||
| s"${tableMetadata.identifier}, which is created by Hive and uses the " + | ||
| "following unsupported serde configuration\n" + | ||
| builder.toString() | ||
| ) | ||
| } else { | ||
| // TODO: should we keep Hive serde properties? | ||
| val newStorage = tableMetadata.storage.copy(properties = Map.empty) | ||
| tableMetadata.copy(provider = source, storage = newStorage) | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
After rethinking it, let us make it more aggressive here. Instead of creating Spark native tables for the existing Hive serde tables, we can try to always show how to create Spark native tables if possible. This will further simplify the migration from Hive to Spark.
To the existing Spark users who prefer to keeping Hive serde formats, we can introduce a new option
AS SERDEwhich will keep the behaviors in Spark 2.4 or prior.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
+1. The new proposal makes more sense!
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
A bit confusing and let me confirm. So you mean let SHOW CREATE TABLE work with
AS SPARK(so not to add newAS SPARKoption) by default, and only fallback to current behavior (show how to create Hive serde table) when givenAS SERDE?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes!