-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-25271][SQL] Hive ctas commands should use data source if it is convertible #22514
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
5debc60
1223178
ad620be
1c4ad1a
5780a5e
0b0a900
c5992ae
e6b61c7
e42a846
9629175
e04812d
3c07d74
57fc943
ef52536
15b9c02
d949436
839a6ce
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -2648,7 +2648,7 @@ class DataFrameFunctionsSuite extends QueryTest with SharedSQLContext { | |
| "transform_values(" + | ||
| "z,(k, v) -> map_from_arrays(ARRAY(1, 2, 3), " + | ||
| "ARRAY('one', 'two', 'three'))[k] || '_' || CAST(v AS String))"), | ||
| Seq(Row(Map(1 -> "one_1.0", 2 -> "two_1.4", 3 ->"three_1.7")))) | ||
| Seq(Row(Map(1 -> "one_1.0", 2 -> "two_1.4", 3 -> "three_1.7")))) | ||
|
||
|
|
||
| checkAnswer( | ||
| dfExample4.selectExpr("transform_values(z, (k, v) -> k-v)"), | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,10 +21,11 @@ import scala.util.control.NonFatal | |
|
|
||
| import org.apache.spark.sql.{AnalysisException, Row, SaveMode, SparkSession} | ||
| import org.apache.spark.sql.catalyst.catalog.CatalogTable | ||
| import org.apache.spark.sql.catalyst.expressions.Attribute | ||
| import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan | ||
| import org.apache.spark.sql.execution.SparkPlan | ||
| import org.apache.spark.sql.execution.command.DataWritingCommand | ||
| import org.apache.spark.sql.execution.command.{DataWritingCommand, DDLUtils} | ||
| import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, InsertIntoHadoopFsRelationCommand, LogicalRelation} | ||
| import org.apache.spark.sql.hive.{HiveMetastoreCatalog, HiveSessionCatalog} | ||
|
|
||
|
|
||
| /** | ||
|
|
@@ -45,6 +46,11 @@ case class CreateHiveTableAsSelectCommand( | |
|
|
||
| override def run(sparkSession: SparkSession, child: SparkPlan): Seq[Row] = { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Some more thoughts:
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think the table metadata created by data source CTAS and Hive CTAS are different?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. then how about we create a special Hive CTAS command that follows data source CTAS command but creates Hive table?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I also thought about it. But then we will have two Hive CTAS commands. Is it good for you?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm OK with that, since we do have 2 different ways to do Hive CTAS.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I created a Hive CTAS with data source command. |
||
| val catalog = sparkSession.sessionState.catalog | ||
| val metastoreCatalog = catalog.asInstanceOf[HiveSessionCatalog].metastoreCatalog | ||
|
|
||
| // Whether this table is convertible to data source relation. | ||
| val isConvertible = metastoreCatalog.isConvertible(tableDesc) | ||
|
||
|
|
||
| if (catalog.tableExists(tableIdentifier)) { | ||
| assert(mode != SaveMode.Overwrite, | ||
| s"Expect the table $tableIdentifier has been dropped when the save mode is Overwrite") | ||
|
|
@@ -57,13 +63,18 @@ case class CreateHiveTableAsSelectCommand( | |
| return Seq.empty | ||
| } | ||
|
|
||
| InsertIntoHiveTable( | ||
| tableDesc, | ||
| Map.empty, | ||
| query, | ||
| overwrite = false, | ||
| ifPartitionNotExists = false, | ||
| outputColumnNames = outputColumnNames).run(sparkSession, child) | ||
| if (!isConvertible) { | ||
| InsertIntoHiveTable( | ||
| tableDesc, | ||
| Map.empty, | ||
| query, | ||
| overwrite = false, | ||
| ifPartitionNotExists = false, | ||
| outputColumnNames = outputColumnNames).run(sparkSession, child) | ||
| } else { | ||
| getHadoopFsRelationCommand(sparkSession, metastoreCatalog, tableDesc, mode) | ||
| .run(sparkSession, child) | ||
| } | ||
| } else { | ||
| // TODO ideally, we should get the output data ready first and then | ||
| // add the relation into catalog, just in case of failure occurs while data | ||
|
|
@@ -75,15 +86,20 @@ case class CreateHiveTableAsSelectCommand( | |
| try { | ||
| // Read back the metadata of the table which was created just now. | ||
| val createdTableMeta = catalog.getTableMetadata(tableDesc.identifier) | ||
| // For CTAS, there is no static partition values to insert. | ||
| val partition = createdTableMeta.partitionColumnNames.map(_ -> None).toMap | ||
| InsertIntoHiveTable( | ||
| createdTableMeta, | ||
| partition, | ||
| query, | ||
| overwrite = true, | ||
| ifPartitionNotExists = false, | ||
| outputColumnNames = outputColumnNames).run(sparkSession, child) | ||
| if (!isConvertible) { | ||
| // For CTAS, there is no static partition values to insert. | ||
| val partition = createdTableMeta.partitionColumnNames.map(_ -> None).toMap | ||
| InsertIntoHiveTable( | ||
| createdTableMeta, | ||
| partition, | ||
| query, | ||
| overwrite = true, | ||
| ifPartitionNotExists = false, | ||
| outputColumnNames = outputColumnNames).run(sparkSession, child) | ||
| } else { | ||
| getHadoopFsRelationCommand(sparkSession, metastoreCatalog, createdTableMeta, | ||
| SaveMode.Overwrite).run(sparkSession, child) | ||
| } | ||
| } catch { | ||
| case NonFatal(e) => | ||
| // drop the created table. | ||
|
|
@@ -95,6 +111,34 @@ case class CreateHiveTableAsSelectCommand( | |
| Seq.empty[Row] | ||
| } | ||
|
|
||
| // Converts Hive table to data source one and returns an `InsertIntoHadoopFsRelationCommand` | ||
| // used to write data into it. | ||
| private def getHadoopFsRelationCommand( | ||
| sparkSession: SparkSession, | ||
| metastoreCatalog: HiveMetastoreCatalog, | ||
| tableDesc: CatalogTable, | ||
| mode: SaveMode): InsertIntoHadoopFsRelationCommand = { | ||
| val hiveTable = DDLUtils.readHiveTable(tableDesc) | ||
| val hadoopRelation = metastoreCatalog.convert(hiveTable) match { | ||
| case LogicalRelation(t: HadoopFsRelation, _, _, _) => t | ||
| case _ => throw new AnalysisException(s"$tableIdentifier should be converted to " + | ||
| "HadoopFsRelation.") | ||
| } | ||
| InsertIntoHadoopFsRelationCommand( | ||
| hadoopRelation.location.rootPaths.head, | ||
| Map.empty, // We don't support to convert partitioned table. | ||
| false, | ||
| Seq.empty, // We don't support to convert partitioned table. | ||
| hadoopRelation.bucketSpec, | ||
| hadoopRelation.fileFormat, | ||
| hadoopRelation.options, | ||
| query, | ||
| mode, | ||
| Some(tableDesc), | ||
| Some(hadoopRelation.location), | ||
| query.output.map(_.name)) | ||
| } | ||
|
|
||
| override def argString: String = { | ||
| s"[Database:${tableDesc.database}}, " + | ||
| s"TableName: ${tableDesc.identifier.table}, " + | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,2 @@ | ||
| 0,1$abc2$pqr:3$xyz | ||
| 1, |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -92,4 +92,18 @@ class HiveParquetSuite extends QueryTest with ParquetTest with TestHiveSingleton | |
| } | ||
| } | ||
| } | ||
|
|
||
| test("SPARK-25271: write empty map into hive parquet table") { | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Although this is the original reported test case in SPARK-25271
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I agreed. Now because we have two Hive CTAS commands, it is easier to test it. Will add tests later.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added a new test for that. |
||
| val testData = hiveContext.getHiveFile("data/files/empty_map.dat").getCanonicalFile() | ||
| val sourceTable = "sourceTable" | ||
| val targetTable = "targetTable" | ||
| withTable(sourceTable, targetTable) { | ||
| sql(s"CREATE TABLE $sourceTable (i int,m map<int, string>) ROW FORMAT DELIMITED FIELDS " + | ||
| "TERMINATED BY ',' COLLECTION ITEMS TERMINATED BY ':' MAP KEYS TERMINATED BY '$'") | ||
| sql(s"LOAD DATA LOCAL INPATH '${testData.toURI}' INTO TABLE $sourceTable") | ||
|
||
| sql(s"CREATE TABLE $targetTable STORED AS PARQUET AS SELECT m FROM $sourceTable") | ||
| checkAnswer(sql(s"SELECT m FROM $targetTable"), | ||
| Row(Map(1 -> "abc2$pqr", 3 -> "xyz")) :: Row(Map.empty[Int, String]) :: Nil) | ||
| } | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If possible, let's not touch this because we didn't change anything in this file. It would be helpful for backporting. SPARK-25271 is reported as a regression in 2.3.x. I assume that we need to backport this for 2.4.1 and 2.3.3 at least.