-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-27592][SQL] Set the bucketed data source table SerDe correctly #24486
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
6ce0a32
2fdc3a6
4e1dd5c
921bbb0
26c895a
87b302c
86e4394
842bd3e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -23,7 +23,7 @@ import org.apache.spark.sql.catalyst.catalog.CatalogTableType | |
| import org.apache.spark.sql.catalyst.parser.CatalystSqlParser | ||
| import org.apache.spark.sql.catalyst.plans.logical.SubqueryAlias | ||
| import org.apache.spark.sql.hive.test.TestHiveSingleton | ||
| import org.apache.spark.sql.internal.SQLConf | ||
| import org.apache.spark.sql.internal.{HiveSerDe, SQLConf} | ||
| import org.apache.spark.sql.test.{ExamplePointUDT, SQLTestUtils} | ||
| import org.apache.spark.sql.types._ | ||
|
|
||
|
|
@@ -284,4 +284,40 @@ class DataSourceWithHiveMetastoreCatalogSuite | |
| } | ||
|
|
||
| } | ||
|
|
||
| test("SPARK-27592 set the bucketed data source table SerDe correctly") { | ||
| val provider = "parquet" | ||
| withTable("t") { | ||
| spark.sql( | ||
| s""" | ||
| |CREATE TABLE t | ||
| |USING $provider | ||
| |CLUSTERED BY (c1) | ||
| |SORTED BY (c1) | ||
| |INTO 2 BUCKETS | ||
| |AS SELECT 1 AS c1, 2 AS c2 | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Only one row is hard to prove Hive can read it correctly. Could you improve the tests? In addition, try to create a partitioned and bucked table and see whether they are readable by Hive. You can create a separate test suite for it.
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| """.stripMargin) | ||
|
|
||
| val metadata = sessionState.catalog.getTableMetadata(TableIdentifier("t", Some("default"))) | ||
|
|
||
| val hiveSerDe = HiveSerDe.sourceToSerDe(provider).get | ||
| assert(metadata.storage.serde === hiveSerDe.serde) | ||
| assert(metadata.storage.inputFormat === hiveSerDe.inputFormat) | ||
| assert(metadata.storage.outputFormat === hiveSerDe.outputFormat) | ||
|
|
||
| // It's a bucketed table at Spark side | ||
| assert(sql("DESC FORMATTED t").collect().containsSlice( | ||
| Seq(Row("Num Buckets", "2", ""), Row("Bucket Columns", "[`c1`]", "")) | ||
| )) | ||
| checkAnswer(table("t"), Row(1, 2)) | ||
|
|
||
| // It's not a bucketed table at Hive side | ||
| val hiveSide = sparkSession.metadataHive.runSqlHive("DESC FORMATTED t") | ||
| assert(hiveSide.contains("Num Buckets: \t-1 \t ")) | ||
| assert(hiveSide.contains("Bucket Columns: \t[] \t ")) | ||
| assert(hiveSide.contains("\tspark.sql.sources.schema.numBuckets\t2 ")) | ||
| assert(hiveSide.contains("\tspark.sql.sources.schema.bucketCol.0\tc1 ")) | ||
| assert(sparkSession.metadataHive.runSqlHive("SELECT * FROM t") === Seq("1\t2")) | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we set
bucketSpec = None?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not necessary:
spark/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveClientImpl.scala
Lines 1009 to 1023 in fee695d