Skip to content

Commit e3c8160

Browse files
gatorsmilecloud-fan
authored andcommitted
[SPARK-20476][SQL] Block users to create a table that use commas in the column names
### What changes were proposed in this pull request? ```SQL hive> create table t1(`a,` string); OK Time taken: 1.399 seconds hive> create table t2(`a,` string, b string); FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.DDLTask. java.lang.RuntimeException: MetaException(message:org.apache.hadoop.hive.serde2.SerDeException org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe: columns has 3 elements while columns.types has 2 elements!) hive> create table t2(`a,` string, b string) stored as parquet; FAILED: Execution Error, return code 1 from org.apache.hadoop.hive.ql.exec.DDLTask. java.lang.IllegalArgumentException: ParquetHiveSerde initialization failed. Number of column name and column type differs. columnNames = [a, , b], columnTypes = [string, string] ``` It has a bug in Hive metastore. When users do not provide alias name in the SELECT query, we call `toPrettySQL` to generate the alias name. For example, the string `get_json_object(jstring, '$.f1')` will be the alias name for the function call in the statement ```SQL SELECT key, get_json_object(jstring, '$.f1') FROM tempView ``` Above is not an issue for the SELECT query statements. However, for CTAS, we hit the issue due to a bug in Hive metastore. Hive metastore does not like the column names containing commas and returned a confusing error message, like: ``` 17/04/26 23:12:56 ERROR [hive.log(397) -- main]: error in initSerDe: org.apache.hadoop.hive.serde2.SerDeException org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe: columns has 2 elements while columns.types has 1 elements! org.apache.hadoop.hive.serde2.SerDeException: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe: columns has 2 elements while columns.types has 1 elements! ``` Thus, this PR is to block users to create a table in Hive metastore when the table table has a column containing commas in the name. ### How was this patch tested? Added a test case Author: Xiao Li <gatorsmile@gmail.com> Closes #17781 from gatorsmile/blockIllegalColumnNames.
1 parent 7fe8249 commit e3c8160

2 files changed

Lines changed: 42 additions & 0 deletions

File tree

sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveExternalCatalog.scala

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,22 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
137137
}
138138
}
139139

140+
/**
141+
* Checks the validity of column names. Hive metastore disallows the table to use comma in
142+
* data column names. Partition columns do not have such a restriction. Views do not have such
143+
* a restriction.
144+
*/
145+
private def verifyColumnNames(table: CatalogTable): Unit = {
146+
if (table.tableType != VIEW) {
147+
table.dataSchema.map(_.name).foreach { colName =>
148+
if (colName.contains(",")) {
149+
throw new AnalysisException("Cannot create a table having a column whose name contains " +
150+
s"commas in Hive metastore. Table: ${table.identifier}; Column: $colName")
151+
}
152+
}
153+
}
154+
}
155+
140156
// --------------------------------------------------------------------------
141157
// Databases
142158
// --------------------------------------------------------------------------
@@ -202,6 +218,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
202218
val table = tableDefinition.identifier.table
203219
requireDbExists(db)
204220
verifyTableProperties(tableDefinition)
221+
verifyColumnNames(tableDefinition)
205222

206223
if (tableExists(db, table) && !ignoreIfExists) {
207224
throw new TableAlreadyExistsException(db = db, table = table)
@@ -614,6 +631,7 @@ private[spark] class HiveExternalCatalog(conf: SparkConf, hadoopConf: Configurat
614631
requireTableExists(db, table)
615632
val rawTable = getRawTable(db, table)
616633
val withNewSchema = rawTable.copy(schema = schema)
634+
verifyColumnNames(withNewSchema)
617635
// Add table metadata such as table schema, partition columns, etc. to table properties.
618636
val updatedTable = withNewSchema.copy(
619637
properties = withNewSchema.properties ++ tableMetaToTableProps(withNewSchema))

sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1976,6 +1976,30 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
19761976
}
19771977
}
19781978

1979+
test("Auto alias construction of get_json_object") {
1980+
val df = Seq(("1", """{"f1": "value1", "f5": 5.23}""")).toDF("key", "jstring")
1981+
val expectedMsg = "Cannot create a table having a column whose name contains commas " +
1982+
"in Hive metastore. Table: `default`.`t`; Column: get_json_object(jstring, $.f1)"
1983+
1984+
withTable("t") {
1985+
val e = intercept[AnalysisException] {
1986+
df.select($"key", functions.get_json_object($"jstring", "$.f1"))
1987+
.write.format("hive").saveAsTable("t")
1988+
}.getMessage
1989+
assert(e.contains(expectedMsg))
1990+
}
1991+
1992+
withTempView("tempView") {
1993+
withTable("t") {
1994+
df.createTempView("tempView")
1995+
val e = intercept[AnalysisException] {
1996+
sql("CREATE TABLE t AS SELECT key, get_json_object(jstring, '$.f1') FROM tempView")
1997+
}.getMessage
1998+
assert(e.contains(expectedMsg))
1999+
}
2000+
}
2001+
}
2002+
19792003
test("SPARK-19912 String literals should be escaped for Hive metastore partition pruning") {
19802004
withTable("spark_19912") {
19812005
Seq(

0 commit comments

Comments
 (0)