Skip to content

Commit 78eae7e

Browse files
hvanhovellcloud-fan
authored andcommitted
[SPARK-19459] Support for nested char/varchar fields in ORC
## What changes were proposed in this pull request? This PR is a small follow-up on apache#16804. This PR also adds support for nested char/varchar fields in orc. ## How was this patch tested? I have added a regression test to the OrcSourceSuite. Author: Herman van Hovell <hvanhovell@databricks.com> Closes apache#17030 from hvanhovell/SPARK-19459-follow-up.
1 parent 93aa427 commit 78eae7e

3 files changed

Lines changed: 100 additions & 19 deletions

File tree

sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala

Lines changed: 19 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging {
7676
}
7777

7878
override def visitSingleDataType(ctx: SingleDataTypeContext): DataType = withOrigin(ctx) {
79-
visit(ctx.dataType).asInstanceOf[DataType]
79+
visitSparkDataType(ctx.dataType)
8080
}
8181

8282
/* ********************************************************************************************
@@ -1006,7 +1006,7 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging {
10061006
* Create a [[Cast]] expression.
10071007
*/
10081008
override def visitCast(ctx: CastContext): Expression = withOrigin(ctx) {
1009-
Cast(expression(ctx.expression), typedVisit(ctx.dataType))
1009+
Cast(expression(ctx.expression), visitSparkDataType(ctx.dataType))
10101010
}
10111011

10121012
/**
@@ -1424,6 +1424,13 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging {
14241424
/* ********************************************************************************************
14251425
* DataType parsing
14261426
* ******************************************************************************************** */
1427+
/**
1428+
* Create a Spark DataType.
1429+
*/
1430+
private def visitSparkDataType(ctx: DataTypeContext): DataType = {
1431+
HiveStringType.replaceCharType(typedVisit(ctx))
1432+
}
1433+
14271434
/**
14281435
* Resolve/create a primitive type.
14291436
*/
@@ -1438,8 +1445,9 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging {
14381445
case ("double", Nil) => DoubleType
14391446
case ("date", Nil) => DateType
14401447
case ("timestamp", Nil) => TimestampType
1441-
case ("char" | "varchar" | "string", Nil) => StringType
1442-
case ("char" | "varchar", _ :: Nil) => StringType
1448+
case ("string", Nil) => StringType
1449+
case ("char", length :: Nil) => CharType(length.getText.toInt)
1450+
case ("varchar", length :: Nil) => VarcharType(length.getText.toInt)
14431451
case ("binary", Nil) => BinaryType
14441452
case ("decimal", Nil) => DecimalType.USER_DEFAULT
14451453
case ("decimal", precision :: Nil) => DecimalType(precision.getText.toInt, 0)
@@ -1461,7 +1469,7 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging {
14611469
case SqlBaseParser.MAP =>
14621470
MapType(typedVisit(ctx.dataType(0)), typedVisit(ctx.dataType(1)))
14631471
case SqlBaseParser.STRUCT =>
1464-
createStructType(ctx.complexColTypeList())
1472+
StructType(Option(ctx.complexColTypeList).toSeq.flatMap(visitComplexColTypeList))
14651473
}
14661474
}
14671475

@@ -1480,7 +1488,7 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging {
14801488
}
14811489

14821490
/**
1483-
* Create a [[StructField]] from a column definition.
1491+
* Create a top level [[StructField]] from a column definition.
14841492
*/
14851493
override def visitColType(ctx: ColTypeContext): StructField = withOrigin(ctx) {
14861494
import ctx._
@@ -1491,19 +1499,15 @@ class AstBuilder extends SqlBaseBaseVisitor[AnyRef] with Logging {
14911499
builder.putString("comment", string(STRING))
14921500
}
14931501
// Add Hive type string to metadata.
1494-
dataType match {
1495-
case p: PrimitiveDataTypeContext =>
1496-
p.identifier.getText.toLowerCase match {
1497-
case "varchar" | "char" =>
1498-
builder.putString(HIVE_TYPE_STRING, dataType.getText.toLowerCase)
1499-
case _ =>
1500-
}
1501-
case _ =>
1502+
val rawDataType = typedVisit[DataType](ctx.dataType)
1503+
val cleanedDataType = HiveStringType.replaceCharType(rawDataType)
1504+
if (rawDataType != cleanedDataType) {
1505+
builder.putString(HIVE_TYPE_STRING, rawDataType.catalogString)
15021506
}
15031507

15041508
StructField(
15051509
identifier.getText,
1506-
typedVisit(dataType),
1510+
cleanedDataType,
15071511
nullable = true,
15081512
builder.build())
15091513
}
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.spark.sql.types
18+
19+
import scala.math.Ordering
20+
import scala.reflect.runtime.universe.typeTag
21+
22+
import org.apache.spark.sql.catalyst.ScalaReflectionLock
23+
import org.apache.spark.unsafe.types.UTF8String
24+
25+
/**
26+
* A hive string type for compatibility. These datatypes should only used for parsing,
27+
* and should NOT be used anywhere else. Any instance of these data types should be
28+
* replaced by a [[StringType]] before analysis.
29+
*/
30+
sealed abstract class HiveStringType extends AtomicType {
31+
private[sql] type InternalType = UTF8String
32+
33+
private[sql] val ordering = implicitly[Ordering[InternalType]]
34+
35+
@transient private[sql] lazy val tag = ScalaReflectionLock.synchronized {
36+
typeTag[InternalType]
37+
}
38+
39+
override def defaultSize: Int = length
40+
41+
private[spark] override def asNullable: HiveStringType = this
42+
43+
def length: Int
44+
}
45+
46+
object HiveStringType {
47+
def replaceCharType(dt: DataType): DataType = dt match {
48+
case ArrayType(et, nullable) =>
49+
ArrayType(replaceCharType(et), nullable)
50+
case MapType(kt, vt, nullable) =>
51+
MapType(replaceCharType(kt), replaceCharType(vt), nullable)
52+
case StructType(fields) =>
53+
StructType(fields.map { field =>
54+
field.copy(dataType = replaceCharType(field.dataType))
55+
})
56+
case _: HiveStringType => StringType
57+
case _ => dt
58+
}
59+
}
60+
61+
/**
62+
* Hive char type.
63+
*/
64+
case class CharType(length: Int) extends HiveStringType {
65+
override def simpleString: String = s"char($length)"
66+
}
67+
68+
/**
69+
* Hive varchar type.
70+
*/
71+
case class VarcharType(length: Int) extends HiveStringType {
72+
override def simpleString: String = s"varchar($length)"
73+
}

sql/hive/src/test/scala/org/apache/spark/sql/hive/orc/OrcSourceSuite.scala

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -162,13 +162,16 @@ abstract class OrcSuite extends QueryTest with TestHiveSingleton with BeforeAndA
162162
|CREATE EXTERNAL TABLE hive_orc(
163163
| a STRING,
164164
| b CHAR(10),
165-
| c VARCHAR(10))
165+
| c VARCHAR(10),
166+
| d ARRAY<CHAR(3)>)
166167
|STORED AS orc""".stripMargin)
167168
// Hive throws an exception if I assign the location in the create table statement.
168169
hiveClient.runSqlHive(
169170
s"ALTER TABLE hive_orc SET LOCATION '$uri'")
170171
hiveClient.runSqlHive(
171-
"INSERT INTO TABLE hive_orc SELECT 'a', 'b', 'c' FROM (SELECT 1) t")
172+
"""INSERT INTO TABLE hive_orc
173+
|SELECT 'a', 'b', 'c', ARRAY(CAST('d' AS CHAR(3)))
174+
|FROM (SELECT 1) t""".stripMargin)
172175

173176
// We create a different table in Spark using the same schema which points to
174177
// the same location.
@@ -177,10 +180,11 @@ abstract class OrcSuite extends QueryTest with TestHiveSingleton with BeforeAndA
177180
|CREATE EXTERNAL TABLE spark_orc(
178181
| a STRING,
179182
| b CHAR(10),
180-
| c VARCHAR(10))
183+
| c VARCHAR(10),
184+
| d ARRAY<CHAR(3)>)
181185
|STORED AS orc
182186
|LOCATION '$uri'""".stripMargin)
183-
val result = Row("a", "b ", "c")
187+
val result = Row("a", "b ", "c", Seq("d "))
184188
checkAnswer(spark.table("hive_orc"), result)
185189
checkAnswer(spark.table("spark_orc"), result)
186190
} finally {

0 commit comments

Comments
 (0)