Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -34,12 +34,12 @@ import org.apache.hadoop.mapreduce._
import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit}

import org.apache.spark.TaskContext
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.execution.datasources._
import org.apache.spark.sql.hive.{HiveInspectors, HiveShim}
import org.apache.spark.sql.sources.{Filter, _}
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.StructType
import org.apache.spark.util.SerializableConfiguration

Expand Down Expand Up @@ -83,6 +83,8 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
classOf[MapRedOutputFormat[_, _]])
}

dataSchema.map(_.name).foreach(checkFieldName)

new OutputWriterFactory {
override def newInstance(
path: String,
Expand Down Expand Up @@ -169,6 +171,16 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
}
}
}

private def checkFieldName(name: String): Unit = {
// ,;{}()\n\t= and space are special characters in ORC schema
Copy link
Contributor

@tejasapatil tejasapatil Sep 4, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this an exhaustive list ? eg. looks like ? is not allowed either. Given that the underlying lib (ORC) can evolve to support / not support certain chars, its safer to rely on some method rather than coming up with a blacklist. Can you simply call TypeInfoUtils.getTypeInfoFromTypeString or any related method which would do this check ?

Caused by: java.lang.IllegalArgumentException: Error: : expected at the position 8 of 'struct<i?:int,j:int,k:string>' but '?' is found.
  at org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils$TypeInfoParser.expect(TypeInfoUtils.java:360)
  at org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils$TypeInfoParser.expect(TypeInfoUtils.java:331)
  at org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils$TypeInfoParser.parseType(TypeInfoUtils.java:483)
  at org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils$TypeInfoParser.parseTypeInfos(TypeInfoUtils.java:305)
  at org.apache.hadoop.hive.serde2.typeinfo.TypeInfoUtils.getTypeInfoFromTypeString(TypeInfoUtils.java:770)
  at org.apache.spark.sql.hive.orc.OrcSerializer.<init>(OrcFileFormat.scala:194)
  at org.apache.spark.sql.hive.orc.OrcOutputWriter.<init>(OrcFileFormat.scala:231)
  at org.apache.spark.sql.hive.orc.OrcFileFormat$$anon$1.newInstance(OrcFileFormat.scala:91)
...
...

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for review, @tejasapatil !
That's a good idea. Right, It's not an exhaustive list. I'll update the PR.

if (name.matches(".*[ ,;{}()\n\t=].*")) {
throw new AnalysisException(
s"""Attribute name "$name" contains invalid character(s) among " ,;{}()\\n\\t=".
|Please use alias to rename it.
""".stripMargin.split("\n").mkString(" ").trim)
}
}
}

private[orc] class OrcSerializer(dataSchema: StructType, conf: Configuration)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2000,4 +2000,13 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
assert(setOfPath.size() == pathSizeToDeleteOnExit)
}
}

test("SPARK-21912 Creating ORC datasource table should check invalid column names") {
withTable("orc1") {
val m = intercept[AnalysisException] {
sql("CREATE TABLE orc1 USING ORC AS SELECT 1 `a b`")
}.getMessage
assert(m.contains("""Attribute name "a b" contains invalid character(s)"""))
}
}
}