Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -32,14 +32,15 @@ import org.apache.hadoop.io.{NullWritable, Writable}
import org.apache.hadoop.mapred.{JobConf, OutputFormat => MapRedOutputFormat, RecordWriter, Reporter}
import org.apache.hadoop.mapreduce._
import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit}
import org.apache.orc.TypeDescription

import org.apache.spark.TaskContext
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.execution.datasources._
import org.apache.spark.sql.hive.{HiveInspectors, HiveShim}
import org.apache.spark.sql.sources.{Filter, _}
import org.apache.spark.sql.sources._
import org.apache.spark.sql.types.StructType
import org.apache.spark.util.SerializableConfiguration

Expand Down Expand Up @@ -83,6 +84,8 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
classOf[MapRedOutputFormat[_, _]])
}

dataSchema.map(_.name).foreach(checkFieldName)

new OutputWriterFactory {
override def newInstance(
path: String,
Expand Down Expand Up @@ -169,6 +172,18 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
}
}
}

private def checkFieldName(name: String): Unit = {
try {
TypeDescription.fromString(s"struct<$name:int>")
} catch {
case _: IllegalArgumentException =>
throw new AnalysisException(
s"""Attribute name "$name" contains invalid character(s).
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: Attribute-> Column

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for review. Sure.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

|Please use alias to rename it.
""".stripMargin.split("\n").mkString(" ").trim)
}
}
}

private[orc] class OrcSerializer(dataSchema: StructType, conf: Configuration)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2000,4 +2000,15 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
assert(setOfPath.size() == pathSizeToDeleteOnExit)
}
}

test("SPARK-21912 Creating ORC datasource table should check invalid column names") {
withTable("orc1") {
Seq(" ", "?", ",", ";", "{", "}", "(", ")", "\n", "\t", "=").foreach { name =>
val m = intercept[AnalysisException] {
sql(s"CREATE TABLE orc1 USING ORC AS SELECT 1 `column$name`")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is CTAS. How about CREATE TABLE?

Copy link
Member Author

@dongjoon-hyun dongjoon-hyun Sep 5, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep. I'll check the code path, too.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems to be the same situation with Parquet. CREATE TABLE passes but SELECT raises exceptions.

scala> sql("CREATE TABLE parquet1(`a b` int) using parquet")
res1: org.apache.spark.sql.DataFrame = []

scala> sql("select * from parquet1").show
org.apache.spark.sql.AnalysisException: Attribute name "a b" contains invalid character(s) among " ,;{}()\n\t=". Please use alias to rename it.;

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we need to add Datasource specific operation on createDataSourceTables for Parquet and ORC?

Copy link
Member Author

@dongjoon-hyun dongjoon-hyun Sep 5, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@gatorsmile . I tried the following in CreateDataSourceTableCommand. We can add a check for ParquetFileFormat, but not for OrcFileFormat. Should I change the PR title and scope instead?

    table.provider.get.toLowerCase match {
      case "parquet" =>
        dataSource.schema.map(_.name).foreach(
          org.apache.spark.sql.execution.datasources.parquet.ParquetSchemaConverter.checkFieldName)
      case "orc" =>
        dataSource.schema.map(_.name).foreach(
          org.apache.spark.sql.hive.OrcRelation.checkFieldName)
    }

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll try in another way.

}.getMessage
assert(m.contains(s"contains invalid character(s)"))
}
}
}
}