[SPARK-15474][SQL] Write and read back non-emtpy schema with empty dataframe

dongjoon-hyun · dongjoon-hyun · commit 8d212f049ccd · 2017-10-25T19:11:58.000-07:00
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/orc/OrcFileFormat.scala
@@ -17,12 +17,18 @@
 
 package org.apache.spark.sql.execution.datasources.orc
 
-import org.apache.orc.TypeDescription
+import java.io._
 
-import org.apache.spark.sql.AnalysisException
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
+import org.apache.orc.{OrcFile, TypeDescription}
+
+import org.apache.spark.internal.Logging
+import org.apache.spark.sql.{AnalysisException, SparkSession}
+import org.apache.spark.sql.catalyst.parser.CatalystSqlParser
 import org.apache.spark.sql.types.StructType
 
-private[sql] object OrcFileFormat {
+private[sql] object OrcFileFormat extends Logging {
   private def checkFieldName(name: String): Unit = {
     try {
       TypeDescription.fromString(s"struct<$name:int>")
@@ -39,4 +45,32 @@ private[sql] object OrcFileFormat {
     schema.fieldNames.foreach(checkFieldName)
     schema
   }
+
+  def getSchemaString(schema: StructType): String = {
+    schema.fields.map(f => s"${f.name}:${f.dataType.catalogString}").mkString("struct<", ",", ">")
+  }
+
+  private def readSchema(file: Path, conf: Configuration, fs: FileSystem) = {
+    try {
+      val readerOptions = OrcFile.readerOptions(conf).filesystem(fs)
+      val reader = OrcFile.createReader(file, readerOptions)
+      val schema = reader.getSchema
+      if (schema.getFieldNames.size == 0) {
+        None
+      } else {
+        Some(schema)
+      }
+    } catch {
+      case _: IOException => None
+    }
+  }
+
+  def readSchema(sparkSession: SparkSession, files: Seq[FileStatus]): Option[StructType] = {
+    val conf = sparkSession.sparkContext.hadoopConfiguration
+    val fs = FileSystem.get(conf)
+    files.map(_.getPath).flatMap(readSchema(_, conf, fs)).headOption.map { schema =>
+      logDebug(s"Reading schema from file $files, got Hive schema string: $schema")
+      CatalystSqlParser.parseDataType(schema.toString).asInstanceOf[StructType]
+    }
+  }
 }
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFileFormat.scala
@@ -32,7 +32,7 @@ import org.apache.hadoop.io.{NullWritable, Writable}
 import org.apache.hadoop.mapred.{JobConf, OutputFormat => MapRedOutputFormat, RecordWriter, Reporter}
 import org.apache.hadoop.mapreduce._
 import org.apache.hadoop.mapreduce.lib.input.{FileInputFormat, FileSplit}
-import org.apache.orc.OrcConf.COMPRESS
+import org.apache.orc.OrcConf.{COMPRESS, MAPRED_OUTPUT_SCHEMA}
 
 import org.apache.spark.TaskContext
 import org.apache.spark.sql.SparkSession
@@ -58,10 +58,7 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
       sparkSession: SparkSession,
       options: Map[String, String],
       files: Seq[FileStatus]): Option[StructType] = {
-    OrcFileOperator.readSchema(
-      files.map(_.getPath.toString),
-      Some(sparkSession.sessionState.newHadoopConf())
-    )
+    org.apache.spark.sql.execution.datasources.orc.OrcFileFormat.readSchema(sparkSession, files)
   }
 
   override def prepareWrite(
@@ -73,6 +70,10 @@ class OrcFileFormat extends FileFormat with DataSourceRegister with Serializable
 
     val configuration = job.getConfiguration
 
+    configuration.set(
+      MAPRED_OUTPUT_SCHEMA.getAttribute,
+      org.apache.spark.sql.execution.datasources.orc.OrcFileFormat.getSchemaString(dataSchema))
+
     configuration.set(COMPRESS.getAttribute, orcOptions.compressionCodec)
     configuration match {
       case conf: JobConf =>
@@ -252,6 +253,12 @@ private[orc] class OrcOutputWriter(
   override def close(): Unit = {
     if (recordWriterInstantiated) {
       recordWriter.close(Reporter.NULL)
+    } else {
+      // SPARK-15474 Write empty orc file with correct schema
+      val conf = context.getConfiguration()
+      val writer = org.apache.orc.OrcFile.createWriter(
+        new Path(path), org.apache.orc.mapred.OrcOutputFormat.buildOptions(conf))
+      new org.apache.orc.mapreduce.OrcMapreduceRecordWriter(writer).close(context)
     }
   }
 }
diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/SQLQuerySuite.scala
@@ -2153,4 +2153,18 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {
       }
     }
   }
+
+  Seq("orc", "parquet").foreach { format =>
+    test(s"SPARK-15474 Write and read back non-emtpy schema with empty dataframe - $format") {
+      withTempPath { file =>
+        val path = file.getCanonicalPath
+        val emptyDf = Seq((true, 1, "str")).toDF.limit(0)
+        emptyDf.write.format(format).save(path)
+
+        val df = spark.read.format(format).load(path)
+        assert(df.schema.sameType(emptyDf.schema))
+        checkAnswer(df, emptyDf)
+      }
+    }
+  }
 }

Original file line number	Diff line number	Diff line change
`@@ -2153,4 +2153,18 @@ class SQLQuerySuite extends QueryTest with SQLTestUtils with TestHiveSingleton {`
`2153`	`2153`	`}`
`2154`	`2154`	`}`
`2155`	`2155`	`}`
	`2156`	`+`
	`2157`	`+ Seq("orc", "parquet").foreach { format =>`
	`2158`	`+ test(s"SPARK-15474 Write and read back non-emtpy schema with empty dataframe - $format") {`
	`2159`	`+ withTempPath { file =>`
	`2160`	`+ val path = file.getCanonicalPath`
	`2161`	`+ val emptyDf = Seq((true, 1, "str")).toDF.limit(0)`
	`2162`	`+ emptyDf.write.format(format).save(path)`
	`2163`	`+`
	`2164`	`+ val df = spark.read.format(format).load(path)`
	`2165`	`+ assert(df.schema.sameType(emptyDf.schema))`
	`2166`	`+ checkAnswer(df, emptyDf)`
	`2167`	`+ }`
	`2168`	`+ }`
	`2169`	`+ }`
`2156`	`2170`	`}`