Revert "[SPARK-26081][SPARK-29999]"

HeartSaVioR · gengliangwang · commit 5b628f8b17e1 · 2019-11-26T18:36:08.000-08:00
### What changes were proposed in this pull request? This reverts commit 31c4fab (#23052) to make sure the partition calling `ManifestFileCommitProtocol.newTaskTempFile` creates actual file. This also reverts part of commit 0d3d46d (#26639) since the commit fixes the issue raised from 31c4fab and we're reverting back. The reason of partial revert is that we found the UT be worth to keep as it is, preventing regression - given the UT can detect the issue on empty partition -> no actual file. This makes one more change to UT; moved intentionally to test both DSv1 and DSv2. ### Why are the changes needed? After the changes in SPARK-26081 (commit 31c4fab / #23052), CSV/JSON/TEXT don't create actual file if the partition is empty. This optimization causes a problem in `ManifestFileCommitProtocol`: the API `newTaskTempFile` is called without actual file creation. Then `fs.getFileStatus` throws `FileNotFoundException` since the file is not created. SPARK-29999 (commit 0d3d46d / #26639) fixes the problem. But it is too costly to check file existence on each task commit. We should simply restore the behavior before SPARK-26081. ### Does this PR introduce any user-facing change? No ### How was this patch tested? Jenkins build will follow. Closes #26671 from HeartSaVioR/revert-SPARK-26081-SPARK-29999. Authored-by: Jungtaek Lim (HeartSaVioR) <kabhwan.opensource@gmail.com> Signed-off-by: Gengliang Wang <gengliang.wang@databricks.com>
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CsvOutputWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/csv/CsvOutputWriter.scala
@@ -33,25 +33,17 @@ class CsvOutputWriter(
     context: TaskAttemptContext,
     params: CSVOptions) extends OutputWriter with Logging {
 
-  private var univocityGenerator: Option[UnivocityGenerator] = None
+  private val charset = Charset.forName(params.charset)
+
+  private val writer = CodecStreams.createOutputStreamWriter(context, new Path(path), charset)
+
+  private val gen = new UnivocityGenerator(dataSchema, writer, params)
 
   if (params.headerFlag) {
-    val gen = getGen()
     gen.writeHeaders()
   }
 
-  private def getGen(): UnivocityGenerator = univocityGenerator.getOrElse {
-    val charset = Charset.forName(params.charset)
-    val os = CodecStreams.createOutputStreamWriter(context, new Path(path), charset)
-    val newGen = new UnivocityGenerator(dataSchema, os, params)
-    univocityGenerator = Some(newGen)
-    newGen
-  }
-
-  override def write(row: InternalRow): Unit = {
-    val gen = getGen()
-    gen.write(row)
-  }
+  override def write(row: InternalRow): Unit = gen.write(row)
 
-  override def close(): Unit = univocityGenerator.foreach(_.close())
+  override def close(): Unit = gen.close()
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonOutputWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/json/JsonOutputWriter.scala
@@ -44,20 +44,18 @@ class JsonOutputWriter(
       " which can be read back by Spark only if multiLine is enabled.")
   }
 
-  private var jacksonGenerator: Option[JacksonGenerator] = None
+  private val writer = CodecStreams.createOutputStreamWriter(context, new Path(path), encoding)
 
-  override def write(row: InternalRow): Unit = {
-    val gen = jacksonGenerator.getOrElse {
-      val os = CodecStreams.createOutputStreamWriter(context, new Path(path), encoding)
-      // create the Generator without separator inserted between 2 records
-      val newGen = new JacksonGenerator(dataSchema, os, options)
-      jacksonGenerator = Some(newGen)
-      newGen
-    }
+  // create the Generator without separator inserted between 2 records
+  private[this] val gen = new JacksonGenerator(dataSchema, writer, options)
 
+  override def write(row: InternalRow): Unit = {
     gen.write(row)
     gen.writeLineEnding()
   }
 
-  override def close(): Unit = jacksonGenerator.foreach(_.close())
+  override def close(): Unit = {
+    gen.close()
+    writer.close()
+  }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextOutputWriter.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/text/TextOutputWriter.scala
@@ -16,8 +16,6 @@
  */
 package org.apache.spark.sql.execution.datasources.text
 
-import java.io.OutputStream
-
 import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.TaskAttemptContext
 
@@ -32,23 +30,17 @@ class TextOutputWriter(
     context: TaskAttemptContext)
   extends OutputWriter {
 
-  private var outputStream: Option[OutputStream] = None
+  private val writer = CodecStreams.createOutputStream(context, new Path(path))
 
   override def write(row: InternalRow): Unit = {
-    val os = outputStream.getOrElse {
-      val newStream = CodecStreams.createOutputStream(context, new Path(path))
-      outputStream = Some(newStream)
-      newStream
-    }
-
     if (!row.isNullAt(0)) {
       val utf8string = row.getUTF8String(0)
-      utf8string.writeTo(os)
+      utf8string.writeTo(writer)
     }
-    os.write(lineSeparator)
+    writer.write(lineSeparator)
   }
 
   override def close(): Unit = {
-    outputStream.foreach(_.close())
+    writer.close()
   }
 }
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/streaming/ManifestFileCommitProtocol.scala
@@ -22,7 +22,7 @@ import java.util.UUID
 
 import scala.collection.mutable.ArrayBuffer
 
-import org.apache.hadoop.fs.{FileSystem, Path}
+import org.apache.hadoop.fs.Path
 import org.apache.hadoop.mapreduce.{JobContext, TaskAttemptContext}
 
 import org.apache.spark.internal.Logging
@@ -89,7 +89,9 @@ class ManifestFileCommitProtocol(jobId: String, path: String)
         try {
           val fs = path.getFileSystem(jobContext.getConfiguration)
           // this is to make sure the file can be seen from driver as well
-          deleteIfExists(fs, path)
+          if (fs.exists(path)) {
+            fs.delete(path, false)
+          }
         } catch {
           case e: IOException =>
             logWarning(s"Fail to remove temporary file $path, continue removing next.", e)
@@ -137,14 +139,7 @@ class ManifestFileCommitProtocol(jobId: String, path: String)
     if (addedFiles.nonEmpty) {
       val fs = new Path(addedFiles.head).getFileSystem(taskContext.getConfiguration)
       val statuses: Seq[SinkFileStatus] =
-        addedFiles.flatMap { f =>
-          val path = new Path(f)
-          if (fs.exists(path)) {
-            Some(SinkFileStatus(fs.getFileStatus(path)))
-          } else {
-            None
-          }
-        }
+        addedFiles.map(f => SinkFileStatus(fs.getFileStatus(new Path(f))))
       new TaskCommitMessage(statuses)
     } else {
       new TaskCommitMessage(Seq.empty[SinkFileStatus])
@@ -155,13 +150,7 @@ class ManifestFileCommitProtocol(jobId: String, path: String)
     // best effort cleanup of incomplete files
     if (addedFiles.nonEmpty) {
       val fs = new Path(addedFiles.head).getFileSystem(taskContext.getConfiguration)
-      addedFiles.foreach { file => deleteIfExists(fs, new Path(file)) }
-    }
-  }
-
-  private def deleteIfExists(fs: FileSystem, path: Path, recursive: Boolean = false): Unit = {
-    if (fs.exists(path)) {
-      fs.delete(path, recursive)
+      addedFiles.foreach { file => fs.delete(new Path(file), false) }
     }
   }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVSuite.scala
@@ -2073,15 +2073,6 @@ class CSVSuite extends QueryTest with SharedSparkSession with TestCsvData {
     }
   }
 
-  test("do not produce empty files for empty partitions") {
-    withTempPath { dir =>
-      val path = dir.getCanonicalPath
-      spark.emptyDataset[String].write.csv(path)
-      val files = new File(path).listFiles()
-      assert(!files.exists(_.getName.endsWith("csv")))
-    }
-  }
-
   test("Do not reuse last good value for bad input field") {
     val schema = StructType(
       StructField("col1", StringType) ::
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
@@ -2471,15 +2471,6 @@ class JsonSuite extends QueryTest with SharedSparkSession with TestJsonData {
     emptyString(BinaryType, "".getBytes(StandardCharsets.UTF_8))
   }
 
-  test("do not produce empty files for empty partitions") {
-    withTempPath { dir =>
-      val path = dir.getCanonicalPath
-      spark.emptyDataset[String].write.json(path)
-      val files = new File(path).listFiles()
-      assert(!files.exists(_.getName.endsWith("json")))
-    }
-  }
-
   test("return partial result for bad records") {
     val schema = "a double, b array<int>, c string, _corrupt_record string"
     val badRecords = Seq(
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/text/TextSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/text/TextSuite.scala
@@ -233,13 +233,4 @@ class TextSuite extends QueryTest with SharedSparkSession {
     assert(data(3) == Row("\"doh\""))
     assert(data.length == 4)
   }
-
-  test("do not produce empty files for empty partitions") {
-    withTempPath { dir =>
-      val path = dir.getCanonicalPath
-      spark.emptyDataset[String].write.text(path)
-      val files = new File(path).listFiles()
-      assert(!files.exists(_.getName.endsWith("txt")))
-    }
-  }
 }
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/FileStreamSinkSuite.scala
@@ -525,6 +525,54 @@ abstract class FileStreamSinkSuite extends StreamTest {
       }
     }
   }
+
+  test("Handle FileStreamSink metadata correctly for empty partition") {
+    Seq("parquet", "orc", "text", "json").foreach { format =>
+      val inputData = MemoryStream[String]
+      val df = inputData.toDF()
+
+      withTempDir { outputDir =>
+        withTempDir { checkpointDir =>
+          var query: StreamingQuery = null
+          try {
+            // repartition to more than the input to leave empty partitions
+            query =
+              df.repartition(10)
+                .writeStream
+                .option("checkpointLocation", checkpointDir.getCanonicalPath)
+                .format(format)
+                .start(outputDir.getCanonicalPath)
+
+            inputData.addData("1", "2", "3")
+            inputData.addData("4", "5")
+
+            failAfter(streamingTimeout) {
+              query.processAllAvailable()
+            }
+          } finally {
+            if (query != null) {
+              query.stop()
+            }
+          }
+
+          val fs = new Path(outputDir.getCanonicalPath).getFileSystem(
+            spark.sessionState.newHadoopConf())
+          val sinkLog = new FileStreamSinkLog(FileStreamSinkLog.VERSION, spark,
+            outputDir.getCanonicalPath)
+
+          val allFiles = sinkLog.allFiles()
+          // only files from non-empty partition should be logged
+          assert(allFiles.length < 10)
+          assert(allFiles.forall(file => fs.exists(new Path(file.path))))
+
+          // the query should be able to read all rows correctly with metadata log
+          val outputDf = spark.read.format(format).load(outputDir.getCanonicalPath)
+            .selectExpr("CAST(value AS INT)").as[Int]
+          checkDatasetUnorderly(outputDf, 1, 2, 3, 4, 5)
+        }
+      }
+    }
+  }
 }
 
 object PendingCommitFilesTrackingManifestFileCommitProtocol {
@@ -600,61 +648,11 @@ class FileStreamSinkV1Suite extends FileStreamSinkSuite {
 }
 
 class FileStreamSinkV2Suite extends FileStreamSinkSuite {
-  import testImplicits._
-
   override protected def sparkConf: SparkConf =
     super
       .sparkConf
       .set(SQLConf.USE_V1_SOURCE_LIST, "")
 
-  test("SPARK-29999 Handle FileStreamSink metadata correctly for empty partition") {
-    Seq("parquet", "orc", "text", "json").foreach { format =>
-      val inputData = MemoryStream[String]
-      val df = inputData.toDF()
-
-      withTempDir { outputDir =>
-        withTempDir { checkpointDir =>
-          var query: StreamingQuery = null
-          try {
-            // repartition to more than the input to leave empty partitions
-            query =
-              df.repartition(10)
-                .writeStream
-                .option("checkpointLocation", checkpointDir.getCanonicalPath)
-                .format(format)
-                .start(outputDir.getCanonicalPath)
-
-            inputData.addData("1", "2", "3")
-            inputData.addData("4", "5")
-
-            failAfter(streamingTimeout) {
-              query.processAllAvailable()
-            }
-          } finally {
-            if (query != null) {
-              query.stop()
-            }
-          }
-
-          val fs = new Path(outputDir.getCanonicalPath).getFileSystem(
-            spark.sessionState.newHadoopConf())
-          val sinkLog = new FileStreamSinkLog(FileStreamSinkLog.VERSION, spark,
-            outputDir.getCanonicalPath)
-
-          val allFiles = sinkLog.allFiles()
-          // only files from non-empty partition should be logged
-          assert(allFiles.length < 10)
-          assert(allFiles.forall(file => fs.exists(new Path(file.path))))
-
-          // the query should be able to read all rows correctly with metadata log
-          val outputDf = spark.read.format(format).load(outputDir.getCanonicalPath)
-            .selectExpr("CAST(value AS INT)").as[Int]
-          checkDatasetUnorderly(outputDf, 1, 2, 3, 4, 5)
-        }
-      }
-    }
-  }
-
   override def checkQueryExecution(df: DataFrame): Unit = {
     // Verify that MetadataLogFileIndex is being used and the correct partitioning schema has
     // been inferred

Original file line number	Diff line number	Diff line change
`@@ -233,13 +233,4 @@ class TextSuite extends QueryTest with SharedSparkSession {`
`233`	`233`	`assert(data(3) == Row("\"doh\""))`
`234`	`234`	`assert(data.length == 4)`
`235`	`235`	`}`
`236`		`-`
`237`		`- test("do not produce empty files for empty partitions") {`
`238`		`- withTempPath { dir =>`
`239`		`- val path = dir.getCanonicalPath`
`240`		`- spark.emptyDataset[String].write.text(path)`
`241`		`- val files = new File(path).listFiles()`
`242`		`- assert(!files.exists(_.getName.endsWith("txt")))`
`243`		`- }`
`244`		`- }`
`245`	`236`	`}`