apache · liutang123 · Oct 10, 2017 · Oct 12, 2017 · Oct 12, 2017 · Oct 12, 2017
diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -270,6 +270,15 @@ package object config {
     .longConf
     .createWithDefault(4 * 1024 * 1024)
 
+  private [spark] val FILTER_OUT_EMPTY_SPLIT = ConfigBuilder("spark.files.filterOutEmptySplit")
+    .doc("If set to true, HadoopRDD/NewHadoopRDD will not handle the split which its length is 0." +
+      "Maybe you will read an empty hive table but has many empty files. If set to false, Spark " +
+      "generates many tasks to handle these empty files. Sometimes, users maybe want to use " +
+      "SparkContext#textFile to handle a file stored in hadoop, and they don't want to generate " +
+      "any task when this file is empty, they can set this configuration to true.")
+    .booleanConf
+    .createWithDefault(false)
+
   private[spark] val SECRET_REDACTION_PATTERN =
     ConfigBuilder("spark.redaction.regex")
       .doc("Regex to decide which Spark configuration properties and environment variables in " +

diff --git a/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/HadoopRDD.scala
@@ -35,7 +35,7 @@ import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.broadcast.Broadcast
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
-import org.apache.spark.internal.config.IGNORE_CORRUPT_FILES
+import org.apache.spark.internal.config.{FILTER_OUT_EMPTY_SPLIT, IGNORE_CORRUPT_FILES}
 import org.apache.spark.rdd.HadoopRDD.HadoopMapPartitionsWithSplitRDD
 import org.apache.spark.scheduler.{HDFSCacheTaskLocation, HostTaskLocation}
 import org.apache.spark.storage.StorageLevel
@@ -196,7 +196,11 @@ class HadoopRDD[K, V](
     // add the credentials here as this can be called before SparkContext initialized
     SparkHadoopUtil.get.addCredentials(jobConf)
     val inputFormat = getInputFormat(jobConf)
-    val inputSplits = inputFormat.getSplits(jobConf, minPartitions)
+    val inputSplits = if (sparkContext.getConf.get(FILTER_OUT_EMPTY_SPLIT)) {
+      inputFormat.getSplits(jobConf, minPartitions).filter(_.getLength > 0)
+    } else {
+      inputFormat.getSplits(jobConf, minPartitions)
+    }
     val array = new Array[Partition](inputSplits.size)
     for (i <- 0 until inputSplits.size) {
       array(i) = new HadoopPartition(id, i, inputSplits(i))

diff --git a/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala b/core/src/main/scala/org/apache/spark/rdd/NewHadoopRDD.scala
@@ -21,6 +21,7 @@ import java.io.IOException
 import java.text.SimpleDateFormat
 import java.util.{Date, Locale}
 
+import scala.collection.JavaConverters.asScalaBufferConverter
 import scala.reflect.ClassTag
 
 import org.apache.hadoop.conf.{Configurable, Configuration}
@@ -34,7 +35,7 @@ import org.apache.spark._
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
-import org.apache.spark.internal.config.IGNORE_CORRUPT_FILES
+import org.apache.spark.internal.config.{FILTER_OUT_EMPTY_SPLIT, IGNORE_CORRUPT_FILES}
 import org.apache.spark.rdd.NewHadoopRDD.NewHadoopMapPartitionsWithSplitRDD
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.util.{SerializableConfiguration, ShutdownHookManager}
@@ -122,7 +123,11 @@ class NewHadoopRDD[K, V](
       case _ =>
     }
     val jobContext = new JobContextImpl(_conf, jobId)
-    val rawSplits = inputFormat.getSplits(jobContext).toArray
+    val rawSplits = if (sparkContext.getConf.get(FILTER_OUT_EMPTY_SPLIT)) {
+      inputFormat.getSplits(jobContext).asScala.filter(_.getLength > 0)
+    } else {
+      inputFormat.getSplits(jobContext).asScala
+    }
     val result = new Array[Partition](rawSplits.size)
     for (i <- 0 until rawSplits.size) {
       result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable])

diff --git a/core/src/test/scala/org/apache/spark/FileSuite.scala b/core/src/test/scala/org/apache/spark/FileSuite.scala
@@ -31,7 +31,7 @@ import org.apache.hadoop.mapreduce.Job
 import org.apache.hadoop.mapreduce.lib.input.{FileSplit => NewFileSplit, TextInputFormat => NewTextInputFormat}
 import org.apache.hadoop.mapreduce.lib.output.{TextOutputFormat => NewTextOutputFormat}
 
-import org.apache.spark.internal.config.IGNORE_CORRUPT_FILES
+import org.apache.spark.internal.config.{FILTER_OUT_EMPTY_SPLIT, IGNORE_CORRUPT_FILES}
 import org.apache.spark.rdd.{HadoopRDD, NewHadoopRDD}
 import org.apache.spark.storage.StorageLevel
 import org.apache.spark.util.Utils
@@ -347,7 +347,7 @@ class FileSuite extends SparkFunSuite with LocalSparkContext {
     }
   }
 
-  test ("allow user to disable the output directory existence checking (old Hadoop API") {
+  test ("allow user to disable the output directory existence checking (old Hadoop API)") {
     val sf = new SparkConf()
     sf.setAppName("test").setMaster("local").set("spark.hadoop.validateOutputSpecs", "false")
     sc = new SparkContext(sf)
@@ -510,4 +510,65 @@ class FileSuite extends SparkFunSuite with LocalSparkContext {
     }
   }
 
+  test("allow user to filter out empty split (old Hadoop API)") {
+    val sf = new SparkConf()
+    sf.setAppName("test").setMaster("local").set(FILTER_OUT_EMPTY_SPLIT, true)
+    sc = new SparkContext(sf)
+
+    // Ensure that if all of the splits are empty, we remove the splits correctly
+    val emptyRDD = sc.parallelize(Array.empty[Tuple2[String, String]], 1)
+    emptyRDD.saveAsHadoopFile[TextOutputFormat[String, String]](tempDir.getPath + "/output")
+    assert(new File(tempDir.getPath + "/output/part-00000").exists() === true)
+    val hadoopRDD = sc.textFile(tempDir.getPath + "/output/part-*")
+    assert(hadoopRDD.partitions.length === 0)
+
+    // Ensure that if no split is empty, we don't lose any splits
+    val randomRDD = sc.parallelize(
+      Array(("key1", "a"), ("key2", "a"), ("key3", "b"), ("key4", "c")), 2)
+    randomRDD.saveAsHadoopFile[TextOutputFormat[String, String]](tempDir.getPath + "/output1")
+    assert(new File(tempDir.getPath + "/output1/part-00001").exists() === true)
+    val hadoopRDD1 = sc.textFile(tempDir.getPath + "/output1/part-*")
+    assert(hadoopRDD1.partitions.length === 2)
+
+    // Ensure that if part of the splits are empty, we remove the splits correctly
+    val randomRDD2 = sc.parallelize(
+      Array(("key1", "a"), ("key2", "a")), 5)
+    randomRDD2.saveAsHadoopFile[TextOutputFormat[String, String]](
+      tempDir.getPath + "/output2")
+    assert(new File(tempDir.getPath + "/output2/part-00004").exists() === true)
+    val hadoopRDD2 = sc.textFile(tempDir.getPath + "/output2/part-*")
+    assert(hadoopRDD2.partitions.length === 2)
+  }
+
+  test("allow user to filter out empty split (new Hadoop API)") {
+    val sf = new SparkConf()
+    sf.setAppName("test").setMaster("local").set(FILTER_OUT_EMPTY_SPLIT, true)
+    sc = new SparkContext(sf)
+
+    // Ensure that if all of the splits are empty, we remove the splits correctly
+    val emptyRDD = sc.parallelize(Array.empty[Tuple2[String, String]], 1)
+    emptyRDD.saveAsNewAPIHadoopFile[NewTextOutputFormat[String, String]](
+      tempDir.getPath + "/output")
+    assert(new File(tempDir.getPath + "/output/part-r-00000").exists() === true)
+    val hadoopRDD = sc.textFile(tempDir.getPath + "/output/part-r-*")
+    assert(hadoopRDD.partitions.length === 0)
+
+    // Ensure that if no split is empty, we don't lose any splits
+    val randomRDD1 = sc.parallelize(
+      Array(("key1", "a"), ("key2", "a"), ("key3", "b"), ("key4", "c")), 2)
+    randomRDD1.saveAsNewAPIHadoopFile[NewTextOutputFormat[String, String]](
+      tempDir.getPath + "/output1")
+    assert(new File(tempDir.getPath + "/output1/part-r-00001").exists() === true)
+    val hadoopRDD1 = sc.textFile(tempDir.getPath + "/output1/part-r-*")
+    assert(hadoopRDD1.partitions.length === 2)
+
+    // Ensure that if part of the splits are empty, we remove the splits correctly
+    val randomRDD2 = sc.parallelize(
+      Array(("key1", "a"), ("key2", "a")), 5)
+    randomRDD2.saveAsNewAPIHadoopFile[NewTextOutputFormat[String, String]](
+      tempDir.getPath + "/output2")
+    assert(new File(tempDir.getPath + "/output2/part-r-00004").exists() === true)
+    val hadoopRDD2 = sc.textFile(tempDir.getPath + "/output2/part-r-*")
+    assert(hadoopRDD2.partitions.length === 2)
+  }
 }
diff --git a/docs/configuration.md b/docs/configuration.md
@@ -1191,6 +1191,14 @@ Apart from these, the following properties are also available, and may be useful
     then the partitions with small files will be faster than partitions with bigger files.
   </td>
 </tr>
+<tr>
+    <td><code>spark.files.filterOutEmptySplit</code></td>
+    <td>false</td>
+    <td>If set to true, HadoopRDD/NewHadoopRDD will not handle the split which its length is 0. Maybe you will read an empty
+    hive table but has many empty files. If set to false, Spark generates many tasks to handle these empty files.
+    Sometimes, users maybe want to use SparkContext#textFile to handle a file stored in hadoop, and they don't
+    want to generate any task when this file is empty, they can set this configuration to true.</td>
+</tr>
 <tr>
     <td><code>spark.hadoop.cloneConf</code></td>
     <td>false</td>