turboFei
diff --git a/‎core/src/main/scala/org/apache/spark/rdd/SinglePartitionHadoopRDD.scala‎
Lines changed: 253 additions & 0 deletions b/‎core/src/main/scala/org/apache/spark/rdd/SinglePartitionHadoopRDD.scala‎
Lines changed: 253 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/util/HadoopFSUtils.scala‎
Lines changed: 61 additions & 22 deletions b/‎core/src/main/scala/org/apache/spark/util/HadoopFSUtils.scala‎
Lines changed: 61 additions & 22 deletions
@@ -0,0 +1,253 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.rdd
+
+import java.io.IOException
+import java.text.SimpleDateFormat
+import java.util.{Date, Locale}
+
+import org.apache.hadoop.mapred._
+import org.apache.hadoop.mapred.lib.CombineFileSplit
+
+import org.apache.spark.{InterruptibleIterator, Partition, SparkContext, TaskContext}
+import org.apache.spark.annotation.DeveloperApi
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.deploy.SparkHadoopUtil
+import org.apache.spark.errors.SparkCoreErrors
+import org.apache.spark.internal.Logging
+import org.apache.spark.internal.config.{IGNORE_CORRUPT_FILES, IGNORE_MISSING_FILES}
+import org.apache.spark.util.{NextIterator, SerializableConfiguration, ShutdownHookManager, TaskCompletionListener}
+
+/**
+ * :: DeveloperApi ::
+ * An Hadoop RDD that read all the files in a single partition.
+ *
+ * @param sc The SparkContext to associate the RDD with.
+ * @param broadcastedConf A general Hadoop Configuration, or a subclass of it. If the enclosed
+ *   variable references an instance of JobConf, then that JobConf will be used for the Hadoop job.
+ *   Otherwise, a new JobConf will be created on each slave using the enclosed Configuration.
+ * @param initLocalJobConfFuncOpt Optional closure used to initialize any JobConf that HadoopRDD
+ *     creates.
+ * @param inputFormatClass Storage format of the data to be read.
+ * @param keyClass Class of the key associated with the inputFormatClass.
+ * @param valueClass Class of the value associated with the inputFormatClass.
+ */
+@DeveloperApi
+class SinglePartitionHadoopRDD[K, V](
+    sc: SparkContext,
+    broadcastedConf: Broadcast[SerializableConfiguration],
+    initLocalJobConfFuncOpt: Option[JobConf => Unit],
+    inputFormatClass: Class[_ <: InputFormat[K, V]],
+    keyClass: Class[K],
+    valueClass: Class[V]) extends HadoopRDD[K, V](
+  sc,
+  broadcastedConf,
+  initLocalJobConfFuncOpt,
+  inputFormatClass,
+  keyClass,
+  valueClass,
+  1) {
+
+  private val ignoreCorruptFiles = sparkContext.conf.get(IGNORE_CORRUPT_FILES)
+
+  private val ignoreMissingFiles = sparkContext.conf.get(IGNORE_MISSING_FILES)
+
+  override def getPartitions: Array[Partition] = {
+    val jobConf = getJobConf()
+    // add the credentials here as this can be called before SparkContext initialized
+    SparkHadoopUtil.get.addCredentials(jobConf)
+    jobConf.setLong(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.SPLIT_MINSIZE,
+      java.lang.Long.MAX_VALUE)
+    try {
+      val inputSplits = getInputFormat(jobConf).getSplits(jobConf, 1)
+
+      val allNonEmptySplits = inputSplits.filter(_.getLength > 0).map(_.asInstanceOf[FileSplit])
+      val (files, lengths) = allNonEmptySplits.map(f => {
+        (f.getPath, f.getLength)
+      }).unzip
+
+      // Use CombineFileSplit to represent a single partition split.
+      // Need to convert back to normal FileSplit in execute side.
+      val array = new Array[Partition](1)
+      array(0) = new HadoopPartition(id, 0, new CombineFileSplit(jobConf, files, lengths))
+      array
+    } catch {
+      case e: InvalidInputException if ignoreMissingFiles =>
+        val inputDir = jobConf.get(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR)
+        logWarning(s"$inputDir doesn't exist and no partitions returned from this path.", e)
+        Array.empty[Partition]
+      case e: IOException if e.getMessage.startsWith("Not a file:") =>
+        val path = e.getMessage.split(":").map(_.trim).apply(2)
+        throw SparkCoreErrors.pathNotSupportedError(path)
+    }
+  }
+
+  override def compute(theSplit: Partition,
+      context: TaskContext): InterruptibleIterator[(K, V)] = {
+    HadoopRDD.addLocalConfiguration(
+      new SimpleDateFormat("yyyyMMddHHmmss", Locale.US).format(new Date()),
+      context.stageId, theSplit.index, context.attemptNumber, getJobConf())
+
+    val combineSplit = theSplit.asInstanceOf[HadoopPartition]
+      .inputSplit.value.asInstanceOf[CombineFileSplit]
+    new InterruptibleIterator[(K, V)](context,
+      new CombineFileSplitScanIterator(context, combineSplit, ignoreCorruptFiles))
+  }
+
+  class CombineFileSplitScanIterator(
+      context: TaskContext,
+      split: CombineFileSplit,
+      ignoreCorruptFiles: Boolean)
+    extends Iterator[(K, V)] with AutoCloseable with Logging {
+    val jobConf = getJobConf()
+    val inputMetrics = context.taskMetrics().inputMetrics
+    val existingBytesRead = inputMetrics.bytesRead
+    val paths = split.getPaths
+    val lengths = split.getLengths
+    val fileSplits = paths.zip(lengths).map(f =>
+      new FileSplit(f._1, 0, f._2, split.getJob)).toIterator
+
+    var currentFile: FileSplit = null
+    var currentIterator: Iterator[(K, V)] = null
+
+    override def hasNext: Boolean = {
+      hasNextRecord()
+    }
+
+    override def next(): (K, V) = {
+      currentIterator.next()
+    }
+
+    override def close(): Unit = {}
+
+    protected def hasNextRecord(): Boolean = {
+      (currentIterator != null && currentIterator.hasNext) || nextIterator()
+    }
+
+    protected def nextIterator(): Boolean = {
+      if (fileSplits.hasNext) {
+        currentFile = fileSplits.next()
+        logInfo(s"Reading File ${currentFile.getPath}")
+        currentIterator = readCurrentFile()
+        hasNextRecord()
+      } else {
+        currentFile = null
+        InputFileBlockHolder.unset()
+        false
+      }
+    }
+
+    protected def readCurrentFile() = {
+      currentIterator = new NextIterator[(K, V)] {
+
+        // Sets InputFileBlockHolder for the file block's information
+        InputFileBlockHolder.set(currentFile.getPath.toString,
+          currentFile.getStart, currentFile.getLength)
+
+        // Find a function that will return the FileSystem bytes read by this thread.
+        // Do this before creating RecordReader, because RecordReader's constructor
+        // might read some bytes
+        private val getBytesReadCallback: Option[() => Long] = {
+          Some(SparkHadoopUtil.get.getFSBytesReadOnThreadCallback())
+        }
+
+        // We get our input bytes from thread-local Hadoop FileSystem statistics.
+        // If we do a coalesce, however, we are likely to compute multiple partitions in the same
+        // task and in the same thread, in which case we need to avoid override values written by
+        // previous partitions (SPARK-13071).
+        private def updateBytesRead(): Unit = {
+          getBytesReadCallback.foreach { getBytesRead =>
+            inputMetrics.setBytesRead(existingBytesRead + getBytesRead())
+          }
+        }
+
+        private val inputFormat = getInputFormat(jobConf)
+        private var reader =
+          try {
+            inputFormat.getRecordReader(currentFile, jobConf, Reporter.NULL)
+          } catch {
+            case e: IOException if ignoreCorruptFiles =>
+              logWarning(s"Skipped the rest content in " +
+                s"the corrupted file: ${currentFile.getPath}", e)
+              finished = true
+              null
+          }
+        // Register an on-task-completion callback to close the input stream.
+        context.addTaskCompletionListener(new TaskCompletionListener {
+          override def onTaskCompletion(context: TaskContext): Unit = {
+            // Update the bytes read before closing is to make sure lingering bytesRead statistics
+            // in this thread get correctly added.
+            updateBytesRead()
+            closeIfNeeded()
+          }
+        })
+
+        private val key: K = if (reader == null) null.asInstanceOf[K] else reader.createKey()
+        private val value: V = if (reader == null) null.asInstanceOf[V] else reader.createValue()
+
+        override def getNext(): (K, V) = {
+          try {
+            finished = !reader.next(key, value)
+          } catch {
+            case e: IOException if ignoreCorruptFiles =>
+              logWarning(s"Skipped the rest content in " +
+                s"the corrupted file: ${currentFile.getPath}", e)
+              finished = true
+          }
+          if (!finished) {
+            inputMetrics.incRecordsRead(1)
+          }
+          if (inputMetrics.recordsRead % SparkHadoopUtil
+            .UPDATE_INPUT_METRICS_INTERVAL_RECORDS == 0) {
+            updateBytesRead()
+          }
+          (key, value)
+        }
+
+        override def close(): Unit = {
+          if (reader != null) {
+            InputFileBlockHolder.unset()
+            try {
+              reader.close()
+            } catch {
+              case e: Exception =>
+                if (!ShutdownHookManager.inShutdown()) {
+                  logWarning("Exception in RecordReader.close()", e)
+                }
+            } finally {
+              reader = null
+            }
+            if (getBytesReadCallback.isDefined) {
+              updateBytesRead()
+            } else {
+              // If we can't get the bytes read from the FS stats, fall back to the split size,
+              // which may be inaccurate.
+              try {
+                inputMetrics.incBytesRead(currentFile.getLength)
+              } catch {
+                case e: IOException =>
+                  logWarning("Unable to get input size to set InputMetrics for task", e)
+              }
+            }
+          }
+        }
+      }
+      currentIterator
+    }
+  }
+}
@@ -64,9 +64,10 @@ private[spark] object HadoopFSUtils extends Logging {
     ignoreMissingFiles: Boolean,
     ignoreLocality: Boolean,
     parallelismThreshold: Int,
-    parallelismMax: Int): Seq[(Path, Seq[FileStatus])] = {
+    parallelismMax: Int,
+    maxLeafFiles: Int = -1): Seq[(Path, Seq[FileStatus])] = {
     parallelListLeafFilesInternal(sc, paths, hadoopConf, filter, isRootLevel = true,
-      ignoreMissingFiles, ignoreLocality, parallelismThreshold, parallelismMax)
+      ignoreMissingFiles, ignoreLocality, parallelismThreshold, parallelismMax, maxLeafFiles)
   }
 
   private def parallelListLeafFilesInternal(
@@ -78,27 +79,57 @@ private[spark] object HadoopFSUtils extends Logging {
       ignoreMissingFiles: Boolean,
       ignoreLocality: Boolean,
       parallelismThreshold: Int,
-      parallelismMax: Int): Seq[(Path, Seq[FileStatus])] = {
+      parallelismMax: Int,
+      maxLeafFiles: Int): Seq[(Path, Seq[FileStatus])] = {
 
     // Short-circuits parallel listing when serial listing is likely to be faster.
-    if (paths.size <= parallelismThreshold) {
+    if ((maxLeafFiles > 0 && maxLeafFiles < paths.size) || paths.size <= parallelismThreshold) {
+      var totalLimit = maxLeafFiles
       return paths.map { path =>
-        val leafFiles = listLeafFiles(
-          path,
-          hadoopConf,
-          filter,
-          Some(sc),
-          ignoreMissingFiles = ignoreMissingFiles,
-          ignoreLocality = ignoreLocality,
-          isRootPath = isRootLevel,
-          parallelismThreshold = parallelismThreshold,
-          parallelismMax = parallelismMax)
-        (path, leafFiles)
+        if (maxLeafFiles > 0) {
+          if (totalLimit > 0) {
+            val filesUnderPath = listLeafFiles(
+              path,
+              hadoopConf,
+              filter,
+              Some(sc),
+              ignoreMissingFiles = ignoreMissingFiles,
+              ignoreLocality = ignoreLocality,
+              isRootPath = isRootLevel,
+              parallelismThreshold = parallelismThreshold,
+              parallelismMax = parallelismMax,
+              maxLeafFiles = totalLimit)
+            totalLimit -= filesUnderPath.length
+            (path, filesUnderPath)
+          } else {
+            (path, Seq.empty)
+          }
+
+        } else {
+          val leafFiles = listLeafFiles(
+            path,
+            hadoopConf,
+            filter,
+            Some(sc),
+            ignoreMissingFiles = ignoreMissingFiles,
+            ignoreLocality = ignoreLocality,
+            isRootPath = isRootLevel,
+            parallelismThreshold = parallelismThreshold,
+            parallelismMax = parallelismMax,
+            maxLeafFiles = maxLeafFiles)
+          (path, leafFiles)
+        }
       }
     }
 
-    logInfo(s"Listing leaf files and directories in parallel under ${paths.length} paths." +
-      s" The first several paths are: ${paths.take(10).mkString(", ")}.")
+    val logMsg = if (paths.size <= 5) {
+      s"Listing leaf files and directories in parallel under: " +
+        s"${paths.mkString(", ")}"
+    } else {
+      s"Listing leaf files and directories in parallel under: " +
+        s"${paths.take(5).mkString(", ")}, ..."
+    }
+    logInfo(logMsg)
     HiveCatalogMetrics.incrementParallelListingJobCount(1)
 
     val serializableConfiguration = new SerializableConfiguration(hadoopConf)
@@ -131,7 +162,8 @@ private[spark] object HadoopFSUtils extends Logging {
               ignoreLocality = ignoreLocality,
               isRootPath = isRootLevel,
               parallelismThreshold = Int.MaxValue,
-              parallelismMax = 0)
+              parallelismMax = 0,
+              maxLeafFiles = maxLeafFiles)
             (path, leafFiles)
           }
         }.collect()
@@ -158,7 +190,8 @@ private[spark] object HadoopFSUtils extends Logging {
       ignoreLocality: Boolean,
       isRootPath: Boolean,
       parallelismThreshold: Int,
-      parallelismMax: Int): Seq[FileStatus] = {
+      parallelismMax: Int,
+      maxLeafFiles: Int): Seq[FileStatus] = {
 
     logTrace(s"Listing $path")
     val fs = path.getFileSystem(hadoopConf)
@@ -204,8 +237,12 @@ private[spark] object HadoopFSUtils extends Logging {
         Array.empty[FileStatus]
     }
 
-    val filteredStatuses =
+    val filteredStatuses = if (maxLeafFiles > 0) {
+      statuses.filterNot(status => shouldFilterOutPathName(status.getPath.getName))
+        .take(maxLeafFiles)
+    } else {
       statuses.filterNot(status => shouldFilterOutPathName(status.getPath.getName))
+    }
 
     val allLeafStatuses = {
       val (dirs, topLevelFiles) = filteredStatuses.partition(_.isDirectory)
@@ -220,7 +257,8 @@ private[spark] object HadoopFSUtils extends Logging {
             ignoreMissingFiles = ignoreMissingFiles,
             ignoreLocality = ignoreLocality,
             parallelismThreshold = parallelismThreshold,
-            parallelismMax = parallelismMax
+            parallelismMax = parallelismMax,
+            maxLeafFiles = maxLeafFiles
           ).flatMap(_._2)
         case _ =>
           dirs.flatMap { dir =>
@@ -233,7 +271,8 @@ private[spark] object HadoopFSUtils extends Logging {
               ignoreLocality = ignoreLocality,
               isRootPath = false,
               parallelismThreshold = parallelismThreshold,
-              parallelismMax = parallelismMax)
+              parallelismMax = parallelismMax,
+              maxLeafFiles = maxLeafFiles)
           }
       }
       val filteredTopLevelFiles = if (filter != null) {