apache · squito · May 7, 2015 · May 7, 2015 · May 7, 2015 · May 7, 2015
diff --git a/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/unsafe/UnsafeShuffleWriter.java
@@ -75,6 +75,7 @@ public class UnsafeShuffleWriter<K, V> extends ShuffleWriter<K, V> {
   private final ShuffleWriteMetrics writeMetrics;
   private final int shuffleId;
   private final int mapId;
+  private final int stageAttemptId;
   private final TaskContext taskContext;
   private final SparkConf sparkConf;
   private final boolean transferToEnabled;
@@ -91,6 +92,11 @@ private static final class MyByteArrayOutputStream extends ByteArrayOutputStream
 
   private MyByteArrayOutputStream serBuffer;
   private SerializationStream serOutputStream;
+  /**
+   * This is just to allow tests to explore more code paths, without requiring too much complexity
+   * in the test cases.  In normal usage, it will be true.
+   */
+  private final boolean allowSpillMove;
 
   /**
    * Are we in the process of stopping? Because map tasks can call stop() with success = true
@@ -106,6 +112,7 @@ public UnsafeShuffleWriter(
       ShuffleMemoryManager shuffleMemoryManager,
       UnsafeShuffleHandle<K, V> handle,
       int mapId,
+      int stageAttemptId,
       TaskContext taskContext,
       SparkConf sparkConf) throws IOException {
     final int numPartitions = handle.dependency().partitioner().numPartitions();
@@ -119,6 +126,7 @@ public UnsafeShuffleWriter(
     this.memoryManager = memoryManager;
     this.shuffleMemoryManager = shuffleMemoryManager;
     this.mapId = mapId;
+    this.stageAttemptId = stageAttemptId;
     final ShuffleDependency<K, V, V> dep = handle.dependency();
     this.shuffleId = dep.shuffleId();
     this.serializer = Serializer.getSerializer(dep.serializer()).newInstance();
@@ -127,6 +135,7 @@ public UnsafeShuffleWriter(
     taskContext.taskMetrics().shuffleWriteMetrics_$eq(Option.apply(writeMetrics));
     this.taskContext = taskContext;
     this.sparkConf = sparkConf;
+    this.allowSpillMove = sparkConf.getBoolean("spark.shuffle.unsafe.testing.allowSpillMove", true);
     this.transferToEnabled = sparkConf.getBoolean("spark.file.transferTo", true);
     open();
   }
@@ -226,8 +235,9 @@ void closeAndWriteOutput() throws IOException {
         }
       }
     }
-    shuffleBlockResolver.writeIndexFile(shuffleId, mapId, partitionLengths);
-    mapStatus = MapStatus$.MODULE$.apply(blockManager.shuffleServerId(), partitionLengths);
+    shuffleBlockResolver.writeIndexFile(shuffleId, mapId, stageAttemptId, partitionLengths);
+    mapStatus = MapStatus$.MODULE$.apply(
+      blockManager.shuffleServerId(), stageAttemptId, partitionLengths);
   }
 
   @VisibleForTesting
@@ -260,7 +270,7 @@ void forceSorterToSpill() throws IOException {
    * @return the partition lengths in the merged file.
    */
   private long[] mergeSpills(SpillInfo[] spills) throws IOException {
-    final File outputFile = shuffleBlockResolver.getDataFile(shuffleId, mapId);
+    final File outputFile = shuffleBlockResolver.getDataFile(shuffleId, mapId, stageAttemptId);
     final boolean compressionEnabled = sparkConf.getBoolean("spark.shuffle.compress", true);
     final CompressionCodec compressionCodec = CompressionCodec$.MODULE$.createCodec(sparkConf);
     final boolean fastMergeEnabled =
@@ -271,7 +281,7 @@ private long[] mergeSpills(SpillInfo[] spills) throws IOException {
       if (spills.length == 0) {
         new FileOutputStream(outputFile).close(); // Create an empty file
         return new long[partitioner.numPartitions()];
-      } else if (spills.length == 1) {
+      } else if (spills.length == 1 && allowSpillMove) {
         // Here, we don't need to perform any metrics updates because the bytes written to this
         // output file would have already been counted as shuffle bytes written.
         Files.move(spills[0].file, outputFile);
@@ -336,7 +346,7 @@ private long[] mergeSpillsWithFileStream(
       SpillInfo[] spills,
       File outputFile,
       @Nullable CompressionCodec compressionCodec) throws IOException {
-    assert (spills.length >= 2);
+    assert (spills.length >= 2 || !allowSpillMove);
     final int numPartitions = partitioner.numPartitions();
     final long[] partitionLengths = new long[numPartitions];
     final InputStream[] spillInputStreams = new FileInputStream[spills.length];
@@ -390,7 +400,7 @@ private long[] mergeSpillsWithFileStream(
    * @return the partition lengths in the merged file.
    */
   private long[] mergeSpillsWithTransferTo(SpillInfo[] spills, File outputFile) throws IOException {
-    assert (spills.length >= 2);
+    assert (spills.length >= 2 || !allowSpillMove);
     final int numPartitions = partitioner.numPartitions();
     final long[] partitionLengths = new long[numPartitions];
     final FileChannel[] spillInputChannels = new FileChannel[spills.length];
@@ -474,7 +484,7 @@ public Option<MapStatus> stop(boolean success) {
           return Option.apply(mapStatus);
         } else {
           // The map task failed, so delete our output data.
-          shuffleBlockResolver.removeDataByMap(shuffleId, mapId);
+          shuffleBlockResolver.removeDataByMap(shuffleId, mapId, stageAttemptId);
           return Option.apply(null);
         }
       }

diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
@@ -543,7 +543,8 @@ private[spark] object MapOutputTracker extends Logging {
       } else {
         for (part <- startPartition until endPartition) {
           splitsByAddress.getOrElseUpdate(status.location, ArrayBuffer()) +=
-            ((ShuffleBlockId(shuffleId, mapId, part), status.getSizeForBlock(part)))
+            ((ShuffleBlockId(shuffleId, mapId, part, status.stageAttemptId),
+              status.getSizeForBlock(part)))
         }
       }
     }

diff --git a/core/src/main/scala/org/apache/spark/TaskContextImpl.scala b/core/src/main/scala/org/apache/spark/TaskContextImpl.scala
@@ -34,6 +34,7 @@ private[spark] class TaskContextImpl(
     @transient private val metricsSystem: MetricsSystem,
     internalAccumulators: Seq[Accumulator[Long]],
     val runningLocally: Boolean = false,
+    val stageAttemptId: Int = 0,  // for testing
     val taskMetrics: TaskMetrics = TaskMetrics.empty)
   extends TaskContext
   with Logging {

diff --git a/core/src/main/scala/org/apache/spark/TaskEndReason.scala b/core/src/main/scala/org/apache/spark/TaskEndReason.scala
@@ -78,12 +78,13 @@ case class FetchFailed(
     shuffleId: Int,
     mapId: Int,
     reduceId: Int,
+    stageAttemptId: Int,
     message: String)
   extends TaskFailedReason {
   override def toErrorString: String = {
     val bmAddressString = if (bmAddress == null) "null" else bmAddress.toString
     s"FetchFailed($bmAddressString, shuffleId=$shuffleId, mapId=$mapId, reduceId=$reduceId, " +
-      s"message=\n$message\n)"
+      s"stageAttemptId=$stageAttemptId, message=\n$message\n)"
   }
 }
 

diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -1228,7 +1228,7 @@ class DAGScheduler(
         logInfo("Resubmitted " + task + ", so marking it as still running")
         stage.pendingPartitions += task.partitionId
 
-      case FetchFailed(bmAddress, shuffleId, mapId, reduceId, failureMessage) =>
+      case FetchFailed(bmAddress, shuffleId, mapId, reduceId, stageAttemptId, failureMessage) =>
         val failedStage = stageIdToStage(task.stageId)
         val mapStage = shuffleToMapStage(shuffleId)
 

diff --git a/core/src/main/scala/org/apache/spark/scheduler/JobLogger.scala b/core/src/main/scala/org/apache/spark/scheduler/JobLogger.scala
@@ -223,7 +223,7 @@ class JobLogger(val user: String, val logDirName: String) extends SparkListener
         taskStatus += " STATUS=RESUBMITTED TID=" + taskInfo.taskId +
                       " STAGE_ID=" + taskEnd.stageId
         stageLogInfo(taskEnd.stageId, taskStatus)
-      case FetchFailed(bmAddress, shuffleId, mapId, reduceId, message) =>
+      case FetchFailed(bmAddress, shuffleId, mapId, reduceId, stageAttemptId, message) =>
         taskStatus += " STATUS=FETCHFAILED TID=" + taskInfo.taskId + " STAGE_ID=" +
                       taskEnd.stageId + " SHUFFLE_ID=" + shuffleId + " MAP_ID=" +
                       mapId + " REDUCE_ID=" + reduceId

diff --git a/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala b/core/src/main/scala/org/apache/spark/scheduler/MapStatus.scala
@@ -32,6 +32,9 @@ private[spark] sealed trait MapStatus {
   /** Location where this task was run. */
   def location: BlockManagerId
 
+  /** stage attempt for the ShuffleMapTask */
+  def stageAttemptId: Int
+
   /**
    * Estimated size for the reduce block, in bytes.
    *
@@ -44,11 +47,11 @@ private[spark] sealed trait MapStatus {
 
 private[spark] object MapStatus {
 
-  def apply(loc: BlockManagerId, uncompressedSizes: Array[Long]): MapStatus = {
+  def apply(loc: BlockManagerId, stageAttemptId: Int, uncompressedSizes: Array[Long]): MapStatus = {
     if (uncompressedSizes.length > 2000) {
-      HighlyCompressedMapStatus(loc, uncompressedSizes)
+      HighlyCompressedMapStatus(loc, stageAttemptId, uncompressedSizes)
     } else {
-      new CompressedMapStatus(loc, uncompressedSizes)
+      new CompressedMapStatus(loc, stageAttemptId, uncompressedSizes)
     }
   }
 
@@ -91,29 +94,34 @@ private[spark] object MapStatus {
  */
 private[spark] class CompressedMapStatus(
     private[this] var loc: BlockManagerId,
+    private[this] var _stageAttemptId: Int,
     private[this] var compressedSizes: Array[Byte])
   extends MapStatus with Externalizable {
 
-  protected def this() = this(null, null.asInstanceOf[Array[Byte]])  // For deserialization only
+  protected def this() = this(null, 0, null.asInstanceOf[Array[Byte]])  // For deserialization only
 
-  def this(loc: BlockManagerId, uncompressedSizes: Array[Long]) {
-    this(loc, uncompressedSizes.map(MapStatus.compressSize))
+  def this(loc: BlockManagerId, stageAttemptId: Int, uncompressedSizes: Array[Long]) {
+    this(loc, stageAttemptId, uncompressedSizes.map(MapStatus.compressSize))
   }
 
   override def location: BlockManagerId = loc
 
+  override def stageAttemptId: Int = _stageAttemptId
+
   override def getSizeForBlock(reduceId: Int): Long = {
     MapStatus.decompressSize(compressedSizes(reduceId))
   }
 
   override def writeExternal(out: ObjectOutput): Unit = Utils.tryOrIOException {
     loc.writeExternal(out)
+    out.writeInt(_stageAttemptId)
     out.writeInt(compressedSizes.length)
     out.write(compressedSizes)
   }
 
   override def readExternal(in: ObjectInput): Unit = Utils.tryOrIOException {
     loc = BlockManagerId(in)
+    _stageAttemptId = in.readInt()
     val len = in.readInt()
     compressedSizes = new Array[Byte](len)
     in.readFully(compressedSizes)
@@ -132,7 +140,8 @@ private[spark] class CompressedMapStatus(
  */
 private[spark] class HighlyCompressedMapStatus private (
     private[this] var loc: BlockManagerId,
-    private[this] var numNonEmptyBlocks: Int,
+    private[this] var _stageAttemptId: Int,
+    numNonEmptyBlocks: Int,
     private[this] var emptyBlocks: RoaringBitmap,
     private[this] var avgSize: Long)
   extends MapStatus with Externalizable {
@@ -141,10 +150,12 @@ private[spark] class HighlyCompressedMapStatus private (
   require(loc == null || avgSize > 0 || numNonEmptyBlocks == 0,
     "Average size can only be zero for map stages that produced no output")
 
-  protected def this() = this(null, -1, null, -1)  // For deserialization only
+  protected def this() = this(null, 0, -1, null, -1)  // For deserialization only
 
   override def location: BlockManagerId = loc
 
+  override def stageAttemptId: Int = _stageAttemptId
+
   override def getSizeForBlock(reduceId: Int): Long = {
     if (emptyBlocks.contains(reduceId)) {
       0
@@ -155,20 +166,25 @@ private[spark] class HighlyCompressedMapStatus private (
 
   override def writeExternal(out: ObjectOutput): Unit = Utils.tryOrIOException {
     loc.writeExternal(out)
+    out.writeInt(_stageAttemptId)
     emptyBlocks.writeExternal(out)
     out.writeLong(avgSize)
   }
 
   override def readExternal(in: ObjectInput): Unit = Utils.tryOrIOException {
     loc = BlockManagerId(in)
+    _stageAttemptId = in.readInt()
     emptyBlocks = new RoaringBitmap()
     emptyBlocks.readExternal(in)
     avgSize = in.readLong()
   }
 }
 
 private[spark] object HighlyCompressedMapStatus {
-  def apply(loc: BlockManagerId, uncompressedSizes: Array[Long]): HighlyCompressedMapStatus = {
+  def apply(
+      loc: BlockManagerId,
+      stageAttemptId: Int,
+      uncompressedSizes: Array[Long]): HighlyCompressedMapStatus = {
     // We must keep track of which blocks are empty so that we don't report a zero-sized
     // block as being non-empty (or vice-versa) when using the average block size.
     var i = 0
@@ -194,6 +210,6 @@ private[spark] object HighlyCompressedMapStatus {
     } else {
       0
     }
-    new HighlyCompressedMapStatus(loc, numNonEmptyBlocks, emptyBlocks, avgSize)
+    new HighlyCompressedMapStatus(loc, stageAttemptId, numNonEmptyBlocks, emptyBlocks, avgSize)
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala b/core/src/main/scala/org/apache/spark/scheduler/ShuffleMapTask.scala
@@ -69,7 +69,7 @@ private[spark] class ShuffleMapTask(
     var writer: ShuffleWriter[Any, Any] = null
     try {
       val manager = SparkEnv.get.shuffleManager
-      writer = manager.getWriter[Any, Any](dep.shuffleHandle, partitionId, context)
+      writer = manager.getWriter[Any, Any](dep.shuffleHandle, partitionId, stageAttemptId, context)
       writer.write(rdd.iterator(partition, context).asInstanceOf[Iterator[_ <: Product2[Any, Any]]])
       writer.stop(success = true).get
     } catch {

diff --git a/core/src/main/scala/org/apache/spark/scheduler/Task.scala b/core/src/main/scala/org/apache/spark/scheduler/Task.scala
@@ -69,13 +69,14 @@ private[spark] abstract class Task[T](
     metricsSystem: MetricsSystem)
   : (T, AccumulatorUpdates) = {
     context = new TaskContextImpl(
-      stageId,
-      partitionId,
-      taskAttemptId,
-      attemptNumber,
-      taskMemoryManager,
-      metricsSystem,
-      internalAccumulators,
+      stageId = stageId,
+      stageAttemptId = stageAttemptId,
+      partitionId = partitionId,
+      taskAttemptId = taskAttemptId,
+      attemptNumber = attemptNumber,
+      taskMemoryManager = taskMemoryManager,
+      metricsSystem = metricsSystem,
+      internalAccumulators = internalAccumulators,
       runningLocally = false)
     TaskContext.setTaskContext(context)
     context.taskMetrics.setHostname(Utils.localHostName())

diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -620,18 +620,22 @@ private[spark] class TaskSetManager(
     val index = info.index
     info.markSuccessful()
     removeRunningTask(tid)
+    val task = tasks(index)
     // This method is called by "TaskSchedulerImpl.handleSuccessfulTask" which holds the
     // "TaskSchedulerImpl" lock until exiting. To avoid the SPARK-7655 issue, we should not
     // "deserialize" the value when holding a lock to avoid blocking other threads. So we call
     // "result.value()" in "TaskResultGetter.enqueueSuccessfulTask" before reaching here.
     // Note: "result.value()" only deserializes the value when it's called at the first time, so
     // here "result.value()" just returns the value and won't block other threads.
     sched.dagScheduler.taskEnded(
-      tasks(index), Success, result.value(), result.accumUpdates, info, result.metrics)
+      task, Success, result.value(), result.accumUpdates, info, result.metrics)
     if (!successful(index)) {
       tasksSuccessful += 1
-      logInfo("Finished task %s in stage %s (TID %d) in %d ms on %s (%d/%d)".format(
-        info.id, taskSet.id, info.taskId, info.duration, info.host, tasksSuccessful, numTasks))
+      // include the partition here b/c on a stage retry, the partition is *not* necessarily
+      // the same as info.id
+      logInfo(s"Finished task ${info.id} in stage ${taskSet.id} (TID ${info.taskId}}, " +
+        s"partition ${task.partitionId}) in ${info.duration} ms on executor ${info.executorId} " +
+        s"(${info.host}) ($tasksSuccessful/$numTasks)")
       // Mark successful and stop if all the tasks have succeeded.
       successful(index) = true
       if (tasksSuccessful == numTasks) {

diff --git a/core/src/main/scala/org/apache/spark/shuffle/FetchFailedException.scala b/core/src/main/scala/org/apache/spark/shuffle/FetchFailedException.scala
@@ -32,6 +32,7 @@ private[spark] class FetchFailedException(
     shuffleId: Int,
     mapId: Int,
     reduceId: Int,
+    stageAttemptId: Int,
     message: String,
     cause: Throwable = null)
   extends Exception(message, cause) {
@@ -41,12 +42,13 @@ private[spark] class FetchFailedException(
       shuffleId: Int,
       mapId: Int,
       reduceId: Int,
+      stageAttemptId: Int,
       cause: Throwable) {
-    this(bmAddress, shuffleId, mapId, reduceId, cause.getMessage, cause)
+    this(bmAddress, shuffleId, mapId, reduceId, stageAttemptId, cause.getMessage, cause)
   }
 
   def toTaskEndReason: TaskEndReason = FetchFailed(bmAddress, shuffleId, mapId, reduceId,
-    Utils.exceptionString(this))
+    stageAttemptId, Utils.exceptionString(this))
 }
 
 /**
@@ -56,4 +58,4 @@ private[spark] class MetadataFetchFailedException(
     shuffleId: Int,
     reduceId: Int,
     message: String)
-  extends FetchFailedException(null, shuffleId, -1, reduceId, message)
+  extends FetchFailedException(null, shuffleId, -1, reduceId, -1, message)