apache · ivoson · Sep 15, 2025 · Sep 17, 2025 · Sep 22, 2025 · Sep 24, 2025
diff --git a/core/src/main/scala/org/apache/spark/MapOutputTracker.scala b/core/src/main/scala/org/apache/spark/MapOutputTracker.scala
@@ -165,9 +165,11 @@ private class ShuffleStatus(
 
   /**
    * Register a map output. If there is already a registered location for the map output then it
-   * will be replaced by the new location.
+   * will be replaced by the new location. Returns true if the checksum in the new MapStatus is
+   * different from a previous registered MapStatus. Otherwise, returns false.
    */
-  def addMapOutput(mapIndex: Int, status: MapStatus): Unit = withWriteLock {
+  def addMapOutput(mapIndex: Int, status: MapStatus): Boolean = withWriteLock {
+    var isChecksumMismatch: Boolean = false
     val currentMapStatus = mapStatuses(mapIndex)
     if (currentMapStatus == null) {
       _numAvailableMapOutputs += 1
@@ -183,9 +185,11 @@ private class ShuffleStatus(
       logInfo(s"Checksum of map output changes from ${preStatus.checksumValue} to " +
         s"${status.checksumValue} for task ${status.mapId}.")
       checksumMismatchIndices.add(mapIndex)
+      isChecksumMismatch = true
     }
     mapStatuses(mapIndex) = status
     mapIdToMapIndex(status.mapId) = mapIndex
+    isChecksumMismatch
   }
 
   /**
@@ -853,7 +857,7 @@ private[spark] class MapOutputTrackerMaster(
     }
   }
 
-  def registerMapOutput(shuffleId: Int, mapIndex: Int, status: MapStatus): Unit = {
+  def registerMapOutput(shuffleId: Int, mapIndex: Int, status: MapStatus): Boolean = {
     shuffleStatuses(shuffleId).addMapOutput(mapIndex, status)
   }
 

diff --git a/core/src/main/scala/org/apache/spark/internal/config/package.scala b/core/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -1679,6 +1679,15 @@ package object config {
       .checkValues(Set("ADLER32", "CRC32", "CRC32C"))
       .createWithDefault("ADLER32")
 
+  private[spark] val SCHEDULER_CHECKSUM_MISMATCH_FULL_RETRY_ENABLED =
+    ConfigBuilder("spark.scheduler.checksumMismatchFullRetry.enabled")
+      .doc("Whether to retry all tasks of a consumer stage when we detect checksum mismatches " +
+        "with its producer stages. The checksum computation is controlled by another config " +
+        "called SHUFFLE_ORDER_INDEPENDENT_CHECKSUM_ENABLED.")
+      .version("4.1.0")
+      .booleanConf
+      .createWithDefault(false)
+
   private[spark] val SHUFFLE_COMPRESS =
     ConfigBuilder("spark.shuffle.compress")
       .doc("Whether to compress shuffle output. Compression will use " +

diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -1773,7 +1773,7 @@ abstract class RDD[T: ClassTag](
   /**
    * Return whether this RDD is reliably checkpointed and materialized.
    */
-  private[rdd] def isReliablyCheckpointed: Boolean = {
+  private[spark] def isReliablyCheckpointed: Boolean = {
     checkpointData match {
       case Some(reliable: ReliableRDDCheckpointData[_]) if reliable.isCheckpointed => true
       case _ => false

diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala
@@ -307,6 +307,9 @@ private[spark] class DAGScheduler(
 
   private val shuffleFinalizeRpcThreads = sc.conf.get(config.PUSH_SHUFFLE_FINALIZE_RPC_THREADS)
 
+  private val checksumMismatchFullRetryEnabled =
+    sc.getConf.get(config.SCHEDULER_CHECKSUM_MISMATCH_FULL_RETRY_ENABLED)
+
   // Since SparkEnv gets initialized after DAGScheduler, externalShuffleClient needs to be
   // initialized lazily
   private lazy val externalShuffleClient: Option[BlockStoreClient] =
@@ -1551,29 +1554,41 @@ private[spark] class DAGScheduler(
     // The operation here can make sure for the partially completed intermediate stage,
     // `findMissingPartitions()` returns all partitions every time.
     stage match {
-      case sms: ShuffleMapStage if stage.isIndeterminate && !sms.isAvailable =>
-        // already executed at least once
-        if (sms.getNextAttemptId > 0) {
-          // While we previously validated possible rollbacks during the handling of a FetchFailure,
-          // where we were fetching from an indeterminate source map stages, this later check
-          // covers additional cases like recalculating an indeterminate stage after an executor
-          // loss. Moreover, because this check occurs later in the process, if a result stage task
-          // has successfully completed, we can detect this and abort the job, as rolling back a
-          // result stage is not possible.
-          val stagesToRollback = collectSucceedingStages(sms)
-          abortStageWithInvalidRollBack(stagesToRollback)
-          // stages which cannot be rolled back were aborted which leads to removing the
-          // the dependant job(s) from the active jobs set
-          val numActiveJobsWithStageAfterRollback =
-            activeJobs.count(job => stagesToRollback.contains(job.finalStage))
-          if (numActiveJobsWithStageAfterRollback == 0) {
-            logInfo(log"All jobs depending on the indeterminate stage " +
-              log"(${MDC(STAGE_ID, stage.id)}) were aborted so this stage is not needed anymore.")
-            return
+      case sms: ShuffleMapStage if !sms.isAvailable =>
+        if (checksumMismatchFullRetryEnabled) {
+          // When the parents of this stage are indeterminate (e.g., some parents are not
+          // checkpointed and checksum mismatches are detected), the output data of the parents
+          // may have changed due to task retries. For correctness reason, we need to
+          // retry all tasks of the current stage. The legacy way of using current stage's
+          // deterministic level to trigger full stage retry is not accurate.
+          if (stage.isParentIndeterminate) {
+            mapOutputTracker.unregisterAllMapAndMergeOutput(sms.shuffleDep.shuffleId)
+            sms.shuffleDep.newShuffleMergeState()
+          }
+        } else if (stage.isIndeterminate) {
+          // already executed at least once
+          if (sms.getNextAttemptId > 0) {
+            // While we previously validated possible rollbacks during the handling of a FetchFailure,
+            // where we were fetching from an indeterminate source map stages, this later check
+            // covers additional cases like recalculating an indeterminate stage after an executor
+            // loss. Moreover, because this check occurs later in the process, if a result stage task
+            // has successfully completed, we can detect this and abort the job, as rolling back a
+            // result stage is not possible.
+            val stagesToRollback = collectSucceedingStages(sms)
+            abortStageWithInvalidRollBack(stagesToRollback)
+            // stages which cannot be rolled back were aborted which leads to removing the
+            // the dependant job(s) from the active jobs set
+            val numActiveJobsWithStageAfterRollback =
+              activeJobs.count(job => stagesToRollback.contains(job.finalStage))
+            if (numActiveJobsWithStageAfterRollback == 0) {
+              logInfo(log"All jobs depending on the indeterminate stage " +
+                log"(${MDC(STAGE_ID, stage.id)}) were aborted so this stage is not needed anymore.")
+              return
+            }
           }
+          mapOutputTracker.unregisterAllMapAndMergeOutput(sms.shuffleDep.shuffleId)
+          sms.shuffleDep.newShuffleMergeState()
         }
-        mapOutputTracker.unregisterAllMapAndMergeOutput(sms.shuffleDep.shuffleId)
-        sms.shuffleDep.newShuffleMergeState()
       case _ =>
     }
 
@@ -1886,6 +1901,20 @@ private[spark] class DAGScheduler(
     }
   }
 
+  /**
+   * If a map stage is non-deterministic, the map tasks of the stage may return different result
+   * when re-try. To make sure data correctness, we need to re-try all the tasks of its succeeding
+   * stages, as the input data may be changed after the map tasks are re-tried. For stages where
+   * rollback and retry all tasks are not possible, we will need to abort the stages.
+   */
+  private[scheduler] def abortUnrollbackableStages(mapStage: ShuffleMapStage): Unit = {
+    val stagesToRollback = collectSucceedingStages(mapStage)
+    val rollingBackStages = abortStageWithInvalidRollBack(stagesToRollback)
+    logInfo(log"The shuffle map stage ${MDC(SHUFFLE_ID, mapStage)} with indeterminate output " +
+      log"was failed, we will roll back and rerun below stages which include itself and all its " +
+      log"indeterminate child stages: ${MDC(STAGES, rollingBackStages)}")
+  }
+
   /**
    * Responds to a task finishing. This is called inside the event loop so it assumes that it can
    * modify the scheduler's internal state. Use taskEnded() to post a task end event from outside.
@@ -2022,8 +2051,25 @@ private[spark] class DAGScheduler(
                 // The epoch of the task is acceptable (i.e., the task was launched after the most
                 // recent failure we're aware of for the executor), so mark the task's output as
                 // available.
-                mapOutputTracker.registerMapOutput(
+                val isChecksumMismatched = mapOutputTracker.registerMapOutput(
                   shuffleStage.shuffleDep.shuffleId, smt.partitionId, status)
+                if (isChecksumMismatched) {
+                  shuffleStage.isChecksumMismatched = isChecksumMismatched
+                  // There could be multiple checksum mismatches detected for a single stage attempt.
+                  // We check for stage abortion once and only once when we first detect checksum
+                  // mismatch for each stage attempt. For example, assume that we have
+                  // stage1 -> stage2, and we encounter checksum mismatch during the retry of stage1.
+                  // In this case, we need to call abortUnrollbackableStages() for the succeeding
+                  // stages. Assume that when stage2 is retried, some tasks finish and some tasks
+                  // failed again with FetchFailed. In case that we encounter checksum mismatch again
+                  // during the retry of stage1, we need to call abortUnrollbackableStages() again.
+                  if (shuffleStage.maxChecksumMismatchedId < smt.stageAttemptId) {
+                    shuffleStage.maxChecksumMismatchedId = smt.stageAttemptId
+                    if (checksumMismatchFullRetryEnabled && shuffleStage.isStageIndeterminate) {
+                      abortUnrollbackableStages(shuffleStage)
+                    }
+                  }
+                }
               }
             } else {
               logInfo(log"Ignoring ${MDC(TASK_NAME, smt)} completion from an older attempt of indeterminate stage")
@@ -2148,12 +2194,8 @@ private[spark] class DAGScheduler(
               // Note that, if map stage is UNORDERED, we are fine. The shuffle partitioner is
               // guaranteed to be determinate, so the input data of the reducers will not change
               // even if the map tasks are re-tried.
-              if (mapStage.isIndeterminate) {
-                val stagesToRollback = collectSucceedingStages(mapStage)
-                val rollingBackStages = abortStageWithInvalidRollBack(stagesToRollback)
-                logInfo(log"The shuffle map stage ${MDC(SHUFFLE_ID, mapStage)} with indeterminate output was failed, " +
-                  log"we will roll back and rerun below stages which include itself and all its " +
-                  log"indeterminate child stages: ${MDC(STAGES, rollingBackStages)}")
+              if (mapStage.isIndeterminate && !checksumMismatchFullRetryEnabled) {
+                abortUnrollbackableStages(mapStage)
               }
 
               // We expect one executor failure to trigger many FetchFailures in rapid succession,

diff --git a/core/src/main/scala/org/apache/spark/scheduler/Stage.scala b/core/src/main/scala/org/apache/spark/scheduler/Stage.scala
@@ -72,6 +72,18 @@ private[scheduler] abstract class Stage(
   private var nextAttemptId: Int = 0
   private[scheduler] def getNextAttemptId: Int = nextAttemptId
 
+  /**
+   * Whether checksum mismatches have been detected across different attempt of the stage, where
+   * checksum mismatches typically indicates that different stage attempts have produced different
+   * data.
+   */
+  private[scheduler] var isChecksumMismatched: Boolean = false
+
+  /**
+   * The maximum of task attempt id where checksum mismatches are detected.
+   */
+  private[scheduler] var maxChecksumMismatchedId: Int = nextAttemptId
+
   val name: String = callSite.shortForm
   val details: String = callSite.longForm
 
@@ -131,4 +143,14 @@ private[scheduler] abstract class Stage(
   def isIndeterminate: Boolean = {
     rdd.outputDeterministicLevel == DeterministicLevel.INDETERMINATE
   }
+
+  // Returns true if any parents of this stage are indeterminate.
+  def isParentIndeterminate: Boolean = {
+    parents.exists(_.isStageIndeterminate)
+  }
+
+  // Returns true if the stage itself is indeterminate.
+  def isStageIndeterminate: Boolean = {
+    !rdd.isReliablyCheckpointed && isChecksumMismatched
+  }
 }