-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-8103][core] DAGScheduler should not submit multiple concurrent attempts for a stage #6750
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 33 commits
a9bf31f
28d70aa
c443def
06a0af6
6e14683
883fe49
7021d28
55f4a94
b6bc248
ecb4e7d
9601b47
89a59b6
8c29707
cb245da
46bc26a
d8eb202
ada7726
6542b42
b2faef5
517b6e5
227b40d
a5f7c8c
19685bb
baf46e1
f025154
c0d4d90
109900e
906d626
a21c8b5
d7f1ef2
88b61cc
c04707e
4470fa1
6bc23af
e43ac25
584acd4
e01b7aa
fb3acfc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -853,7 +853,6 @@ class DAGScheduler( | |
| // Get our pending tasks and remember them in our pendingTasks entry | ||
| stage.pendingTasks.clear() | ||
|
|
||
|
|
||
| // First figure out the indexes of partition ids to compute. | ||
| val partitionsToCompute: Seq[Int] = { | ||
| stage match { | ||
|
|
@@ -914,7 +913,7 @@ class DAGScheduler( | |
| partitionsToCompute.map { id => | ||
| val locs = getPreferredLocs(stage.rdd, id) | ||
| val part = stage.rdd.partitions(id) | ||
| new ShuffleMapTask(stage.id, taskBinary, part, locs) | ||
| new ShuffleMapTask(stage.id, stage.latestInfo.attemptId, taskBinary, part, locs) | ||
| } | ||
|
|
||
| case stage: ResultStage => | ||
|
|
@@ -923,7 +922,7 @@ class DAGScheduler( | |
| val p: Int = job.partitions(id) | ||
| val part = stage.rdd.partitions(p) | ||
| val locs = getPreferredLocs(stage.rdd, p) | ||
| new ResultTask(stage.id, taskBinary, part, locs, id) | ||
| new ResultTask(stage.id, stage.latestInfo.attemptId, taskBinary, part, locs, id) | ||
| } | ||
| } | ||
| } catch { | ||
|
|
@@ -1065,10 +1064,11 @@ class DAGScheduler( | |
| val execId = status.location.executorId | ||
| logDebug("ShuffleMapTask finished on " + execId) | ||
| if (failedEpoch.contains(execId) && smt.epoch <= failedEpoch(execId)) { | ||
| logInfo("Ignoring possibly bogus ShuffleMapTask completion from " + execId) | ||
| logInfo(s"Ignoring possibly bogus $smt completion from executor $execId") | ||
| } else { | ||
| shuffleStage.addOutputLoc(smt.partitionId, status) | ||
| } | ||
|
|
||
| if (runningStages.contains(shuffleStage) && shuffleStage.pendingTasks.isEmpty) { | ||
| markStageAsFinished(shuffleStage) | ||
| logInfo("looking for newly runnable stages") | ||
|
|
@@ -1128,38 +1128,47 @@ class DAGScheduler( | |
| val failedStage = stageIdToStage(task.stageId) | ||
| val mapStage = shuffleToMapStage(shuffleId) | ||
|
|
||
| // It is likely that we receive multiple FetchFailed for a single stage (because we have | ||
| // multiple tasks running concurrently on different executors). In that case, it is possible | ||
| // the fetch failure has already been handled by the scheduler. | ||
| if (runningStages.contains(failedStage)) { | ||
| logInfo(s"Marking $failedStage (${failedStage.name}) as failed " + | ||
| s"due to a fetch failure from $mapStage (${mapStage.name})") | ||
| markStageAsFinished(failedStage, Some(failureMessage)) | ||
| } | ||
| if (failedStage.latestInfo.attemptId != task.stageAttemptId) { | ||
| logInfo(s"Ignoring fetch failure from $task as it's from $failedStage attempt" + | ||
| s" ${task.stageAttemptId}, which has already failed") | ||
| } else { | ||
|
|
||
| if (disallowStageRetryForTest) { | ||
| abortStage(failedStage, "Fetch failure will not retry stage due to testing config") | ||
| } else if (failedStages.isEmpty) { | ||
| // Don't schedule an event to resubmit failed stages if failed isn't empty, because | ||
| // in that case the event will already have been scheduled. | ||
| // TODO: Cancel running tasks in the stage | ||
| logInfo(s"Resubmitting $mapStage (${mapStage.name}) and " + | ||
| s"$failedStage (${failedStage.name}) due to fetch failure") | ||
| messageScheduler.schedule(new Runnable { | ||
| override def run(): Unit = eventProcessLoop.post(ResubmitFailedStages) | ||
| }, DAGScheduler.RESUBMIT_TIMEOUT, TimeUnit.MILLISECONDS) | ||
| } | ||
| failedStages += failedStage | ||
| failedStages += mapStage | ||
| // Mark the map whose fetch failed as broken in the map stage | ||
| if (mapId != -1) { | ||
| mapStage.removeOutputLoc(mapId, bmAddress) | ||
| mapOutputTracker.unregisterMapOutput(shuffleId, mapId, bmAddress) | ||
| } | ||
| // It is likely that we receive multiple FetchFailed for a single stage (because we have | ||
| // multiple tasks running concurrently on different executors). In that case, it is | ||
| // possible the fetch failure has already been handled by the scheduler. | ||
| if (runningStages.contains(failedStage)) { | ||
| logInfo(s"Marking $failedStage (${failedStage.name}) as failed " + | ||
| s"due to a fetch failure from $mapStage (${mapStage.name})") | ||
| markStageAsFinished(failedStage, Some(failureMessage)) | ||
| } else { | ||
| logInfo(s"Ignoring fetch failure from $task as it's from $failedStage, " + | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems like, even when the failed stage is no longer running, we don't fully ignore the failure, because we remove the map id from the set of output locations (on line 1164)? As a result, I'm not sure this log message makes sense.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, good point ... perhaps we should just get rid of this msg, I'm not sure if there is anything we could say here that is succint, accurate, and still useful. I think I added this msg during some early debugging. Do you think its worth logging "Received fetch failure from $task, but its from $failedStage which is no longer running"? Maybe a
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Doing a logDebug seems fine if you think it will be useful! (I never turn on debug level logging just because Spark's debug logging is sooo verbose, but I can imagine that others probably do turn it on!)
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It looks like this log statement is still lingering?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. woops, sorry I forgot about this one! thanks, changed to the logDebug |
||
| s"which is no longer running") | ||
| } | ||
|
|
||
| if (disallowStageRetryForTest) { | ||
| abortStage(failedStage, "Fetch failure will not retry stage due to testing config") | ||
| } else if (failedStages.isEmpty) { | ||
| // Don't schedule an event to resubmit failed stages if failed isn't empty, because | ||
| // in that case the event will already have been scheduled. | ||
| // TODO: Cancel running tasks in the stage | ||
| logInfo(s"Resubmitting $mapStage (${mapStage.name}) and " + | ||
| s"$failedStage (${failedStage.name}) due to fetch failure") | ||
| messageScheduler.schedule(new Runnable { | ||
| override def run(): Unit = eventProcessLoop.post(ResubmitFailedStages) | ||
| }, DAGScheduler.RESUBMIT_TIMEOUT, TimeUnit.MILLISECONDS) | ||
| } | ||
| failedStages += failedStage | ||
| failedStages += mapStage | ||
| // Mark the map whose fetch failed as broken in the map stage | ||
| if (mapId != -1) { | ||
| mapStage.removeOutputLoc(mapId, bmAddress) | ||
| mapOutputTracker.unregisterMapOutput(shuffleId, mapId, bmAddress) | ||
| } | ||
|
|
||
| // TODO: mark the executor as failed only if there were lots of fetch failures on it | ||
| if (bmAddress != null) { | ||
| handleExecutorLost(bmAddress.executorId, fetchFailed = true, Some(task.epoch)) | ||
| // TODO: mark the executor as failed only if there were lots of fetch failures on it | ||
| if (bmAddress != null) { | ||
| handleExecutorLost(bmAddress.executorId, fetchFailed = true, Some(task.epoch)) | ||
| } | ||
| } | ||
|
|
||
| case commitDenied: TaskCommitDenied => | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -75,9 +75,9 @@ private[spark] class TaskSchedulerImpl( | |
|
|
||
| // TaskSetManagers are not thread safe, so any access to one should be synchronized | ||
| // on this class. | ||
| val activeTaskSets = new HashMap[String, TaskSetManager] | ||
| val taskSetsByStageIdAndAttempt = new HashMap[Int, HashMap[Int, TaskSetManager]] | ||
|
|
||
| val taskIdToTaskSetId = new HashMap[Long, String] | ||
| val taskIdToStageIdAndAttempt = new HashMap[Long, (Int, Int)] | ||
| val taskIdToExecutorId = new HashMap[Long, String] | ||
|
|
||
| @volatile private var hasReceivedTask = false | ||
|
|
@@ -162,7 +162,17 @@ private[spark] class TaskSchedulerImpl( | |
| logInfo("Adding task set " + taskSet.id + " with " + tasks.length + " tasks") | ||
| this.synchronized { | ||
| val manager = createTaskSetManager(taskSet, maxTaskFailures) | ||
| activeTaskSets(taskSet.id) = manager | ||
| val stage = taskSet.stageId | ||
| val stageTaskSets = | ||
| taskSetsByStageIdAndAttempt.getOrElseUpdate(stage, new HashMap[Int, TaskSetManager]) | ||
| stageTaskSets(taskSet.stageAttemptId) = manager | ||
| val conflictingTaskSet = stageTaskSets.exists { case (_, ts) => | ||
| ts.taskSet != taskSet && !ts.isZombie | ||
| } | ||
| if (conflictingTaskSet) { | ||
| throw new IllegalStateException(s"more than one active taskSet for stage $stage:" + | ||
| s" ${stageTaskSets.toSeq.map{_._2.taskSet.id}.mkString(",")}") | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @kayousterhout How much of a concern should the extra overhead be here? Just wondering whether this (let's hope rare) condition might better be handled only in a non-production environment and behind an
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps the code could just look for an existing task set that matches the stage ID of the task set being added? That should be a little better than the filter / groupBy. Something like:
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. restoring the comments from the old diff b/c they are still relevant: from mark:
from marcelo:
good point, there isn't any need to do the I'd really rather leave the check in place. In fact I think this fail-fast behavior is especially important in a production environment -- that's much better than an infinite loop of failures.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I totally agree that it is valuable to catch the failure in production. I'm only suggesting that at some point the check becomes a big enough performance hit that it makes sense to compromise on the fail-fast desiderata in order to maintain production performance while trying to ensure in development that the failure can never occur. I doubt that this check is that costly, but my expectation is that Kay has a better sense of how much more we can afford to do within this synchronized block.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What if you change activeTaskSets to be keyed on the stageId? That seems pretty easy to do and would make this check O(1) rather than O(N) |
||
| schedulableBuilder.addTaskSetManager(manager, manager.taskSet.properties) | ||
|
|
||
| if (!isLocal && !hasReceivedTask) { | ||
|
|
@@ -192,19 +202,21 @@ private[spark] class TaskSchedulerImpl( | |
|
|
||
| override def cancelTasks(stageId: Int, interruptThread: Boolean): Unit = synchronized { | ||
| logInfo("Cancelling stage " + stageId) | ||
| activeTaskSets.find(_._2.stageId == stageId).foreach { case (_, tsm) => | ||
| // There are two possible cases here: | ||
| // 1. The task set manager has been created and some tasks have been scheduled. | ||
| // In this case, send a kill signal to the executors to kill the task and then abort | ||
| // the stage. | ||
| // 2. The task set manager has been created but no tasks has been scheduled. In this case, | ||
| // simply abort the stage. | ||
| tsm.runningTasksSet.foreach { tid => | ||
| val execId = taskIdToExecutorId(tid) | ||
| backend.killTask(tid, execId, interruptThread) | ||
| taskSetsByStageIdAndAttempt.get(stageId).foreach { attempts => | ||
| attempts.foreach { case (_, tsm) => | ||
| // There are two possible cases here: | ||
| // 1. The task set manager has been created and some tasks have been scheduled. | ||
| // In this case, send a kill signal to the executors to kill the task and then abort | ||
| // the stage. | ||
| // 2. The task set manager has been created but no tasks has been scheduled. In this case, | ||
| // simply abort the stage. | ||
| tsm.runningTasksSet.foreach { tid => | ||
| val execId = taskIdToExecutorId(tid) | ||
| backend.killTask(tid, execId, interruptThread) | ||
| } | ||
| tsm.abort("Stage %s cancelled".format(stageId)) | ||
| logInfo("Stage %d was cancelled".format(stageId)) | ||
| } | ||
| tsm.abort("Stage %s cancelled".format(stageId)) | ||
| logInfo("Stage %d was cancelled".format(stageId)) | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -214,7 +226,12 @@ private[spark] class TaskSchedulerImpl( | |
| * cleaned up. | ||
| */ | ||
| def taskSetFinished(manager: TaskSetManager): Unit = synchronized { | ||
| activeTaskSets -= manager.taskSet.id | ||
| taskSetsByStageIdAndAttempt.get(manager.taskSet.stageId).foreach { taskSetsForStage => | ||
| taskSetsForStage -= manager.taskSet.stageAttemptId | ||
| if (taskSetsForStage.isEmpty) { | ||
| taskSetsByStageIdAndAttempt -= manager.taskSet.stageId | ||
| } | ||
| } | ||
| manager.parent.removeSchedulable(manager) | ||
| logInfo("Removed TaskSet %s, whose tasks have all completed, from pool %s" | ||
| .format(manager.taskSet.id, manager.parent.name)) | ||
|
|
@@ -235,7 +252,8 @@ private[spark] class TaskSchedulerImpl( | |
| for (task <- taskSet.resourceOffer(execId, host, maxLocality)) { | ||
| tasks(i) += task | ||
| val tid = task.taskId | ||
| taskIdToTaskSetId(tid) = taskSet.taskSet.id | ||
| taskIdToStageIdAndAttempt(tid) = | ||
| (taskSet.taskSet.stageId, taskSet.taskSet.stageAttemptId) | ||
| taskIdToExecutorId(tid) = execId | ||
| executorsByHost(host) += execId | ||
| availableCpus(i) -= CPUS_PER_TASK | ||
|
|
@@ -319,26 +337,24 @@ private[spark] class TaskSchedulerImpl( | |
| failedExecutor = Some(execId) | ||
| } | ||
| } | ||
| taskIdToTaskSetId.get(tid) match { | ||
| case Some(taskSetId) => | ||
| taskSetManagerForTask(tid) match { | ||
| case Some(taskSet) => | ||
| if (TaskState.isFinished(state)) { | ||
| taskIdToTaskSetId.remove(tid) | ||
| taskIdToStageIdAndAttempt.remove(tid) | ||
| taskIdToExecutorId.remove(tid) | ||
| } | ||
| activeTaskSets.get(taskSetId).foreach { taskSet => | ||
| if (state == TaskState.FINISHED) { | ||
| taskSet.removeRunningTask(tid) | ||
| taskResultGetter.enqueueSuccessfulTask(taskSet, tid, serializedData) | ||
| } else if (Set(TaskState.FAILED, TaskState.KILLED, TaskState.LOST).contains(state)) { | ||
| taskSet.removeRunningTask(tid) | ||
| taskResultGetter.enqueueFailedTask(taskSet, tid, state, serializedData) | ||
| } | ||
| if (state == TaskState.FINISHED) { | ||
| taskSet.removeRunningTask(tid) | ||
| taskResultGetter.enqueueSuccessfulTask(taskSet, tid, serializedData) | ||
| } else if (Set(TaskState.FAILED, TaskState.KILLED, TaskState.LOST).contains(state)) { | ||
| taskSet.removeRunningTask(tid) | ||
| taskResultGetter.enqueueFailedTask(taskSet, tid, state, serializedData) | ||
| } | ||
| case None => | ||
| logError( | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. there is a small change in logic here. Before, we'd hit this case only if This wasn't exactly intentional, it just fell out from an attempt at refactoring a helper method -- but it does seems to more accurately correspond to what is in the error msg. Still I'm not certain about it. Perhaps if nobody else has high confidence in this change I should just go back to the old behavior.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is fine! |
||
| ("Ignoring update with state %s for TID %s because its task set is gone (this is " + | ||
| "likely the result of receiving duplicate task finished status updates)") | ||
| .format(state, tid)) | ||
| "likely the result of receiving duplicate task finished status updates)") | ||
| .format(state, tid)) | ||
| } | ||
| } catch { | ||
| case e: Exception => logError("Exception in statusUpdate", e) | ||
|
|
@@ -363,9 +379,13 @@ private[spark] class TaskSchedulerImpl( | |
|
|
||
| val metricsWithStageIds: Array[(Long, Int, Int, TaskMetrics)] = synchronized { | ||
| taskMetrics.flatMap { case (id, metrics) => | ||
| taskIdToTaskSetId.get(id) | ||
| .flatMap(activeTaskSets.get) | ||
| .map(taskSetMgr => (id, taskSetMgr.stageId, taskSetMgr.taskSet.attempt, metrics)) | ||
| for { | ||
| (stageId, stageAttemptId) <- taskIdToStageIdAndAttempt.get(id) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why not use the taskSetManagerForTask(id) method here? |
||
| attempts <- taskSetsByStageIdAndAttempt.get(stageId) | ||
| taskSetMgr <- attempts.get(stageAttemptId) | ||
| } yield { | ||
| (id, taskSetMgr.stageId, taskSetMgr.taskSet.stageAttemptId, metrics) | ||
| } | ||
| } | ||
| } | ||
| dagScheduler.executorHeartbeatReceived(execId, metricsWithStageIds, blockManagerId) | ||
|
|
@@ -397,9 +417,12 @@ private[spark] class TaskSchedulerImpl( | |
|
|
||
| def error(message: String) { | ||
| synchronized { | ||
| if (activeTaskSets.nonEmpty) { | ||
| if (taskSetsByStageIdAndAttempt.nonEmpty) { | ||
| // Have each task set throw a SparkException with the error | ||
| for ((taskSetId, manager) <- activeTaskSets) { | ||
| for { | ||
| attempts <- taskSetsByStageIdAndAttempt.values | ||
| manager <- attempts.values | ||
| } { | ||
| try { | ||
| manager.abort(message) | ||
| } catch { | ||
|
|
@@ -520,6 +543,23 @@ private[spark] class TaskSchedulerImpl( | |
|
|
||
| override def applicationAttemptId(): Option[String] = backend.applicationAttemptId() | ||
|
|
||
| private[scheduler] def taskSetManagerForTask(taskId: Long): Option[TaskSetManager] = { | ||
| taskIdToStageIdAndAttempt.get(taskId).flatMap{ case (stageId, stageAttemptId) => | ||
| taskSetManagerForAttempt(stageId, stageAttemptId) | ||
| } | ||
| } | ||
|
|
||
| private[scheduler] def taskSetManagerForAttempt( | ||
| stageId: Int, | ||
| stageAttemptId: Int): Option[TaskSetManager] = { | ||
| for { | ||
| attempts <- taskSetsByStageIdAndAttempt.get(stageId) | ||
| manager <- attempts.get(stageAttemptId) | ||
| } yield { | ||
| manager | ||
| } | ||
| } | ||
|
|
||
| } | ||
|
|
||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you change this to s" ${task.stageAttemptId}, and there is a more recent attempt for that stage (attempt ID ${failedStage.latestInfo.attemptId}) running"? It's a little misleading as-is, because we actually do still do something with the fetch failure if the stage has already been marked as failed, as long as there's not already a newer version of the stage running.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
yes, good idea. done