Skip to content

Commit 6b5980d

Browse files
committed
Set a limited number of retry in standalone deploy mode.
1 parent 9a449e0 commit 6b5980d

3 files changed

Lines changed: 34 additions & 14 deletions

File tree

core/src/main/scala/spark/deploy/master/JobInfo.scala

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,4 +31,13 @@ class JobInfo(val id: String, val desc: JobDescription, val submitDate: Date, va
3131
}
3232

3333
def coresLeft: Int = desc.cores - coresGranted
34+
35+
private var _retryCount = 0
36+
37+
def retryCount = _retryCount
38+
39+
def incrementRetryCount = {
40+
_retryCount += 1
41+
_retryCount
42+
}
3443
}

core/src/main/scala/spark/deploy/master/JobState.scala

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,6 @@ object JobState extends Enumeration("WAITING", "RUNNING", "FINISHED", "FAILED")
44
type JobState = Value
55

66
val WAITING, RUNNING, FINISHED, FAILED = Value
7+
8+
val MAX_NUM_RETRY = 10
79
}

core/src/main/scala/spark/deploy/master/Master.scala

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,18 @@
11
package spark.deploy.master
22

3-
import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet}
4-
53
import akka.actor._
6-
import spark.{Logging, Utils}
7-
import spark.util.AkkaUtils
4+
import akka.actor.Terminated
5+
import akka.remote.{RemoteClientLifeCycleEvent, RemoteClientDisconnected, RemoteClientShutdown}
6+
87
import java.text.SimpleDateFormat
98
import java.util.Date
10-
import akka.remote.RemoteClientLifeCycleEvent
9+
10+
import scala.collection.mutable.{ArrayBuffer, HashMap, HashSet}
11+
1112
import spark.deploy._
12-
import akka.remote.RemoteClientShutdown
13-
import akka.remote.RemoteClientDisconnected
14-
import spark.deploy.RegisterWorker
15-
import spark.deploy.RegisterWorkerFailed
16-
import akka.actor.Terminated
13+
import spark.{Logging, SparkException, Utils}
14+
import spark.util.AkkaUtils
15+
1716

1817
class Master(ip: String, port: Int, webUiPort: Int) extends Actor with Logging {
1918
val DATE_FORMAT = new SimpleDateFormat("yyyyMMddHHmmss") // For job IDs
@@ -81,12 +80,22 @@ class Master(ip: String, port: Int, webUiPort: Int) extends Actor with Logging {
8180
exec.state = state
8281
exec.job.actor ! ExecutorUpdated(execId, state, message)
8382
if (ExecutorState.isFinished(state)) {
83+
val jobInfo = idToJob(jobId)
8484
// Remove this executor from the worker and job
8585
logInfo("Removing executor " + exec.fullId + " because it is " + state)
86-
idToJob(jobId).removeExecutor(exec)
86+
jobInfo.removeExecutor(exec)
8787
exec.worker.removeExecutor(exec)
88-
// TODO: the worker would probably want to restart the executor a few times
89-
schedule()
88+
89+
// Only retry certain number of times so we don't go into an infinite loop.
90+
if (jobInfo.incrementRetryCount <= JobState.MAX_NUM_RETRY) {
91+
schedule()
92+
} else {
93+
val e = new SparkException("Job %s wth ID %s failed %d times.".format(
94+
jobInfo.desc.name, jobInfo.id, jobInfo.retryCount))
95+
logError(e.getMessage, e)
96+
throw e
97+
//System.exit(1)
98+
}
9099
}
91100
}
92101
case None =>
@@ -112,7 +121,7 @@ class Master(ip: String, port: Int, webUiPort: Int) extends Actor with Logging {
112121
addressToWorker.get(address).foreach(removeWorker)
113122
addressToJob.get(address).foreach(removeJob)
114123
}
115-
124+
116125
case RequestMasterState => {
117126
sender ! MasterState(ip + ":" + port, workers.toList, jobs.toList, completedJobs.toList)
118127
}

0 commit comments

Comments
 (0)