@@ -63,8 +63,6 @@ object CoarseCookSchedulerBackend {
6363 }
6464}
6565
66-
67-
6866/**
6967 * A SchedulerBackend that runs tasks using Cook, using "coarse-grained" tasks, where it holds
7068 * onto Cook instances for the duration of the Spark job instead of relinquishing cores whenever
@@ -79,27 +77,11 @@ class CoarseCookSchedulerBackend(
7977 cookPort : Int )
8078 extends CoarseGrainedSchedulerBackend (scheduler, sc.env.rpcEnv) with Logging {
8179
82- val maxCores = conf.getInt(" spark.cores.max" , 0 )
83- val maxCoresPerJob = conf.getInt(" spark.executor.cores" , 1 )
84- val priority = conf.getInt(" spark.cook.priority" , 75 )
85- val jobNamePrefix = conf.get(" spark.cook.job.name.prefix" , " sparkjob" )
86- val maxFailures = conf.getInt(" spark.executor.failures" , 5 )
87- val dynamicAllocationEnabled = conf.getBoolean(" spark.dynamicAllocation.enabled" , false )
88-
89- if (conf.contains(" spark.cores.max" ) && dynamicAllocationEnabled) {
90- logWarning(" spark.cores.max is ignored when dynamic allocation is enabled. Use spark.dynamicAllocation.maxExecutors instead" )
91- }
92-
93- def currentInstancesToRequest : Int = (executorsToRequest - totalInstancesRequested)
94- var executorsToRequest : Int = if (dynamicAllocationEnabled) {
95- conf.getInt(" spark.dynamicAllocation.minExecutors" , 0 )
96- } else {
97- maxCores / maxCoresPerJob
98- }
99- var totalInstancesRequested = 0
100- var totalFailures = 0
101- val jobIds = mutable.Set [UUID ]()
102- val abortedJobIds = mutable.Set [UUID ]()
80+ private [this ] val schedulerConf = CookSchedulerConfiguration .conf(conf)
81+ private [this ] var executorsRequested = 0
82+ private [this ] var totalFailures = 0
83+ private [this ] val jobIds = mutable.Set [UUID ]()
84+ private [this ] val abortedJobIds = mutable.Set [UUID ]()
10385
10486 private [this ] val jobClient = new JobClient .Builder ()
10587 .setHost(cookHost)
@@ -117,7 +99,7 @@ class CoarseCookSchedulerBackend(
11799 val isAborted = abortedJobIds.contains(job.getUUID)
118100
119101 if (isCompleted) {
120- totalInstancesRequested -= 1
102+ executorsRequested -= 1
121103 abortedJobIds -= job.getUUID
122104 jobIds -= job.getUUID
123105
@@ -127,18 +109,21 @@ class CoarseCookSchedulerBackend(
127109
128110 if (! job.isSuccess && ! isAborted) {
129111 totalFailures += 1
130- logWarning(s " Job ${job.getUUID} has died. Failure ( $totalFailures/ $maxFailures) " )
112+ logWarning(s " Job ${job.getUUID} has died. " +
113+ s " Failure ( $totalFailures/ $schedulerConf.getMaximumExecutorFailures) " )
131114 jobIds -= job.getUUID
132- if (totalFailures >= maxFailures ) {
115+ if (totalFailures >= schedulerConf.getMaximumExecutorFailures ) {
133116 // TODO should we abort the outstanding tasks now?
134- logError(s " We have exceeded our maximum failures ( $maxFailures) " +
117+ logError(s " We have exceeded our maximum failures " +
118+ s " ( $schedulerConf.getMaximumExecutorFailures) " +
135119 " and will not relaunch any more tasks" )
136120 }
137121 }
138122 }
139123 }
140124 }
141- def executorUUIDWriter : UUID => Unit =
125+
126+ private def executorUUIDWriter : UUID => Unit =
142127 conf.getOption(" spark.cook.executoruuid.log" ).fold { _ : UUID => () } { _file =>
143128 def file (ct : Int ) = s " ${_file}. $ct"
144129 def path (ct : Int ) = Paths .get(file(ct))
@@ -167,13 +152,13 @@ class CoarseCookSchedulerBackend(
167152 }
168153 }
169154
170- val sparkMesosScheduler =
155+ private [ this ] val sparkMesosScheduler =
171156 new CoarseMesosSchedulerBackend (scheduler, sc, " " , sc.env.securityManager)
172157
173158 override def applicationId (): String = conf.get(" spark.cook.applicationId" , super .applicationId())
174159 override def applicationAttemptId (): Option [String ] = Some (applicationId())
175160
176- def createJob (numCores : Double ): Job = {
161+ private def createJob (numCores : Double ): Job = {
177162 import CoarseCookSchedulerBackend .fetchUri
178163
179164 val jobId = UUID .randomUUID()
@@ -272,11 +257,11 @@ class CoarseCookSchedulerBackend(
272257
273258 val builder = new Job .Builder ()
274259 .setUUID(jobId)
275- .setName(jobNamePrefix )
260+ .setName(schedulerConf.getPrefixOfCookJobName )
276261 .setCommand(cmds.mkString(" ; " ))
277262 .setMemory(sparkMesosScheduler.calculateTotalMemory(sc).toDouble)
278263 .setCpus(numCores)
279- .setPriority(priority )
264+ .setPriority(schedulerConf.getPriorityPerCookJob )
280265
281266 val container = conf.get(" spark.executor.cook.container" , null )
282267 if (container != null ) {
@@ -288,7 +273,8 @@ class CoarseCookSchedulerBackend(
288273 builder.build()
289274 }
290275
291- private [this ] val minExecutorsNecessary = currentInstancesToRequest * minRegisteredRatio
276+ private [this ] val minExecutorsNecessary =
277+ schedulerConf.getExecutorsToRequest(0 ) * minRegisteredRatio
292278
293279 override def sufficientResourcesRegistered (): Boolean =
294280 totalRegisteredExecutors.get >= minExecutorsNecessary
@@ -307,7 +293,7 @@ class CoarseCookSchedulerBackend(
307293 ret
308294 }
309295
310- // In our fake offer mesos adds some autoincrementing ID per job but
296+ // In our fake offer mesos adds some auto-increasing ID per job but
311297 // this sticks around in the executorId so we strop it out to get the actual executor ID
312298 private def instanceIdFromExecutorId (executorId : String ): UUID = {
313299 UUID .fromString(executorId.split('/' )(0 ))
@@ -369,8 +355,8 @@ class CoarseCookSchedulerBackend(
369355
370356 override def doRequestTotalExecutors (requestedTotal : Int ): Boolean = {
371357 logInfo(s " Setting total amount of executors to request to $requestedTotal" )
372- executorsToRequest = requestedTotal
373- requestRemainingInstances ()
358+ schedulerConf.setMaximumCores( requestedTotal)
359+ requestExecutorsIfNecessary ()
374360 true
375361 }
376362
@@ -384,21 +370,39 @@ class CoarseCookSchedulerBackend(
384370 @ annotation.tailrec
385371 def loop (instancesRemaining : Double , jobs : List [Job ]): List [Job ] =
386372 if (instancesRemaining <= 0 ) jobs
387- else loop(instancesRemaining - 1 , createJob(maxCoresPerJob) :: jobs)
388- loop(currentInstancesToRequest, Nil ).reverse
373+ else loop(instancesRemaining - 1 , createJob(schedulerConf.getCoresPerCookJob) :: jobs)
374+
375+ loop(schedulerConf.getExecutorsToRequest(executorsRequested), Nil ).reverse
389376 }
390377
391378 /**
392- * Request cores from Cook via cook jobs .
379+ * Kill the extra executors if necessary .
393380 */
394- private [this ] def requestRemainingInstances (): Unit = {
381+ private [this ] def killExecutorsIfNecessary (): Unit = {
382+ val executorsToKill = schedulerConf.getExecutorsToKil(executorsRequested)
383+ if (executorsToKill > 0 ) {
384+ val jobIdsToKill = jobIds.take(executorsToKill)
385+ Try [Unit ](jobClient.abort(jobIdsToKill.asJava)) match {
386+ case Failure (e) =>
387+ logWarning(" Failed to abort redundant jobs" , e)
388+ case Success (_) =>
389+ logInfo(s " Successfully abort $executorsToKill jobs. " )
390+ jobIdsToKill.foreach(abortedJobIds.add)
391+ }
392+ }
393+ }
394+
395+ /**
396+ * Request more executors from Cook via cook jobs if necessary.
397+ */
398+ private [this ] def requestExecutorsIfNecessary (): Unit = {
395399 val jobs = createRemainingJobs()
396400 if (jobs.nonEmpty) {
397401 Try [Unit ](jobClient.submit(jobs.asJava, jobListener)) match {
398402 case Failure (e) => logWarning(" Can't request more instances" , e)
399403 case Success (_) => {
400404 logInfo(s " Successfully requested ${jobs.size} instances " )
401- totalInstancesRequested += jobs.size
405+ executorsRequested += jobs.size
402406 jobs.map(_.getUUID).foreach(jobIds.add)
403407 }
404408 }
@@ -421,9 +425,12 @@ class CoarseCookSchedulerBackend(
421425 override def start (): Unit = {
422426 super .start()
423427
424- requestRemainingInstances ()
428+ requestExecutorsIfNecessary ()
425429 resourceManagerService.scheduleAtFixedRate(new Runnable () {
426- override def run (): Unit = requestRemainingInstances()
430+ override def run (): Unit = {
431+ requestExecutorsIfNecessary()
432+ killExecutorsIfNecessary()
433+ }
427434 }, 10 , 10 , TimeUnit .SECONDS )
428435 }
429436
0 commit comments