-
Notifications
You must be signed in to change notification settings - Fork 1
[SPARK-4899][MESOS] Enable mesos checkpointing #26
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
9149cfb
0cbee4e
d94b2f2
4ad7650
acfbf18
ceb5d37
4745ce2
d2c3646
67cf276
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -516,6 +516,26 @@ See the [configuration page](configuration.html) for information on Spark config | |
| Fetcher Cache</a> | ||
| </td> | ||
| </tr> | ||
| <tr> | ||
| <td><code>spark.mesos.checkpoint</code></td> | ||
| <td>false</td> | ||
| <td> | ||
| If set, agents running tasks started by this framework will write the framework pid, executor pids and status updates to disk. | ||
| If the agent exits (e.g., due to a crash or as part of upgrading Mesos), this checkpointed data allows the restarted agent to | ||
| reconnect to executors that were started by the old instance of the agent. Enabling checkpointing improves fault tolerance, | ||
| at the cost of a (usually small) increase in disk I/O. | ||
| </td> | ||
| </tr> | ||
| <tr> | ||
| <td><code>spark.mesos.failoverTimeout</code></td> | ||
| <td>0.0</td> | ||
| <td> | ||
| The amount of time (in seconds) that the master will wait for thescheduler to failover before it tears down the framework | ||
| by killing all its tasks/executors. This should be non-zero if aframework expects to reconnect after a failure and not lose | ||
|
||
| its tasks/executors. | ||
| NOTE: To avoid accidental destruction of tasks, productionframeworks typically set this to a large value (e.g., 1 week). | ||
| </td> | ||
| </tr> | ||
| </table> | ||
|
|
||
| # Troubleshooting and Debugging | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -139,6 +139,42 @@ class MesosCoarseGrainedSchedulerBackendSuite extends SparkFunSuite | |
| assert(cpus == offerCores) | ||
| } | ||
|
|
||
| test("mesos supports checkpointing") { | ||
|
|
||
| val checkpoint = true | ||
| val failoverTimeout = 10 | ||
| setBackend(Map("spark.mesos.checkpoint" -> checkpoint.toString, | ||
| "spark.mesos.failoverTimeout" -> failoverTimeout.toString)) | ||
|
|
||
| val taskScheduler = mock[TaskSchedulerImpl] | ||
| when(taskScheduler.sc).thenReturn(sc) | ||
| val driver = mock[SchedulerDriver] | ||
| when(driver.start()).thenReturn(Protos.Status.DRIVER_RUNNING) | ||
| val securityManager = mock[SecurityManager] | ||
|
|
||
| val backend = new MesosCoarseGrainedSchedulerBackend( | ||
| taskScheduler, sc, "master", securityManager) { | ||
| override protected def createSchedulerDriver( | ||
| masterUrl: String, | ||
| scheduler: Scheduler, | ||
| sparkUser: String, | ||
| appName: String, | ||
| conf: SparkConf, | ||
| webuiUrl: Option[String] = None, | ||
| checkpoint: Option[Boolean] = None, | ||
| failoverTimeout: Option[Double] = None, | ||
| frameworkId: Option[String] = None): SchedulerDriver = { | ||
| markRegistered() | ||
| assert(checkpoint.equals(true)) | ||
| assert(failoverTimeout.equals(10)) | ||
|
||
| driver | ||
| } | ||
| } | ||
|
|
||
| backend.start() | ||
|
|
||
| } | ||
|
|
||
| test("mesos does not acquire more than spark.cores.max") { | ||
| val maxCores = 10 | ||
| setBackend(Map("spark.cores.max" -> maxCores.toString)) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -79,6 +79,42 @@ class MesosFineGrainedSchedulerBackendSuite | |
| backend.start() | ||
| } | ||
|
|
||
| test("mesos supports checkpointing") { | ||
| val conf = new SparkConf | ||
| conf.set("spark.mesos.checkpoint", "true") | ||
| conf.set("spark.mesos.failoverTimeout", "10") | ||
|
|
||
| val sc = mock[SparkContext] | ||
| when(sc.conf).thenReturn(conf) | ||
| when(sc.sparkUser).thenReturn("sparkUser1") | ||
| when(sc.appName).thenReturn("appName1") | ||
|
|
||
| val taskScheduler = mock[TaskSchedulerImpl] | ||
| val driver = mock[SchedulerDriver] | ||
| when(driver.start()).thenReturn(Protos.Status.DRIVER_RUNNING) | ||
|
|
||
| val backend = new MesosFineGrainedSchedulerBackend(taskScheduler, sc, "master") { | ||
| override protected def createSchedulerDriver( | ||
| masterUrl: String, | ||
| scheduler: Scheduler, | ||
| sparkUser: String, | ||
| appName: String, | ||
| conf: SparkConf, | ||
| webuiUrl: Option[String] = None, | ||
| checkpoint: Option[Boolean] = None, | ||
| failoverTimeout: Option[Double] = None, | ||
| frameworkId: Option[String] = None): SchedulerDriver = { | ||
| markRegistered() | ||
| assert(checkpoint.equals(true)) | ||
| assert(failoverTimeout.equals(10)) | ||
|
||
| driver | ||
| } | ||
| } | ||
|
|
||
| backend.start() | ||
|
|
||
| } | ||
|
|
||
| test("Use configured mesosExecutor.cores for ExecutorInfo") { | ||
| val mesosExecutorCores = 3 | ||
| val conf = new SparkConf | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Typo
thescheduler->the scheduler