-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-3015] Block on cleaning tasks to prevent Akka timeouts #1931
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
0b7e768
104b366
9fd1fe6
a183b83
111192a
ce9daf5
d0f7195
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,6 +18,7 @@ | |
| package org.apache.spark | ||
|
|
||
| import java.lang.ref.{ReferenceQueue, WeakReference} | ||
| import java.lang.reflect.Field | ||
|
|
||
| import scala.collection.mutable.{ArrayBuffer, SynchronizedBuffer} | ||
|
|
||
|
|
@@ -64,12 +65,38 @@ private[spark] class ContextCleaner(sc: SparkContext) extends Logging { | |
|
|
||
| private val cleaningThread = new Thread() { override def run() { keepCleaning() }} | ||
|
|
||
| /** | ||
| * Keep track of the reference queue length and log an error if this exceeds a certain capacity. | ||
| * Unfortunately, Java's ReferenceQueue exposes neither the queue length nor the enqueue method, | ||
| * so we have to do this through reflection. This is expensive, however, so we should access | ||
| * this field only once in a while. | ||
| */ | ||
| private val queueCapacity = 10000 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Well, this is not the capacity. It is just a warning threshold. it should be named accordingly. |
||
| private var queueFullErrorMessageLogged = false | ||
| private val queueLengthAccessor: Option[Field] = { | ||
| try { | ||
| val f = classOf[ReferenceQueue[AnyRef]].getDeclaredField("queueLength") | ||
| f.setAccessible(true) | ||
| Some(f) | ||
| } catch { | ||
| case e: Exception => | ||
| logDebug("Failed to expose java.lang.ref.ReferenceQueue's queueLength field: " + e) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Similar to the comment below, add a note. |
||
| None | ||
| } | ||
| } | ||
| private val logQueueLengthInterval = 1000 | ||
|
|
||
| /** | ||
| * Whether the cleaning thread will block on cleanup tasks. | ||
| * This is set to true only for tests. | ||
| * | ||
| * Due to SPARK-3015, this is set to true by default. This is intended to be only a temporary | ||
| * workaround for the issue, which is ultimately caused by the way the BlockManager actors | ||
| * issue inter-dependent blocking Akka messages to each other at high frequencies. This happens, | ||
| * for instance, when the driver performs a GC and cleans up all broadcast blocks that are no | ||
| * longer in scope. | ||
| */ | ||
| private val blockOnCleanupTasks = sc.conf.getBoolean( | ||
| "spark.cleaner.referenceTracking.blocking", false) | ||
| "spark.cleaner.referenceTracking.blocking", true) | ||
|
|
||
| @volatile private var stopped = false | ||
|
|
||
|
|
@@ -112,6 +139,7 @@ private[spark] class ContextCleaner(sc: SparkContext) extends Logging { | |
|
|
||
| /** Keep cleaning RDD, shuffle, and broadcast state. */ | ||
| private def keepCleaning(): Unit = Utils.logUncaughtExceptions { | ||
| var iteration = 0 | ||
| while (!stopped) { | ||
| try { | ||
| val reference = Option(referenceQueue.remove(ContextCleaner.REF_QUEUE_POLL_TIMEOUT)) | ||
|
|
@@ -127,10 +155,14 @@ private[spark] class ContextCleaner(sc: SparkContext) extends Logging { | |
| case CleanBroadcast(broadcastId) => | ||
| doCleanupBroadcast(broadcastId, blocking = blockOnCleanupTasks) | ||
| } | ||
| if (iteration % logQueueLengthInterval == 0) { | ||
| logQueueLength() | ||
| } | ||
| } | ||
| } catch { | ||
| case e: Exception => logError("Error in cleaning thread", e) | ||
| } | ||
| iteration += 1 | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -171,12 +203,44 @@ private[spark] class ContextCleaner(sc: SparkContext) extends Logging { | |
| } | ||
| } | ||
|
|
||
| /** | ||
| * Log the length of the reference queue through reflection. | ||
| * This is an expensive operation and should be called sparingly. | ||
| */ | ||
| private def logQueueLength(): Unit = { | ||
| try { | ||
| queueLengthAccessor.foreach { field => | ||
| val length = field.getLong(referenceQueue) | ||
| logDebug("Reference queue size is " + length) | ||
| if (length > queueCapacity) { | ||
| logQueueFullErrorMessage() | ||
| } | ||
| } | ||
| } catch { | ||
| case e: Exception => | ||
| logDebug("Failed to access reference queue's length through reflection: " + e) | ||
|
||
| } | ||
| } | ||
|
|
||
| /** | ||
| * Log an error message to indicate that the queue has exceeded its capacity. Do this only once. | ||
| */ | ||
| private def logQueueFullErrorMessage(): Unit = { | ||
| if (!queueFullErrorMessageLogged) { | ||
| queueFullErrorMessageLogged = true | ||
| logError(s"Reference queue size in ContextCleaner has exceeded $queueCapacity! " + | ||
|
||
| "This means the rate at which we clean up RDDs, shuffles, and/or broadcasts is too slow.") | ||
| if (blockOnCleanupTasks) { | ||
| logError("Consider setting spark.cleaner.referenceTracking.blocking to false. " + | ||
| "Note that there is a known issue (SPARK-3015) in disabling blocking, especially if " + | ||
| "the workload involves creating many RDDs in quick successions.") | ||
| } | ||
| } | ||
| } | ||
|
|
||
| private def blockManagerMaster = sc.env.blockManager.master | ||
| private def broadcastManager = sc.env.broadcastManager | ||
| private def mapOutputTrackerMaster = sc.env.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster] | ||
|
|
||
| // Used for testing. These methods explicitly blocks until cleanup is completed | ||
| // to ensure that more reliable testing. | ||
| } | ||
|
|
||
| private object ContextCleaner { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The changes will not solve the problem here. see.
BlockManagerMasterActor.scala#L165