apache-spark-on-k8s · ash211 · May 10, 2017 · Apr 4, 2017 · Apr 5, 2017 · Apr 5, 2017
diff --git a/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/TaskSetManager.scala
@@ -221,7 +221,7 @@ private[spark] class TaskSetManager(
    * Return the pending tasks list for a given host, or an empty list if
    * there is no map entry for that host
    */
-  private def getPendingTasksForHost(host: String): ArrayBuffer[Int] = {
+  protected def getPendingTasksForHost(host: String): ArrayBuffer[Int] = {
     pendingTasksForHost.getOrElse(host, ArrayBuffer())
   }
 

diff --git a/...etes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesClientBuilder.scala b/...etes/core/src/main/scala/org/apache/spark/deploy/kubernetes/KubernetesClientBuilder.scala
@@ -17,10 +17,13 @@
 package org.apache.spark.deploy.kubernetes
 
 import java.io.File
+import java.util.concurrent.{ThreadFactory, ThreadPoolExecutor}
 
 import com.google.common.base.Charsets
 import com.google.common.io.Files
 import io.fabric8.kubernetes.client.{Config, ConfigBuilder, DefaultKubernetesClient}
+import io.fabric8.kubernetes.client.utils.HttpClientUtils
+import okhttp3.Dispatcher
 
 import org.apache.spark.SparkConf
 import org.apache.spark.deploy.kubernetes.config._
@@ -78,6 +81,25 @@ private[spark] class KubernetesClientBuilder(sparkConf: SparkConf, namespace: St
       }
       serviceAccountConfigBuilder
     }
-    new DefaultKubernetesClient(configBuilder.build)
+    val threadPoolExecutor = new Dispatcher().executorService().asInstanceOf[ThreadPoolExecutor]
+    // Set threads to be daemons in order to allow the driver main thread
+    // to shut down upon errors. Otherwise the driver will hang indefinitely.
+    threadPoolExecutor.setThreadFactory(new ThreadFactory {
+        override def newThread(r: Runnable): Thread = {
+          val thread = new Thread(r, "spark-on-k8s")
+          thread.setDaemon(true)
+          thread
+        }
+      })
+    // Disable the ping thread that is not daemon, in order to allow
+    // the driver main thread to shut down upon errors. Otherwise, the driver
+    // will hang indefinitely.
+    val config = configBuilder
+      .withWebsocketPingInterval(0)
+      .build()
+    val httpClient = HttpClientUtils.createHttpClient(config).newBuilder()
+      .dispatcher(new Dispatcher(threadPoolExecutor))
+      .build()
+    new DefaultKubernetesClient(httpClient, config)
   }
 }
diff --git a/...c/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala b/...c/main/scala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterManager.scala
@@ -16,15 +16,55 @@
  */
 package org.apache.spark.scheduler.cluster.kubernetes
 
+import scala.collection.mutable.ArrayBuffer
+
 import org.apache.spark.SparkContext
-import org.apache.spark.scheduler.{ExternalClusterManager, SchedulerBackend, TaskScheduler, TaskSchedulerImpl}
+import org.apache.spark.scheduler.{ExternalClusterManager, SchedulerBackend,
+    TaskScheduler, TaskSchedulerImpl, TaskSet, TaskSetManager}
 
 private[spark] class KubernetesClusterManager extends ExternalClusterManager {
 
   override def canCreate(masterURL: String): Boolean = masterURL.startsWith("k8s")
 
   override def createTaskScheduler(sc: SparkContext, masterURL: String): TaskScheduler = {
-    val scheduler = new TaskSchedulerImpl(sc)
+    val scheduler = new TaskSchedulerImpl(sc) {
+
+      override def createTaskSetManager(taskSet: TaskSet, maxTaskFailures: Int): TaskSetManager = {
+        new TaskSetManager(sched = this, taskSet, maxTaskFailures) {
+
+          /**
+           * Overrides the lookup to use not only the executor pod IP, but also the cluster node
+           * name and host IP address that the pod is running on. The base class may have populated
+           * the lookup target map with HDFS datanode locations if this task set reads HDFS data.
+           * Those datanode locations are based on cluster node names or host IP addresses. Using
+           * only executor pod IPs may not match them.
+           */
+          override def getPendingTasksForHost(executorIP: String): ArrayBuffer[Int] = {
+            var pendingTasks = super.getPendingTasksForHost(executorIP)
+            if (pendingTasks.nonEmpty) {
+              return pendingTasks
+            }
+            val backend = sc.taskScheduler.asInstanceOf[TaskSchedulerImpl].backend.asInstanceOf[
+              KubernetesClusterSchedulerBackend]
+            val pod = backend.getExecutorPodByIP(executorIP)
+            if (pod.isEmpty) {
+              return pendingTasks  // Empty
+            }
+            val clusterNodeName = pod.get.getSpec.getNodeName
+            val clusterNodeIP = pod.get.getStatus.getHostIP
+            pendingTasks = super.getPendingTasksForHost(clusterNodeName)
+            if (pendingTasks.isEmpty) {
+              pendingTasks = super.getPendingTasksForHost(clusterNodeIP)
+            }
+            if (pendingTasks.nonEmpty && log.isDebugEnabled) {
+              logDebug(s"Got preferred task list $pendingTasks for executor host $executorIP " +
+                s"using cluster node $clusterNodeName at $clusterNodeIP")
+            }
+            pendingTasks
+          }
+        }
+      }
+    }
     sc.taskScheduler = scheduler
     scheduler
   }
@@ -37,6 +77,5 @@ private[spark] class KubernetesClusterManager extends ExternalClusterManager {
   override def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = {
     scheduler.asInstanceOf[TaskSchedulerImpl].initialize(backend)
   }
-
 }
 
diff --git a/...ala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala b/...ala/org/apache/spark/scheduler/cluster/kubernetes/KubernetesClusterSchedulerBackend.scala
@@ -16,13 +16,18 @@
  */
 package org.apache.spark.scheduler.cluster.kubernetes
 
-import java.util.concurrent.atomic.{AtomicInteger, AtomicLong}
+import java.io.Closeable
+import java.util.concurrent.atomic.{AtomicInteger, AtomicLong, AtomicReference}
 
-import io.fabric8.kubernetes.api.model.{ContainerPortBuilder, EnvVarBuilder,
-    EnvVarSourceBuilder, Pod, QuantityBuilder}
 import scala.collection.JavaConverters._
+import scala.collection.mutable
 import scala.concurrent.{ExecutionContext, Future}
 
+import io.fabric8.kubernetes.api.model.{ContainerPortBuilder, EnvVarBuilder,
+    EnvVarSourceBuilder, Pod, QuantityBuilder}
+import io.fabric8.kubernetes.client.{KubernetesClientException, Watcher}
+import io.fabric8.kubernetes.client.Watcher.Action
+
 import org.apache.spark.{SparkContext, SparkException}
 import org.apache.spark.deploy.kubernetes.KubernetesClientBuilder
 import org.apache.spark.deploy.kubernetes.config._
@@ -39,8 +44,11 @@ private[spark] class KubernetesClusterSchedulerBackend(
 
   import KubernetesClusterSchedulerBackend._
 
-  private val EXECUTOR_MODIFICATION_LOCK = new Object
-  private val runningExecutorPods = new scala.collection.mutable.HashMap[String, Pod]
+  private val RUNNING_EXECUTOR_PODS_LOCK = new Object
+  private val runningExecutorPods = new mutable.HashMap[String, Pod] // Indexed by executor IDs.
+
+  private val EXECUTOR_PODS_BY_IPS_LOCK = new Object
+  private val executorPodsByIPs = new mutable.HashMap[String, Pod] // Indexed by executor IP addrs.
 
   private val executorDockerImage = conf.get(EXECUTOR_DOCKER_IMAGE)
   private val kubernetesNamespace = conf.get(KUBERNETES_NAMESPACE)
@@ -93,6 +101,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
       super.minRegisteredRatio
     }
 
+  private val executorWatchResource = new AtomicReference[Closeable]
   protected var totalExpectedExecutors = new AtomicInteger(0)
 
   private val driverUrl = RpcEndpointAddress(
@@ -125,6 +134,8 @@ private[spark] class KubernetesClusterSchedulerBackend(
 
   override def start(): Unit = {
     super.start()
+    executorWatchResource.set(kubernetesClient.pods().withLabel(SPARK_APP_ID_LABEL, applicationId())
+      .watch(new ExecutorPodsWatcher()))
     if (!Utils.isDynamicAllocationEnabled(sc.conf)) {
       doRequestTotalExecutors(initialExecutors)
     }
@@ -139,7 +150,16 @@ private[spark] class KubernetesClusterSchedulerBackend(
     // When using Utils.tryLogNonFatalError some of the code fails but without any logs or
     // indication as to why.
     try {
-      runningExecutorPods.values.foreach(kubernetesClient.pods().delete(_))
+      RUNNING_EXECUTOR_PODS_LOCK.synchronized {
+        runningExecutorPods.values.foreach(kubernetesClient.pods().delete(_))
+      }
+      EXECUTOR_PODS_BY_IPS_LOCK.synchronized {
+        executorPodsByIPs.clear()
+      }
+      val resource = executorWatchResource.getAndSet(null)
+      if (resource != null) {
+        resource.close()
+      }
     } catch {
       case e: Throwable => logError("Uncaught exception while shutting down controllers.", e)
     }
@@ -149,6 +169,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
       case e: Throwable => logError("Uncaught exception while shutting down driver service.", e)
     }
     try {
+      logInfo("Closing kubernetes client")
       kubernetesClient.close()
     } catch {
       case e: Throwable => logError("Uncaught exception closing Kubernetes client.", e)
@@ -242,7 +263,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
   }
 
   override def doRequestTotalExecutors(requestedTotal: Int): Future[Boolean] = Future[Boolean] {
-    EXECUTOR_MODIFICATION_LOCK.synchronized {
+    RUNNING_EXECUTOR_PODS_LOCK.synchronized {
       if (requestedTotal > totalExpectedExecutors.get) {
         logInfo(s"Requesting ${requestedTotal - totalExpectedExecutors.get}"
           + s" additional executors, expecting total $requestedTotal and currently" +
@@ -257,7 +278,7 @@ private[spark] class KubernetesClusterSchedulerBackend(
   }
 
   override def doKillExecutors(executorIds: Seq[String]): Future[Boolean] = Future[Boolean] {
-    EXECUTOR_MODIFICATION_LOCK.synchronized {
+    RUNNING_EXECUTOR_PODS_LOCK.synchronized {
       for (executor <- executorIds) {
         runningExecutorPods.remove(executor) match {
           case Some(pod) => kubernetesClient.pods().delete(pod)
@@ -267,6 +288,39 @@ private[spark] class KubernetesClusterSchedulerBackend(
     }
     true
   }
+
+  def getExecutorPodByIP(podIP: String): Option[Pod] = {
+    EXECUTOR_PODS_BY_IPS_LOCK.synchronized {
+      executorPodsByIPs.get(podIP)
+    }
+  }
+
+  private class ExecutorPodsWatcher extends Watcher[Pod] {
+
+    override def eventReceived(action: Action, pod: Pod): Unit = {
+      if (action == Action.MODIFIED && pod.getStatus.getPhase == "Running"
+          && pod.getMetadata.getDeletionTimestamp == null) {
+        val podName = pod.getMetadata.getName
+        val podIP = pod.getStatus.getPodIP
+        val clusterNodeName = pod.getSpec.getNodeName
+        logDebug(s"Executor pod $pod ready, launched at $clusterNodeName as IP $podIP.")
+        EXECUTOR_PODS_BY_IPS_LOCK.synchronized {
+          executorPodsByIPs += ((podIP, pod))
+        }
+      } else if (action == Action.MODIFIED && pod.getMetadata.getDeletionTimestamp != null) {
+        val podName = pod.getMetadata.getName
+        val podIP = pod.getStatus.getPodIP
+        logDebug(s"Executor pod $podName at IP $podIP was deleted.")
+        EXECUTOR_PODS_BY_IPS_LOCK.synchronized {
+          executorPodsByIPs -= podIP
+        }
+      }
+    }
+
+    override def onClose(cause: KubernetesClientException): Unit = {
+      logDebug("Executor pod watch closed.", cause)
+    }
+  }
 }
 
 private object KubernetesClusterSchedulerBackend {