Merge pull request #7 from markhamstra/master-csd

jhartlaub · jhartlaub · commit c5884701c7d8 · 2014-02-22T17:01:04.000-08:00
SPY-287 Merging Apache 0.8.2 changes
diff --git a/assembly/pom.xml b/assembly/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-    <version>0.8.1-csd-3-SNAPSHOT</version>
+    <version>0.8.2-candidate-csd-1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/bagel/pom.xml b/bagel/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-    <version>0.8.1-csd-3-SNAPSHOT</version>
+    <version>0.8.2-candidate-csd-1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/core/pom.xml b/core/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-    <version>0.8.1-csd-3-SNAPSHOT</version>
+    <version>0.8.2-candidate-csd-1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala b/core/src/main/scala/org/apache/spark/api/java/JavaRDDLike.scala
@@ -246,6 +246,17 @@ trait JavaRDDLike[T, This <: JavaRDDLike[T, This]] extends Serializable {
     new java.util.ArrayList(arr)
   }
 
+  /**
+   * Return an array that contains all of the elements in a specific partition of this RDD.
+   */
+  def collectPartitions(partitionIds: Array[Int]): Array[JList[T]] = {
+    // This is useful for implementing `take` from other language frontends
+    // like Python where the data is serialized.
+    import scala.collection.JavaConversions._
+    val res = context.runJob(rdd, (it: Iterator[T]) => it.toArray, partitionIds, true)
+    res.map(x => new java.util.ArrayList(x.toSeq)).toArray
+  }
+
   /**
    * Reduces the elements of this RDD using the specified commutative and associative binary operator.
    */
diff --git a/core/src/main/scala/org/apache/spark/broadcast/Broadcast.scala b/core/src/main/scala/org/apache/spark/broadcast/Broadcast.scala
@@ -22,7 +22,7 @@ import java.util.concurrent.atomic.AtomicLong
 
 import org.apache.spark._
 
-abstract class Broadcast[T](private[spark] val id: Long) extends Serializable {
+abstract class Broadcast[T](val id: Long) extends Serializable {
   def value: T
 
   // We cannot have an abstract readObject here due to some weird issues with
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/Master.scala b/core/src/main/scala/org/apache/spark/deploy/master/Master.scala
@@ -409,7 +409,7 @@ private[spark] class Master(host: String, port: Int, webUiPort: Int) extends Act
     // There may be one or more refs to dead workers on this same node (w/ different ID's),
     // remove them.
     workers.filter { w =>
-      (w.host == host && w.port == port) && (w.state == WorkerState.DEAD)
+      (w.host == worker.host && w.port == worker.port) && (w.state == WorkerState.DEAD)
     }.foreach { w =>
       workers -= w
     }
diff --git a/core/src/main/scala/org/apache/spark/deploy/master/ZooKeeperPersistenceEngine.scala b/core/src/main/scala/org/apache/spark/deploy/master/ZooKeeperPersistenceEngine.scala
@@ -77,7 +77,7 @@ class ZooKeeperPersistenceEngine(serialization: Serialization)
   }
 
   def deserializeFromFile[T <: Serializable](filename: String)(implicit m: Manifest[T]): T = {
-    val fileData = zk.getData("/spark/master_status/" + filename)
+    val fileData = zk.getData(WORKING_DIR + "/" + filename)
     val clazz = m.erasure.asInstanceOf[Class[T]]
     val serializer = serialization.serializerFor(clazz)
     serializer.fromBinary(fileData).asInstanceOf[T]
diff --git a/core/src/main/scala/org/apache/spark/executor/Executor.scala b/core/src/main/scala/org/apache/spark/executor/Executor.scala
@@ -142,11 +142,6 @@ private[spark] class Executor(
     val tr = runningTasks.get(taskId)
     if (tr != null) {
       tr.kill()
-      // We remove the task also in the finally block in TaskRunner.run.
-      // The reason we need to remove it here is because killTask might be called before the task
-      // is even launched, and never reaching that finally block. ConcurrentHashMap's remove is
-      // idempotent.
-      runningTasks.remove(taskId)
     }
   }
 
@@ -168,6 +163,8 @@ private[spark] class Executor(
   class TaskRunner(execBackend: ExecutorBackend, taskId: Long, serializedTask: ByteBuffer)
     extends Runnable {
 
+    object TaskKilledException extends Exception
+
     @volatile private var killed = false
     @volatile private var task: Task[Any] = _
 
@@ -201,9 +198,11 @@ private[spark] class Executor(
         // If this task has been killed before we deserialized it, let's quit now. Otherwise,
         // continue executing the task.
         if (killed) {
-          logInfo("Executor killed task " + taskId)
-          execBackend.statusUpdate(taskId, TaskState.KILLED, ser.serialize(TaskKilled))
-          return
+          // Throw an exception rather than returning, because returning within a try{} block
+          // causes a NonLocalReturnControl exception to be thrown. The NonLocalReturnControl
+          // exception will be caught by the catch block, leading to an incorrect ExceptionFailure
+          // for the task.
+          throw TaskKilledException
         }
 
         attemptedTask = Some(task)
@@ -217,9 +216,7 @@ private[spark] class Executor(
 
         // If the task has been killed, let's fail it.
         if (task.killed) {
-          logInfo("Executor killed task " + taskId)
-          execBackend.statusUpdate(taskId, TaskState.KILLED, ser.serialize(TaskKilled))
-          return
+          throw TaskKilledException
         }
 
         for (m <- task.metrics) {
@@ -257,6 +254,11 @@ private[spark] class Executor(
           execBackend.statusUpdate(taskId, TaskState.FAILED, ser.serialize(reason))
         }
 
+        case TaskKilledException => {
+          logInfo("Executor killed task " + taskId)
+          execBackend.statusUpdate(taskId, TaskState.KILLED, ser.serialize(TaskKilled))
+        }
+
         case t: Throwable => {
           val serviceTime = (System.currentTimeMillis() - taskStart).toInt
           val metrics = attemptedTask.flatMap(t => t.metrics)
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/ClusterScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/ClusterScheduler.scala
@@ -287,7 +287,8 @@ private[spark] class ClusterScheduler(val sc: SparkContext)
               }
             }
           case None =>
-            logInfo("Ignoring update from TID " + tid + " because its task set is gone")
+            logInfo("Ignoring update with state %s from TID %s because its task set is gone"
+              .format(state, tid))
         }
       } catch {
         case e: Exception => logError("Exception in statusUpdate", e)
diff --git a/core/src/main/scala/org/apache/spark/scheduler/local/LocalTaskSetManager.scala b/core/src/main/scala/org/apache/spark/scheduler/local/LocalTaskSetManager.scala
@@ -21,7 +21,8 @@ import java.nio.ByteBuffer
 import scala.collection.mutable.ArrayBuffer
 import scala.collection.mutable.HashMap
 
-import org.apache.spark.{ExceptionFailure, Logging, SparkEnv, SparkException, Success, TaskState}
+import org.apache.spark.{ExceptionFailure, Logging, SparkEnv, SparkException, Success,
+  TaskEndReason, TaskResultLost, TaskState}
 import org.apache.spark.TaskState.TaskState
 import org.apache.spark.scheduler.{DirectTaskResult, IndirectTaskResult, Pool, Schedulable, Task,
   TaskDescription, TaskInfo, TaskLocality, TaskResult, TaskSet, TaskSetManager}
@@ -144,7 +145,18 @@ private[spark] class LocalTaskSetManager(sched: LocalScheduler, val taskSet: Tas
     val result = ser.deserialize[TaskResult[_]](serializedData, getClass.getClassLoader) match {
       case directResult: DirectTaskResult[_] => directResult
       case IndirectTaskResult(blockId) => {
-        throw new SparkException("Expect only DirectTaskResults when using LocalScheduler")
+        logDebug("Fetching indirect task result for TID %s".format(tid))
+        val serializedTaskResult = env.blockManager.getRemoteBytes(blockId)
+        if (!serializedTaskResult.isDefined) {
+          /* We won't be able to get the task result if the block manager had to flush the
+           * result. */
+          taskFailed(tid, state, serializedData)
+          return
+        }
+        val deserializedResult = ser.deserialize[DirectTaskResult[_]](
+          serializedTaskResult.get)
+        env.blockManager.master.removeBlock(blockId)
+        deserializedResult
       }
     }
     result.metrics.resultSize = serializedData.limit()
@@ -164,18 +176,28 @@ private[spark] class LocalTaskSetManager(sched: LocalScheduler, val taskSet: Tas
     val task = taskSet.tasks(index)
     info.markFailed()
     decreaseRunningTasks(1)
-    val reason: ExceptionFailure = ser.deserialize[ExceptionFailure](
-      serializedData, getClass.getClassLoader)
-    sched.dagScheduler.taskEnded(task, reason, null, null, info, reason.metrics.getOrElse(null))
+    var failureReason = "unknown"
+    ser.deserialize[TaskEndReason](serializedData, getClass.getClassLoader) match {
+      case ef: ExceptionFailure =>
+        failureReason = "Exception failure: %s".format(ef.description)
+        val locs = ef.stackTrace.map(loc => "\tat %s".format(loc.toString))
+        logInfo("Task loss due to %s\n%s\n%s".format(
+          ef.className, ef.description, locs.mkString("\n")))
+        sched.dagScheduler.taskEnded(task, ef, null, null, info, ef.metrics.getOrElse(null))
+
+      case TaskResultLost =>
+        failureReason = "Lost result for TID %s".format(tid)
+        logWarning(failureReason)
+        sched.dagScheduler.taskEnded(task, TaskResultLost, null, null, info, null)
+
+      case _ => {}
+    }
     if (!finished(index)) {
       copiesRunning(index) -= 1
       numFailures(index) += 1
-      val locs = reason.stackTrace.map(loc => "\tat %s".format(loc.toString))
-      logInfo("Loss was due to %s\n%s\n%s".format(
-        reason.className, reason.description, locs.mkString("\n")))
       if (numFailures(index) > MAX_TASK_FAILURES) {
-        val errorMessage = "Task %s:%d failed more than %d times; aborting job %s".format(
-          taskSet.id, index, MAX_TASK_FAILURES, reason.description)
+        val errorMessage = ("Task %s:%d failed more than %d times; aborting job" +
+          "(most recent failure: %s").format(taskSet.id, index, MAX_TASK_FAILURES, failureReason)
         decreaseRunningTasks(runningTasks)
         sched.dagScheduler.taskSetFailed(taskSet, errorMessage)
         // need to delete failed Taskset from schedule queue
diff --git a/core/src/main/scala/org/apache/spark/util/BoundedPriorityQueue.scala b/core/src/main/scala/org/apache/spark/util/BoundedPriorityQueue.scala
@@ -34,6 +34,8 @@ class BoundedPriorityQueue[A](maxSize: Int)(implicit ord: Ordering[A])
 
   override def iterator: Iterator[A] = underlying.iterator.asScala
 
+  override def size: Int = underlying.size
+
   override def ++=(xs: TraversableOnce[A]): this.type = {
     xs.foreach { this += _ }
     this
diff --git a/core/src/test/scala/org/apache/spark/JavaAPISuite.java b/core/src/test/scala/org/apache/spark/JavaAPISuite.java
@@ -883,4 +883,37 @@ public Tuple2<Integer, Integer> call(Tuple2<Integer, Integer> in) throws Excepti
         new Tuple2<Integer, Integer>(0, 4)), rdd3.collect());
 
   }
+
+  @Test
+  public void collectPartitions() {
+    JavaRDD<Integer> rdd1 = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5, 6, 7), 3);
+
+    JavaPairRDD<Integer, Integer> rdd2 = rdd1.map(new PairFunction<Integer, Integer, Integer>() {
+      @Override
+      public Tuple2<Integer, Integer> call(Integer i) throws Exception {
+        return new Tuple2<Integer, Integer>(i, i % 2);
+      }
+    });
+
+    List[] parts = rdd1.collectPartitions(new int[] {0});
+    Assert.assertEquals(Arrays.asList(1, 2), parts[0]);
+
+    parts = rdd1.collectPartitions(new int[] {1, 2});
+    Assert.assertEquals(Arrays.asList(3, 4), parts[0]);
+    Assert.assertEquals(Arrays.asList(5, 6, 7), parts[1]);
+
+    Assert.assertEquals(Arrays.asList(new Tuple2<Integer, Integer>(1, 1),
+                                      new Tuple2<Integer, Integer>(2, 0)),
+                        rdd2.collectPartitions(new int[] {0})[0]);
+
+    parts = rdd2.collectPartitions(new int[] {1, 2});
+    Assert.assertEquals(Arrays.asList(new Tuple2<Integer, Integer>(3, 1),
+                                      new Tuple2<Integer, Integer>(4, 0)),
+                        parts[0]);
+    Assert.assertEquals(Arrays.asList(new Tuple2<Integer, Integer>(5, 1),
+                                      new Tuple2<Integer, Integer>(6, 0),
+                                      new Tuple2<Integer, Integer>(7, 1)),
+                        parts[1]);
+  }
+
 }
diff --git a/docs/_config.yml b/docs/_config.yml
@@ -3,8 +3,8 @@ markdown: kramdown
 
 # These allow the documentation to be updated with nerw releases
 # of Spark, Scala, and Mesos.
-SPARK_VERSION: 0.8.1-incubating
-SPARK_VERSION_SHORT: 0.8.1
+SPARK_VERSION: 0.8.2-incubating-SNAPSHOT
+SPARK_VERSION_SHORT: 0.8.2-SNAPSHOT
 SCALA_VERSION: 2.9.3
 MESOS_VERSION: 0.13.0
 SPARK_ISSUE_TRACKER_URL: https://spark-project.atlassian.net
diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
@@ -565,7 +565,7 @@ def ssh(host, opts, command):
   while True:
     try:
       return subprocess.check_call(
-        "ssh -t -o StrictHostKeyChecking=no -i %s %s@%s '%s'" %
+        "ssh -t -t -o StrictHostKeyChecking=no -i %s %s@%s '%s'" %
         (opts.identity_file, opts.user, host, command), shell=True)
     except subprocess.CalledProcessError as e:
       if (tries > 2):
diff --git a/examples/pom.xml b/examples/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-    <version>0.8.1-csd-3-SNAPSHOT</version>
+    <version>0.8.2-candidate-csd-1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/mllib/pom.xml b/mllib/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-    <version>0.8.1-csd-3-SNAPSHOT</version>
+    <version>0.8.2-candidate-csd-1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/new-yarn/pom.xml b/new-yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-    <version>0.8.1-incubating</version>
+    <version>0.8.2-incubating-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/pom.xml b/pom.xml
@@ -25,7 +25,7 @@
   </parent>
   <groupId>org.apache.spark</groupId>
   <artifactId>spark-parent</artifactId>
-  <version>0.8.1-csd-3-SNAPSHOT</version>
+  <version>0.8.2-candidate-csd-1-SNAPSHOT</version>
   <packaging>pom</packaging>
   <name>Spark Project Parent POM</name>
   <url>http://spark.incubator.apache.org/</url>
diff --git a/project/SparkBuild.scala b/project/SparkBuild.scala
@@ -96,7 +96,7 @@ object SparkBuild extends Build {
 
   def sharedSettings = Defaults.defaultSettings ++ Seq(
     organization := "org.apache.spark",
-    version := "0.8.1-incubating",
+    version := "0.8.2-incubating-SNAPSHOT",
     scalaVersion := "2.9.3",
     scalacOptions := Seq("-Xmax-classfile-name", "120", "-unchecked", "-deprecation",
       "-target:" + SCALAC_JVM_VERSION),
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
@@ -569,9 +569,14 @@ def takeUpToNum(iterator):
         # Take only up to num elements from each partition we try
         mapped = self.mapPartitions(takeUpToNum)
         items = []
+        # TODO(shivaram): Similar to the scala implementation, update the take 
+        # method to scan multiple splits based on an estimate of how many elements 
+        # we have per-split.
         for partition in range(mapped._jrdd.splits().size()):
-            iterator = self.ctx._takePartition(mapped._jrdd.rdd(), partition)
-            items.extend(self._collect_iterator_through_file(iterator))
+            partitionsToTake = self.ctx._gateway.new_array(self.ctx._jvm.int, 1)
+            partitionsToTake[0] = partition
+            iterator = mapped._jrdd.collectPartitions(partitionsToTake)[0].iterator()
+            items.extend(mapped._collect_iterator_through_file(iterator))
             if len(items) >= num:
                 break
         return items[:num]
diff --git a/python/pyspark/shell.py b/python/pyspark/shell.py
@@ -35,7 +35,7 @@
       ____              __
      / __/__  ___ _____/ /__
     _\ \/ _ \/ _ `/ __/  '_/
-   /__ / .__/\_,_/_/ /_/\_\   version 0.8.1
+   /__ / .__/\_,_/_/ /_/\_\   version 0.8.2-SNAPSHOT
       /_/
 """
 print "Using Python version %s (%s, %s)" % (
diff --git a/repl/pom.xml b/repl/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-    <version>0.8.1-csd-3-SNAPSHOT</version>
+    <version>0.8.2-candidate-csd-1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/repl/src/main/scala/org/apache/spark/repl/SparkILoop.scala b/repl/src/main/scala/org/apache/spark/repl/SparkILoop.scala
@@ -200,7 +200,7 @@ class SparkILoop(in0: Option[BufferedReader], val out: PrintWriter, val master:
       ____              __  
      / __/__  ___ _____/ /__
     _\ \/ _ \/ _ `/ __/  '_/
-   /___/ .__/\_,_/_/ /_/\_\   version 0.8.1
+   /___/ .__/\_,_/_/ /_/\_\   version 0.8.2-SNAPSHOT
       /_/                  
 """)
     import Properties._
diff --git a/streaming/pom.xml b/streaming/pom.xml
@@ -21,7 +21,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-    <version>0.8.1-csd-3-SNAPSHOT</version>
+    <version>0.8.2-candidate-csd-1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/tools/pom.xml b/tools/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-    <version>0.8.1-csd-3-SNAPSHOT</version>
+    <version>0.8.2-candidate-csd-1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 
diff --git a/yarn/pom.xml b/yarn/pom.xml
@@ -20,7 +20,7 @@
   <parent>
     <groupId>org.apache.spark</groupId>
     <artifactId>spark-parent</artifactId>
-    <version>0.8.1-candidate-csd-1-SNAPSHOT</version>
+    <version>0.8.2-candidate-csd-1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
 

Original file line number	Diff line number	Diff line change
`@@ -409,7 +409,7 @@ private[spark] class Master(host: String, port: Int, webUiPort: Int) extends Act`
`409`	`409`	`// There may be one or more refs to dead workers on this same node (w/ different ID's),`
`410`	`410`	`// remove them.`
`411`	`411`	`workers.filter { w =>`
`412`		`- (w.host == host && w.port == port) && (w.state == WorkerState.DEAD)`
	`412`	`+ (w.host == worker.host && w.port == worker.port) && (w.state == WorkerState.DEAD)`
`413`	`413`	`}.foreach { w =>`
`414`	`414`	`workers -= w`
`415`	`415`	`}`
Original file line number	Diff line number	Diff line change
`@@ -77,7 +77,7 @@ class ZooKeeperPersistenceEngine(serialization: Serialization)`
`77`	`77`	`}`
`78`	`78`
`79`	`79`	`def deserializeFromFile[T <: Serializable](filename: String)(implicit m: Manifest[T]): T = {`
`80`		`- val fileData = zk.getData("/spark/master_status/" + filename)`
	`80`	`+ val fileData = zk.getData(WORKING_DIR + "/" + filename)`
`81`	`81`	`val clazz = m.erasure.asInstanceOf[Class[T]]`
`82`	`82`	`val serializer = serialization.serializerFor(clazz)`
`83`	`83`	`serializer.fromBinary(fileData).asInstanceOf[T]`
Original file line number	Diff line number	Diff line change
`@@ -287,7 +287,8 @@ private[spark] class ClusterScheduler(val sc: SparkContext)`
`287`	`287`	`}`
`288`	`288`	`}`
`289`	`289`	`case None =>`
`290`		`- logInfo("Ignoring update from TID " + tid + " because its task set is gone")`
	`290`	`+ logInfo("Ignoring update with state %s from TID %s because its task set is gone"`
	`291`	`+ .format(state, tid))`
`291`	`292`	`}`
`292`	`293`	`} catch {`
`293`	`294`	`case e: Exception => logError("Exception in statusUpdate", e)`