apache
diff --git a/‎bin/compute-classpath.sh‎
Lines changed: 25 additions & 9 deletions b/‎bin/compute-classpath.sh‎
Lines changed: 25 additions & 9 deletions
diff --git a/‎bin/spark-class‎
Lines changed: 0 additions & 17 deletions b/‎bin/spark-class‎
Lines changed: 0 additions & 17 deletions
diff --git a/‎core/pom.xml‎
Lines changed: 6 additions & 0 deletions b/‎core/pom.xml‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/CacheManager.scala‎
Lines changed: 12 additions & 8 deletions b/‎core/src/main/scala/org/apache/spark/CacheManager.scala‎
Lines changed: 12 additions & 8 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/ContextCleaner.scala‎
Lines changed: 1 addition & 1 deletion b/‎core/src/main/scala/org/apache/spark/ContextCleaner.scala‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎core/src/main/scala/org/apache/spark/Dependency.scala‎
Lines changed: 9 additions & 3 deletions b/‎core/src/main/scala/org/apache/spark/Dependency.scala‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/SparkContext.scala‎
Lines changed: 15 additions & 3 deletions b/‎core/src/main/scala/org/apache/spark/SparkContext.scala‎
Lines changed: 15 additions & 3 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/SparkEnv.scala‎
Lines changed: 18 additions & 10 deletions b/‎core/src/main/scala/org/apache/spark/SparkEnv.scala‎
Lines changed: 18 additions & 10 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala‎
Lines changed: 44 additions & 0 deletions b/‎core/src/main/scala/org/apache/spark/api/java/JavaPairRDD.scala‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎core/src/main/scala/org/apache/spark/deploy/Client.scala‎
Lines changed: 1 addition & 1 deletion b/‎core/src/main/scala/org/apache/spark/deploy/Client.scala‎
Lines changed: 1 addition & 1 deletion
@@ -38,8 +38,10 @@ else
   JAR_CMD="jar"
 fi
 
-# First check if we have a dependencies jar. If so, include binary classes with the deps jar
-if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
+# A developer option to prepend more recently compiled Spark classes
+if [ -n "$SPARK_PREPEND_CLASSES" ]; then
+  echo "NOTE: SPARK_PREPEND_CLASSES is set, placing locally compiled Spark"\
+    "classes ahead of assembly." >&2
   CLASSPATH="$CLASSPATH:$FWDIR/core/target/scala-$SCALA_VERSION/classes"
   CLASSPATH="$CLASSPATH:$FWDIR/repl/target/scala-$SCALA_VERSION/classes"
   CLASSPATH="$CLASSPATH:$FWDIR/mllib/target/scala-$SCALA_VERSION/classes"
@@ -51,17 +53,31 @@ if [ -f "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar ]; then
   CLASSPATH="$CLASSPATH:$FWDIR/sql/core/target/scala-$SCALA_VERSION/classes"
   CLASSPATH="$CLASSPATH:$FWDIR/sql/hive/target/scala-$SCALA_VERSION/classes"
   CLASSPATH="$CLASSPATH:$FWDIR/yarn/stable/target/scala-$SCALA_VERSION/classes"
+fi
 
-  ASSEMBLY_JAR=$(ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*-deps.jar 2>/dev/null)
+# Use spark-assembly jar from either RELEASE or assembly directory
+if [ -f "$FWDIR/RELEASE" ]; then
+  assembly_folder="$FWDIR"/lib
 else
-  # Else use spark-assembly jar from either RELEASE or assembly directory
-  if [ -f "$FWDIR/RELEASE" ]; then
-    ASSEMBLY_JAR=$(ls "$FWDIR"/lib/spark-assembly*hadoop*.jar 2>/dev/null)
-  else
-    ASSEMBLY_JAR=$(ls "$ASSEMBLY_DIR"/spark-assembly*hadoop*.jar 2>/dev/null)
-  fi
+  assembly_folder="$ASSEMBLY_DIR"
 fi
 
+num_jars=$(ls "$assembly_folder" | grep "spark-assembly.*hadoop.*\.jar" | wc -l)
+if [ "$num_jars" -eq "0" ]; then
+  echo "Failed to find Spark assembly in $assembly_folder"
+  echo "You need to build Spark before running this program."
+  exit 1
+fi
+if [ "$num_jars" -gt "1" ]; then
+  jars_list=$(ls "$assembly_folder" | grep "spark-assembly.*hadoop.*.jar")
+  echo "Found multiple Spark assembly jars in $assembly_folder:"
+  echo "$jars_list"
+  echo "Please remove all but one jar."
+  exit 1
+fi
+
+ASSEMBLY_JAR=$(ls "$assembly_folder"/spark-assembly*hadoop*.jar 2>/dev/null)
+
 # Verify that versions of java used to build the jars and run Spark are compatible
 jar_error_check=$("$JAR_CMD" -tf "$ASSEMBLY_JAR" nonexistent/class/path 2>&1)
 if [[ "$jar_error_check" =~ "invalid CEN header" ]]; then
 
@@ -108,23 +108,6 @@ fi
 export JAVA_OPTS
 # Attention: when changing the way the JAVA_OPTS are assembled, the change must be reflected in CommandUtils.scala!
 
-if [ ! -f "$FWDIR/RELEASE" ]; then
-  # Exit if the user hasn't compiled Spark
-  num_jars=$(ls "$FWDIR"/assembly/target/scala-$SCALA_VERSION/ | grep "spark-assembly.*hadoop.*.jar" | wc -l)
-  jars_list=$(ls "$FWDIR"/assembly/target/scala-$SCALA_VERSION/ | grep "spark-assembly.*hadoop.*.jar")
-  if [ "$num_jars" -eq "0" ]; then
-    echo "Failed to find Spark assembly in $FWDIR/assembly/target/scala-$SCALA_VERSION/" >&2
-    echo "You need to build Spark before running this program." >&2
-    exit 1
-  fi
-  if [ "$num_jars" -gt "1" ]; then
-    echo "Found multiple Spark assembly jars in $FWDIR/assembly/target/scala-$SCALA_VERSION:" >&2
-    echo "$jars_list"
-    echo "Please remove all but one jar."
-    exit 1
-  fi
-fi
-
 TOOLS_DIR="$FWDIR"/tools
 SPARK_TOOLS_JAR=""
 if [ -e "$TOOLS_DIR"/target/scala-$SCALA_VERSION/*assembly*[0-9Tg].jar ]; then
 
@@ -67,6 +67,12 @@
       <groupId>org.apache.commons</groupId>
       <artifactId>commons-lang3</artifactId>
     </dependency>
+    <dependency>
+      <groupId>org.apache.commons</groupId>
+      <artifactId>commons-math3</artifactId>
+      <version>3.3</version>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>com.google.code.findbugs</groupId>
       <artifactId>jsr305</artifactId>
 
@@ -32,10 +32,14 @@ private[spark] class CacheManager(blockManager: BlockManager) extends Logging {
   private val loading = new HashSet[RDDBlockId]()
 
   /** Gets or computes an RDD split. Used by RDD.iterator() when an RDD is cached. */
-  def getOrCompute[T](rdd: RDD[T], split: Partition, context: TaskContext,
+  def getOrCompute[T](
+      rdd: RDD[T],
+      split: Partition,
+      context: TaskContext,
       storageLevel: StorageLevel): Iterator[T] = {
+
     val key = RDDBlockId(rdd.id, split.index)
-    logDebug("Looking for partition " + key)
+    logDebug(s"Looking for partition $key")
     blockManager.get(key) match {
       case Some(values) =>
         // Partition is already materialized, so just return its values
@@ -45,7 +49,7 @@ private[spark] class CacheManager(blockManager: BlockManager) extends Logging {
         // Mark the split as loading (unless someone else marks it first)
         loading.synchronized {
           if (loading.contains(key)) {
-            logInfo("Another thread is loading %s, waiting for it to finish...".format(key))
+            logInfo(s"Another thread is loading $key, waiting for it to finish...")
             while (loading.contains(key)) {
               try {
                 loading.wait()
@@ -54,7 +58,7 @@ private[spark] class CacheManager(blockManager: BlockManager) extends Logging {
                   logWarning(s"Got an exception while waiting for another thread to load $key", e)
               }
             }
-            logInfo("Finished waiting for %s".format(key))
+            logInfo(s"Finished waiting for $key")
             /* See whether someone else has successfully loaded it. The main way this would fail
              * is for the RDD-level cache eviction policy if someone else has loaded the same RDD
              * partition but we didn't want to make space for it. However, that case is unlikely
@@ -64,7 +68,7 @@ private[spark] class CacheManager(blockManager: BlockManager) extends Logging {
               case Some(values) =>
                 return new InterruptibleIterator(context, values.asInstanceOf[Iterator[T]])
               case None =>
-                logInfo("Whoever was loading %s failed; we'll try it ourselves".format(key))
+                logInfo(s"Whoever was loading $key failed; we'll try it ourselves")
                 loading.add(key)
             }
           } else {
@@ -73,7 +77,7 @@ private[spark] class CacheManager(blockManager: BlockManager) extends Logging {
         }
         try {
           // If we got here, we have to load the split
-          logInfo("Partition %s not found, computing it".format(key))
+          logInfo(s"Partition $key not found, computing it")
           val computedValues = rdd.computeOrReadCheckpoint(split, context)
 
           // Persist the result, so long as the task is not running locally
@@ -97,8 +101,8 @@ private[spark] class CacheManager(blockManager: BlockManager) extends Logging {
                 case Some(values) =>
                   values.asInstanceOf[Iterator[T]]
                 case None =>
-                  logInfo("Failure to store %s".format(key))
-                  throw new Exception("Block manager failed to return persisted valued")
+                  logInfo(s"Failure to store $key")
+                  throw new SparkException("Block manager failed to return persisted value")
               }
             } else {
               // In this case the RDD is cached to an array buffer. This will save the results
 
@@ -96,7 +96,7 @@ private[spark] class ContextCleaner(sc: SparkContext) extends Logging {
   }
 
   /** Register a ShuffleDependency for cleanup when it is garbage collected. */
-  def registerShuffleForCleanup(shuffleDependency: ShuffleDependency[_, _]) {
+  def registerShuffleForCleanup(shuffleDependency: ShuffleDependency[_, _, _]) {
     registerForCleanup(shuffleDependency, CleanShuffle(shuffleDependency.shuffleId))
   }
 
 
@@ -20,6 +20,7 @@ package org.apache.spark
 import org.apache.spark.annotation.DeveloperApi
 import org.apache.spark.rdd.RDD
 import org.apache.spark.serializer.Serializer
+import org.apache.spark.shuffle.ShuffleHandle
 
 /**
  * :: DeveloperApi ::
@@ -50,19 +51,24 @@ abstract class NarrowDependency[T](rdd: RDD[T]) extends Dependency(rdd) {
  * Represents a dependency on the output of a shuffle stage.
  * @param rdd the parent RDD
  * @param partitioner partitioner used to partition the shuffle output
- * @param serializer [[org.apache.spark.serializer.Serializer Serializer]] to use. If set to null,
+ * @param serializer [[org.apache.spark.serializer.Serializer Serializer]] to use. If set to None,
  *                   the default serializer, as specified by `spark.serializer` config option, will
  *                   be used.
  */
 @DeveloperApi
-class ShuffleDependency[K, V](
+class ShuffleDependency[K, V, C](
     @transient rdd: RDD[_ <: Product2[K, V]],
     val partitioner: Partitioner,
-    val serializer: Serializer = null)
+    val serializer: Option[Serializer] = None,
+    val keyOrdering: Option[Ordering[K]] = None,
+    val aggregator: Option[Aggregator[K, V, C]] = None)
   extends Dependency(rdd.asInstanceOf[RDD[Product2[K, V]]]) {
 
   val shuffleId: Int = rdd.context.newShuffleId()
 
+  val shuffleHandle: ShuffleHandle = rdd.context.env.shuffleManager.registerShuffle(
+    shuffleId, rdd.partitions.size, this)
+
   rdd.sparkContext.cleaner.foreach(_.registerShuffleForCleanup(this))
 }
 
 
@@ -290,14 +290,17 @@ class SparkContext(config: SparkConf) extends Logging {
     value <- Option(System.getenv(envKey)).orElse(Option(System.getProperty(propKey)))} {
     executorEnvs(envKey) = value
   }
+  Option(System.getenv("SPARK_PREPEND_CLASSES")).foreach { v => 
+    executorEnvs("SPARK_PREPEND_CLASSES") = v
+  }
   // The Mesos scheduler backend relies on this environment variable to set executor memory.
   // TODO: Set this only in the Mesos scheduler.
   executorEnvs("SPARK_EXECUTOR_MEMORY") = executorMemory + "m"
   executorEnvs ++= conf.getExecutorEnv
 
   // Set SPARK_USER for user who is running SparkContext.
   val sparkUser = Option {
-    Option(System.getProperty("user.name")).getOrElse(System.getenv("SPARK_USER"))
+    Option(System.getenv("SPARK_USER")).getOrElse(System.getProperty("user.name"))
   }.getOrElse {
     SparkContext.SPARK_UNKNOWN_USER
   }
@@ -431,12 +434,21 @@ class SparkContext(config: SparkConf) extends Logging {
 
   // Methods for creating RDDs
 
-  /** Distribute a local Scala collection to form an RDD. */
+  /** Distribute a local Scala collection to form an RDD.
+   *
+   * @note Parallelize acts lazily. If `seq` is a mutable collection and is
+   * altered after the call to parallelize and before the first action on the
+   * RDD, the resultant RDD will reflect the modified collection. Pass a copy of
+   * the argument to avoid this.
+   */
   def parallelize[T: ClassTag](seq: Seq[T], numSlices: Int = defaultParallelism): RDD[T] = {
     new ParallelCollectionRDD[T](this, seq, numSlices, Map[Int, Seq[String]]())
   }
 
-  /** Distribute a local Scala collection to form an RDD. */
+  /** Distribute a local Scala collection to form an RDD.
+   *
+   * This method is identical to `parallelize`.
+   */
   def makeRDD[T: ClassTag](seq: Seq[T], numSlices: Int = defaultParallelism): RDD[T] = {
     parallelize(seq, numSlices)
   }
 
@@ -34,6 +34,7 @@ import org.apache.spark.metrics.MetricsSystem
 import org.apache.spark.network.ConnectionManager
 import org.apache.spark.scheduler.LiveListenerBus
 import org.apache.spark.serializer.Serializer
+import org.apache.spark.shuffle.ShuffleManager
 import org.apache.spark.storage._
 import org.apache.spark.util.{AkkaUtils, Utils}
 
@@ -56,7 +57,7 @@ class SparkEnv (
     val closureSerializer: Serializer,
     val cacheManager: CacheManager,
     val mapOutputTracker: MapOutputTracker,
-    val shuffleFetcher: ShuffleFetcher,
+    val shuffleManager: ShuffleManager,
     val broadcastManager: BroadcastManager,
     val blockManager: BlockManager,
     val connectionManager: ConnectionManager,
@@ -80,7 +81,7 @@ class SparkEnv (
     pythonWorkers.foreach { case(key, worker) => worker.stop() }
     httpFileServer.stop()
     mapOutputTracker.stop()
-    shuffleFetcher.stop()
+    shuffleManager.stop()
     broadcastManager.stop()
     blockManager.stop()
     blockManager.master.stop()
@@ -163,13 +164,20 @@ object SparkEnv extends Logging {
     def instantiateClass[T](propertyName: String, defaultClassName: String): T = {
       val name = conf.get(propertyName,  defaultClassName)
       val cls = Class.forName(name, true, Utils.getContextOrSparkClassLoader)
-      // First try with the constructor that takes SparkConf. If we can't find one,
-      // use a no-arg constructor instead.
+      // Look for a constructor taking a SparkConf and a boolean isDriver, then one taking just
+      // SparkConf, then one taking no arguments
       try {
-        cls.getConstructor(classOf[SparkConf]).newInstance(conf).asInstanceOf[T]
+        cls.getConstructor(classOf[SparkConf], java.lang.Boolean.TYPE)
+          .newInstance(conf, new java.lang.Boolean(isDriver))
+          .asInstanceOf[T]
       } catch {
         case _: NoSuchMethodException =>
-            cls.getConstructor().newInstance().asInstanceOf[T]
+          try {
+            cls.getConstructor(classOf[SparkConf]).newInstance(conf).asInstanceOf[T]
+          } catch {
+            case _: NoSuchMethodException =>
+              cls.getConstructor().newInstance().asInstanceOf[T]
+          }
       }
     }
 
@@ -219,9 +227,6 @@ object SparkEnv extends Logging {
 
     val cacheManager = new CacheManager(blockManager)
 
-    val shuffleFetcher = instantiateClass[ShuffleFetcher](
-      "spark.shuffle.fetcher", "org.apache.spark.BlockStoreShuffleFetcher")
-
     val httpFileServer = new HttpFileServer(securityManager)
     httpFileServer.initialize()
     conf.set("spark.fileserver.uri",  httpFileServer.serverUri)
@@ -242,6 +247,9 @@ object SparkEnv extends Logging {
       "."
     }
 
+    val shuffleManager = instantiateClass[ShuffleManager](
+      "spark.shuffle.manager", "org.apache.spark.shuffle.hash.HashShuffleManager")
+
     // Warn about deprecated spark.cache.class property
     if (conf.contains("spark.cache.class")) {
       logWarning("The spark.cache.class property is no longer being used! Specify storage " +
@@ -255,7 +263,7 @@ object SparkEnv extends Logging {
       closureSerializer,
       cacheManager,
       mapOutputTracker,
-      shuffleFetcher,
+      shuffleManager,
       broadcastManager,
       blockManager,
       connectionManager,
 
@@ -228,6 +228,50 @@ class JavaPairRDD[K, V](val rdd: RDD[(K, V)])
   : PartialResult[java.util.Map[K, BoundedDouble]] =
     rdd.countByKeyApprox(timeout, confidence).map(mapAsJavaMap)
 
+  /**
+   * Aggregate the values of each key, using given combine functions and a neutral "zero value".
+   * This function can return a different result type, U, than the type of the values in this RDD,
+   * V. Thus, we need one operation for merging a V into a U and one operation for merging two U's,
+   * as in scala.TraversableOnce. The former operation is used for merging values within a
+   * partition, and the latter is used for merging values between partitions. To avoid memory
+   * allocation, both of these functions are allowed to modify and return their first argument
+   * instead of creating a new U.
+   */
+  def aggregateByKey[U](zeroValue: U, partitioner: Partitioner, seqFunc: JFunction2[U, V, U],
+      combFunc: JFunction2[U, U, U]): JavaPairRDD[K, U] = {
+    implicit val ctag: ClassTag[U] = fakeClassTag
+    fromRDD(rdd.aggregateByKey(zeroValue, partitioner)(seqFunc, combFunc))
+  }
+
+  /**
+   * Aggregate the values of each key, using given combine functions and a neutral "zero value".
+   * This function can return a different result type, U, than the type of the values in this RDD,
+   * V. Thus, we need one operation for merging a V into a U and one operation for merging two U's,
+   * as in scala.TraversableOnce. The former operation is used for merging values within a
+   * partition, and the latter is used for merging values between partitions. To avoid memory
+   * allocation, both of these functions are allowed to modify and return their first argument
+   * instead of creating a new U.
+   */
+  def aggregateByKey[U](zeroValue: U, numPartitions: Int, seqFunc: JFunction2[U, V, U],
+      combFunc: JFunction2[U, U, U]): JavaPairRDD[K, U] = {
+    implicit val ctag: ClassTag[U] = fakeClassTag
+    fromRDD(rdd.aggregateByKey(zeroValue, numPartitions)(seqFunc, combFunc))
+  }
+
+  /**
+   * Aggregate the values of each key, using given combine functions and a neutral "zero value".
+   * This function can return a different result type, U, than the type of the values in this RDD,
+   * V. Thus, we need one operation for merging a V into a U and one operation for merging two U's.
+   * The former operation is used for merging values within a partition, and the latter is used for
+   * merging values between partitions. To avoid memory allocation, both of these functions are
+   * allowed to modify and return their first argument instead of creating a new U.
+   */
+  def aggregateByKey[U](zeroValue: U, seqFunc: JFunction2[U, V, U], combFunc: JFunction2[U, U, U]):
+      JavaPairRDD[K, U] = {
+    implicit val ctag: ClassTag[U] = fakeClassTag
+    fromRDD(rdd.aggregateByKey(zeroValue)(seqFunc, combFunc))
+  }
+
   /**
    * Merge the values for each key using an associative function and a neutral "zero value" which
    * may be added to the result an arbitrary number of times, and must not change the result
 
@@ -81,7 +81,7 @@ private class ClientActor(driverArgs: ClientArguments, conf: SparkConf) extends
 
       case "kill" =>
         val driverId = driverArgs.driverId
-        val killFuture = masterActor ! RequestKillDriver(driverId)
+        masterActor ! RequestKillDriver(driverId)
     }
   }
Original file line number	Diff line number	Diff line change
`@@ -96,7 +96,7 @@ private[spark] class ContextCleaner(sc: SparkContext) extends Logging {`
`96`	`96`	`}`
`97`	`97`
`98`	`98`	`/** Register a ShuffleDependency for cleanup when it is garbage collected. */`
`99`		`- def registerShuffleForCleanup(shuffleDependency: ShuffleDependency[_, _]) {`
	`99`	`+ def registerShuffleForCleanup(shuffleDependency: ShuffleDependency[_, _, _]) {`
`100`	`100`	`registerForCleanup(shuffleDependency, CleanShuffle(shuffleDependency.shuffleId))`
`101`	`101`	`}`
`102`	`102`
Original file line number	Diff line number	Diff line change
`@@ -81,7 +81,7 @@ private class ClientActor(driverArgs: ClientArguments, conf: SparkConf) extends`
`81`	`81`
`82`	`82`	`case "kill" =>`
`83`	`83`	`val driverId = driverArgs.driverId`
`84`		`- val killFuture = masterActor ! RequestKillDriver(driverId)`
	`84`	`+ masterActor ! RequestKillDriver(driverId)`
`85`	`85`	`}`
`86`	`86`	`}`
`87`	`87`