Merge remote-tracking branch 'apache/master' into SPARK-11672.2

mengxr · mengxr · commit da857a3d9bb6 · 2015-11-12T16:45:09.000-08:00
diff --git a/R/pkg/inst/tests/test_sparkSQL.R b/R/pkg/inst/tests/test_sparkSQL.R
@@ -1007,7 +1007,7 @@ test_that("group by, agg functions", {
   df3 <- agg(gd, age = "stddev")
   expect_is(df3, "DataFrame")
   df3_local <- collect(df3)
-  expect_equal(0, df3_local[df3_local$name == "Andy",][1, 2])
+  expect_true(is.nan(df3_local[df3_local$name == "Andy",][1, 2]))
 
   df4 <- agg(gd, sumAge = sum(df$age))
   expect_is(df4, "DataFrame")
@@ -1038,7 +1038,7 @@ test_that("group by, agg functions", {
   df7 <- agg(gd2, value = "stddev")
   df7_local <- collect(df7)
   expect_true(abs(df7_local[df7_local$name == "ID1",][1, 2] - 6.928203) < 1e-6)
-  expect_equal(0, df7_local[df7_local$name == "ID2",][1, 2])
+  expect_true(is.nan(df7_local[df7_local$name == "ID2",][1, 2]))
 
   mockLines3 <- c("{\"name\":\"Andy\", \"age\":30}",
                   "{\"name\":\"Andy\", \"age\":30}",
diff --git a/core/src/main/scala/org/apache/spark/launcher/LauncherBackend.scala b/core/src/main/scala/org/apache/spark/launcher/LauncherBackend.scala
@@ -21,7 +21,7 @@ import java.net.{InetAddress, Socket}
 
 import org.apache.spark.SPARK_VERSION
 import org.apache.spark.launcher.LauncherProtocol._
-import org.apache.spark.util.ThreadUtils
+import org.apache.spark.util.{ThreadUtils, Utils}
 
 /**
  * A class that can be used to talk to a launcher server. Users should extend this class to
@@ -88,12 +88,20 @@ private[spark] abstract class LauncherBackend {
    */
   protected def onDisconnected() : Unit = { }
 
+  private def fireStopRequest(): Unit = {
+    val thread = LauncherBackend.threadFactory.newThread(new Runnable() {
+      override def run(): Unit = Utils.tryLogNonFatalError {
+        onStopRequest()
+      }
+    })
+    thread.start()
+  }
 
   private class BackendConnection(s: Socket) extends LauncherConnection(s) {
 
     override protected def handle(m: Message): Unit = m match {
       case _: Stop =>
-        onStopRequest()
+        fireStopRequest()
 
       case _ =>
         throw new IllegalArgumentException(s"Unexpected message type: ${m.getClass().getName()}")
diff --git a/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala b/core/src/main/scala/org/apache/spark/scheduler/cluster/SparkDeploySchedulerBackend.scala
@@ -191,17 +191,19 @@ private[spark] class SparkDeploySchedulerBackend(
   }
 
   private def stop(finalState: SparkAppHandle.State): Unit = synchronized {
-    stopping = true
+    try {
+      stopping = true
 
-    launcherBackend.setState(finalState)
-    launcherBackend.close()
+      super.stop()
+      client.stop()
 
-    super.stop()
-    client.stop()
-
-    val callback = shutdownCallback
-    if (callback != null) {
-      callback(this)
+      val callback = shutdownCallback
+      if (callback != null) {
+        callback(this)
+      }
+    } finally {
+      launcherBackend.setState(finalState)
+      launcherBackend.close()
     }
   }
 
diff --git a/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala b/core/src/main/scala/org/apache/spark/ui/jobs/StagePage.scala
@@ -28,7 +28,7 @@ import org.apache.commons.lang3.StringEscapeUtils
 
 import org.apache.spark.{InternalAccumulator, SparkConf}
 import org.apache.spark.executor.TaskMetrics
-import org.apache.spark.scheduler.{AccumulableInfo, TaskInfo}
+import org.apache.spark.scheduler.{AccumulableInfo, TaskInfo, TaskLocality}
 import org.apache.spark.ui._
 import org.apache.spark.ui.jobs.UIData._
 import org.apache.spark.util.{Utils, Distribution}
@@ -70,6 +70,21 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
 
   private val displayPeakExecutionMemory = parent.conf.getBoolean("spark.sql.unsafe.enabled", true)
 
+  private def getLocalitySummaryString(stageData: StageUIData): String = {
+    val localities = stageData.taskData.values.map(_.taskInfo.taskLocality)
+    val localityCounts = localities.groupBy(identity).mapValues(_.size)
+    val localityNamesAndCounts = localityCounts.toSeq.map { case (locality, count) =>
+      val localityName = locality match {
+        case TaskLocality.PROCESS_LOCAL => "Process local"
+        case TaskLocality.NODE_LOCAL => "Node local"
+        case TaskLocality.RACK_LOCAL => "Rack local"
+        case TaskLocality.ANY => "Any"
+      }
+      s"$localityName: $count"
+    }
+    localityNamesAndCounts.sorted.mkString("; ")
+  }
+
   def render(request: HttpServletRequest): Seq[Node] = {
     progressListener.synchronized {
       val parameterId = request.getParameter("id")
@@ -129,6 +144,10 @@ private[ui] class StagePage(parent: StagesTab) extends WebUIPage("stage") {
               <strong>Total Time Across All Tasks: </strong>
               {UIUtils.formatDuration(stageData.executorRunTime)}
             </li>
+            <li>
+              <strong>Locality Level Summary: </strong>
+              {getLocalitySummaryString(stageData)}
+            </li>
             {if (stageData.hasInput) {
               <li>
                 <strong>Input Size / Records: </strong>
diff --git a/docs/job-scheduling.md b/docs/job-scheduling.md
@@ -56,36 +56,32 @@ provide another approach to share RDDs.
 
 ## Dynamic Resource Allocation
 
-Spark 1.2 introduces the ability to dynamically scale the set of cluster resources allocated to
-your application up and down based on the workload. This means that your application may give
-resources back to the cluster if they are no longer used and request them again later when there
-is demand. This feature is particularly useful if multiple applications share resources in your
-Spark cluster. If a subset of the resources allocated to an application becomes idle, it can be
-returned to the cluster's pool of resources and acquired by other applications. In Spark, dynamic
-resource allocation is performed on the granularity of the executor and can be enabled through
-`spark.dynamicAllocation.enabled`.
-
-This feature is currently disabled by default and available only on [YARN](running-on-yarn.html).
-A future release will extend this to [standalone mode](spark-standalone.html) and
-[Mesos coarse-grained mode](running-on-mesos.html#mesos-run-modes). Note that although Spark on
-Mesos already has a similar notion of dynamic resource sharing in fine-grained mode, enabling
-dynamic allocation allows your Mesos application to take advantage of coarse-grained low-latency
-scheduling while sharing cluster resources efficiently.
+Spark provides a mechanism to dynamically adjust the resources your application occupies based
+on the workload. This means that your application may give resources back to the cluster if they
+are no longer used and request them again later when there is demand. This feature is particularly
+useful if multiple applications share resources in your Spark cluster.
+
+This feature is disabled by default and available on all coarse-grained cluster managers, i.e.
+[standalone mode](spark-standalone.html), [YARN mode](running-on-yarn.html), and
+[Mesos coarse-grained mode](running-on-mesos.html#mesos-run-modes).
 
 ### Configuration and Setup
 
-All configurations used by this feature live under the `spark.dynamicAllocation.*` namespace.
-To enable this feature, your application must set `spark.dynamicAllocation.enabled` to `true`.
-Other relevant configurations are described on the
-[configurations page](configuration.html#dynamic-allocation) and in the subsequent sections in
-detail.
+There are two requirements for using this feature. First, your application must set
+`spark.dynamicAllocation.enabled` to `true`. Second, you must set up an *external shuffle service*
+on each worker node in the same cluster and set `spark.shuffle.service.enabled` to true in your
+application. The purpose of the external shuffle service is to allow executors to be removed
+without deleting shuffle files written by them (more detail described
+[below](job-scheduling.html#graceful-decommission-of-executors)). The way to set up this service
+varies across cluster managers:
+
+In standalone mode, simply start your workers with `spark.shuffle.service.enabled` set to `true`.
 
-Additionally, your application must use an external shuffle service. The purpose of the service is
-to preserve the shuffle files written by executors so the executors can be safely removed (more
-detail described [below](job-scheduling.html#graceful-decommission-of-executors)). To enable
-this service, set `spark.shuffle.service.enabled` to `true`. In YARN, this external shuffle service
-is implemented in `org.apache.spark.yarn.network.YarnShuffleService` that runs in each `NodeManager`
-in your cluster. To start this service, follow these steps:
+In Mesos coarse-grained mode, run `$SPARK_HOME/sbin/start-mesos-shuffle-service.sh` on all
+slave nodes with `spark.shuffle.service.enabled` set to `true`. For instance, you may do so
+through Marathon.
+
+In YARN mode, start the shuffle service on each `NodeManager` as follows:
 
 1. Build Spark with the [YARN profile](building-spark.html). Skip this step if you are using a
 pre-packaged distribution.
@@ -95,10 +91,13 @@ pre-packaged distribution.
 2. Add this jar to the classpath of all `NodeManager`s in your cluster.
 3. In the `yarn-site.xml` on each node, add `spark_shuffle` to `yarn.nodemanager.aux-services`,
 then set `yarn.nodemanager.aux-services.spark_shuffle.class` to
-`org.apache.spark.network.yarn.YarnShuffleService`. Additionally, set all relevant
-`spark.shuffle.service.*` [configurations](configuration.html).
+`org.apache.spark.network.yarn.YarnShuffleService` and `spark.shuffle.service.enabled` to true.
 4. Restart all `NodeManager`s in your cluster.
 
+All other relevant configurations are optional and under the `spark.dynamicAllocation.*` and
+`spark.shuffle.service.*` namespaces. For more detail, see the
+[configurations page](configuration.html#dynamic-allocation).
+
 ### Resource Allocation Policy
 
 At a high level, Spark should relinquish executors when they are no longer used and acquire
diff --git a/docs/tuning.md b/docs/tuning.md
@@ -61,8 +61,8 @@ The [Kryo documentation](https://github.com/EsotericSoftware/kryo) describes mor
 registration options, such as adding custom serialization code.
 
 If your objects are large, you may also need to increase the `spark.kryoserializer.buffer`
-config property. The default is 2, but this value needs to be large enough to hold the *largest*
-object you will serialize.
+[config](configuration.html#compression-and-serialization). This value needs to be large enough
+to hold the *largest* object you will serialize.
 
 Finally, if you don't register your custom classes, Kryo will still work, but it will have to store
 the full class name with each object, which is wasteful.
diff --git a/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java b/launcher/src/main/java/org/apache/spark/launcher/ChildProcAppHandle.java
@@ -18,6 +18,7 @@
 package org.apache.spark.launcher;
 
 import java.io.IOException;
+import java.lang.reflect.Method;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.concurrent.ThreadFactory;
@@ -102,8 +103,20 @@ public synchronized void kill() {
       disconnect();
     }
     if (childProc != null) {
-      childProc.destroy();
-      childProc = null;
+      try {
+        childProc.exitValue();
+      } catch (IllegalThreadStateException e) {
+        // Child is still alive. Try to use Java 8's "destroyForcibly()" if available,
+        // fall back to the old API if it's not there.
+        try {
+          Method destroy = childProc.getClass().getMethod("destroyForcibly");
+          destroy.invoke(childProc);
+        } catch (Exception inner) {
+          childProc.destroy();
+        }
+      } finally {
+        childProc = null;
+      }
     }
   }
 
diff --git a/launcher/src/main/java/org/apache/spark/launcher/SparkAppHandle.java b/launcher/src/main/java/org/apache/spark/launcher/SparkAppHandle.java
@@ -89,6 +89,9 @@ public boolean isFinal() {
    * Tries to kill the underlying application. Implies {@link #disconnect()}. This will not send
    * a {@link #stop()} message to the application, so it's recommended that users first try to
    * stop the application cleanly and only resort to this method if that fails.
+   * <p>
+   * Note that if the application is running as a child process, this method fail to kill the
+   * process when using Java 7. This may happen if, for example, the application is deadlocked.
    */
   void kill();
 
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
@@ -1760,7 +1760,6 @@ def combineByKey(self, createCombiner, mergeValue, mergeCombiners,
         In addition, users can control the partitioning of the output RDD.
 
         >>> x = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])
-        >>> def f(x): return x
         >>> def add(a, b): return a + str(b)
         >>> sorted(x.combineByKey(str, add, add).collect())
         [('a', '11'), ('b', '1')]
diff --git a/python/pyspark/sql/context.py b/python/pyspark/sql/context.py
@@ -415,7 +415,7 @@ def createDataFrame(self, data, schema=None, samplingRatio=None):
 
         >>> sqlContext.createDataFrame(df.toPandas()).collect()  # doctest: +SKIP
         [Row(name=u'Alice', age=1)]
-        >>> sqlContext.createDataFrame(pandas.DataFrame([[1, 2]]).collect())  # doctest: +SKIP
+        >>> sqlContext.createDataFrame(pandas.DataFrame([[1, 2]])).collect()  # doctest: +SKIP
         [Row(0=1, 1=2)]
         """
         if isinstance(data, DataFrame):
diff --git a/python/pyspark/sql/dataframe.py b/python/pyspark/sql/dataframe.py
@@ -761,7 +761,7 @@ def describe(self, *cols):
         +-------+------------------+-----+
         |  count|                 2|    2|
         |   mean|               3.5| null|
-        | stddev|2.1213203435596424| null|
+        | stddev|2.1213203435596424|  NaN|
         |    min|                 2|Alice|
         |    max|                 5|  Bob|
         +-------+------------------+-----+
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/HiveTypeCoercion.scala
@@ -297,8 +297,10 @@ object HiveTypeCoercion {
 
       case Sum(e @ StringType()) => Sum(Cast(e, DoubleType))
       case Average(e @ StringType()) => Average(Cast(e, DoubleType))
-      case StddevPop(e @ StringType()) => StddevPop(Cast(e, DoubleType))
-      case StddevSamp(e @ StringType()) => StddevSamp(Cast(e, DoubleType))
+      case StddevPop(e @ StringType(), mutableAggBufferOffset, inputAggBufferOffset) =>
+        StddevPop(Cast(e, DoubleType), mutableAggBufferOffset, inputAggBufferOffset)
+      case StddevSamp(e @ StringType(), mutableAggBufferOffset, inputAggBufferOffset) =>
+        StddevSamp(Cast(e, DoubleType), mutableAggBufferOffset, inputAggBufferOffset)
       case VariancePop(e @ StringType(), mutableAggBufferOffset, inputAggBufferOffset) =>
         VariancePop(Cast(e, DoubleType), mutableAggBufferOffset, inputAggBufferOffset)
       case VarianceSamp(e @ StringType(), mutableAggBufferOffset, inputAggBufferOffset) =>
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Kurtosis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Kurtosis.scala
@@ -42,9 +42,11 @@ case class Kurtosis(child: Expression,
       s"$prettyName requires ${momentOrder + 1} central moments, received: ${moments.length}")
     val m2 = moments(2)
     val m4 = moments(4)
+
     if (n == 0.0 || m2 == 0.0) {
       Double.NaN
-    } else {
+    }
+    else {
       n * m4 / (m2 * m2) - 3.0
     }
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Skewness.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Skewness.scala
@@ -41,9 +41,11 @@ case class Skewness(child: Expression,
       s"$prettyName requires ${momentOrder + 1} central moments, received: ${moments.length}")
     val m2 = moments(2)
     val m3 = moments(3)
+
     if (n == 0.0 || m2 == 0.0) {
       Double.NaN
-    } else {
+    }
+    else {
       math.sqrt(n) * m3 / math.sqrt(m2 * m2 * m2)
     }
   }
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Stddev.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Stddev.scala
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameAggregateSuite.scala
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
diff --git a/streaming/src/test/scala/org/apache/spark/streaming/rdd/TrackStateRDDSuite.scala b/streaming/src/test/scala/org/apache/spark/streaming/rdd/TrackStateRDDSuite.scala