From 17929cccb18a47fdd97b0d665290fdf1896070a9 Mon Sep 17 00:00:00 2001 From: Patrick Wendell Date: Tue, 22 Apr 2014 14:09:47 -0700 Subject: [PATCH 1/3] Assorted clean-up for Spark-on-YARN. In particular when the HADOOP_CONF_DIRS are not specified. --- conf/spark-env.sh.template | 1 + .../org/apache/spark/deploy/SparkSubmitArguments.scala | 9 +++++++++ docs/hadoop-third-party-distributions.md | 9 ++------- .../scala/org/apache/spark/deploy/yarn/ClientBase.scala | 8 +++++--- 4 files changed, 17 insertions(+), 10 deletions(-) diff --git a/conf/spark-env.sh.template b/conf/spark-env.sh.template index 177a21cc0377..a7a77d52f9e7 100755 --- a/conf/spark-env.sh.template +++ b/conf/spark-env.sh.template @@ -5,6 +5,7 @@ # Options read when launching programs locally with # ./bin/run-example or ./bin/spark-submit +# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files # - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node # - SPARK_PUBLIC_DNS, to set the public dns name of the driver program # - SPARK_CLASSPATH, default classpath entries to append diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala index 02502adfbd0c..99abe8e12aa4 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala @@ -116,6 +116,15 @@ private[spark] class SparkSubmitArguments(args: Array[String]) { if (args.length == 0) printUsageAndExit(-1) if (primaryResource == null) SparkSubmit.printErrorAndExit("Must specify a primary resource") if (mainClass == null) SparkSubmit.printErrorAndExit("Must specify a main class with --class") + if (master.startsWith("yarn")) { + val hasHadoopEnv = sys.env.contains("HADOOP_CONF_DIR") || + sys.env.contains("YARN_CONF_DIR") || + sys.env.contains("SPARK_TESTING") + if (!hasHadoopEnv) { + throw new Exception(s"When running with master '$master'" + + s" either HADOOP_CONF_DIR or YARN_CONF_DIR must be set in the environment.") + } + } } override def toString = { diff --git a/docs/hadoop-third-party-distributions.md b/docs/hadoop-third-party-distributions.md index de6a2b0a43bd..454877a7fa8a 100644 --- a/docs/hadoop-third-party-distributions.md +++ b/docs/hadoop-third-party-distributions.md @@ -110,10 +110,5 @@ The location of these configuration files varies across CDH and HDP versions, bu a common location is inside of `/etc/hadoop/conf`. Some tools, such as Cloudera Manager, create configurations on-the-fly, but offer a mechanisms to download copies of them. -There are a few ways to make these files visible to Spark: - -* You can copy these files into `$SPARK_HOME/conf` and they will be included in Spark's -classpath automatically. -* If you are running Spark on the same nodes as Hadoop _and_ your distribution includes both -`hdfs-site.xml` and `core-site.xml` in the same directory, you can set `HADOOP_CONF_DIR` -in `$SPARK_HOME/spark-env.sh` to that directory. +To make these files visible to Spark, set `HADOOP_CONF_DIR` in `$SPARK_HOME/spark-env.sh` +to a location containing the configuration files. diff --git a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala index c00b63669ca8..ebac1e73c3e2 100644 --- a/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala +++ b/yarn/common/src/main/scala/org/apache/spark/deploy/yarn/ClientBase.scala @@ -375,9 +375,11 @@ object ClientBase { val classpathEntries = Option(conf.getStrings( YarnConfiguration.YARN_APPLICATION_CLASSPATH)).getOrElse( getDefaultYarnApplicationClasspath()) - for (c <- classpathEntries) { - YarnSparkHadoopUtil.addToEnvironment(env, Environment.CLASSPATH.name, c.trim, - File.pathSeparator) + if (classpathEntries != null) { + for (c <- classpathEntries) { + YarnSparkHadoopUtil.addToEnvironment(env, Environment.CLASSPATH.name, c.trim, + File.pathSeparator) + } } val mrClasspathEntries = Option(conf.getStrings( From 18d09c1e1f594680139bbff91aa0acb6b3ce4971 Mon Sep 17 00:00:00 2001 From: Patrick Wendell Date: Tue, 22 Apr 2014 15:13:54 -0700 Subject: [PATCH 2/3] Review comments from Andrew --- conf/spark-env.sh.template | 1 + .../apache/spark/deploy/SparkSubmitArguments.scala | 12 +++++++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/conf/spark-env.sh.template b/conf/spark-env.sh.template index a7a77d52f9e7..f906be611a93 100755 --- a/conf/spark-env.sh.template +++ b/conf/spark-env.sh.template @@ -18,6 +18,7 @@ # - MESOS_NATIVE_LIBRARY, to point to your libmesos.so if you use Mesos # Options read in YARN client mode +# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files # - SPARK_EXECUTOR_INSTANCES, Number of workers to start (Default: 2) # - SPARK_EXECUTOR_CORES, Number of cores for the workers (Default: 1). # - SPARK_EXECUTOR_MEMORY, Memory per Worker (e.g. 1000M, 2G) (Default: 1G) diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala index 99abe8e12aa4..9a3384eecddc 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala @@ -116,13 +116,15 @@ private[spark] class SparkSubmitArguments(args: Array[String]) { if (args.length == 0) printUsageAndExit(-1) if (primaryResource == null) SparkSubmit.printErrorAndExit("Must specify a primary resource") if (mainClass == null) SparkSubmit.printErrorAndExit("Must specify a main class with --class") + val testing = sys.env.contains("SPARK_TESTING") if (master.startsWith("yarn")) { - val hasHadoopEnv = sys.env.contains("HADOOP_CONF_DIR") || - sys.env.contains("YARN_CONF_DIR") || - sys.env.contains("SPARK_TESTING") + val hasHadoopEnv = testing || + sys.env.contains("HADOOP_CONF_DIR") || + sys.env.contains("YARN_CONF_DIR") + if (!hasHadoopEnv) { - throw new Exception(s"When running with master '$master'" + - s" either HADOOP_CONF_DIR or YARN_CONF_DIR must be set in the environment.") + throw new Exception(s"When running with master '$master' " + + "either HADOOP_CONF_DIR or YARN_CONF_DIR must be set in the environment.") } } } From fe95f131558ada018f3ed730a59173aefe014432 Mon Sep 17 00:00:00 2001 From: Patrick Wendell Date: Tue, 22 Apr 2014 18:28:48 -0700 Subject: [PATCH 3/3] Changes based on Andrew's feeback --- .../org/apache/spark/deploy/SparkSubmitArguments.scala | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala index 9a3384eecddc..cc976565cc72 100644 --- a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala +++ b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala @@ -116,13 +116,11 @@ private[spark] class SparkSubmitArguments(args: Array[String]) { if (args.length == 0) printUsageAndExit(-1) if (primaryResource == null) SparkSubmit.printErrorAndExit("Must specify a primary resource") if (mainClass == null) SparkSubmit.printErrorAndExit("Must specify a main class with --class") - val testing = sys.env.contains("SPARK_TESTING") - if (master.startsWith("yarn")) { - val hasHadoopEnv = testing || - sys.env.contains("HADOOP_CONF_DIR") || - sys.env.contains("YARN_CONF_DIR") - if (!hasHadoopEnv) { + if (master.startsWith("yarn")) { + val hasHadoopEnv = sys.env.contains("HADOOP_CONF_DIR") || sys.env.contains("YARN_CONF_DIR") + val testing = sys.env.contains("SPARK_TESTING") + if (!hasHadoopEnv && !testing) { throw new Exception(s"When running with master '$master' " + "either HADOOP_CONF_DIR or YARN_CONF_DIR must be set in the environment.") }