SPARK-1619 Launch spark-shell with spark-submit

pwendell · pwendell · commit dc3b640a0ab3 · 2014-04-24T23:59:16.000-07:00
This simplifies the shell a bunch and passes all arguments through to spark-submit. There is a tiny incompatibility from 0.9.1 which is that you can't put `-c` _or_ `--cores`, only `--cores`. However, spark-submit will give a good error message in this case, I don't think many people used this, and it's a trivial change for users. Author: Patrick Wendell <pwendell@gmail.com> Closes apache#542 from pwendell/spark-shell and squashes the following commits: 9eb3e6f [Patrick Wendell] Updating Spark docs b552459 [Patrick Wendell] Andrew's feedback 97720fa [Patrick Wendell] Review feedback aa2900b [Patrick Wendell] SPARK-1619 Launch spark-shell with spark-submit
diff --git a/.gitignore b/.gitignore
@@ -18,6 +18,7 @@ conf/java-opts
 conf/spark-env.sh
 conf/streaming-env.sh
 conf/log4j.properties
+conf/spark-defaults.conf
 docs/_site
 docs/api
 target/
diff --git a/bin/spark-shell b/bin/spark-shell
@@ -19,9 +19,8 @@
 
 #
 # Shell script for starting the Spark Shell REPL
-# Note that it will set MASTER to spark://${SPARK_MASTER_IP}:${SPARK_MASTER_PORT}
-# if those two env vars are set in spark-env.sh but MASTER is not.
 
+args="$@"
 cygwin=false
 case "`uname`" in
     CYGWIN*) cygwin=true;;
@@ -30,133 +29,16 @@ esac
 # Enter posix mode for bash
 set -o posix
 
+if [[ "$@" == *--help* ]]; then
+  echo "Usage: ./bin/spark-shell [options]"
+  ./bin/spark-submit --help 2>&1 | grep -v Usage 1>&2
+  exit 0
+fi
+
 ## Global script variables
 FWDIR="$(cd `dirname $0`/..; pwd)"
 
-SPARK_REPL_OPTS="${SPARK_REPL_OPTS:-""}"
-DEFAULT_MASTER="local[*]"
-MASTER=${MASTER:-""}
-
-info_log=0
-
-#CLI Color Templates
-txtund=$(tput sgr 0 1)          # Underline
-txtbld=$(tput bold)             # Bold
-bldred=${txtbld}$(tput setaf 1) # red
-bldyel=${txtbld}$(tput setaf 3) # yellow
-bldblu=${txtbld}$(tput setaf 4) # blue
-bldwht=${txtbld}$(tput setaf 7) # white
-txtrst=$(tput sgr0)             # Reset
-info=${bldwht}*${txtrst}        # Feedback
-pass=${bldblu}*${txtrst}
-warn=${bldred}*${txtrst}
-ques=${bldblu}?${txtrst}
-
-# Helper function to describe the script usage
-function usage() {
-    cat << EOF
-${txtbld}Usage${txtrst}: spark-shell [OPTIONS]
-
-${txtbld}OPTIONS${txtrst}:
-    -h  --help              : Print this help information.
-    -c  --cores             : The maximum number of cores to be used by the Spark Shell.
-    -em --executor-memory   : The memory used by each executor of the Spark Shell, the number 
-                              is followed by m for megabytes or g for gigabytes, e.g. "1g".
-    -dm --driver-memory     : The memory used by the Spark Shell, the number is followed 
-                              by m for megabytes or g for gigabytes, e.g. "1g".
-    -m  --master            : A full string that describes the Spark Master, defaults to "local[*]"
-                              e.g. "spark://localhost:7077".
-    --log-conf              : Enables logging of the supplied SparkConf as INFO at start of the
-                              Spark Context.
-
-e.g.
-    spark-shell -m spark://localhost:7077 -c 4 -dm 512m -em 2g
-
-EOF
-}
-
-function out_error(){
-    echo -e "${txtund}${bldred}ERROR${txtrst}: $1"
-    usage
-    exit 1
-}
-
-function log_info(){
-    [ $info_log -eq 1 ] && echo -e "${bldyel}INFO${txtrst}: $1"
-}
-
-function log_warn(){
-    echo -e "${txtund}${bldyel}WARN${txtrst}: $1"
-}
-
-# PATTERNS used to validate more than one optional arg.
-ARG_FLAG_PATTERN="^-"
-MEM_PATTERN="^[0-9]+[m|g|M|G]$"
-NUM_PATTERN="^[0-9]+$"
-PORT_PATTERN="^[0-9]+$"
-
-# Setters for optional args.
-function set_cores(){
-    CORE_PATTERN="^[0-9]+$"
-    if [[ "$1" =~ $CORE_PATTERN ]]; then
-        SPARK_REPL_OPTS="$SPARK_REPL_OPTS -Dspark.cores.max=$1"
-    else
-        out_error "wrong format for $2"
-    fi
-}
-
-function set_em(){
-    if [[ $1 =~ $MEM_PATTERN ]]; then
-      SPARK_REPL_OPTS="$SPARK_REPL_OPTS -Dspark.executor.memory=$1"
-    else
-      out_error "wrong format for $2"
-    fi
-}
-
-function set_dm(){
-    if [[ $1 =~ $MEM_PATTERN ]]; then
-      export SPARK_DRIVER_MEMORY=$1
-    else
-      out_error "wrong format for $2"
-    fi
-}
-
-function set_spark_log_conf(){
-    SPARK_REPL_OPTS="$SPARK_REPL_OPTS -Dspark.logConf=$1"
-}
-
-function set_spark_master(){
-    if ! [[ "$1" =~ $ARG_FLAG_PATTERN ]]; then
-        export MASTER="$1"
-    else
-        out_error "wrong format for $2"
-    fi
-}
-
-function resolve_spark_master(){
-    # Set MASTER from spark-env if possible
-    DEFAULT_SPARK_MASTER_PORT=7077
-    if [ -z "$MASTER" ]; then
-        . $FWDIR/bin/load-spark-env.sh
-        if [ -n "$SPARK_MASTER_IP" ]; then
-            SPARK_MASTER_PORT="${SPARK_MASTER_PORT:-"$DEFAULT_SPARK_MASTER_PORT"}"
-            export MASTER="spark://${SPARK_MASTER_IP}:${SPARK_MASTER_PORT}"
-        fi
-    fi
-
-    if [ -z "$MASTER" ]; then
-        export MASTER="$DEFAULT_MASTER"
-    fi
-
-}
-
 function main(){
-    log_info "Base Directory set to $FWDIR"
-    
-    resolve_spark_master
-    log_info "Spark Master is $MASTER"
-
-    log_info "Spark REPL options  $SPARK_REPL_OPTS"
     if $cygwin; then
         # Workaround for issue involving JLine and Cygwin
         # (see http://sourceforge.net/p/jline/bugs/40/).
@@ -165,55 +47,14 @@ function main(){
         # (see https://github.com/sbt/sbt/issues/562).
         stty -icanon min 1 -echo > /dev/null 2>&1
         export SPARK_REPL_OPTS="$SPARK_REPL_OPTS -Djline.terminal=unix"
-        $FWDIR/bin/spark-class org.apache.spark.repl.Main "$@"
+        $FWDIR/bin/spark-submit spark-internal "$args" --class org.apache.spark.repl.Main
         stty icanon echo > /dev/null 2>&1
     else
         export SPARK_REPL_OPTS
-        $FWDIR/bin/spark-class org.apache.spark.repl.Main "$@"
+        $FWDIR/bin/spark-submit spark-internal "$args" --class org.apache.spark.repl.Main
     fi
 }
 
-for option in "$@"
-do
-     case $option in
-         -h  | --help )
-             usage
-             exit 1
-             ;;
-         -c  | --cores)
-             shift
-             _1=$1
-             shift
-             set_cores $_1 "-c/--cores"
-             ;;
-         -em | --executor-memory)
-             shift
-             _1=$1
-             shift
-             set_em $_1 "-em/--executor-memory"
-             ;;
-         -dm | --driver-memory)
-             shift
-             _1=$1
-             shift
-             set_dm $_1 "-dm/--driver-memory"
-             ;;
-         -m | --master)
-             shift
-             _1=$1
-             shift
-             set_spark_master $_1 "-m/--master"
-             ;;
-         --log-conf)
-             shift
-             set_spark_log_conf "true"
-             info_log=1
-             ;;
-         ?)
-             ;;
-     esac
-done
-
 # Copy restore-TTY-on-exit functions from Scala script so spark-shell exits properly even in
 # binary distribution of Spark where Scala is not installed
 exit_status=127
diff --git a/bin/spark-submit b/bin/spark-submit
@@ -21,15 +21,15 @@ export SPARK_HOME="$(cd `dirname $0`/..; pwd)"
 ORIG_ARGS=$@
 
 while (($#)); do
-  if [ $1 = "--deploy-mode" ]; then
+  if [ "$1" = "--deploy-mode" ]; then
     DEPLOY_MODE=$2
-  elif [ $1 = "--driver-memory" ]; then
+  elif [ "$1" = "--driver-memory" ]; then
     DRIVER_MEMORY=$2
-  elif [ $1 = "--driver-library-path" ]; then
+  elif [ "$1" = "--driver-library-path" ]; then
     export _SPARK_LIBRARY_PATH=$2
-  elif [ $1 = "--driver-class-path" ]; then
+  elif [ "$1" = "--driver-class-path" ]; then
     export SPARK_CLASSPATH="$SPARK_CLASSPATH:$2"
-  elif [ $1 = "--driver-java-options" ]; then
+  elif [ "$1" = "--driver-java-options" ]; then
     export SPARK_JAVA_OPTS="$SPARK_JAVA_OPTS $2"
   fi
   shift
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala
@@ -38,6 +38,12 @@ object SparkSubmit {
 
   private var clusterManager: Int = LOCAL
 
+  /**
+   * A special jar name that indicates the class being run is inside of Spark itself,
+   * and therefore no user jar is needed.
+   */
+  private val RESERVED_JAR_NAME = "spark-internal"
+
   def main(args: Array[String]) {
     val appArgs = new SparkSubmitArguments(args)
     if (appArgs.verbose) {
@@ -113,7 +119,9 @@ object SparkSubmit {
 
     if (!deployOnCluster) {
       childMainClass = appArgs.mainClass
-      childClasspath += appArgs.primaryResource
+      if (appArgs.primaryResource != RESERVED_JAR_NAME) {
+        childClasspath += appArgs.primaryResource
+      }
     } else if (clusterManager == YARN) {
       childMainClass = "org.apache.spark.deploy.yarn.Client"
       childArgs += ("--jar", appArgs.primaryResource)
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala b/core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala
@@ -107,7 +107,7 @@ private[spark] class SparkSubmitArguments(args: Array[String]) {
     deployMode = Option(deployMode).getOrElse(System.getenv("DEPLOY_MODE"))
 
     // Global defaults. These should be keep to minimum to avoid confusing behavior.
-    master = Option(master).getOrElse("local")
+    master = Option(master).getOrElse("local[*]")
   }
 
   /** Ensure that required fields exists. Call this only once all defaults are loaded. */
diff --git a/docs/scala-programming-guide.md b/docs/scala-programming-guide.md
@@ -60,17 +60,18 @@ which avoids hard-coding the master name in your application.
 
 In the Spark shell, a special interpreter-aware SparkContext is already created for you, in the
 variable called `sc`. Making your own SparkContext will not work. You can set which master the
-context connects to using the `MASTER` environment variable, and you can add JARs to the classpath
-with the `ADD_JARS` variable. For example, to run `bin/spark-shell` on exactly four cores, use
+context connects to using the `--master` argument, and you can add JARs to the classpath
+by passing a comma separated list to the `--jars` argument. For example, to run 
+`bin/spark-shell` on exactly four cores, use
 
 {% highlight bash %}
-$ MASTER=local[4] ./bin/spark-shell
+$ ./bin/spark-shell --master local[4]
 {% endhighlight %}
 
 Or, to also add `code.jar` to its classpath, use:
 
 {% highlight bash %}
-$ MASTER=local[4] ADD_JARS=code.jar ./bin/spark-shell
+$ ./bin/spark-shell --master local[4] --jars code.jar
 {% endhighlight %}
 
 ### Master URLs
diff --git a/docs/spark-debugger.md b/docs/spark-debugger.md
@@ -39,7 +39,7 @@ where `path/to/event-log` is where you want the event log to go relative to `$SP
 
 ### Loading the event log into the debugger
 
-1. Run a Spark shell with `MASTER=<i>host</i> ./bin/spark-shell`.
+1. Run a Spark shell with `./bin/spark-shell --master <i>hist</i>`.
 2. Use `EventLogReader` to load the event log as follows:
     {% highlight scala %}
 spark> val r = new spark.EventLogReader(sc, Some("path/to/event-log"))
diff --git a/docs/spark-standalone.md b/docs/spark-standalone.md
@@ -139,12 +139,12 @@ constructor](scala-programming-guide.html#initializing-spark).
 
 To run an interactive Spark shell against the cluster, run the following command:
 
-    MASTER=spark://IP:PORT ./bin/spark-shell
+    ./bin/spark-shell --master spark://IP:PORT
 
 Note that if you are running spark-shell from one of the spark cluster machines, the `bin/spark-shell` script will
 automatically set MASTER from the `SPARK_MASTER_IP` and `SPARK_MASTER_PORT` variables in `conf/spark-env.sh`.
 
-You can also pass an option `-c <numCores>` to control the number of cores that spark-shell uses on the cluster.
+You can also pass an option `--cores <numCores>` to control the number of cores that spark-shell uses on the cluster.
 
 # Launching Compiled Spark Applications
 
diff --git a/docs/streaming-programming-guide.md b/docs/streaming-programming-guide.md
@@ -272,12 +272,10 @@ Time: 1357008430000 ms
     </td>
 </table>
 
-If you plan to run the Scala code for Spark Streaming-based use cases in the Spark
-shell, you should start the shell with the SparkConfiguration pre-configured to
-discard old batches periodically:
+You can also use Spark Streaming directly from the Spark shell:
 
 {% highlight bash %}
-$ SPARK_JAVA_OPTS=-Dspark.cleaner.ttl=10000 bin/spark-shell
+$ bin/spark-shell
 {% endhighlight %}
 
 ... and create your StreamingContext by wrapping the existing interactive shell
diff --git a/make-distribution.sh b/make-distribution.sh
@@ -36,7 +36,7 @@
 # 2) cd to deploy dir; ./sbin/start-master.sh
 # 3) Verify master is up by visiting web page, ie http://master-ip:8080.  Note the spark:// URL.
 # 4) ./sbin/start-slave.sh 1 <<spark:// URL>>
-# 5) MASTER="spark://my-master-ip:7077" ./bin/spark-shell
+# 5) ./bin/spark-shell --master spark://my-master-ip:7077
 #
 
 # Figure out where the Spark framework is installed
diff --git a/repl/src/main/scala/org/apache/spark/repl/SparkILoop.scala b/repl/src/main/scala/org/apache/spark/repl/SparkILoop.scala
@@ -963,8 +963,9 @@ class SparkILoop(in0: Option[BufferedReader], protected val out: JPrintWriter,
     val master = this.master match {
       case Some(m) => m
       case None => {
-        val prop = System.getenv("MASTER")
-        if (prop != null) prop else "local[*]"
+        val envMaster = sys.env.get("MASTER")
+        val propMaster = sys.props.get("spark.master")
+        envMaster.orElse(propMaster).getOrElse("local[*]")
       }
     }
     master

Original file line number	Diff line number	Diff line change
`@@ -107,7 +107,7 @@ private[spark] class SparkSubmitArguments(args: Array[String]) {`
`107`	`107`	`deployMode = Option(deployMode).getOrElse(System.getenv("DEPLOY_MODE"))`
`108`	`108`
`109`	`109`	`// Global defaults. These should be keep to minimum to avoid confusing behavior.`
`110`		`- master = Option(master).getOrElse("local")`
	`110`	`+ master = Option(master).getOrElse("local[*]")`
`111`	`111`	`}`
`112`	`112`
`113`	`113`	`/** Ensure that required fields exists. Call this only once all defaults are loaded. */`
Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,7 @@`
`36`	`36`	`# 2) cd to deploy dir; ./sbin/start-master.sh`
`37`	`37`	`# 3) Verify master is up by visiting web page, ie http://master-ip:8080. Note the spark:// URL.`
`38`	`38`	`# 4) ./sbin/start-slave.sh 1 <<spark:// URL>>`
`39`		`-# 5) MASTER="spark://my-master-ip:7077" ./bin/spark-shell`
	`39`	`+# 5) ./bin/spark-shell --master spark://my-master-ip:7077`
`40`	`40`	`#`
`41`	`41`
`42`	`42`	`# Figure out where the Spark framework is installed`