Skip to content

Commit dc3b640

Browse files
committed
SPARK-1619 Launch spark-shell with spark-submit
This simplifies the shell a bunch and passes all arguments through to spark-submit. There is a tiny incompatibility from 0.9.1 which is that you can't put `-c` _or_ `--cores`, only `--cores`. However, spark-submit will give a good error message in this case, I don't think many people used this, and it's a trivial change for users. Author: Patrick Wendell <pwendell@gmail.com> Closes apache#542 from pwendell/spark-shell and squashes the following commits: 9eb3e6f [Patrick Wendell] Updating Spark docs b552459 [Patrick Wendell] Andrew's feedback 97720fa [Patrick Wendell] Review feedback aa2900b [Patrick Wendell] SPARK-1619 Launch spark-shell with spark-submit
1 parent 6e101f1 commit dc3b640

11 files changed

Lines changed: 39 additions & 189 deletions

File tree

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ conf/java-opts
1818
conf/spark-env.sh
1919
conf/streaming-env.sh
2020
conf/log4j.properties
21+
conf/spark-defaults.conf
2122
docs/_site
2223
docs/api
2324
target/

bin/spark-shell

Lines changed: 9 additions & 168 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,8 @@
1919

2020
#
2121
# Shell script for starting the Spark Shell REPL
22-
# Note that it will set MASTER to spark://${SPARK_MASTER_IP}:${SPARK_MASTER_PORT}
23-
# if those two env vars are set in spark-env.sh but MASTER is not.
2422

23+
args="$@"
2524
cygwin=false
2625
case "`uname`" in
2726
CYGWIN*) cygwin=true;;
@@ -30,133 +29,16 @@ esac
3029
# Enter posix mode for bash
3130
set -o posix
3231

32+
if [[ "$@" == *--help* ]]; then
33+
echo "Usage: ./bin/spark-shell [options]"
34+
./bin/spark-submit --help 2>&1 | grep -v Usage 1>&2
35+
exit 0
36+
fi
37+
3338
## Global script variables
3439
FWDIR="$(cd `dirname $0`/..; pwd)"
3540

36-
SPARK_REPL_OPTS="${SPARK_REPL_OPTS:-""}"
37-
DEFAULT_MASTER="local[*]"
38-
MASTER=${MASTER:-""}
39-
40-
info_log=0
41-
42-
#CLI Color Templates
43-
txtund=$(tput sgr 0 1) # Underline
44-
txtbld=$(tput bold) # Bold
45-
bldred=${txtbld}$(tput setaf 1) # red
46-
bldyel=${txtbld}$(tput setaf 3) # yellow
47-
bldblu=${txtbld}$(tput setaf 4) # blue
48-
bldwht=${txtbld}$(tput setaf 7) # white
49-
txtrst=$(tput sgr0) # Reset
50-
info=${bldwht}*${txtrst} # Feedback
51-
pass=${bldblu}*${txtrst}
52-
warn=${bldred}*${txtrst}
53-
ques=${bldblu}?${txtrst}
54-
55-
# Helper function to describe the script usage
56-
function usage() {
57-
cat << EOF
58-
${txtbld}Usage${txtrst}: spark-shell [OPTIONS]
59-
60-
${txtbld}OPTIONS${txtrst}:
61-
-h --help : Print this help information.
62-
-c --cores : The maximum number of cores to be used by the Spark Shell.
63-
-em --executor-memory : The memory used by each executor of the Spark Shell, the number
64-
is followed by m for megabytes or g for gigabytes, e.g. "1g".
65-
-dm --driver-memory : The memory used by the Spark Shell, the number is followed
66-
by m for megabytes or g for gigabytes, e.g. "1g".
67-
-m --master : A full string that describes the Spark Master, defaults to "local[*]"
68-
e.g. "spark://localhost:7077".
69-
--log-conf : Enables logging of the supplied SparkConf as INFO at start of the
70-
Spark Context.
71-
72-
e.g.
73-
spark-shell -m spark://localhost:7077 -c 4 -dm 512m -em 2g
74-
75-
EOF
76-
}
77-
78-
function out_error(){
79-
echo -e "${txtund}${bldred}ERROR${txtrst}: $1"
80-
usage
81-
exit 1
82-
}
83-
84-
function log_info(){
85-
[ $info_log -eq 1 ] && echo -e "${bldyel}INFO${txtrst}: $1"
86-
}
87-
88-
function log_warn(){
89-
echo -e "${txtund}${bldyel}WARN${txtrst}: $1"
90-
}
91-
92-
# PATTERNS used to validate more than one optional arg.
93-
ARG_FLAG_PATTERN="^-"
94-
MEM_PATTERN="^[0-9]+[m|g|M|G]$"
95-
NUM_PATTERN="^[0-9]+$"
96-
PORT_PATTERN="^[0-9]+$"
97-
98-
# Setters for optional args.
99-
function set_cores(){
100-
CORE_PATTERN="^[0-9]+$"
101-
if [[ "$1" =~ $CORE_PATTERN ]]; then
102-
SPARK_REPL_OPTS="$SPARK_REPL_OPTS -Dspark.cores.max=$1"
103-
else
104-
out_error "wrong format for $2"
105-
fi
106-
}
107-
108-
function set_em(){
109-
if [[ $1 =~ $MEM_PATTERN ]]; then
110-
SPARK_REPL_OPTS="$SPARK_REPL_OPTS -Dspark.executor.memory=$1"
111-
else
112-
out_error "wrong format for $2"
113-
fi
114-
}
115-
116-
function set_dm(){
117-
if [[ $1 =~ $MEM_PATTERN ]]; then
118-
export SPARK_DRIVER_MEMORY=$1
119-
else
120-
out_error "wrong format for $2"
121-
fi
122-
}
123-
124-
function set_spark_log_conf(){
125-
SPARK_REPL_OPTS="$SPARK_REPL_OPTS -Dspark.logConf=$1"
126-
}
127-
128-
function set_spark_master(){
129-
if ! [[ "$1" =~ $ARG_FLAG_PATTERN ]]; then
130-
export MASTER="$1"
131-
else
132-
out_error "wrong format for $2"
133-
fi
134-
}
135-
136-
function resolve_spark_master(){
137-
# Set MASTER from spark-env if possible
138-
DEFAULT_SPARK_MASTER_PORT=7077
139-
if [ -z "$MASTER" ]; then
140-
. $FWDIR/bin/load-spark-env.sh
141-
if [ -n "$SPARK_MASTER_IP" ]; then
142-
SPARK_MASTER_PORT="${SPARK_MASTER_PORT:-"$DEFAULT_SPARK_MASTER_PORT"}"
143-
export MASTER="spark://${SPARK_MASTER_IP}:${SPARK_MASTER_PORT}"
144-
fi
145-
fi
146-
147-
if [ -z "$MASTER" ]; then
148-
export MASTER="$DEFAULT_MASTER"
149-
fi
150-
151-
}
152-
15341
function main(){
154-
log_info "Base Directory set to $FWDIR"
155-
156-
resolve_spark_master
157-
log_info "Spark Master is $MASTER"
158-
159-
log_info "Spark REPL options $SPARK_REPL_OPTS"
16042
if $cygwin; then
16143
# Workaround for issue involving JLine and Cygwin
16244
# (see http://sourceforge.net/p/jline/bugs/40/).
@@ -165,55 +47,14 @@ function main(){
16547
# (see https://github.com/sbt/sbt/issues/562).
16648
stty -icanon min 1 -echo > /dev/null 2>&1
16749
export SPARK_REPL_OPTS="$SPARK_REPL_OPTS -Djline.terminal=unix"
168-
$FWDIR/bin/spark-class org.apache.spark.repl.Main "$@"
50+
$FWDIR/bin/spark-submit spark-internal "$args" --class org.apache.spark.repl.Main
16951
stty icanon echo > /dev/null 2>&1
17052
else
17153
export SPARK_REPL_OPTS
172-
$FWDIR/bin/spark-class org.apache.spark.repl.Main "$@"
54+
$FWDIR/bin/spark-submit spark-internal "$args" --class org.apache.spark.repl.Main
17355
fi
17456
}
17557

176-
for option in "$@"
177-
do
178-
case $option in
179-
-h | --help )
180-
usage
181-
exit 1
182-
;;
183-
-c | --cores)
184-
shift
185-
_1=$1
186-
shift
187-
set_cores $_1 "-c/--cores"
188-
;;
189-
-em | --executor-memory)
190-
shift
191-
_1=$1
192-
shift
193-
set_em $_1 "-em/--executor-memory"
194-
;;
195-
-dm | --driver-memory)
196-
shift
197-
_1=$1
198-
shift
199-
set_dm $_1 "-dm/--driver-memory"
200-
;;
201-
-m | --master)
202-
shift
203-
_1=$1
204-
shift
205-
set_spark_master $_1 "-m/--master"
206-
;;
207-
--log-conf)
208-
shift
209-
set_spark_log_conf "true"
210-
info_log=1
211-
;;
212-
?)
213-
;;
214-
esac
215-
done
216-
21758
# Copy restore-TTY-on-exit functions from Scala script so spark-shell exits properly even in
21859
# binary distribution of Spark where Scala is not installed
21960
exit_status=127

bin/spark-submit

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,15 +21,15 @@ export SPARK_HOME="$(cd `dirname $0`/..; pwd)"
2121
ORIG_ARGS=$@
2222

2323
while (($#)); do
24-
if [ $1 = "--deploy-mode" ]; then
24+
if [ "$1" = "--deploy-mode" ]; then
2525
DEPLOY_MODE=$2
26-
elif [ $1 = "--driver-memory" ]; then
26+
elif [ "$1" = "--driver-memory" ]; then
2727
DRIVER_MEMORY=$2
28-
elif [ $1 = "--driver-library-path" ]; then
28+
elif [ "$1" = "--driver-library-path" ]; then
2929
export _SPARK_LIBRARY_PATH=$2
30-
elif [ $1 = "--driver-class-path" ]; then
30+
elif [ "$1" = "--driver-class-path" ]; then
3131
export SPARK_CLASSPATH="$SPARK_CLASSPATH:$2"
32-
elif [ $1 = "--driver-java-options" ]; then
32+
elif [ "$1" = "--driver-java-options" ]; then
3333
export SPARK_JAVA_OPTS="$SPARK_JAVA_OPTS $2"
3434
fi
3535
shift

core/src/main/scala/org/apache/spark/deploy/SparkSubmit.scala

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,12 @@ object SparkSubmit {
3838

3939
private var clusterManager: Int = LOCAL
4040

41+
/**
42+
* A special jar name that indicates the class being run is inside of Spark itself,
43+
* and therefore no user jar is needed.
44+
*/
45+
private val RESERVED_JAR_NAME = "spark-internal"
46+
4147
def main(args: Array[String]) {
4248
val appArgs = new SparkSubmitArguments(args)
4349
if (appArgs.verbose) {
@@ -113,7 +119,9 @@ object SparkSubmit {
113119

114120
if (!deployOnCluster) {
115121
childMainClass = appArgs.mainClass
116-
childClasspath += appArgs.primaryResource
122+
if (appArgs.primaryResource != RESERVED_JAR_NAME) {
123+
childClasspath += appArgs.primaryResource
124+
}
117125
} else if (clusterManager == YARN) {
118126
childMainClass = "org.apache.spark.deploy.yarn.Client"
119127
childArgs += ("--jar", appArgs.primaryResource)

core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ private[spark] class SparkSubmitArguments(args: Array[String]) {
107107
deployMode = Option(deployMode).getOrElse(System.getenv("DEPLOY_MODE"))
108108

109109
// Global defaults. These should be keep to minimum to avoid confusing behavior.
110-
master = Option(master).getOrElse("local")
110+
master = Option(master).getOrElse("local[*]")
111111
}
112112

113113
/** Ensure that required fields exists. Call this only once all defaults are loaded. */

docs/scala-programming-guide.md

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -60,17 +60,18 @@ which avoids hard-coding the master name in your application.
6060

6161
In the Spark shell, a special interpreter-aware SparkContext is already created for you, in the
6262
variable called `sc`. Making your own SparkContext will not work. You can set which master the
63-
context connects to using the `MASTER` environment variable, and you can add JARs to the classpath
64-
with the `ADD_JARS` variable. For example, to run `bin/spark-shell` on exactly four cores, use
63+
context connects to using the `--master` argument, and you can add JARs to the classpath
64+
by passing a comma separated list to the `--jars` argument. For example, to run
65+
`bin/spark-shell` on exactly four cores, use
6566

6667
{% highlight bash %}
67-
$ MASTER=local[4] ./bin/spark-shell
68+
$ ./bin/spark-shell --master local[4]
6869
{% endhighlight %}
6970

7071
Or, to also add `code.jar` to its classpath, use:
7172

7273
{% highlight bash %}
73-
$ MASTER=local[4] ADD_JARS=code.jar ./bin/spark-shell
74+
$ ./bin/spark-shell --master local[4] --jars code.jar
7475
{% endhighlight %}
7576

7677
### Master URLs

docs/spark-debugger.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ where `path/to/event-log` is where you want the event log to go relative to `$SP
3939

4040
### Loading the event log into the debugger
4141

42-
1. Run a Spark shell with `MASTER=<i>host</i> ./bin/spark-shell`.
42+
1. Run a Spark shell with `./bin/spark-shell --master <i>hist</i>`.
4343
2. Use `EventLogReader` to load the event log as follows:
4444
{% highlight scala %}
4545
spark> val r = new spark.EventLogReader(sc, Some("path/to/event-log"))

docs/spark-standalone.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -139,12 +139,12 @@ constructor](scala-programming-guide.html#initializing-spark).
139139

140140
To run an interactive Spark shell against the cluster, run the following command:
141141

142-
MASTER=spark://IP:PORT ./bin/spark-shell
142+
./bin/spark-shell --master spark://IP:PORT
143143

144144
Note that if you are running spark-shell from one of the spark cluster machines, the `bin/spark-shell` script will
145145
automatically set MASTER from the `SPARK_MASTER_IP` and `SPARK_MASTER_PORT` variables in `conf/spark-env.sh`.
146146

147-
You can also pass an option `-c <numCores>` to control the number of cores that spark-shell uses on the cluster.
147+
You can also pass an option `--cores <numCores>` to control the number of cores that spark-shell uses on the cluster.
148148

149149
# Launching Compiled Spark Applications
150150

docs/streaming-programming-guide.md

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -272,12 +272,10 @@ Time: 1357008430000 ms
272272
</td>
273273
</table>
274274

275-
If you plan to run the Scala code for Spark Streaming-based use cases in the Spark
276-
shell, you should start the shell with the SparkConfiguration pre-configured to
277-
discard old batches periodically:
275+
You can also use Spark Streaming directly from the Spark shell:
278276

279277
{% highlight bash %}
280-
$ SPARK_JAVA_OPTS=-Dspark.cleaner.ttl=10000 bin/spark-shell
278+
$ bin/spark-shell
281279
{% endhighlight %}
282280

283281
... and create your StreamingContext by wrapping the existing interactive shell

make-distribution.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@
3636
# 2) cd to deploy dir; ./sbin/start-master.sh
3737
# 3) Verify master is up by visiting web page, ie http://master-ip:8080. Note the spark:// URL.
3838
# 4) ./sbin/start-slave.sh 1 <<spark:// URL>>
39-
# 5) MASTER="spark://my-master-ip:7077" ./bin/spark-shell
39+
# 5) ./bin/spark-shell --master spark://my-master-ip:7077
4040
#
4141

4242
# Figure out where the Spark framework is installed

0 commit comments

Comments
 (0)