Skip to content

Commit ade279a

Browse files
committed
Merge branch 'master' of git://git.apache.org/spark into SPARK-2677
2 parents 0174d6a + 482c5af commit ade279a

9 files changed

Lines changed: 116 additions & 50 deletions

File tree

bin/pyspark

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,18 @@ FWDIR="$(cd `dirname $0`/..; pwd)"
2323
# Export this as SPARK_HOME
2424
export SPARK_HOME="$FWDIR"
2525

26+
source $FWDIR/bin/utils.sh
27+
2628
SCALA_VERSION=2.10
2729

28-
if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then
30+
function usage() {
2931
echo "Usage: ./bin/pyspark [options]" 1>&2
3032
$FWDIR/bin/spark-submit --help 2>&1 | grep -v Usage 1>&2
3133
exit 0
34+
}
35+
36+
if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then
37+
usage
3238
fi
3339

3440
# Exit if the user hasn't compiled Spark
@@ -66,10 +72,11 @@ fi
6672
# Build up arguments list manually to preserve quotes and backslashes.
6773
# We export Spark submit arguments as an environment variable because shell.py must run as a
6874
# PYTHONSTARTUP script, which does not take in arguments. This is required for IPython notebooks.
69-
75+
SUBMIT_USAGE_FUNCTION=usage
76+
gatherSparkSubmitOpts "$@"
7077
PYSPARK_SUBMIT_ARGS=""
7178
whitespace="[[:space:]]"
72-
for i in "$@"; do
79+
for i in "${SUBMISSION_OPTS[@]}"; do
7380
if [[ $i =~ \" ]]; then i=$(echo $i | sed 's/\"/\\\"/g'); fi
7481
if [[ $i =~ $whitespace ]]; then i=\"$i\"; fi
7582
PYSPARK_SUBMIT_ARGS="$PYSPARK_SUBMIT_ARGS $i"
@@ -90,7 +97,10 @@ fi
9097
if [[ "$1" =~ \.py$ ]]; then
9198
echo -e "\nWARNING: Running python applications through ./bin/pyspark is deprecated as of Spark 1.0." 1>&2
9299
echo -e "Use ./bin/spark-submit <python file>\n" 1>&2
93-
exec $FWDIR/bin/spark-submit "$@"
100+
primary=$1
101+
shift
102+
gatherSparkSubmitOpts "$@"
103+
exec $FWDIR/bin/spark-submit "${SUBMISSION_OPTS[@]}" $primary "${APPLICATION_OPTS[@]}"
94104
else
95105
# Only use ipython if no command line arguments were provided [SPARK-1134]
96106
if [[ "$IPYTHON" = "1" ]]; then

bin/spark-shell

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,13 +31,21 @@ set -o posix
3131
## Global script variables
3232
FWDIR="$(cd `dirname $0`/..; pwd)"
3333

34+
function usage() {
35+
echo "Usage: ./bin/spark-shell [options]"
36+
$FWDIR/bin/spark-submit --help 2>&1 | grep -v Usage 1>&2
37+
exit 0
38+
}
39+
3440
if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then
35-
echo "Usage: ./bin/spark-shell [options]"
36-
$FWDIR/bin/spark-submit --help 2>&1 | grep -v Usage 1>&2
37-
exit 0
41+
usage
3842
fi
3943

40-
function main(){
44+
source $FWDIR/bin/utils.sh
45+
SUBMIT_USAGE_FUNCTION=usage
46+
gatherSparkSubmitOpts "$@"
47+
48+
function main() {
4149
if $cygwin; then
4250
# Workaround for issue involving JLine and Cygwin
4351
# (see http://sourceforge.net/p/jline/bugs/40/).
@@ -46,11 +54,11 @@ function main(){
4654
# (see https://github.com/sbt/sbt/issues/562).
4755
stty -icanon min 1 -echo > /dev/null 2>&1
4856
export SPARK_SUBMIT_OPTS="$SPARK_SUBMIT_OPTS -Djline.terminal=unix"
49-
$FWDIR/bin/spark-submit --class org.apache.spark.repl.Main spark-shell "$@"
57+
$FWDIR/bin/spark-submit --class org.apache.spark.repl.Main "${SUBMISSION_OPTS[@]}" spark-shell "${APPLICATION_OPTS[@]}"
5058
stty icanon echo > /dev/null 2>&1
5159
else
5260
export SPARK_SUBMIT_OPTS
53-
$FWDIR/bin/spark-submit --class org.apache.spark.repl.Main spark-shell "$@"
61+
$FWDIR/bin/spark-submit --class org.apache.spark.repl.Main "${SUBMISSION_OPTS[@]}" spark-shell "${APPLICATION_OPTS[@]}"
5462
fi
5563
}
5664

bin/utils.sh

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
#!/usr/bin/env bash
2+
3+
#
4+
# Licensed to the Apache Software Foundation (ASF) under one or more
5+
# contributor license agreements. See the NOTICE file distributed with
6+
# this work for additional information regarding copyright ownership.
7+
# The ASF licenses this file to You under the Apache License, Version 2.0
8+
# (the "License"); you may not use this file except in compliance with
9+
# the License. You may obtain a copy of the License at
10+
#
11+
# http://www.apache.org/licenses/LICENSE-2.0
12+
#
13+
# Unless required by applicable law or agreed to in writing, software
14+
# distributed under the License is distributed on an "AS IS" BASIS,
15+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16+
# See the License for the specific language governing permissions and
17+
# limitations under the License.
18+
#
19+
20+
# Gather all all spark-submit options into SUBMISSION_OPTS
21+
function gatherSparkSubmitOpts() {
22+
23+
if [ -z "$SUBMIT_USAGE_FUNCTION" ]; then
24+
echo "Function for printing usage of $0 is not set." 1>&2
25+
echo "Please set usage function to shell variable 'SUBMIT_USAGE_FUNCTION' in $0" 1>&2
26+
exit 1
27+
fi
28+
29+
# NOTE: If you add or remove spark-sumbmit options,
30+
# modify NOT ONLY this script but also SparkSubmitArgument.scala
31+
SUBMISSION_OPTS=()
32+
APPLICATION_OPTS=()
33+
while (($#)); do
34+
case "$1" in
35+
--master | --deploy-mode | --class | --name | --jars | --py-files | --files | \
36+
--conf | --properties-file | --driver-memory | --driver-java-options | \
37+
--driver-library-path | --driver-class-path | --executor-memory | --driver-cores | \
38+
--total-executor-cores | --executor-cores | --queue | --num-executors | --archives)
39+
if [[ $# -lt 2 ]]; then
40+
"$SUBMIT_USAGE_FUNCTION"
41+
exit 1;
42+
fi
43+
SUBMISSION_OPTS+=("$1"); shift
44+
SUBMISSION_OPTS+=("$1"); shift
45+
;;
46+
47+
--verbose | -v | --supervise)
48+
SUBMISSION_OPTS+=("$1"); shift
49+
;;
50+
51+
*)
52+
APPLICATION_OPTS+=("$1"); shift
53+
;;
54+
esac
55+
done
56+
57+
export SUBMISSION_OPTS
58+
export APPLICATION_OPTS
59+
}

core/src/main/scala/org/apache/spark/deploy/SparkSubmitArguments.scala

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,10 @@ private[spark] class SparkSubmitArguments(args: Seq[String]) {
224224
// Delineates parsing of Spark options from parsing of user options.
225225
parse(opts)
226226

227+
/**
228+
* NOTE: If you add or remove spark-submit options,
229+
* modify NOT ONLY this file but also utils.sh
230+
*/
227231
def parse(opts: Seq[String]): Unit = opts match {
228232
case ("--name") :: value :: tail =>
229233
name = value

core/src/main/scala/org/apache/spark/rdd/PairRDDFunctions.scala

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,25 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
237237
combineByKey[V]((v: V) => v, func, func, partitioner)
238238
}
239239

240+
/**
241+
* Merge the values for each key using an associative reduce function. This will also perform
242+
* the merging locally on each mapper before sending results to a reducer, similarly to a
243+
* "combiner" in MapReduce. Output will be hash-partitioned with numPartitions partitions.
244+
*/
245+
def reduceByKey(func: (V, V) => V, numPartitions: Int): RDD[(K, V)] = {
246+
reduceByKey(new HashPartitioner(numPartitions), func)
247+
}
248+
249+
/**
250+
* Merge the values for each key using an associative reduce function. This will also perform
251+
* the merging locally on each mapper before sending results to a reducer, similarly to a
252+
* "combiner" in MapReduce. Output will be hash-partitioned with the existing partitioner/
253+
* parallelism level.
254+
*/
255+
def reduceByKey(func: (V, V) => V): RDD[(K, V)] = {
256+
reduceByKey(defaultPartitioner(self), func)
257+
}
258+
240259
/**
241260
* Merge the values for each key using an associative reduce function, but return the results
242261
* immediately to the master as a Map. This will also perform the merging locally on each mapper
@@ -374,15 +393,6 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
374393
countApproxDistinctByKey(relativeSD, defaultPartitioner(self))
375394
}
376395

377-
/**
378-
* Merge the values for each key using an associative reduce function. This will also perform
379-
* the merging locally on each mapper before sending results to a reducer, similarly to a
380-
* "combiner" in MapReduce. Output will be hash-partitioned with numPartitions partitions.
381-
*/
382-
def reduceByKey(func: (V, V) => V, numPartitions: Int): RDD[(K, V)] = {
383-
reduceByKey(new HashPartitioner(numPartitions), func)
384-
}
385-
386396
/**
387397
* Group the values for each key in the RDD into a single sequence. Allows controlling the
388398
* partitioning of the resulting key-value pair RDD by passing a Partitioner.
@@ -482,16 +492,6 @@ class PairRDDFunctions[K, V](self: RDD[(K, V)])
482492
combineByKey(createCombiner, mergeValue, mergeCombiners, defaultPartitioner(self))
483493
}
484494

485-
/**
486-
* Merge the values for each key using an associative reduce function. This will also perform
487-
* the merging locally on each mapper before sending results to a reducer, similarly to a
488-
* "combiner" in MapReduce. Output will be hash-partitioned with the existing partitioner/
489-
* parallelism level.
490-
*/
491-
def reduceByKey(func: (V, V) => V): RDD[(K, V)] = {
492-
reduceByKey(defaultPartitioner(self), func)
493-
}
494-
495495
/**
496496
* Group the values for each key in the RDD into a single sequence. Hash-partitions the
497497
* resulting RDD with the existing partitioner/parallelism level.

core/src/main/scala/org/apache/spark/storage/BlockManagerMessages.scala

Lines changed: 1 addition & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ private[spark] object BlockManagerMessages {
5353
sender: ActorRef)
5454
extends ToBlockManagerMaster
5555

56-
class UpdateBlockInfo(
56+
case class UpdateBlockInfo(
5757
var blockManagerId: BlockManagerId,
5858
var blockId: BlockId,
5959
var storageLevel: StorageLevel,
@@ -84,24 +84,6 @@ private[spark] object BlockManagerMessages {
8484
}
8585
}
8686

87-
object UpdateBlockInfo {
88-
def apply(
89-
blockManagerId: BlockManagerId,
90-
blockId: BlockId,
91-
storageLevel: StorageLevel,
92-
memSize: Long,
93-
diskSize: Long,
94-
tachyonSize: Long): UpdateBlockInfo = {
95-
new UpdateBlockInfo(blockManagerId, blockId, storageLevel, memSize, diskSize, tachyonSize)
96-
}
97-
98-
// For pattern-matching
99-
def unapply(h: UpdateBlockInfo)
100-
: Option[(BlockManagerId, BlockId, StorageLevel, Long, Long, Long)] = {
101-
Some((h.blockManagerId, h.blockId, h.storageLevel, h.memSize, h.diskSize, h.tachyonSize))
102-
}
103-
}
104-
10587
case class GetLocations(blockId: BlockId) extends ToBlockManagerMaster
10688

10789
case class GetLocationsMultipleBlockIds(blockIds: Array[BlockId]) extends ToBlockManagerMaster

dev/merge_spark_pr.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,8 +74,10 @@ def fail(msg):
7474

7575
def run_cmd(cmd):
7676
if isinstance(cmd, list):
77+
print " ".join(cmd)
7778
return subprocess.check_output(cmd)
7879
else:
80+
print cmd
7981
return subprocess.check_output(cmd.split(" "))
8082

8183

python/pyspark/java_gateway.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def launch_gateway():
3939
submit_args = os.environ.get("PYSPARK_SUBMIT_ARGS")
4040
submit_args = submit_args if submit_args is not None else ""
4141
submit_args = shlex.split(submit_args)
42-
command = [os.path.join(SPARK_HOME, script), "pyspark-shell"] + submit_args
42+
command = [os.path.join(SPARK_HOME, script)] + submit_args + ["pyspark-shell"]
4343
if not on_windows:
4444
# Don't send ctrl-c / SIGINT to the Java gateway:
4545
def preexec_func():

sql/README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,11 @@ Spark SQL
33

44
This module provides support for executing relational queries expressed in either SQL or a LINQ-like Scala DSL.
55

6-
Spark SQL is broken up into three subprojects:
6+
Spark SQL is broken up into four subprojects:
77
- Catalyst (sql/catalyst) - An implementation-agnostic framework for manipulating trees of relational operators and expressions.
88
- Execution (sql/core) - A query planner / execution engine for translating Catalyst’s logical query plans into Spark RDDs. This component also includes a new public interface, SQLContext, that allows users to execute SQL or LINQ statements against existing RDDs and Parquet files.
99
- Hive Support (sql/hive) - Includes an extension of SQLContext called HiveContext that allows users to write queries using a subset of HiveQL and access data from a Hive Metastore using Hive SerDes. There are also wrappers that allows users to run queries that include Hive UDFs, UDAFs, and UDTFs.
10+
- HiveServer and CLI support (sql/hive-thriftserver) - Includes support for the SQL CLI (bin/spark-sql) and a HiveServer2 (for JDBC/ODBC) compatible server.
1011

1112

1213
Other dependencies for developers

0 commit comments

Comments
 (0)