apache · jkbradley · Aug 7, 2014 · Aug 7, 2014 · Aug 8, 2014 · Aug 10, 2014
diff --git a/examples/src/main/python/als.py b/examples/src/main/python/als.py
@@ -97,3 +97,5 @@ def update(i, vec, mat, ratings):
         error = rmse(R, ms, us)
         print "Iteration %d:" % i
         print "\nRMSE: %5.4f\n" % error
+
+    sc.stop()
diff --git a/examples/src/main/python/cassandra_inputformat.py b/examples/src/main/python/cassandra_inputformat.py
@@ -77,3 +77,5 @@
     output = cass_rdd.collect()
     for (k, v) in output:
         print (k, v)
+
+    sc.stop()
diff --git a/examples/src/main/python/cassandra_outputformat.py b/examples/src/main/python/cassandra_outputformat.py
@@ -81,3 +81,5 @@
         conf=conf,
         keyConverter="org.apache.spark.examples.pythonconverters.ToCassandraCQLKeyConverter",
         valueConverter="org.apache.spark.examples.pythonconverters.ToCassandraCQLValueConverter")
+
+    sc.stop()
diff --git a/examples/src/main/python/hbase_inputformat.py b/examples/src/main/python/hbase_inputformat.py
@@ -71,3 +71,5 @@
     output = hbase_rdd.collect()
     for (k, v) in output:
         print (k, v)
+
+    sc.stop()
diff --git a/examples/src/main/python/hbase_outputformat.py b/examples/src/main/python/hbase_outputformat.py
@@ -63,3 +63,5 @@
         conf=conf,
         keyConverter="org.apache.spark.examples.pythonconverters.StringToImmutableBytesWritableConverter",
         valueConverter="org.apache.spark.examples.pythonconverters.StringListToPutConverter")
+
+    sc.stop()
diff --git a/examples/src/main/python/kmeans.py b/examples/src/main/python/kmeans.py
@@ -77,3 +77,5 @@ def closestPoint(p, centers):
             kPoints[x] = y
 
     print "Final centers: " + str(kPoints)
+
+    sc.stop()
diff --git a/examples/src/main/python/logistic_regression.py b/examples/src/main/python/logistic_regression.py
@@ -80,3 +80,5 @@ def add(x, y):
         w -= points.map(lambda m: gradient(m, w)).reduce(add)
 
     print "Final w: " + str(w)
+
+    sc.stop()
diff --git a/examples/src/main/python/mllib/decision_tree_runner.py b/examples/src/main/python/mllib/decision_tree_runner.py
@@ -17,6 +17,8 @@
 
 """
 Decision tree classification and regression using MLlib.
+
+This example requires NumPy (http://www.numpy.org/).
 """
 
 import numpy, os, sys
@@ -117,6 +119,7 @@ def usage():
     if len(sys.argv) == 2:
         dataPath = sys.argv[1]
     if not os.path.isfile(dataPath):
+        sc.stop()
         usage()
     points = MLUtils.loadLibSVMFile(sc, dataPath)
 
@@ -131,3 +134,5 @@ def usage():
     print "  Model depth: %d\n" % model.depth()
     print "  Training accuracy: %g\n" % getAccuracy(model, reindexedData)
     print model
+
+    sc.stop()
diff --git a/examples/src/main/python/mllib/kmeans.py b/examples/src/main/python/mllib/kmeans.py
@@ -42,3 +42,4 @@ def parseVector(line):
     k = int(sys.argv[2])
     model = KMeans.train(data, k)
     print "Final centers: " + str(model.clusterCenters)
+    sc.stop()
diff --git a/examples/src/main/python/mllib/logistic_regression.py b/examples/src/main/python/mllib/logistic_regression.py
@@ -50,3 +50,4 @@ def parsePoint(line):
     model = LogisticRegressionWithSGD.train(points, iterations)
     print "Final weights: " + str(model.weights)
     print "Final intercept: " + str(model.intercept)
+    sc.stop()
diff --git a/examples/src/main/python/mllib/random_and_sampled_rdds.py b/examples/src/main/python/mllib/random_and_sampled_rdds.py
@@ -0,0 +1,88 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+Randomly generated and sampled RDDs.
+"""
+
+import sys
+
+from pyspark import SparkContext
+from pyspark.mllib.random import RandomRDDGenerators
+from pyspark.mllib.util import MLUtils
+
+
+
+if __name__ == "__main__":
+    if len(sys.argv) not in [1, 2]:
+        print >> sys.stderr, "Usage: logistic_regression <libsvm data file>"
+        exit(-1)
+    if len(sys.argv) == 2:
+        datapath = sys.argv[1]
+    else:
+        datapath = 'data/mllib/sample_binary_classification_data.txt'
+
+    sc = SparkContext(appName="PythonRandomAndSampledRDDs")
+
+    points = MLUtils.loadLibSVMFile(sc, datapath)
+
+    numExamples = 10000 # number of examples to generate
+    fraction = 0.1 # fraction of data to sample
+
+    # Example: RandomRDDGenerators
+    normalRDD = RandomRDDGenerators.normalRDD(sc, numExamples)
+    print 'Generated RDD of %d examples sampled from a unit normal distribution' % normalRDD.count()
+    normalVectorRDD = RandomRDDGenerators.normalVectorRDD(sc, numRows = numExamples, numCols = 2)
+    print 'Generated RDD of %d examples of length-2 vectors.' % normalVectorRDD.count()
+
+    print ''
+
+    # Example: RDD.sample() and RDD.takeSample()
+    exactSampleSize = int(numExamples * fraction)
+    print 'Sampling RDD using fraction %g.  Expected sample size = %d.' \
+        % (fraction, exactSampleSize)
+    sampledRDD = normalRDD.sample(withReplacement = True, fraction = fraction)
+    print '  RDD.sample(): sample has %d examples' % sampledRDD.count()
+    sampledArray = normalRDD.takeSample(withReplacement = True, num = exactSampleSize)
+    print '  RDD.takeSample(): sample has %d examples' % len(sampledArray)
+
+    print ''
+
+    # Example: RDD.sampleByKey()
+    examples = MLUtils.loadLibSVMFile(sc, datapath)
+    sizeA = examples.count()
+    print 'Loaded data with %d examples from file: %s' % (sizeA, datapath)
+    keyedRDD = examples.map(lambda lp: (int(lp.label), lp.features))
+    print '  Keyed data using label (Int) as key ==> Orig'
+    #  Count examples per label in original data.
+    keyCountsA = keyedRDD.countByKey()
+    #  Subsample, and count examples per label in sampled data.
+    fractions = {}
+    for k in keyCountsA.keys():
+        fractions[k] = fraction
+    sampledByKeyRDD = \
+        keyedRDD.sampleByKey(withReplacement = True, fractions = fractions)#, exact = True)
+    keyCountsB = sampledByKeyRDD.countByKey()
+    sizeB = sum(keyCountsB.values())
+    print '  Sampled %d examples using approximate stratified sampling (by label). ==> Sample' \
+        % sizeB
+    print '   \tFractions of examples with key'
+    print 'Key\tOrig\tSample'
+    for k in sorted(keyCountsA.keys()):
+        print '%d\t%g\t%g' % (k, keyCountsA[k] / float(sizeA), keyCountsB[k] / float(sizeB))
+
+    sc.stop()
diff --git a/examples/src/main/python/mllib/statistical_summary.py b/examples/src/main/python/mllib/statistical_summary.py
@@ -0,0 +1,60 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""
+Statistical summarization using MLlib.
+"""
+
+import sys
+
+from pyspark import SparkContext
+from pyspark.mllib.regression import LabeledPoint
+from pyspark.mllib.stat import Statistics
+from pyspark.mllib.util import MLUtils
+
+
+if __name__ == "__main__":
+    if len(sys.argv) not in [1,2]:
+        print >> sys.stderr, "Usage: statistical_summary (<file>)"
+        exit(-1)
+    sc = SparkContext(appName="PythonStatisticalSummary")
+    if len(sys.argv) == 2:
+        filepath = sys.argv[1]
+    else:
+        filepath = 'data/mllib/sample_linear_regression_data.txt'
+    corrType = 'pearson'
+
+    points = MLUtils.loadLibSVMFile(sc, filepath)\
+        .map(lambda lp: LabeledPoint(lp.label, lp.features.toDense()))
+
+    print ''
+    print 'Summary of data file: ' + filepath
+    print '%d data points' % points.count()
+
+    # Statistics (correlations)
+    print ''
+    print 'Correlation (%s) between label and each feature' % corrType
+    print 'Feature\tCorrelation'
+    numFeatures = points.take(1)[0].features.size
+    labelRDD = points.map(lambda lp: lp.label)
+    for i in range(numFeatures):
+        featureRDD = points.map(lambda lp: lp.features[i])
+        corr = Statistics.corr(labelRDD, featureRDD, corrType)
+        print '%d\t%g' % (i, corr)
+    print ''
+
+    sc.stop()
diff --git a/examples/src/main/python/pagerank.py b/examples/src/main/python/pagerank.py
@@ -68,3 +68,5 @@ def parseNeighbors(urls):
     # Collects all URL ranks and dump them to console.
     for (link, rank) in ranks.collect():
         print "%s has rank: %s." % (link, rank)
+
+    sc.stop()
diff --git a/examples/src/main/python/pi.py b/examples/src/main/python/pi.py
@@ -37,3 +37,5 @@ def f(_):
 
     count = sc.parallelize(xrange(1, n+1), slices).map(f).reduce(add)
     print "Pi is roughly %f" % (4.0 * count / n)
+
+    sc.stop()
diff --git a/examples/src/main/python/sort.py b/examples/src/main/python/sort.py
@@ -34,3 +34,5 @@
     output = sortedCount.collect()
     for (num, unitcount) in output:
         print num
+
+    sc.stop()
diff --git a/examples/src/main/python/transitive_closure.py b/examples/src/main/python/transitive_closure.py
@@ -64,3 +64,5 @@ def generateGraph():
             break
 
     print "TC has %i edges" % tc.count()
+
+    sc.stop()
diff --git a/examples/src/main/python/wordcount.py b/examples/src/main/python/wordcount.py
@@ -33,3 +33,5 @@
     output = counts.collect()
     for (word, count) in output:
         print "%s: %i" % (word, count)
+
+    sc.stop()
diff --git a/examples/src/main/scala/org/apache/spark/examples/mllib/RandomAndSampledRDDs.scala b/examples/src/main/scala/org/apache/spark/examples/mllib/RandomAndSampledRDDs.scala
@@ -0,0 +1,110 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.examples.mllib
+
+import org.apache.spark.mllib.random.RandomRDDGenerators
+import org.apache.spark.mllib.util.MLUtils
+import org.apache.spark.rdd.RDD
+import scopt.OptionParser
+
+import org.apache.spark.{SparkConf, SparkContext}
+import org.apache.spark.SparkContext._
+
+/**
+ * An example app for randomly generated and sampled RDDs. Run with
+ * {{{
+ * bin/run-example org.apache.spark.examples.mllib.RandomAndSampledRDDs
+ * }}}
+ * If you use it as a template to create your own app, please use `spark-submit` to submit your app.
+ */
+object RandomAndSampledRDDs extends App {
+
+  case class Params(input: String = "data/mllib/sample_binary_classification_data.txt")
+
+  val defaultParams = Params()
+
+  val parser = new OptionParser[Params]("RandomAndSampledRDDs") {
+    head("RandomAndSampledRDDs: an example app for randomly generated and sampled RDDs.")
+    opt[String]("input")
+      .text(s"Input path to labeled examples in LIBSVM format, default: ${defaultParams.input}")
+      .action((x, c) => c.copy(input = x))
+    note(
+      """
+        |For example, the following command runs this app:
+        |
+        | bin/spark-submit --class org.apache.spark.examples.mllib.RandomAndSampledRDDs \
+        |  examples/target/scala-*/spark-examples-*.jar
+      """.stripMargin)
+  }
+
+  parser.parse(args, defaultParams).map { params =>
+    run(params)
+  } getOrElse {
+    sys.exit(1)
+  }
+
+  def run(params: Params) {
+    val conf = new SparkConf().setAppName(s"RandomAndSampledRDDs with $params")
+    val sc = new SparkContext(conf)
+
+    val numExamples = 10000 // number of examples to generate
+    val fraction = 0.1 // fraction of data to sample
+
+    // Example: RandomRDDGenerators
+    val normalRDD: RDD[Double] = RandomRDDGenerators.normalRDD(sc, numExamples)
+    println(s"Generated RDD of ${normalRDD.count()} examples sampled from a unit normal distribution")
+    val normalVectorRDD =
+      RandomRDDGenerators.normalVectorRDD(sc, numRows = numExamples, numCols = 2)
+    println(s"Generated RDD of ${normalVectorRDD.count()} examples of length-2 vectors.")
+
+    println()
+
+    // Example: RDD.sample() and RDD.takeSample()
+    val exactSampleSize = (numExamples * fraction).toInt
+    println(s"Sampling RDD using fraction $fraction.  Expected sample size = $exactSampleSize.")
+    val sampledRDD = normalRDD.sample(withReplacement = true, fraction = fraction)
+    println(s"  RDD.sample(): sample has ${sampledRDD.count()} examples")
+    val sampledArray = normalRDD.takeSample(withReplacement = true, num = exactSampleSize)
+    println(s"  RDD.takeSample(): sample has ${sampledArray.size} examples")
+
+    println()
+
+    // Example: RDD.sampleByKey()
+    val examples = MLUtils.loadLibSVMFile(sc, params.input)
+    val sizeA = examples.count()
+    println(s"Loaded data with $sizeA examples from file: ${params.input}")
+    val keyedRDD = examples.map { lp => (lp.label.toInt, lp.features) }
+    println(s"  Keyed data using label (Int) as key ==> Orig")
+    //  Count examples per label in original data.
+    val keyCountsA = keyedRDD.countByKey()
+    //  Subsample, and count examples per label in sampled data.
+    val fractions = keyCountsA.keys.map((_, fraction)).toMap
+    val sampledByKeyRDD =
+      keyedRDD.sampleByKey(withReplacement = true, fractions = fractions, exact = true)
+    val keyCountsB = sampledByKeyRDD.countByKey()
+    val sizeB = keyCountsB.values.sum
+    println(s"  Sampled $sizeB examples using exact stratified sampling (by label). ==> Sample")
+    println(s"   \tFractions of examples with key")
+    println(s"Key\tOrig\tSample")
+    keyCountsA.keys.toSeq.sorted.foreach { key =>
+      println(s"$key\t${keyCountsA(key) / sizeA.toDouble}\t${keyCountsB(key) / sizeB.toDouble}")
+    }
+
+    sc.stop()
+  }
+}
Original file line number	Diff line number	Diff line change
Expand Up		@@ -77,3 +77,5 @@ def closestPoint(p, centers):
		kPoints[x] = y

		print "Final centers: " + str(kPoints)

		sc.stop()
Original file line number	Diff line number	Diff line change
Expand Up		@@ -80,3 +80,5 @@ def add(x, y):
		w -= points.map(lambda m: gradient(m, w)).reduce(add)

		print "Final w: " + str(w)

		sc.stop()
Original file line number	Diff line number	Diff line change
Expand Up		@@ -37,3 +37,5 @@ def f(_):

		count = sc.parallelize(xrange(1, n+1), slices).map(f).reduce(add)
		print "Pi is roughly %f" % (4.0 * count / n)

		sc.stop()
Original file line number	Diff line number	Diff line change
Expand Up		@@ -64,3 +64,5 @@ def generateGraph():
		break

		print "TC has %i edges" % tc.count()

		sc.stop()