apache · JeremyNixon · Feb 13, 2016 · Feb 17, 2016 · Feb 17, 2016 · Feb 17, 2016
diff --git a/docs/ml-guide.md b/docs/ml-guide.md
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaCrossValidatorExample.java
@@ -23,6 +23,7 @@
 
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaSparkContext;
+// $example on$
 import org.apache.spark.ml.Pipeline;
 import org.apache.spark.ml.PipelineStage;
 import org.apache.spark.ml.classification.LogisticRegression;
@@ -36,6 +37,7 @@
 import org.apache.spark.sql.DataFrame;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.SQLContext;
+// $example off$
 
 /**
  * A simple example demonstrating model selection using CrossValidator.
@@ -56,7 +58,7 @@ public static void main(String[] args) {
     SparkConf conf = new SparkConf().setAppName("JavaCrossValidatorExample");
     JavaSparkContext jsc = new JavaSparkContext(conf);
     SQLContext jsql = new SQLContext(jsc);
-
+    // $example on$
     // Prepare training documents, which are labeled.
     List<LabeledDocument> localTraining = Lists.newArrayList(
       new LabeledDocument(0L, "a b c d e spark", 1.0),
@@ -120,7 +122,7 @@ public static void main(String[] args) {
       System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2)
           + ", prediction=" + r.get(3));
     }
-
+    // $example off$
     jsc.stop();
   }
 }
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaSimpleParamsExample.java
@@ -23,6 +23,7 @@
 
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaSparkContext;
+// $example on$
 import org.apache.spark.ml.classification.LogisticRegressionModel;
 import org.apache.spark.ml.param.ParamMap;
 import org.apache.spark.ml.classification.LogisticRegression;
@@ -31,7 +32,7 @@
 import org.apache.spark.sql.DataFrame;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.SQLContext;
-
+// $example off$
 /**
  * A simple example demonstrating ways to specify parameters for Estimators and Transformers.
  * Run with
@@ -45,7 +46,7 @@ public static void main(String[] args) {
     SparkConf conf = new SparkConf().setAppName("JavaSimpleParamsExample");
     JavaSparkContext jsc = new JavaSparkContext(conf);
     SQLContext jsql = new SQLContext(jsc);
-
+    // $example on$
     // Prepare training data.
     // We use LabeledPoint, which is a JavaBean.  Spark SQL can convert RDDs of JavaBeans
     // into DataFrames, where it uses the bean metadata to infer the schema.
@@ -106,7 +107,7 @@ public static void main(String[] args) {
       System.out.println("(" + r.get(0) + ", " + r.get(1) + ") -> prob=" + r.get(2)
           + ", prediction=" + r.get(3));
     }
-
+    // $example off$
     jsc.stop();
   }
 }
diff --git a/...ples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java b/...ples/src/main/java/org/apache/spark/examples/ml/JavaSimpleTextClassificationPipeline.java
@@ -23,6 +23,7 @@
 
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaSparkContext;
+// $example on$
 import org.apache.spark.ml.Pipeline;
 import org.apache.spark.ml.PipelineModel;
 import org.apache.spark.ml.PipelineStage;
@@ -32,6 +33,7 @@
 import org.apache.spark.sql.DataFrame;
 import org.apache.spark.sql.Row;
 import org.apache.spark.sql.SQLContext;
+// $example off$
 
 /**
  * A simple text classification pipeline that recognizes "spark" from input text. It uses the Java
@@ -47,7 +49,7 @@ public static void main(String[] args) {
     SparkConf conf = new SparkConf().setAppName("JavaSimpleTextClassificationPipeline");
     JavaSparkContext jsc = new JavaSparkContext(conf);
     SQLContext jsql = new SQLContext(jsc);
-
+    // $example on$
     // Prepare training documents, which are labeled.
     List<LabeledDocument> localTraining = Lists.newArrayList(
       new LabeledDocument(0L, "a b c d e spark", 1.0),
@@ -87,7 +89,7 @@ public static void main(String[] args) {
       System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2)
           + ", prediction=" + r.get(3));
     }
-
+    // $example off$
     jsc.stop();
   }
 }
diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaTrainValidationSplitExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaTrainValidationSplitExample.java
@@ -19,12 +19,14 @@
 
 import org.apache.spark.SparkConf;
 import org.apache.spark.api.java.JavaSparkContext;
+// $example on$
 import org.apache.spark.ml.evaluation.RegressionEvaluator;
 import org.apache.spark.ml.param.ParamMap;
 import org.apache.spark.ml.regression.LinearRegression;
 import org.apache.spark.ml.tuning.*;
 import org.apache.spark.sql.DataFrame;
 import org.apache.spark.sql.SQLContext;
+// $example off$
 
 /**
  * A simple example demonstrating model selection using TrainValidationSplit.
@@ -43,8 +45,8 @@ public static void main(String[] args) {
     SparkConf conf = new SparkConf().setAppName("JavaTrainValidationSplitExample");
     JavaSparkContext jsc = new JavaSparkContext(conf);
     SQLContext jsql = new SQLContext(jsc);
-
-    DataFrame data = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
+    // $example on$
+    DataFrame data = jsql.read().format("libsvm").load("data/mllib/sample_linear_regression_data.txt");
 
     // Prepare training and test data.
     DataFrame[] splits = data.randomSplit(new double [] {0.9, 0.1}, 12345);
@@ -80,7 +82,7 @@ public static void main(String[] args) {
     model.transform(test)
       .select("features", "label", "prediction")
       .show();
-
+    // $example off$
     jsc.stop();
   }
 }
diff --git a/examples/src/main/python/ml/cross_validator.py b/examples/src/main/python/ml/cross_validator.py
@@ -18,12 +18,14 @@
 from __future__ import print_function
 
 from pyspark import SparkContext
+# $example on$
 from pyspark.ml import Pipeline
 from pyspark.ml.classification import LogisticRegression
 from pyspark.ml.evaluation import BinaryClassificationEvaluator
 from pyspark.ml.feature import HashingTF, Tokenizer
 from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
 from pyspark.sql import Row, SQLContext
+# $example off$
 
 """
 A simple example demonstrating model selection using CrossValidator.
@@ -36,7 +38,7 @@
 if __name__ == "__main__":
     sc = SparkContext(appName="CrossValidatorExample")
     sqlContext = SQLContext(sc)
-
+    # $example on$
     # Prepare training documents, which are labeled.
     LabeledDocument = Row("id", "text", "label")
     training = sc.parallelize([(0, "a b c d e spark", 1.0),
@@ -92,5 +94,6 @@
     selected = prediction.select("id", "text", "probability", "prediction")
     for row in selected.collect():
         print(row)
+    # $example off$
 
     sc.stop()
diff --git a/examples/src/main/python/ml/simple_params_example.py b/examples/src/main/python/ml/simple_params_example.py
@@ -17,14 +17,16 @@
 
 from __future__ import print_function
 
-import pprint
 import sys
 
 from pyspark import SparkContext
+# $example on$
 from pyspark.ml.classification import LogisticRegression
 from pyspark.mllib.linalg import DenseVector
 from pyspark.mllib.regression import LabeledPoint
 from pyspark.sql import SQLContext
+import pprint
+# $example off$
 
 """
 A simple example demonstrating ways to specify parameters for Estimators and Transformers.
@@ -38,7 +40,7 @@
         exit(1)
     sc = SparkContext(appName="PythonSimpleParamsExample")
     sqlContext = SQLContext(sc)
-
+    # $example on$
     # prepare training data.
     # We create an RDD of LabeledPoints and convert them into a DataFrame.
     # A LabeledPoint is an Object with two fields named label and features
@@ -94,5 +96,5 @@
     for row in result:
         print("features=%s,label=%s -> prob=%s, prediction=%s"
               % (row.features, row.label, row.myProbability, row.prediction))
-
+    # $example off$
     sc.stop()
diff --git a/examples/src/main/python/ml/simple_text_classification_pipeline.py b/examples/src/main/python/ml/simple_text_classification_pipeline.py
@@ -14,6 +14,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+    # $example on$
 
 from __future__ import print_function
 
@@ -69,3 +70,6 @@
         print(row)
 
     sc.stop()
+
+    # $example off$
+
diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/CrossValidatorExample.scala
@@ -17,7 +17,7 @@
 
 // scalastyle:off println
 package org.apache.spark.examples.ml
-
+// $example on$
 import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.ml.Pipeline
 import org.apache.spark.ml.classification.LogisticRegression
@@ -26,6 +26,7 @@ import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
 import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.sql.{Row, SQLContext}
+// $example off$
 
 /**
  * A simple example demonstrating model selection using CrossValidator.
@@ -46,7 +47,7 @@ object CrossValidatorExample {
     val sc = new SparkContext(conf)
     val sqlContext = new SQLContext(sc)
     import sqlContext.implicits._
-
+    // $example on$
     // Prepare training documents, which are labeled.
     val training = sc.parallelize(Seq(
       LabeledDocument(0L, "a b c d e spark", 1.0),
@@ -107,6 +108,7 @@ object CrossValidatorExample {
       .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) =>
       println(s"($id, $text) --> prob=$prob, prediction=$prediction")
     }
+    // $example off$
 
     sc.stop()
   }

diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleParamsExample.scala
@@ -17,13 +17,14 @@
 
 // scalastyle:off println
 package org.apache.spark.examples.ml
-
+// $example on$
 import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.ml.classification.LogisticRegression
 import org.apache.spark.ml.param.ParamMap
 import org.apache.spark.mllib.linalg.{Vector, Vectors}
 import org.apache.spark.mllib.regression.LabeledPoint
 import org.apache.spark.sql.{Row, SQLContext}
+// $example off$
 
 /**
  * A simple example demonstrating ways to specify parameters for Estimators and Transformers.
@@ -38,6 +39,7 @@ object SimpleParamsExample {
     val conf = new SparkConf().setAppName("SimpleParamsExample")
     val sc = new SparkContext(conf)
     val sqlContext = new SQLContext(sc)
+    // $example on$
     import sqlContext.implicits._
 
     // Prepare training data.
@@ -97,6 +99,7 @@ object SimpleParamsExample {
       .foreach { case Row(features: Vector, label: Double, prob: Vector, prediction: Double) =>
         println(s"($features, $label) -> prob=$prob, prediction=$prediction")
       }
+      // $example off$
 
     sc.stop()
   }

diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala b/examples/src/main/scala/org/apache/spark/examples/ml/SimpleTextClassificationPipeline.scala
@@ -19,13 +19,14 @@
 package org.apache.spark.examples.ml
 
 import scala.beans.BeanInfo
-
+// $example on$
 import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.ml.Pipeline
 import org.apache.spark.ml.classification.LogisticRegression
 import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
 import org.apache.spark.mllib.linalg.Vector
 import org.apache.spark.sql.{Row, SQLContext}
+// $example off$
 
 @BeanInfo
 case class LabeledDocument(id: Long, text: String, label: Double)
@@ -47,7 +48,7 @@ object SimpleTextClassificationPipeline {
     val sc = new SparkContext(conf)
     val sqlContext = new SQLContext(sc)
     import sqlContext.implicits._
-
+// $example on$
     // Prepare training documents, which are labeled.
     val training = sc.parallelize(Seq(
       LabeledDocument(0L, "a b c d e spark", 1.0),
@@ -86,6 +87,7 @@ object SimpleTextClassificationPipeline {
       .foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) =>
         println(s"($id, $text) --> prob=$prob, prediction=$prediction")
       }
+// $example off$
 
     sc.stop()
   }

diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/TrainValidationSplitExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/TrainValidationSplitExample.scala
@@ -16,12 +16,13 @@
  */
 
 package org.apache.spark.examples.ml
-
+// $example on$
 import org.apache.spark.{SparkConf, SparkContext}
 import org.apache.spark.ml.evaluation.RegressionEvaluator
 import org.apache.spark.ml.regression.LinearRegression
 import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
 import org.apache.spark.sql.SQLContext
+// $example off$
 
 /**
  * A simple example demonstrating model selection using TrainValidationSplit.
@@ -38,9 +39,9 @@ object TrainValidationSplitExample {
     val conf = new SparkConf().setAppName("TrainValidationSplitExample")
     val sc = new SparkContext(conf)
     val sqlContext = new SQLContext(sc)
-
+    // $example on$
     // Prepare training and test data.
-    val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
+    val data = sqlContext.read.format("libsvm").load("data/mllib/sample_linear_regression_data.txt")
     val Array(training, test) = data.randomSplit(Array(0.9, 0.1), seed = 12345)
 
     val lr = new LinearRegression()
@@ -72,6 +73,7 @@ object TrainValidationSplitExample {
     model.transform(test)
       .select("features", "label", "prediction")
       .show()
+    // $example off$
 
     sc.stop()
   }