Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
671 changes: 19 additions & 652 deletions docs/ml-guide.md

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
// $example on$
import org.apache.spark.ml.Pipeline;
import org.apache.spark.ml.PipelineStage;
import org.apache.spark.ml.classification.LogisticRegression;
Expand All @@ -36,6 +37,7 @@
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
// $example off$

/**
* A simple example demonstrating model selection using CrossValidator.
Expand All @@ -56,7 +58,7 @@ public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("JavaCrossValidatorExample");
JavaSparkContext jsc = new JavaSparkContext(conf);
SQLContext jsql = new SQLContext(jsc);

// $example on$
// Prepare training documents, which are labeled.
List<LabeledDocument> localTraining = Lists.newArrayList(
new LabeledDocument(0L, "a b c d e spark", 1.0),
Expand Down Expand Up @@ -120,7 +122,7 @@ public static void main(String[] args) {
System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2)
+ ", prediction=" + r.get(3));
}

// $example off$
jsc.stop();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
// $example on$
import org.apache.spark.ml.classification.LogisticRegressionModel;
import org.apache.spark.ml.param.ParamMap;
import org.apache.spark.ml.classification.LogisticRegression;
Expand All @@ -31,7 +32,7 @@
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;

// $example off$
/**
* A simple example demonstrating ways to specify parameters for Estimators and Transformers.
* Run with
Expand All @@ -45,7 +46,7 @@ public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("JavaSimpleParamsExample");
JavaSparkContext jsc = new JavaSparkContext(conf);
SQLContext jsql = new SQLContext(jsc);

// $example on$
// Prepare training data.
// We use LabeledPoint, which is a JavaBean. Spark SQL can convert RDDs of JavaBeans
// into DataFrames, where it uses the bean metadata to infer the schema.
Expand Down Expand Up @@ -106,7 +107,7 @@ public static void main(String[] args) {
System.out.println("(" + r.get(0) + ", " + r.get(1) + ") -> prob=" + r.get(2)
+ ", prediction=" + r.get(3));
}

// $example off$
jsc.stop();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
// $example on$
import org.apache.spark.ml.Pipeline;
import org.apache.spark.ml.PipelineModel;
import org.apache.spark.ml.PipelineStage;
Expand All @@ -32,6 +33,7 @@
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.Row;
import org.apache.spark.sql.SQLContext;
// $example off$

/**
* A simple text classification pipeline that recognizes "spark" from input text. It uses the Java
Expand All @@ -47,7 +49,7 @@ public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("JavaSimpleTextClassificationPipeline");
JavaSparkContext jsc = new JavaSparkContext(conf);
SQLContext jsql = new SQLContext(jsc);

// $example on$
// Prepare training documents, which are labeled.
List<LabeledDocument> localTraining = Lists.newArrayList(
new LabeledDocument(0L, "a b c d e spark", 1.0),
Expand Down Expand Up @@ -87,7 +89,7 @@ public static void main(String[] args) {
System.out.println("(" + r.get(0) + ", " + r.get(1) + ") --> prob=" + r.get(2)
+ ", prediction=" + r.get(3));
}

// $example off$
jsc.stop();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,14 @@

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
// $example on$
import org.apache.spark.ml.evaluation.RegressionEvaluator;
import org.apache.spark.ml.param.ParamMap;
import org.apache.spark.ml.regression.LinearRegression;
import org.apache.spark.ml.tuning.*;
import org.apache.spark.sql.DataFrame;
import org.apache.spark.sql.SQLContext;
// $example off$

/**
* A simple example demonstrating model selection using TrainValidationSplit.
Expand All @@ -43,8 +45,8 @@ public static void main(String[] args) {
SparkConf conf = new SparkConf().setAppName("JavaTrainValidationSplitExample");
JavaSparkContext jsc = new JavaSparkContext(conf);
SQLContext jsql = new SQLContext(jsc);

DataFrame data = jsql.read().format("libsvm").load("data/mllib/sample_libsvm_data.txt");
// $example on$
DataFrame data = jsql.read().format("libsvm").load("data/mllib/sample_linear_regression_data.txt");

// Prepare training and test data.
DataFrame[] splits = data.randomSplit(new double [] {0.9, 0.1}, 12345);
Expand Down Expand Up @@ -80,7 +82,7 @@ public static void main(String[] args) {
model.transform(test)
.select("features", "label", "prediction")
.show();

// $example off$
jsc.stop();
}
}
5 changes: 4 additions & 1 deletion examples/src/main/python/ml/cross_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,14 @@
from __future__ import print_function

from pyspark import SparkContext
# $example on$
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.sql import Row, SQLContext
# $example off$

"""
A simple example demonstrating model selection using CrossValidator.
Expand All @@ -36,7 +38,7 @@
if __name__ == "__main__":
sc = SparkContext(appName="CrossValidatorExample")
sqlContext = SQLContext(sc)

# $example on$
# Prepare training documents, which are labeled.
LabeledDocument = Row("id", "text", "label")
training = sc.parallelize([(0, "a b c d e spark", 1.0),
Expand Down Expand Up @@ -92,5 +94,6 @@
selected = prediction.select("id", "text", "probability", "prediction")
for row in selected.collect():
print(row)
# $example off$

sc.stop()
8 changes: 5 additions & 3 deletions examples/src/main/python/ml/simple_params_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,16 @@

from __future__ import print_function

import pprint
import sys

from pyspark import SparkContext
# $example on$
from pyspark.ml.classification import LogisticRegression
from pyspark.mllib.linalg import DenseVector
from pyspark.mllib.regression import LabeledPoint
from pyspark.sql import SQLContext
import pprint
# $example off$

"""
A simple example demonstrating ways to specify parameters for Estimators and Transformers.
Expand All @@ -38,7 +40,7 @@
exit(1)
sc = SparkContext(appName="PythonSimpleParamsExample")
sqlContext = SQLContext(sc)

# $example on$
# prepare training data.
# We create an RDD of LabeledPoints and convert them into a DataFrame.
# A LabeledPoint is an Object with two fields named label and features
Expand Down Expand Up @@ -94,5 +96,5 @@
for row in result:
print("features=%s,label=%s -> prob=%s, prediction=%s"
% (row.features, row.label, row.myProbability, row.prediction))

# $example off$
sc.stop()
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#
# $example on$

from __future__ import print_function

Expand Down Expand Up @@ -69,3 +70,6 @@
print(row)

sc.stop()

# $example off$

Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.LogisticRegression
Expand All @@ -26,6 +26,7 @@ import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
import org.apache.spark.ml.tuning.{CrossValidator, ParamGridBuilder}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.sql.{Row, SQLContext}
// $example off$

/**
* A simple example demonstrating model selection using CrossValidator.
Expand All @@ -46,7 +47,7 @@ object CrossValidatorExample {
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
import sqlContext.implicits._

// $example on$
// Prepare training documents, which are labeled.
val training = sc.parallelize(Seq(
LabeledDocument(0L, "a b c d e spark", 1.0),
Expand Down Expand Up @@ -107,6 +108,7 @@ object CrossValidatorExample {
.foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) =>
println(s"($id, $text) --> prob=$prob, prediction=$prediction")
}
// $example off$

sc.stop()
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,14 @@

// scalastyle:off println
package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.param.ParamMap
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.sql.{Row, SQLContext}
// $example off$

/**
* A simple example demonstrating ways to specify parameters for Estimators and Transformers.
Expand All @@ -38,6 +39,7 @@ object SimpleParamsExample {
val conf = new SparkConf().setAppName("SimpleParamsExample")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
// $example on$
import sqlContext.implicits._

// Prepare training data.
Expand Down Expand Up @@ -97,6 +99,7 @@ object SimpleParamsExample {
.foreach { case Row(features: Vector, label: Double, prob: Vector, prediction: Double) =>
println(s"($features, $label) -> prob=$prob, prediction=$prediction")
}
// $example off$

sc.stop()
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,14 @@
package org.apache.spark.examples.ml

import scala.beans.BeanInfo

// $example on$
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.Pipeline
import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.ml.feature.{HashingTF, Tokenizer}
import org.apache.spark.mllib.linalg.Vector
import org.apache.spark.sql.{Row, SQLContext}
// $example off$

@BeanInfo
case class LabeledDocument(id: Long, text: String, label: Double)
Expand All @@ -47,7 +48,7 @@ object SimpleTextClassificationPipeline {
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
import sqlContext.implicits._

// $example on$
// Prepare training documents, which are labeled.
val training = sc.parallelize(Seq(
LabeledDocument(0L, "a b c d e spark", 1.0),
Expand Down Expand Up @@ -86,6 +87,7 @@ object SimpleTextClassificationPipeline {
.foreach { case Row(id: Long, text: String, prob: Vector, prediction: Double) =>
println(s"($id, $text) --> prob=$prob, prediction=$prediction")
}
// $example off$

sc.stop()
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,13 @@
*/

package org.apache.spark.examples.ml

// $example on$
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.ml.evaluation.RegressionEvaluator
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.ml.tuning.{ParamGridBuilder, TrainValidationSplit}
import org.apache.spark.sql.SQLContext
// $example off$

/**
* A simple example demonstrating model selection using TrainValidationSplit.
Expand All @@ -38,9 +39,9 @@ object TrainValidationSplitExample {
val conf = new SparkConf().setAppName("TrainValidationSplitExample")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)

// $example on$
// Prepare training and test data.
val data = sqlContext.read.format("libsvm").load("data/mllib/sample_libsvm_data.txt")
val data = sqlContext.read.format("libsvm").load("data/mllib/sample_linear_regression_data.txt")
val Array(training, test) = data.randomSplit(Array(0.9, 0.1), seed = 12345)

val lr = new LinearRegression()
Expand Down Expand Up @@ -72,6 +73,7 @@ object TrainValidationSplitExample {
model.transform(test)
.select("features", "label", "prediction")
.show()
// $example off$

sc.stop()
}
Expand Down