Skip to content
Closed
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
8389280
Mark a number of alogrithms and models experimental that are marked t…
holdenk May 5, 2016
1fa57e5
Add the rest
holdenk May 5, 2016
b1ce817
Use mathjax for formula in PyDoc
holdenk May 5, 2016
8125c8c
Switch to math highlighting and update legostic regresion get doc sin…
holdenk May 5, 2016
c72fa46
Long line fix
holdenk May 5, 2016
3fd1dce
Start adding the missing params to mutli-layer perceptron, also inves…
holdenk May 5, 2016
c7caa43
Or wait we just don't need to support None
holdenk May 5, 2016
4776221
Update the doc string for weights param and add doctest that verifys …
holdenk May 6, 2016
64942b7
Merge in master
holdenk May 9, 2016
2397004
mini fix
holdenk May 10, 2016
130d05f
Merge branch 'master' into SPARK-15162-SPARK-15164-update-some-pydocs
holdenk May 10, 2016
a73913b
more pydoc fix
holdenk May 10, 2016
50b41ae
Merge branch 'master' into SPARK-15162-SPARK-15164-update-some-pydocs
holdenk May 10, 2016
9e38ddf
Remove flaky doctet component
holdenk May 10, 2016
f4df8f0
Add a : as requested
holdenk May 10, 2016
5df5a93
Merge in master
holdenk May 19, 2016
2eec947
Back out some unrelated changes that are in a seperate PR anyways
holdenk May 19, 2016
e11dbf8
Merge branch 'master' into SPARK-15162-SPARK-15164-update-some-pydocs
holdenk May 23, 2016
4111b2d
Update scaladoc and PyDoc to both have the correct chain for getThres…
holdenk May 26, 2016
53ab790
pep8
holdenk May 26, 2016
c2c7900
Merge branch 'master' into SPARK-15162-SPARK-15164-update-some-pydocs
holdenk Jun 6, 2016
a7aadec
Revert doc change
holdenk Jun 6, 2016
e4061f4
minor fix
holdenk Jun 6, 2016
7b634b6
Merge branch 'master' into SPARK-15162-SPARK-15164-update-some-pydocs
holdenk Jun 7, 2016
873f6c8
Merge branch 'master' into SPARK-15162-SPARK-15164-update-some-pydocs
holdenk Jun 11, 2016
9fb2e41
Merge branch 'master' into SPARK-15162-SPARK-15164-update-some-pydocs
holdenk Jun 12, 2016
74636b1
Merge branch 'master' into SPARK-15162-SPARK-15164-update-some-pydocs
holdenk Jun 13, 2016
d925f38
Merge branch 'master' into SPARK-15162-SPARK-15164-update-some-pydocs
holdenk Jun 14, 2016
3981612
oook lets try 86ing mathjax but... welll w/e
holdenk Jun 14, 2016
3d13c6c
reenable mathjax
holdenk Jun 14, 2016
2be8cdf
Revert "[SPARK-15745][SQL] Use classloader's getResource() for readin…
holdenk Jun 14, 2016
4431daa
Support both methods
holdenk Jun 14, 2016
d842309
Revert "Support both methods"
holdenk Jun 21, 2016
de63f9f
Revert "Revert "[SPARK-15745][SQL] Use classloader's getResource() fo…
holdenk Jun 21, 2016
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,8 @@ private[ml] trait MultilayerPerceptronParams extends PredictorParams
* @group expertParam
*/
final val solver: Param[String] = new Param[String](this, "solver",
" Allows setting the solver: minibatch gradient descent (gd) or l-bfgs. " +
" l-bfgs is the default one.",
"Allows setting the solver: minibatch gradient descent (gd) or l-bfgs. " +
"(Default l-bfgs)",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

indentation is off here. Also prefer (Default: l-bfgs)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

sure, kept original indentation but will update.

ParamValidators.inArray[String](Array("gd", "l-bfgs")))

/** @group getParam */
Expand All @@ -88,7 +88,7 @@ private[ml] trait MultilayerPerceptronParams extends PredictorParams
* @group expertParam
*/
final val weights: Param[Vector] = new Param[Vector](this, "weights",
" Sets the weights of the model ")
"Weights (either initial if before training or actual on model)")

/** @group getParam */
final def getWeights: Vector = $(weights)
Expand Down Expand Up @@ -181,7 +181,7 @@ class MultilayerPerceptronClassifier @Since("1.5.0") (
def setSeed(value: Long): this.type = set(seed, value)

/**
* Sets the model weights.
* Sets the initial weights used for the optimizer.
*
* @group expertParam
*/
Expand Down
1 change: 1 addition & 0 deletions python/docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
'sphinx.ext.autodoc',
'sphinx.ext.viewcode',
'epytext',
'sphinx.ext.mathjax',
]

# Add any paths that contain templates here, relative to this directory.
Expand Down
118 changes: 109 additions & 9 deletions python/pyspark/ml/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti
HasElasticNetParam, HasFitIntercept, HasStandardization, HasThresholds,
HasWeightCol, JavaMLWritable, JavaMLReadable):
"""
.. note:: Experimental

Logistic regression.
Currently, this class only supports binary classification.

Expand Down Expand Up @@ -96,7 +98,8 @@ class LogisticRegression(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredicti

threshold = Param(Params._dummy(), "threshold",
"Threshold in binary classification prediction, in range [0, 1]." +
" If threshold and thresholds are both set, they must match.",
" If threshold and thresholds are both set, they must match." +
"e.g. threshold must be equal to [1-p, p].",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps e.g. if threshold is p, then thresholds must be [1-p, p]?

typeConverter=TypeConverters.toFloat)

@keyword_only
Expand Down Expand Up @@ -154,7 +157,10 @@ def setThreshold(self, value):
@since("1.4.0")
def getThreshold(self):
"""
Gets the value of threshold or its default value.
Gets the value of threshold or attempt to convert thresholds to threshold if set, or default
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I just noticed that both this and Scala doc is inaccurate.

Scala side says:

   * Get threshold for binary classification.
   *
   * If [[threshold]] is set, returns that value.
   * Otherwise, if [[thresholds]] is set with length 2 (i.e., binary classification),
   * this returns the equivalent threshold: {{{1 / (1 + thresholds(0) / thresholds(1))}}}.
   * Otherwise, returns [[threshold]] default value.

But actually, the logic is "if thresholds is set and is length 2, return 1 / (1 + t(0) / t(1) ). Otherwise return threshold or its default value."

Seems to me we should update both Scala and Python doc to reflect this.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good, I'll go and update both the docs.

value if neither are set.
This conversion is equivalent to:
:math:`\\frac{1}{1 + \\frac{thresholds(0)}{thresholds(1)}}`.
"""
self._checkThresholdConsistency()
if self.isSet(self.thresholds):
Expand Down Expand Up @@ -183,7 +189,7 @@ def getThresholds(self):
If :py:attr:`thresholds` is set, return its value.
Otherwise, if :py:attr:`threshold` is set, return the equivalent thresholds for binary
classification: (1-threshold, threshold).
If neither are set, throw an error.
If neither are set, return the default value.
"""
self._checkThresholdConsistency()
if not self.isSet(self.thresholds) and self.isSet(self.threshold):
Expand All @@ -208,6 +214,8 @@ def _checkThresholdConsistency(self):

class LogisticRegressionModel(JavaModel, JavaMLWritable, JavaMLReadable):
"""
.. note:: Experimental

Model fitted by LogisticRegression.

.. versionadded:: 1.3.0
Expand Down Expand Up @@ -491,6 +499,8 @@ class DecisionTreeClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred
TreeClassifierParams, HasCheckpointInterval, HasSeed, JavaMLWritable,
JavaMLReadable):
"""
.. note:: Experimental

`Decision tree <http://en.wikipedia.org/wiki/Decision_tree_learning>`_
learning algorithm for classification.
It supports both binary and multiclass labels, as well as both continuous and categorical
Expand Down Expand Up @@ -585,6 +595,8 @@ def _create_model(self, java_model):
@inherit_doc
class DecisionTreeClassificationModel(DecisionTreeModel, JavaMLWritable, JavaMLReadable):
"""
.. note:: Experimental

Model fitted by DecisionTreeClassifier.

.. versionadded:: 1.4.0
Expand Down Expand Up @@ -618,6 +630,8 @@ class RandomForestClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPred
RandomForestParams, TreeClassifierParams, HasCheckpointInterval,
JavaMLWritable, JavaMLReadable):
"""
.. note:: Experimental

`Random Forest <http://en.wikipedia.org/wiki/Random_forest>`_
learning algorithm for classification.
It supports both binary and multiclass labels, as well as both continuous and categorical
Expand Down Expand Up @@ -710,6 +724,8 @@ def _create_model(self, java_model):

class RandomForestClassificationModel(TreeEnsembleModels, JavaMLWritable, JavaMLReadable):
"""
.. note:: Experimental

Model fitted by RandomForestClassifier.

.. versionadded:: 1.4.0
Expand All @@ -736,6 +752,8 @@ class GBTClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol
GBTParams, HasCheckpointInterval, HasStepSize, HasSeed, JavaMLWritable,
JavaMLReadable):
"""
.. note:: Experimental

`Gradient-Boosted Trees (GBTs) <http://en.wikipedia.org/wiki/Gradient_boosting>`_
learning algorithm for classification.
It supports binary labels, as well as both continuous and categorical features.
Expand Down Expand Up @@ -849,6 +867,8 @@ def getLossType(self):

class GBTClassificationModel(TreeEnsembleModels, JavaMLWritable, JavaMLReadable):
"""
.. note:: Experimental

Model fitted by GBTClassifier.

.. versionadded:: 1.4.0
Expand All @@ -874,6 +894,8 @@ def featureImportances(self):
class NaiveBayes(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol, HasProbabilityCol,
HasRawPredictionCol, JavaMLWritable, JavaMLReadable):
"""
.. note:: Experimental

Naive Bayes Classifiers.
It supports both Multinomial and Bernoulli NB. `Multinomial NB
<http://nlp.stanford.edu/IR-book/html/htmledition/naive-bayes-text-classification-1.html>`_
Expand Down Expand Up @@ -992,6 +1014,8 @@ def getModelType(self):

class NaiveBayesModel(JavaModel, JavaMLWritable, JavaMLReadable):
"""
.. note:: Experimental

Model fitted by NaiveBayes.

.. versionadded:: 1.5.0
Expand All @@ -1016,8 +1040,11 @@ def theta(self):

@inherit_doc
class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol, HasPredictionCol,
HasMaxIter, HasTol, HasSeed, JavaMLWritable, JavaMLReadable):
HasMaxIter, HasTol, HasSeed, HasStepSize, JavaMLWritable,
JavaMLReadable):
"""
.. note:: Experimental

Classifier trainer based on the Multilayer Perceptron.
Each layer has sigmoid activation function, output layer has softmax.
Number of inputs has to be equal to the size of feature vectors.
Expand Down Expand Up @@ -1058,6 +1085,14 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol,
True
>>> model.weights == model2.weights
True
>>> mlp2 = mlp2.setWeights([
... 2, 5, 1, -7, -5, -10, 0, 0.6, -1, 2, -2, 1, 2, -7, -1, -2, 2, 1, -1, 9, -9, 3, -3, -3,
... 3.0, 0, -1])
>>> model3 = mlp2.fit(df)
>>> model3.weights != model2.weights
True
>>> model3.layers == model.layers
True

.. versionadded:: 1.6.0
"""
Expand All @@ -1071,28 +1106,37 @@ class MultilayerPerceptronClassifier(JavaEstimator, HasFeaturesCol, HasLabelCol,
"remaining data in a partition then it is adjusted to the size of this " +
"data. Recommended size is between 10 and 1000, default is 128.",
typeConverter=TypeConverters.toInt)
solver = Param(Params._dummy(), "solver", "Allows setting the solver: minibatch gradient " +
"descent (gd) or l-bfgs. (Default l-bfgs)",
typeConverter=TypeConverters.toString)
weights = Param(Params._dummy(), "weights", "Weights (either initial if before training or " +
"actual on model)", typeConverter=TypeConverters.toVector)

@keyword_only
def __init__(self, featuresCol="features", labelCol="label", predictionCol="prediction",
maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128):
maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128, stepSize=0.03,
solver="l-bfgs", weights=None):
"""
__init__(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128)
maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128, stepSize=0.03, \
solver="l-bfgs", weights=None)
"""
super(MultilayerPerceptronClassifier, self).__init__()
self._java_obj = self._new_java_obj(
"org.apache.spark.ml.classification.MultilayerPerceptronClassifier", self.uid)
self._setDefault(maxIter=100, tol=1E-4, blockSize=128)
self._setDefault(maxIter=100, tol=1E-4, blockSize=128, stepSize=0.03, solver="l-bfgs")
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)

@keyword_only
@since("1.6.0")
def setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction",
maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128):
maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128, stepSize=0.03,
solver="l-bfgs", weights=None):
"""
setParams(self, featuresCol="features", labelCol="label", predictionCol="prediction", \
maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128)
maxIter=100, tol=1e-4, seed=None, layers=None, blockSize=128, stepSize=0.03, \
solver="l-bfgs", weights=None)
Sets params for MultilayerPerceptronClassifier.
"""
kwargs = self.setParams._input_kwargs
Expand Down Expand Up @@ -1129,9 +1173,61 @@ def getBlockSize(self):
"""
return self.getOrDefault(self.blockSize)

@since("2.0.0")
def setStepSize(self, value):
"""
Sets the value of :py:attr:`stepSize`.
"""
return self._set(stepSize=value)

@since("2.0.0")
def getStepSize(self):
"""
Gets the value of stepSize or its default value.
"""
return self.getOrDefault(self.stepSize)

@since("2.0.0")
def setSolver(self, value):
"""
Sets the value of :py:attr:`solver`.
"""
return self._set(solver=value)

@since("2.0.0")
def getSolver(self):
"""
Gets the value of solver or its default value.
"""
return self.getOrDefault(self.solver)

@property
@since("2.0.0")
def getOptimizer(self):
"""
Gets the optimizer used.
"""
return self.getSolver()

@since("2.0.0")
def setWeights(self, value):
"""
Sets the value of :py:attr:`weights`.
"""
return self._set(weights=value)

@since("2.0.0")
def getWeights(self):
"""
Gets the value of weights or its default value.
"""
return self.getOrDefault(self.weights)


class MultilayerPerceptronClassificationModel(JavaModel, JavaMLWritable, JavaMLReadable):
"""
.. note:: Experimental

Model fitted by MultilayerPerceptronClassifier.

.. versionadded:: 1.6.0
Expand Down Expand Up @@ -1181,6 +1277,8 @@ def getClassifier(self):
@inherit_doc
class OneVsRest(Estimator, OneVsRestParams, MLReadable, MLWritable):
"""
.. note:: Experimental

Reduction of Multiclass Classification to Binary Classification.
Performs reduction using one against all strategy.
For a multiclass classification with k classes, train k models (one per class).
Expand Down Expand Up @@ -1335,6 +1433,8 @@ def _to_java(self):

class OneVsRestModel(Model, OneVsRestParams, MLReadable, MLWritable):
"""
.. note:: Experimental

Model fitted by OneVsRest.
This stores the models resulting from training k binary classifiers: one for each class.
Each example is scored against all k models, and the model with the highest score
Expand Down