Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala
Original file line number Diff line number Diff line change
Expand Up @@ -38,12 +38,12 @@ import org.apache.spark.sql.types.StructType
private[feature] trait IDFBase extends Params with HasInputCol with HasOutputCol {

/**
* The minimum of documents in which a term should appear.
* The minimum number of documents in which a term should appear.
* Default: 0
* @group param
*/
final val minDocFreq = new IntParam(
this, "minDocFreq", "minimum of documents in which a term should appear for filtering")
this, "minDocFreq", "minimum number of documents in which a term should appear for filtering")

setDefault(minDocFreq -> 0)

Expand Down
5 changes: 3 additions & 2 deletions mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ private[feature] trait PCAParams extends Params with HasInputCol with HasOutputC

/**
* :: Experimental ::
* PCA trains a model to project vectors to a low-dimensional space using PCA.
* PCA trains a model to project vectors to a lower dimensional space of the top [[PCA!.k]]
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

while you're looking @holdenk , is this right with the "!"? I saw this somewhere else and without that it seems like it can't find k, but I couldn't find anything about it in scaladoc.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yah I took a look - it seems to generate the correct link and we use it elsewhere but it doesn't seem to really do much.

* principal components.
*/
@Experimental
class PCA (override val uid: String) extends Estimator[PCAModel] with PCAParams
Expand Down Expand Up @@ -106,7 +107,7 @@ object PCA extends DefaultParamsReadable[PCA] {

/**
* :: Experimental ::
* Model fitted by [[PCA]].
* Model fitted by [[PCA]]. Transforms vectors to a lower dimensional space.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(Trivial.. but maybe single space between * Model fitted by [[PCA]] and Transforms vectors to a lower dimensional space.)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah you're right, I should have used single space. Thanks @HyukjinKwon !

*
* @param pc A principal components Matrix. Each column is one principal component.
* @param explainedVariance A vector of proportions of variance explained by
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,7 +194,9 @@ object RFormula extends DefaultParamsReadable[RFormula] {

/**
* :: Experimental ::
* A fitted RFormula. Fitting is required to determine the factor levels of formula terms.
* Model fitted by [[RFormula]]. Fitting is required to determine the factor levels of
* formula terms.
*
* @param resolvedFormula the fitted R formula.
* @param pipelineModel the fitted feature model, including factor to index mappings.
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,8 @@ object VectorIndexer extends DefaultParamsReadable[VectorIndexer] {

/**
* :: Experimental ::
* Transform categorical features to use 0-based indices instead of their original values.
* Model fitted by [[VectorIndexer]]. Transform categorical features to use 0-based indices
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here too..

* instead of their original values.
* - Categorical features are mapped to indices.
* - Continuous features (columns) are left unchanged.
* This also appends metadata to the output column, marking features as Numeric (continuous),
Expand Down
25 changes: 19 additions & 6 deletions python/pyspark/ml/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -609,7 +609,7 @@ class IDF(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritab
"""

minDocFreq = Param(Params._dummy(), "minDocFreq",
"minimum of documents in which a term should appear for filtering",
"minimum number of documents in which a term should appear for filtering",
typeConverter=TypeConverters.toInt)

@keyword_only
Expand Down Expand Up @@ -1302,7 +1302,8 @@ class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable,

minTokenLength = Param(Params._dummy(), "minTokenLength", "minimum token length (>= 0)",
typeConverter=TypeConverters.toInt)
gaps = Param(Params._dummy(), "gaps", "whether regex splits on gaps (True) or matches tokens")
gaps = Param(Params._dummy(), "gaps", "whether regex splits on gaps (True) or matches tokens " +
"(False)")
pattern = Param(Params._dummy(), "pattern", "regex pattern (Java dialect) used for tokenizing",
typeConverter=TypeConverters.toString)
toLowercase = Param(Params._dummy(), "toLowercase", "whether to convert all characters to " +
Expand Down Expand Up @@ -1907,7 +1908,7 @@ class VectorIndexer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, Ja
"""
.. note:: Experimental

Class for indexing categorical feature columns in a dataset of [[Vector]].
Class for indexing categorical feature columns in a dataset of `Vector`.

This has 2 usage modes:
- Automatically identify categorical features (default behavior)
Expand Down Expand Up @@ -2025,6 +2026,16 @@ class VectorIndexerModel(JavaModel, JavaMLReadable, JavaMLWritable):

Model fitted by VectorIndexer.

Transform categorical features to use 0-based indices instead of their original values.
- Categorical features are mapped to indices.
- Continuous features (columns) are left unchanged.

This also appends metadata to the output column, marking features as Numeric (continuous),
Nominal (categorical), or Binary (either continuous or categorical).
Non-ML metadata is not carried over from the input to the output column.

This maintains vector sparsity.

.. versionadded:: 1.4.0
"""

Expand Down Expand Up @@ -2327,7 +2338,8 @@ class PCA(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritab
"""
.. note:: Experimental

PCA trains a model to project vectors to a low-dimensional space using PCA.
PCA trains a model to project vectors to a lower dimensional space of the
top :py:attr:`k` principal components.

>>> from pyspark.ml.linalg import Vectors
>>> data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
Expand Down Expand Up @@ -2401,7 +2413,7 @@ class PCAModel(JavaModel, JavaMLReadable, JavaMLWritable):
"""
.. note:: Experimental

Model fitted by PCA.
Model fitted by PCA. Transforms vectors to a lower dimensional space.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And here as well.


.. versionadded:: 1.5.0
"""
Expand Down Expand Up @@ -2532,7 +2544,8 @@ class RFormulaModel(JavaModel, JavaMLReadable, JavaMLWritable):
"""
.. note:: Experimental

Model fitted by :py:class:`RFormula`.
Model fitted by :py:class:`RFormula`. Fitting is required to determine the
factor levels of formula terms.

.. versionadded:: 1.5.0
"""
Expand Down