Skip to content
Closed
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 89 additions & 6 deletions python/pyspark/ml/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,10 @@
__all__ = ['Binarizer', 'Bucketizer', 'CountVectorizer', 'CountVectorizerModel', 'DCT',
'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel', 'IndexToString', 'MinMaxScaler',
'MinMaxScalerModel', 'NGram', 'Normalizer', 'OneHotEncoder', 'PCA', 'PCAModel',
'PolynomialExpansion', 'RegexTokenizer', 'RFormula', 'RFormulaModel', 'SQLTransformer',
'StandardScaler', 'StandardScalerModel', 'StopWordsRemover', 'StringIndexer',
'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'VectorSlicer',
'Word2Vec', 'Word2VecModel']
'PolynomialExpansion', 'QuantileDiscretizer', 'RegexTokenizer', 'RFormula',
'RFormulaModel', 'SQLTransformer', 'StandardScaler', 'StandardScalerModel',
'StopWordsRemover', 'StringIndexer', 'StringIndexerModel', 'Tokenizer',
'VectorAssembler', 'VectorIndexer', 'VectorSlicer', 'Word2Vec', 'Word2VecModel']


@inherit_doc
Expand Down Expand Up @@ -135,9 +135,9 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol):
"specified will be treated as errors.")

@keyword_only
def __init__(self, splits=None, inputCol=None, outputCol=None):
def __init__(self, splits=None, inputCol=None, outputCol=None, _java_model=None):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is _java_model needed? It does not seem to be used.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh yah, I think the original plan was to avoid the overhead of object creation and sending the params back to the JVM if it is supplied since we already had a transformer. I'll remove this.

"""
__init__(self, splits=None, inputCol=None, outputCol=None)
__init__(self, splits=None, inputCol=None, outputCol=None, _java_model=None)
"""
super(Bucketizer, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Bucketizer", self.uid)
Expand All @@ -155,6 +155,7 @@ def __init__(self, splits=None, inputCol=None, outputCol=None):
"provided to cover all Double values; otherwise, values outside the splits " +
"specified will be treated as errors.")
kwargs = self.__init__._input_kwargs
kwargs.pop("_java_model", None)
self.setParams(**kwargs)

@keyword_only
Expand Down Expand Up @@ -971,6 +972,88 @@ def getDegree(self):
return self.getOrDefault(self.degree)


@inherit_doc
class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol):
"""
.. note:: Experimental

`QuantileDiscretizer` takes a column with continuous features and outputs a column with binned
categorical features. The bin ranges are chosen by taking a sample of the data and dividing it
into roughly equal parts. The lower and upper bin bounds will be -Infinity and +Infinity,
covering all real values. This attempts to find numBuckets partitions based on a sample of data,
but it may find fewer depending on the data sample values.

>>> df = sqlContext.createDataFrame([(0.1,), (0.4,), (1.2,), (1.5,)], ["values"])
>>> qds = QuantileDiscretizer(numBuckets=2,
... inputCol="values", outputCol="buckets")
>>> bucketizer = qds.fit(df)
>>> splits = bucketizer.getSplits()
>>> splits[0]
-inf
>>> int(splits[1]*10)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is odd. Can you not just check splits[1]?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Its a float, so to make the test not flaky and still human readable for doctests I truncated the split.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about using round instead?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure :)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tried that, seems like it gets printed differently, I'll go back to the int one instead (or just drop it).

4
>>> bucketed = bucketizer.transform(df).collect()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about head() instead of collect()[0]?

>>> bucketed[0].buckets
0.0

.. versionadded:: 1.6.0
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

change it to 2.0.0.

"""

# a placeholder to make it appear in the generated doc
numBuckets = Param(Params._dummy(), "numBuckets",
"Maximum number of buckets (quantiles, or " +
"categories) into which data points are grouped. Must be >= 2.")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we add a default 2 here?


@keyword_only
def __init__(self, numBuckets=2, inputCol=None, outputCol=None):
"""
__init__(self, numBuckets=2, inputCol=None, outputCol=None)
"""
super(QuantileDiscretizer, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.QuantileDiscretizer",
self.uid)
self.numBuckets = Param(self, "numBuckets",
"Maximum number of buckets (quantiles, or " +
"categories) into which data points are grouped. Must be >= 2.")
self._setDefault(numBuckets=2)
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)

@keyword_only
@since("1.6.0")
def setParams(self, numBuckets=2, inputCol=None, outputCol=None):
"""
setParams(self, numBuckets=2, inputCol=None, outputCol=None)
Set the params for the QuantileDiscertizerBase
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

typo

"""
kwargs = self.setParams._input_kwargs
return self._set(**kwargs)

@since("1.6.0")
def setNumBuckets(self, value):
"""
Sets the value of :py:attr:`numBuckets`.
"""
self._paramMap[self.numBuckets] = value
return self

@since("1.6.0")
def getNumBuckets(self):
"""
Gets the value of numBuckets or its default value.
"""
return self.getOrDefault(self.numBuckets)

def _create_model(self, java_model):
"""
Private method to convert the java_model to a Python model.
"""
return Bucketizer(splits=list(java_model.getSplits()),
inputCol=self.getInputCol(),
outputCol=self.getOutputCol(),
_java_model=java_model)


@inherit_doc
@ignore_unicode_prefix
class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol):
Expand Down