-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-11922][PYSPARK][ML] Python api for ml.feature.quantile discretizer #10085
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 8 commits
dbabade
1cacd76
cfb255f
2540101
1145ec4
2afd197
601a9ea
798798c
b44c74d
27a4098
5e18778
f21ebef
d90339a
f9e3086
194ec6d
463aa37
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -30,10 +30,10 @@ | |
| __all__ = ['Binarizer', 'Bucketizer', 'CountVectorizer', 'CountVectorizerModel', 'DCT', | ||
| 'ElementwiseProduct', 'HashingTF', 'IDF', 'IDFModel', 'IndexToString', 'MinMaxScaler', | ||
| 'MinMaxScalerModel', 'NGram', 'Normalizer', 'OneHotEncoder', 'PCA', 'PCAModel', | ||
| 'PolynomialExpansion', 'RegexTokenizer', 'RFormula', 'RFormulaModel', 'SQLTransformer', | ||
| 'StandardScaler', 'StandardScalerModel', 'StopWordsRemover', 'StringIndexer', | ||
| 'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'VectorSlicer', | ||
| 'Word2Vec', 'Word2VecModel'] | ||
| 'PolynomialExpansion', 'QuantileDiscretizer', 'RegexTokenizer', 'RFormula', | ||
| 'RFormulaModel', 'SQLTransformer', 'StandardScaler', 'StandardScalerModel', | ||
| 'StopWordsRemover', 'StringIndexer', 'StringIndexerModel', 'Tokenizer', | ||
| 'VectorAssembler', 'VectorIndexer', 'VectorSlicer', 'Word2Vec', 'Word2VecModel'] | ||
|
|
||
|
|
||
| @inherit_doc | ||
|
|
@@ -135,9 +135,9 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol): | |
| "specified will be treated as errors.") | ||
|
|
||
| @keyword_only | ||
| def __init__(self, splits=None, inputCol=None, outputCol=None): | ||
| def __init__(self, splits=None, inputCol=None, outputCol=None, _java_model=None): | ||
| """ | ||
| __init__(self, splits=None, inputCol=None, outputCol=None) | ||
| __init__(self, splits=None, inputCol=None, outputCol=None, _java_model=None) | ||
| """ | ||
| super(Bucketizer, self).__init__() | ||
| self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Bucketizer", self.uid) | ||
|
|
@@ -155,6 +155,7 @@ def __init__(self, splits=None, inputCol=None, outputCol=None): | |
| "provided to cover all Double values; otherwise, values outside the splits " + | ||
| "specified will be treated as errors.") | ||
| kwargs = self.__init__._input_kwargs | ||
| kwargs.pop("_java_model", None) | ||
| self.setParams(**kwargs) | ||
|
|
||
| @keyword_only | ||
|
|
@@ -971,6 +972,88 @@ def getDegree(self): | |
| return self.getOrDefault(self.degree) | ||
|
|
||
|
|
||
| @inherit_doc | ||
| class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol): | ||
| """ | ||
| .. note:: Experimental | ||
|
|
||
| `QuantileDiscretizer` takes a column with continuous features and outputs a column with binned | ||
| categorical features. The bin ranges are chosen by taking a sample of the data and dividing it | ||
| into roughly equal parts. The lower and upper bin bounds will be -Infinity and +Infinity, | ||
| covering all real values. This attempts to find numBuckets partitions based on a sample of data, | ||
| but it may find fewer depending on the data sample values. | ||
|
|
||
| >>> df = sqlContext.createDataFrame([(0.1,), (0.4,), (1.2,), (1.5,)], ["values"]) | ||
| >>> qds = QuantileDiscretizer(numBuckets=2, | ||
| ... inputCol="values", outputCol="buckets") | ||
| >>> bucketizer = qds.fit(df) | ||
| >>> splits = bucketizer.getSplits() | ||
| >>> splits[0] | ||
| -inf | ||
| >>> int(splits[1]*10) | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is odd. Can you not just check
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Its a float, so to make the test not flaky and still human readable for doctests I truncated the split.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about using round instead?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sure :)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tried that, seems like it gets printed differently, I'll go back to the int one instead (or just drop it). |
||
| 4 | ||
| >>> bucketed = bucketizer.transform(df).collect() | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about |
||
| >>> bucketed[0].buckets | ||
| 0.0 | ||
|
|
||
| .. versionadded:: 1.6.0 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. change it to 2.0.0. |
||
| """ | ||
|
|
||
| # a placeholder to make it appear in the generated doc | ||
| numBuckets = Param(Params._dummy(), "numBuckets", | ||
| "Maximum number of buckets (quantiles, or " + | ||
| "categories) into which data points are grouped. Must be >= 2.") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we add a |
||
|
|
||
| @keyword_only | ||
| def __init__(self, numBuckets=2, inputCol=None, outputCol=None): | ||
| """ | ||
| __init__(self, numBuckets=2, inputCol=None, outputCol=None) | ||
| """ | ||
| super(QuantileDiscretizer, self).__init__() | ||
| self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.QuantileDiscretizer", | ||
| self.uid) | ||
| self.numBuckets = Param(self, "numBuckets", | ||
| "Maximum number of buckets (quantiles, or " + | ||
| "categories) into which data points are grouped. Must be >= 2.") | ||
| self._setDefault(numBuckets=2) | ||
| kwargs = self.__init__._input_kwargs | ||
| self.setParams(**kwargs) | ||
|
|
||
| @keyword_only | ||
| @since("1.6.0") | ||
| def setParams(self, numBuckets=2, inputCol=None, outputCol=None): | ||
| """ | ||
| setParams(self, numBuckets=2, inputCol=None, outputCol=None) | ||
| Set the params for the QuantileDiscertizerBase | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. typo |
||
| """ | ||
| kwargs = self.setParams._input_kwargs | ||
| return self._set(**kwargs) | ||
|
|
||
| @since("1.6.0") | ||
| def setNumBuckets(self, value): | ||
| """ | ||
| Sets the value of :py:attr:`numBuckets`. | ||
| """ | ||
| self._paramMap[self.numBuckets] = value | ||
| return self | ||
|
|
||
| @since("1.6.0") | ||
| def getNumBuckets(self): | ||
| """ | ||
| Gets the value of numBuckets or its default value. | ||
| """ | ||
| return self.getOrDefault(self.numBuckets) | ||
|
|
||
| def _create_model(self, java_model): | ||
| """ | ||
| Private method to convert the java_model to a Python model. | ||
| """ | ||
| return Bucketizer(splits=list(java_model.getSplits()), | ||
| inputCol=self.getInputCol(), | ||
| outputCol=self.getOutputCol(), | ||
| _java_model=java_model) | ||
|
|
||
|
|
||
| @inherit_doc | ||
| @ignore_unicode_prefix | ||
| class RegexTokenizer(JavaTransformer, HasInputCol, HasOutputCol): | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why is _java_model needed? It does not seem to be used.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh yah, I think the original plan was to avoid the overhead of object creation and sending the params back to the JVM if it is supplied since we already had a transformer. I'll remove this.