Skip to content
Closed
67 changes: 57 additions & 10 deletions python/pyspark/ml/feature.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,21 +158,28 @@ class Bucketizer(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, Jav
"splits specified will be treated as errors.",
typeConverter=TypeConverters.toListFloat)

handleInvalid = Param(Params._dummy(), "handleInvalid", "how to handle invalid entries. " +
"Options are skip (filter out rows with invalid values), " +
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we put the options in single quotes, e.g. "Options are 'skip' ..."

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@techaddict I don't think you addressed this comment?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

To be fair we don't have it quoted in the scala param description, so if we want to make this change we should probably also change it in the scala side just for consistencies sake.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah it's pretty minor. Maybe we can do it later in a follow up

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Cool, since we've already cut RC1 and it would be nice to have these params in sooner rather than later and @techaddict seems to be a bit busy I've created a follow up JIRA ( SPARK-18628 ) for this so that we can maybe move ahead with this as is.

"error (throw an error), or keep (keep invalid values in a special " +
"additional bucket).",
typeConverter=TypeConverters.toString)

@keyword_only
def __init__(self, splits=None, inputCol=None, outputCol=None):
def __init__(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error"):
"""
__init__(self, splits=None, inputCol=None, outputCol=None)
__init__(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error")
"""
super(Bucketizer, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Bucketizer", self.uid)
self._setDefault(handleInvalid="error")
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)

@keyword_only
@since("1.4.0")
def setParams(self, splits=None, inputCol=None, outputCol=None):
def setParams(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error"):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Missing handleInvalid in doc string below.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

"""
setParams(self, splits=None, inputCol=None, outputCol=None)
setParams(self, splits=None, inputCol=None, outputCol=None, handleInvalid="error")
Sets params for this Bucketizer.
"""
kwargs = self.setParams._input_kwargs
Expand All @@ -192,6 +199,20 @@ def getSplits(self):
"""
return self.getOrDefault(self.splits)

@since("2.1.0")
def setHandleInvalid(self):
"""
Sets the value of :py:attr:`handleInvalid`.
"""
return self._set(handleInvalid=value)

@since("2.1.0")
def getHandleInvalid(self):
"""
Gets the value of :py:attr:`handleInvalid` or its default value.
"""
return self.getOrDefault(self.handleInvalid)


@inherit_doc
class CountVectorizer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable):
Expand Down Expand Up @@ -1163,9 +1184,11 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadab

>>> df = spark.createDataFrame([(0.1,), (0.4,), (1.2,), (1.5,)], ["values"])
>>> qds = QuantileDiscretizer(numBuckets=2,
... inputCol="values", outputCol="buckets", relativeError=0.01)
... inputCol="values", outputCol="buckets", relativeError=0.01, handleInvalid="error")
>>> qds.getRelativeError()
0.01
>>> qds.getHandleInvalid()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We didn't add anything to the doctest of bucketizer. Actually, I think it would be nice in both places to set handleInvalid='skip' and then add an invalid value to the example data. That way we can show what we mean by invalid and prove that it works.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good idea! adding

'error'
>>> bucketizer = qds.fit(df)
>>> splits = bucketizer.getSplits()
>>> splits[0]
Expand Down Expand Up @@ -1194,23 +1217,33 @@ class QuantileDiscretizer(JavaEstimator, HasInputCol, HasOutputCol, JavaMLReadab
"Must be in the range [0, 1].",
typeConverter=TypeConverters.toFloat)

handleInvalid = Param(Params._dummy(), "handleInvalid", "how to handle invalid entries. " +
"Options are skip (filter out rows with invalid values), " +
"error (throw an error), or keep (keep invalid values in a special " +
"additional bucket).",
typeConverter=TypeConverters.toString)

@keyword_only
def __init__(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001):
def __init__(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001,
handleInvalid="error"):
"""
__init__(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001)
__init__(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001, \
handleInvalid="error")
"""
super(QuantileDiscretizer, self).__init__()
self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.QuantileDiscretizer",
self.uid)
self._setDefault(numBuckets=2, relativeError=0.001)
self._setDefault(numBuckets=2, relativeError=0.001, handleInvalid="error")
kwargs = self.__init__._input_kwargs
self.setParams(**kwargs)

@keyword_only
@since("2.0.0")
def setParams(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001):
def setParams(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001,
handleInvalid="error"):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

missing handleInvalid in doc string below.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed

"""
setParams(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001)
setParams(self, numBuckets=2, inputCol=None, outputCol=None, relativeError=0.001, \
handleInvalid="error")
Set the params for the QuantileDiscretizer
"""
kwargs = self.setParams._input_kwargs
Expand Down Expand Up @@ -1244,6 +1277,20 @@ def getRelativeError(self):
"""
return self.getOrDefault(self.relativeError)

@since("2.1.0")
def setHandleInvalid(self):
"""
Sets the value of :py:attr:`handleInvalid`.
"""
return self._set(handleInvalid=value)

@since("2.1.0")
def getHandleInvalid(self):
"""
Gets the value of :py:attr:`handleInvalid` or its default value.
"""
return self.getOrDefault(self.handleInvalid)

def _create_model(self, java_model):
"""
Private method to convert the java_model to a Python model.
Expand Down