-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-14489][SPARK-14153][ML][PYSPARK] Support dropping NaN predicted values in RegressionEvaluator #12577
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-14489][SPARK-14153][ML][PYSPARK] Support dropping NaN predicted values in RegressionEvaluator #12577
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -18,7 +18,7 @@ | |
| package org.apache.spark.ml.evaluation | ||
|
|
||
| import org.apache.spark.annotation.{Experimental, Since} | ||
| import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} | ||
| import org.apache.spark.ml.param.{BooleanParam, Param, ParamMap, ParamValidators} | ||
| import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} | ||
| import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} | ||
| import org.apache.spark.mllib.evaluation.RegressionMetrics | ||
|
|
@@ -69,7 +69,27 @@ final class RegressionEvaluator @Since("1.4.0") (@Since("1.4.0") override val ui | |
| @Since("1.4.0") | ||
| def setLabelCol(value: String): this.type = set(labelCol, value) | ||
|
|
||
| setDefault(metricName -> "rmse") | ||
| /** | ||
| * Param for whether to drop rows where 'predictionCol' is NaN. NOTE - only set this to | ||
| * true if you are certain that NaN predictions should be ignored! | ||
| * (default: false) | ||
| * | ||
| * @group expertParam | ||
| */ | ||
| @Since("2.0.0") | ||
| val dropNaN: BooleanParam = new BooleanParam(this, "dropNaN", | ||
| "whether to drop rows where 'predictionCol' is NaN. NOTE - only set this to true if you are " + | ||
| "certain that NaN predictions should be ignored! (default: false)") | ||
|
|
||
| /** @group expertGetParam */ | ||
| @Since("2.0.0") | ||
| def getDropNaN: Boolean = $(dropNaN) | ||
|
|
||
| /** @group expertSetParam */ | ||
| @Since("2.0.0") | ||
| def setDropNaN(value: Boolean): this.type = set(dropNaN, value) | ||
|
|
||
| setDefault(metricName -> "rmse", dropNaN -> false) | ||
|
|
||
| @Since("2.0.0") | ||
| override def evaluate(dataset: Dataset[_]): Double = { | ||
|
|
@@ -86,8 +106,9 @@ final class RegressionEvaluator @Since("1.4.0") (@Since("1.4.0") override val ui | |
|
|
||
| val predictionAndLabels = dataset | ||
| .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType)) | ||
| .rdd. | ||
| map { case Row(prediction: Double, label: Double) => | ||
| .na.drop("any", if ($(dropNaN)) Seq($(predictionCol)) else Seq()) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This also drops
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point, will add |
||
| .rdd | ||
| .map { case Row(prediction: Double, label: Double) => | ||
| (prediction, label) | ||
| } | ||
| val metrics = new RegressionMetrics(predictionAndLabels) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -187,6 +187,13 @@ class RegressionEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol): | |
| 0.993... | ||
| >>> evaluator.evaluate(dataset, {evaluator.metricName: "mae"}) | ||
| 2.649... | ||
| >>> scoreAndLabels = [(4.0, 5.0), (4.0, 1.0), (float('nan'), 2.0), | ||
| ... (1.0, 3.0), (float('nan'), 4.0)] | ||
| >>> dataset = sqlContext.createDataFrame(scoreAndLabels, ["raw", "label"]) | ||
| ... | ||
| >>> evaluator = RegressionEvaluator(predictionCol="raw").setDropNaN(True) | ||
| >>> evaluator.evaluate(dataset) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This doesn't quite mirror the Scala test, since the scala test first checks that the result is
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. will do |
||
| 2.160... | ||
|
|
||
| .. versionadded:: 1.4.0 | ||
| """ | ||
|
|
@@ -197,18 +204,24 @@ class RegressionEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol): | |
| "metric name in evaluation (mse|rmse|r2|mae)", | ||
| typeConverter=TypeConverters.toString) | ||
|
|
||
| dropNaN = Param(Params._dummy(), "dropNaN", | ||
| "whether to drop rows where 'predictionCol' is NaN. NOTE - only set this to " + | ||
| "True if you are certain that NaN predictions should be ignored! " + | ||
| "(default: False)", | ||
| typeConverter=TypeConverters.toBoolean) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: move typeConverter up a line. |
||
|
|
||
| @keyword_only | ||
| def __init__(self, predictionCol="prediction", labelCol="label", | ||
| metricName="rmse"): | ||
| metricName="rmse", dropNaN=False): | ||
| """ | ||
| __init__(self, predictionCol="prediction", labelCol="label", \ | ||
| metricName="rmse") | ||
| metricName="rmse", dropNaN=False) | ||
| """ | ||
| super(RegressionEvaluator, self).__init__() | ||
| self._java_obj = self._new_java_obj( | ||
| "org.apache.spark.ml.evaluation.RegressionEvaluator", self.uid) | ||
| self._setDefault(predictionCol="prediction", labelCol="label", | ||
| metricName="rmse") | ||
| metricName="rmse", dropNaN=False) | ||
| kwargs = self.__init__._input_kwargs | ||
| self._set(**kwargs) | ||
|
|
||
|
|
@@ -227,13 +240,28 @@ def getMetricName(self): | |
| """ | ||
| return self.getOrDefault(self.metricName) | ||
|
|
||
| @since("2.0.0") | ||
| def setDropNaN(self, value): | ||
| """ | ||
| Sets the value of :py:attr:`dropNaN`. | ||
| """ | ||
| self._set(dropNaN=value) | ||
| return self | ||
|
|
||
| @since("2.0.0") | ||
| def getDropNaN(self): | ||
| """ | ||
| Gets the value of dropNaN or its default value. | ||
| """ | ||
| return self.getOrDefault(self.dropNaN) | ||
|
|
||
| @keyword_only | ||
| @since("1.4.0") | ||
| def setParams(self, predictionCol="prediction", labelCol="label", | ||
| metricName="rmse"): | ||
| metricName="rmse", dropNaN=False): | ||
| """ | ||
| setParams(self, predictionCol="prediction", labelCol="label", \ | ||
| metricName="rmse") | ||
| metricName="rmse", dropNaN=False) | ||
| Sets params for regression evaluator. | ||
| """ | ||
| kwargs = self.setParams._input_kwargs | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Making this a Boolean parameter called
dropNaNmakes it less extensible in the future if we wish to implement more than just one possible NaN behavior. If we don't envision adding any other behavior then I guess this is good, but otherwise we could make a String param and limit its options to drop or raise an error for now.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let me see how nulls are handled given nullable input columns, and perhaps the possible strategies can be adjusted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Currently if
nullsare presentRegressionEvaluatorthrows aMatchError. I think we should either (a) disallow nullable columns explicitly with a schema check inevaluate- this can then provide a more understandable error message too; (b) allow nulls, but ignore them for both prediction and label col.I think nulls in the input for this case are unlikely and probably a result of bad data or user error somewhere along the line. So I'd prefer option (a). This then means the
dropNaNsetting will only apply to NaNs.