diff --git a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala index ed04b67bcc93..01895541aeab 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/evaluation/RegressionEvaluator.scala @@ -18,7 +18,7 @@ package org.apache.spark.ml.evaluation import org.apache.spark.annotation.{Experimental, Since} -import org.apache.spark.ml.param.{Param, ParamMap, ParamValidators} +import org.apache.spark.ml.param.{BooleanParam, Param, ParamMap, ParamValidators} import org.apache.spark.ml.param.shared.{HasLabelCol, HasPredictionCol} import org.apache.spark.ml.util.{DefaultParamsReadable, DefaultParamsWritable, Identifiable} import org.apache.spark.mllib.evaluation.RegressionMetrics @@ -69,7 +69,27 @@ final class RegressionEvaluator @Since("1.4.0") (@Since("1.4.0") override val ui @Since("1.4.0") def setLabelCol(value: String): this.type = set(labelCol, value) - setDefault(metricName -> "rmse") + /** + * Param for whether to drop rows where 'predictionCol' is NaN. NOTE - only set this to + * true if you are certain that NaN predictions should be ignored! + * (default: false) + * + * @group expertParam + */ + @Since("2.0.0") + val dropNaN: BooleanParam = new BooleanParam(this, "dropNaN", + "whether to drop rows where 'predictionCol' is NaN. NOTE - only set this to true if you are " + + "certain that NaN predictions should be ignored! (default: false)") + + /** @group expertGetParam */ + @Since("2.0.0") + def getDropNaN: Boolean = $(dropNaN) + + /** @group expertSetParam */ + @Since("2.0.0") + def setDropNaN(value: Boolean): this.type = set(dropNaN, value) + + setDefault(metricName -> "rmse", dropNaN -> false) @Since("2.0.0") override def evaluate(dataset: Dataset[_]): Double = { @@ -86,8 +106,9 @@ final class RegressionEvaluator @Since("1.4.0") (@Since("1.4.0") override val ui val predictionAndLabels = dataset .select(col($(predictionCol)).cast(DoubleType), col($(labelCol)).cast(DoubleType)) - .rdd. - map { case Row(prediction: Double, label: Double) => + .na.drop("any", if ($(dropNaN)) Seq($(predictionCol)) else Seq()) + .rdd + .map { case Row(prediction: Double, label: Double) => (prediction, label) } val metrics = new RegressionMetrics(predictionAndLabels) diff --git a/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala index 954d3bedc14b..467c03933464 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/evaluation/RegressionEvaluatorSuite.scala @@ -76,6 +76,19 @@ class RegressionEvaluatorSuite assert(evaluator.evaluate(predictions) ~== 0.08399089 absTol 0.01) } + test("support dropping NaNs from prediction column") { + val local = this.sqlContext + import local.implicits._ + val dataset = Seq( + (5.0, 4.0), (1.0, 4.0), (2.0, Double.NaN), (3.0, 1.0), (4.0, Double.NaN) + ).toDF("label", "prediction") + + val evaluator = new RegressionEvaluator() + assert(evaluator.evaluate(dataset).isNaN) + evaluator.setDropNaN(true) + assert(evaluator.evaluate(dataset) ~== 2.16024 absTol 1e-2) + } + test("read/write") { val evaluator = new RegressionEvaluator() .setPredictionCol("myPrediction") diff --git a/python/pyspark/ml/evaluation.py b/python/pyspark/ml/evaluation.py index 455795f9a083..1eefe85a0c19 100644 --- a/python/pyspark/ml/evaluation.py +++ b/python/pyspark/ml/evaluation.py @@ -187,6 +187,13 @@ class RegressionEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol): 0.993... >>> evaluator.evaluate(dataset, {evaluator.metricName: "mae"}) 2.649... + >>> scoreAndLabels = [(4.0, 5.0), (4.0, 1.0), (float('nan'), 2.0), + ... (1.0, 3.0), (float('nan'), 4.0)] + >>> dataset = sqlContext.createDataFrame(scoreAndLabels, ["raw", "label"]) + ... + >>> evaluator = RegressionEvaluator(predictionCol="raw").setDropNaN(True) + >>> evaluator.evaluate(dataset) + 2.160... .. versionadded:: 1.4.0 """ @@ -197,18 +204,24 @@ class RegressionEvaluator(JavaEvaluator, HasLabelCol, HasPredictionCol): "metric name in evaluation (mse|rmse|r2|mae)", typeConverter=TypeConverters.toString) + dropNaN = Param(Params._dummy(), "dropNaN", + "whether to drop rows where 'predictionCol' is NaN. NOTE - only set this to " + + "True if you are certain that NaN predictions should be ignored! " + + "(default: False)", + typeConverter=TypeConverters.toBoolean) + @keyword_only def __init__(self, predictionCol="prediction", labelCol="label", - metricName="rmse"): + metricName="rmse", dropNaN=False): """ __init__(self, predictionCol="prediction", labelCol="label", \ - metricName="rmse") + metricName="rmse", dropNaN=False) """ super(RegressionEvaluator, self).__init__() self._java_obj = self._new_java_obj( "org.apache.spark.ml.evaluation.RegressionEvaluator", self.uid) self._setDefault(predictionCol="prediction", labelCol="label", - metricName="rmse") + metricName="rmse", dropNaN=False) kwargs = self.__init__._input_kwargs self._set(**kwargs) @@ -227,13 +240,28 @@ def getMetricName(self): """ return self.getOrDefault(self.metricName) + @since("2.0.0") + def setDropNaN(self, value): + """ + Sets the value of :py:attr:`dropNaN`. + """ + self._set(dropNaN=value) + return self + + @since("2.0.0") + def getDropNaN(self): + """ + Gets the value of dropNaN or its default value. + """ + return self.getOrDefault(self.dropNaN) + @keyword_only @since("1.4.0") def setParams(self, predictionCol="prediction", labelCol="label", - metricName="rmse"): + metricName="rmse", dropNaN=False): """ setParams(self, predictionCol="prediction", labelCol="label", \ - metricName="rmse") + metricName="rmse", dropNaN=False) Sets params for regression evaluator. """ kwargs = self.setParams._input_kwargs