make rawPrediction optionall

lu-wang-dl · lu-wang-dl · commit 2a47e2be30d5 · 2018-04-12T17:13:12.000-07:00
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
@@ -191,6 +191,7 @@ final class OneVsRestModel private[ml] (
         val updateUDF = udf { (predictions: Map[Int, Double], prediction: Vector) =>
           predictions + ((index, prediction(1)))
         }
+
         model.setFeaturesCol($(featuresCol))
         val transformedDataset = model.transform(df).select(columns: _*)
         val updatedDataset = transformedDataset
@@ -206,18 +207,31 @@ final class OneVsRestModel private[ml] (
     }
 
     // output the RawPrediction as vector
-    val rawPredictionUDF = udf { (predictions: Map[Int, Double]) =>
-      Vectors.sparse(numClasses, predictions.toList )
-    }
+    if (getRawPredictionCol != "") {
+      val rawPredictionUDF = udf { (predictions: Map[Int, Double]) =>
+        val myArray = Array.fill[Double](numClasses)(0.0)
+        predictions.foreach { case (idx, value) => myArray(idx) = value }
+        Vectors.dense(myArray)
+      }
 
-    // output the index of the classifier with highest confidence as prediction
-    val labelUDF = udf { (predictions: Vector) => predictions.argmax.toDouble }
+      // output the index of the classifier with highest confidence as prediction
+      val labelUDF = udf { (predictions: Vector) => predictions.argmax.toDouble }
 
-    // output confidence as rwa prediction, label and label metadata as prediction
-    aggregatedDataset
-      .withColumn(getRawPredictionCol, rawPredictionUDF(col(accColName)))
-      .withColumn(getPredictionCol, labelUDF(col(getRawPredictionCol)), labelMetadata)
-      .drop(accColName)
+      aggregatedDataset
+        .withColumn(getRawPredictionCol, rawPredictionUDF(col(accColName)))
+        .withColumn(getPredictionCol, labelUDF(col(getRawPredictionCol)), labelMetadata)
+        .drop(accColName)
+    }
+    else {
+      // output the index of the classifier with highest confidence as prediction
+      val labelUDF = udf { (predictions: Map[Int, Double]) =>
+        predictions.maxBy(_._2)._1.toDouble
+      }
+      // output confidence as rwa prediction, label and label metadata as prediction
+      aggregatedDataset
+        .withColumn(getPredictionCol, labelUDF(col(accColName)), labelMetadata)
+        .drop(accColName)
+    }
   }
 
   @Since("1.4.1")
diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
@@ -180,10 +180,10 @@ class OneVsRestSuite extends MLTest with DefaultReadWriteTest {
     val dataset2 = dataset.select(col("label").as("y"), col("features").as("fea"))
     ovaModel.setFeaturesCol("fea")
     ovaModel.setPredictionCol("pred")
-    ovaModel.setRawPredictionCol("rawpred")
+    ovaModel.setRawPredictionCol("")
     val transformedDataset = ovaModel.transform(dataset2)
     val outputFields = transformedDataset.schema.fieldNames.toSet
-    assert(outputFields === Set("y", "fea", "pred", "rawpred"))
+    assert(outputFields === Set("y", "fea", "pred"))
   }
 
   test("SPARK-8049: OneVsRest shouldn't output temp columns") {