apache · facaiy · Jul 6, 2017 · Jul 6, 2017 · Jul 6, 2017 · Jul 7, 2017
diff --git a/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala b/mllib/src/main/scala/org/apache/spark/ml/classification/OneVsRest.scala
@@ -34,6 +34,7 @@ import org.apache.spark.ml._
 import org.apache.spark.ml.attribute._
 import org.apache.spark.ml.linalg.Vector
 import org.apache.spark.ml.param.{Param, ParamMap, ParamPair, Params}
+import org.apache.spark.ml.param.shared.HasWeightCol
 import org.apache.spark.ml.util._
 import org.apache.spark.sql.{DataFrame, Dataset, Row}
 import org.apache.spark.sql.functions._
@@ -317,7 +318,12 @@ final class OneVsRest @Since("1.4.0") (
     val numClasses = MetadataUtils.getNumClasses(labelSchema).fold(computeNumClasses())(identity)
     instr.logNumClasses(numClasses)
 
-    val multiclassLabeled = dataset.select($(labelCol), $(featuresCol))
+    val multiclassLabeled = getClassifier match {
+      // SPARK-21306: cache weightCol if necessary
+      case c: HasWeightCol if c.isDefined(c.weightCol) && c.getWeightCol.nonEmpty =>
+        dataset.select($(labelCol), $(featuresCol), c.getWeightCol)
+      case _ => dataset.select($(labelCol), $(featuresCol))
+    }
 
     // persist if underlying dataset is not persistent.
     val handlePersistence = dataset.rdd.getStorageLevel == StorageLevel.NONE

diff --git a/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/classification/OneVsRestSuite.scala
@@ -156,6 +156,14 @@ class OneVsRestSuite extends SparkFunSuite with MLlibTestSparkContext with Defau
     assert(output.schema.fieldNames.toSet === Set("label", "features", "prediction"))
   }
 
+  test("SPARK-21306: OneVsRest should cache weightCol if necessary") {
+    val dataset2 = dataset.withColumn("weight", lit(1))
+    val ova = new OneVsRest().setClassifier(new LogisticRegression().setWeightCol("weight"))
+    // failed if weightCol is not cached.
+    val ovaModel = ova.fit(dataset2)
+    assert(ovaModel !== null)
+  }
+
   test("OneVsRest.copy and OneVsRestModel.copy") {
     val lr = new LogisticRegression()
       .setMaxIter(1)

diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
@@ -1546,7 +1546,12 @@ def _fit(self, dataset):
 
         numClasses = int(dataset.agg({labelCol: "max"}).head()["max("+labelCol+")"]) + 1
 
-        multiclassLabeled = dataset.select(labelCol, featuresCol)
+        if (isinstance(classifier, HasWeightCol)
+                and classifier.isDefined(classifier.weightCol)
+                and classifier.getWeightCol()):
+            multiclassLabeled = dataset.select(labelCol, featuresCol, classifier.getWeightCol())
+        else:
+            multiclassLabeled = dataset.select(labelCol, featuresCol)
 
         # persist if underlying dataset is not persistent.
         handlePersistence = \

diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py
@@ -1255,6 +1255,16 @@ def test_output_columns(self):
         output = model.transform(df)
         self.assertEqual(output.columns, ["label", "features", "prediction"])
 
+    def test_cache_weightCol_if_necessary(self):
+        df = self.spark.createDataFrame([(0.0, Vectors.dense(1.0, 0.8), 1.0),
+                                         (1.0, Vectors.sparse(2, [], []), 1.0),
+                                         (2.0, Vectors.dense(0.5, 0.5), 1.0)],
+                                        ["label", "features", "weight"])
+        lr = LogisticRegression(maxIter=5, regParam=0.01, weightCol="weight")
+        ovr = OneVsRest(classifier=lr)
+        model = ovr.fit(df)
+        self.assertIsNotNone(model)
+
 
 class HashingTFTest(SparkSessionTestCase):