[SPARK-18608][ML][FOLLOWUP] Fix double caching for PySpark OneVsRest.

yanboliang · MatthewRBruce · commit ca3e8924dfef · 2018-07-31T10:57:17.000-04:00
## What changes were proposed in this pull request? apache#19197 fixed double caching for MLlib algorithms, but missed PySpark ```OneVsRest```, this PR fixed it. ## How was this patch tested? Existing tests. Author: Yanbo Liang <ybliang8@gmail.com> Closes apache#19220 from yanboliang/SPARK-18608. (cherry picked from commit c76153c) Signed-off-by: Yanbo Liang <ybliang8@gmail.com>
diff --git a/python/pyspark/ml/classification.py b/python/pyspark/ml/classification.py
@@ -1576,8 +1576,7 @@ def _fit(self, dataset):
             multiclassLabeled = dataset.select(labelCol, featuresCol)
 
         # persist if underlying dataset is not persistent.
-        handlePersistence = \
-            dataset.rdd.getStorageLevel() == StorageLevel(False, False, False, False)
+        handlePersistence = dataset.storageLevel == StorageLevel(False, False, False, False)
         if handlePersistence:
             multiclassLabeled.persist(StorageLevel.MEMORY_AND_DISK)
 
@@ -1690,8 +1689,7 @@ def _transform(self, dataset):
         newDataset = dataset.withColumn(accColName, initUDF(dataset[origCols[0]]))
 
         # persist if underlying dataset is not persistent.
-        handlePersistence = \
-            dataset.rdd.getStorageLevel() == StorageLevel(False, False, False, False)
+        handlePersistence = dataset.storageLevel == StorageLevel(False, False, False, False)
         if handlePersistence:
             newDataset.persist(StorageLevel.MEMORY_AND_DISK)