apache · d0evi1 · May 19, 2017 · May 19, 2017 · srowen · May 19, 2017
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/LDAModel.scala
@@ -33,6 +33,7 @@ import org.apache.spark.mllib.util.{Loader, Saveable}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.{Row, SparkSession}
 import org.apache.spark.util.BoundedPriorityQueue
+import org.apache.spark.util.Utils
 
 /**
  * Latent Dirichlet Allocation (LDA) model.
@@ -468,7 +469,16 @@ object LocalLDAModel extends Loader[LocalLDAModel] {
       val topics = Range(0, k).map { topicInd =>
         Data(Vectors.dense((topicsDenseMatrix(::, topicInd).toArray)), topicInd)
       }
-      spark.createDataFrame(topics).repartition(1).write.parquet(Loader.dataPath(path))
+
+      val bufferSize = Utils.byteStringAsBytes(
+        spark.conf.get("spark.kryoserializer.buffer.max", "64m"))
+      // We calculate the approximate size of the model
+      // We only calculate the array size, considering an
+      // average string size of 15 bytes, the formula is:
+      // (floatSize * vectorSize + 15) * numWords
+      val approxSize = (4L * k + 15) * topicsMatrix.numRows
+      val nPartitions = ((approxSize / bufferSize) + 1).toInt
+      spark.createDataFrame(topics).repartition(nPartitions).write.parquet(Loader.dataPath(path))
     }
 
     def load(