1717
1818package org .apache .spark .ml .clustering
1919
20- import org .apache .hadoop .fs .Path
20+ import org .apache .hadoop .fs .{ FileSystem , Path }
2121
22- import org .apache .spark .annotation .{Experimental , Since }
22+ import org .apache .spark .annotation .{DeveloperApi , Experimental , Since }
2323import org .apache .spark .internal .Logging
2424import org .apache .spark .ml .{Estimator , Model }
2525import org .apache .spark .ml .param ._
2626import org .apache .spark .ml .param .shared .{HasCheckpointInterval , HasFeaturesCol , HasMaxIter , HasSeed }
2727import org .apache .spark .ml .util ._
2828import org .apache .spark .mllib .clustering .{DistributedLDAModel => OldDistributedLDAModel ,
29- EMLDAOptimizer => OldEMLDAOptimizer , LDA => OldLDA , LDAModel => OldLDAModel ,
30- LDAOptimizer => OldLDAOptimizer , LocalLDAModel => OldLocalLDAModel ,
31- OnlineLDAOptimizer => OldOnlineLDAOptimizer }
29+ EMLDAOptimizer => OldEMLDAOptimizer , LDA => OldLDA , LDAModel => OldLDAModel ,
30+ LDAOptimizer => OldLDAOptimizer , LocalLDAModel => OldLocalLDAModel ,
31+ OnlineLDAOptimizer => OldOnlineLDAOptimizer }
32+ import org .apache .spark .mllib .impl .PeriodicCheckpointer
3233import org .apache .spark .mllib .linalg .{Matrix , Vector , Vectors , VectorUDT }
3334import org .apache .spark .rdd .RDD
3435import org .apache .spark .sql .{DataFrame , Row , SQLContext }
@@ -41,6 +42,7 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
4142
4243 /**
4344 * Param for the number of topics (clusters) to infer. Must be > 1. Default: 10.
45+ *
4446 * @group param
4547 */
4648 @ Since (" 1.6.0" )
@@ -173,6 +175,7 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
173175 * This uses a variational approximation following Hoffman et al. (2010), where the approximate
174176 * distribution is called "gamma." Technically, this method returns this approximation "gamma"
175177 * for each document.
178+ *
176179 * @group param
177180 */
178181 @ Since (" 1.6.0" )
@@ -191,6 +194,7 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
191194 * iterations count less.
192195 * This is called "tau0" in the Online LDA paper (Hoffman et al., 2010)
193196 * Default: 1024, following Hoffman et al.
197+ *
194198 * @group expertParam
195199 */
196200 @ Since (" 1.6.0" )
@@ -207,6 +211,7 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
207211 * This should be between (0.5, 1.0] to guarantee asymptotic convergence.
208212 * This is called "kappa" in the Online LDA paper (Hoffman et al., 2010).
209213 * Default: 0.51, based on Hoffman et al.
214+ *
210215 * @group expertParam
211216 */
212217 @ Since (" 1.6.0" )
@@ -230,6 +235,7 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
230235 * [[org.apache.spark.mllib.clustering.OnlineLDAOptimizer ]].
231236 *
232237 * Default: 0.05, i.e., 5% of total documents.
238+ *
233239 * @group param
234240 */
235241 @ Since (" 1.6.0" )
@@ -246,6 +252,7 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
246252 * document-topic distribution) will be optimized during training.
247253 * Setting this to true will make the model more expressive and fit the training data better.
248254 * Default: false
255+ *
249256 * @group expertParam
250257 */
251258 @ Since (" 1.6.0" )
@@ -257,8 +264,32 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
257264 @ Since (" 1.6.0" )
258265 def getOptimizeDocConcentration : Boolean = $(optimizeDocConcentration)
259266
267+ /**
268+ * For EM optimizer, if using checkpointing, this indicates whether to keep the last
269+ * checkpoint. If false, then the checkpoint will be deleted. Deleting the checkpoint can
270+ * cause failures if a data partition is lost, so set this bit with care.
271+ * Note that checkpoints will be cleaned up via reference counting, regardless.
272+ *
273+ * See [[DistributedLDAModel.getCheckpointFiles ]] for getting remaining checkpoints and
274+ * [[DistributedLDAModel.deleteCheckpointFiles ]] for removing remaining checkpoints.
275+ *
276+ * Default: true
277+ *
278+ * @group expertParam
279+ */
280+ @ Since (" 2.0.0" )
281+ final val keepLastCheckpoint = new BooleanParam (this , " keepLastCheckpoint" ,
282+ " For EM optimizer, if using checkpointing, this indicates whether to keep the last" +
283+ " checkpoint. If false, then the checkpoint will be deleted. Deleting the checkpoint can" +
284+ " cause failures if a data partition is lost, so set this bit with care." )
285+
286+ /** @group expertGetParam */
287+ @ Since (" 2.0.0" )
288+ def getKeepLastCheckpoint : Boolean = $(keepLastCheckpoint)
289+
260290 /**
261291 * Validates and transforms the input schema.
292+ *
262293 * @param schema input schema
263294 * @return output schema
264295 */
@@ -303,6 +334,7 @@ private[clustering] trait LDAParams extends Params with HasFeaturesCol with HasM
303334 .setOptimizeDocConcentration($(optimizeDocConcentration))
304335 case " em" =>
305336 new OldEMLDAOptimizer ()
337+ .setKeepLastCheckpoint($(keepLastCheckpoint))
306338 }
307339}
308340
@@ -341,6 +373,7 @@ sealed abstract class LDAModel private[ml] (
341373 /**
342374 * The features for LDA should be a [[Vector ]] representing the word counts in a document.
343375 * The vector should be of length vocabSize, with counts for each term (word).
376+ *
344377 * @group setParam
345378 */
346379 @ Since (" 1.6.0" )
@@ -619,6 +652,35 @@ class DistributedLDAModel private[ml] (
619652 @ Since (" 1.6.0" )
620653 lazy val logPrior : Double = oldDistributedModel.logPrior
621654
655+ private var _checkpointFiles : Array [String ] = oldDistributedModel.checkpointFiles
656+
657+ /**
658+ * If using checkpointing and [[LDA.keepLastCheckpoint ]] is set to true, then there may be
659+ * saved checkpoint files. This method is provided so that users can manage those files.
660+ *
661+ * Note that removing the checkpoints can cause failures if a partition is lost and is needed
662+ * by certain [[DistributedLDAModel ]] methods. Reference counting will clean up the checkpoints
663+ * when this model and derivative data go out of scope.
664+ *
665+ * @return Checkpoint files from training
666+ */
667+ @ DeveloperApi
668+ @ Since (" 2.0.0" )
669+ def getCheckpointFiles : Array [String ] = _checkpointFiles
670+
671+ /**
672+ * Remove any remaining checkpoint files from training.
673+ *
674+ * @see [[getCheckpointFiles ]]
675+ */
676+ @ DeveloperApi
677+ @ Since (" 2.0.0" )
678+ def deleteCheckpointFiles (): Unit = {
679+ val fs = FileSystem .get(sqlContext.sparkContext.hadoopConfiguration)
680+ _checkpointFiles.foreach(PeriodicCheckpointer .removeCheckpointFile(_, fs))
681+ _checkpointFiles = Array .empty[String ]
682+ }
683+
622684 @ Since (" 1.6.0" )
623685 override def write : MLWriter = new DistributedLDAModel .DistributedWriter (this )
624686}
@@ -696,11 +758,12 @@ class LDA @Since("1.6.0") (
696758
697759 setDefault(maxIter -> 20 , k -> 10 , optimizer -> " online" , checkpointInterval -> 10 ,
698760 learningOffset -> 1024 , learningDecay -> 0.51 , subsamplingRate -> 0.05 ,
699- optimizeDocConcentration -> true )
761+ optimizeDocConcentration -> true , keepLastCheckpoint -> true )
700762
701763 /**
702764 * The features for LDA should be a [[Vector ]] representing the word counts in a document.
703765 * The vector should be of length vocabSize, with counts for each term (word).
766+ *
704767 * @group setParam
705768 */
706769 @ Since (" 1.6.0" )
@@ -758,6 +821,10 @@ class LDA @Since("1.6.0") (
758821 @ Since (" 1.6.0" )
759822 def setOptimizeDocConcentration (value : Boolean ): this .type = set(optimizeDocConcentration, value)
760823
824+ /** @group expertSetParam */
825+ @ Since (" 2.0.0" )
826+ def setKeepLastCheckpoint (value : Boolean ): this .type = set(keepLastCheckpoint, value)
827+
761828 @ Since (" 1.6.0" )
762829 override def copy (extra : ParamMap ): LDA = defaultCopy(extra)
763830
0 commit comments