@@ -19,12 +19,10 @@ package org.apache.spark.mllib.clustering
1919
2020import scala .collection .mutable .IndexedSeq
2121
22- import breeze .linalg .{diag , DenseMatrix => BreezeMatrix , DenseVector => BDV , SparseVector => BSV ,
23- Transpose , Vector => BV }
22+ import breeze .linalg .{diag , DenseMatrix => BreezeMatrix , DenseVector => BDV , Vector => BV }
2423
2524import org .apache .spark .annotation .Experimental
26- import org .apache .spark .mllib .linalg .{BLAS , DenseVector , DenseMatrix , Matrices ,
27- SparseVector , Vector , Vectors }
25+ import org .apache .spark .mllib .linalg .{BLAS , DenseMatrix , Matrices , Vector , Vectors }
2826import org .apache .spark .mllib .stat .distribution .MultivariateGaussian
2927import org .apache .spark .mllib .util .MLUtils
3028import org .apache .spark .rdd .RDD
@@ -43,7 +41,11 @@ import org.apache.spark.util.Utils
4341 * less than convergenceTol, or until it has reached the max number of iterations.
4442 * While this process is generally guaranteed to converge, it is not guaranteed
4543 * to find a global optimum.
46- *
44+ *
45+ * Note: For high-dimensional data (with many features), this algorithm may perform poorly.
46+ * This is due to high-dimensional data (a) making it difficult to cluster at all (based
47+ * on statistical/theoretical arguments) and (b) numerical issues with Gaussian distributions.
48+ *
4749 * @param k The number of independent Gaussians in the mixture model
4850 * @param convergenceTol The maximum change in log-likelihood at which convergence
4951 * is considered to have occurred.
0 commit comments