-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-28159][ML] Make the transform natively in ml framework to avoid extra conversion #24963
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 3 commits
3d9c916
92d555c
10ba449
bd813db
f78ed32
5730ab7
38b3872
d54d073
f1314fb
096d204
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -27,6 +27,8 @@ import org.apache.spark.annotation.{DeveloperApi, Since} | |
| import org.apache.spark.graphx._ | ||
| import org.apache.spark.graphx.util.PeriodicGraphCheckpointer | ||
| import org.apache.spark.internal.Logging | ||
| import org.apache.spark.ml.linalg.{DenseVector => NewDenseVector, | ||
| SparseVector => NewSparseVector, Vector => NewVector} | ||
| import org.apache.spark.mllib.linalg.{DenseVector, Matrices, SparseVector, Vector, Vectors} | ||
| import org.apache.spark.rdd.RDD | ||
| import org.apache.spark.storage.StorageLevel | ||
|
|
@@ -609,26 +611,23 @@ private[clustering] object OnlineLDAOptimizer { | |
| * statistics for updating lambda and `ids` - list of termCounts vector indices. | ||
| */ | ||
| private[clustering] def variationalTopicInference( | ||
| termCounts: Vector, | ||
| indices: List[Int], | ||
srowen marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| values: Array[Double], | ||
| expElogbeta: BDM[Double], | ||
| alpha: breeze.linalg.Vector[Double], | ||
| gammaShape: Double, | ||
| k: Int, | ||
| seed: Long): (BDV[Double], BDM[Double], List[Int]) = { | ||
| val (ids: List[Int], cts: Array[Double]) = termCounts match { | ||
| case v: DenseVector => ((0 until v.size).toList, v.values) | ||
| case v: SparseVector => (v.indices.toList, v.values) | ||
| } | ||
| // Initialize the variational distribution q(theta|gamma) for the mini-batch | ||
| val randBasis = new RandBasis(new org.apache.commons.math3.random.MersenneTwister(seed)) | ||
| val gammad: BDV[Double] = | ||
| new Gamma(gammaShape, 1.0 / gammaShape)(randBasis).samplesVector(k) // K | ||
| val expElogthetad: BDV[Double] = exp(LDAUtils.dirichletExpectation(gammad)) // K | ||
| val expElogbetad = expElogbeta(ids, ::).toDenseMatrix // ids * K | ||
| val expElogbetad = expElogbeta(indices, ::).toDenseMatrix // ids * K | ||
|
|
||
| val phiNorm: BDV[Double] = expElogbetad * expElogthetad +:+ 1e-100 // ids | ||
| var meanGammaChange = 1D | ||
| val ctsVector = new BDV[Double](cts) // ids | ||
| val ctsVector = new BDV[Double](values) // ids | ||
|
|
||
| // Iterate between gamma and phi until convergence | ||
| while (meanGammaChange > 1e-3) { | ||
|
|
@@ -642,6 +641,34 @@ private[clustering] object OnlineLDAOptimizer { | |
| } | ||
|
|
||
| val sstatsd = expElogthetad.asDenseMatrix.t * (ctsVector /:/ phiNorm).asDenseMatrix | ||
| (gammad, sstatsd, ids) | ||
| (gammad, sstatsd, indices) | ||
| } | ||
|
|
||
| private[clustering] def variationalTopicInference( | ||
| termCounts: Vector, | ||
| expElogbeta: BDM[Double], | ||
| alpha: breeze.linalg.Vector[Double], | ||
| gammaShape: Double, | ||
| k: Int, | ||
| seed: Long): (BDV[Double], BDM[Double], List[Int]) = { | ||
| val (ids: List[Int], cts: Array[Double]) = termCounts match { | ||
| case v: DenseVector => ((0 until v.size).toList, v.values) | ||
|
||
| case v: SparseVector => (v.indices.toList, v.values) | ||
| } | ||
| variationalTopicInference(ids, cts, expElogbeta, alpha, gammaShape, k, seed) | ||
| } | ||
|
|
||
| private[clustering] def variationalTopicInference( | ||
| termCounts: NewVector, | ||
| expElogbeta: BDM[Double], | ||
| alpha: breeze.linalg.Vector[Double], | ||
| gammaShape: Double, | ||
| k: Int, | ||
| seed: Long): (BDV[Double], BDM[Double], List[Int]) = { | ||
| val (ids: List[Int], cts: Array[Double]) = termCounts match { | ||
| case v: NewDenseVector => ((0 until v.size).toList, v.values) | ||
|
||
| case v: NewSparseVector => (v.indices.toList, v.values) | ||
| } | ||
| variationalTopicInference(ids, cts, expElogbeta, alpha, gammaShape, k, seed) | ||
| } | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -25,6 +25,8 @@ import org.json4s.jackson.JsonMethods._ | |
|
|
||
| import org.apache.spark.SparkContext | ||
| import org.apache.spark.annotation.Since | ||
| import org.apache.spark.ml.linalg.{DenseVector => NewDenseVector, | ||
| SparseVector => NewSparseVector, Vector => NewVector, Vectors => NewVectors} | ||
| import org.apache.spark.mllib.linalg.{DenseVector, SparseVector, Vector, Vectors} | ||
| import org.apache.spark.mllib.regression.LabeledPoint | ||
| import org.apache.spark.mllib.stat.Statistics | ||
|
|
@@ -75,40 +77,61 @@ class ChiSqSelectorModel @Since("1.3.0") ( | |
| private def compress(features: Vector): Vector = { | ||
| features match { | ||
| case SparseVector(size, indices, values) => | ||
| val newSize = filterIndices.length | ||
| val newValues = new ArrayBuilder.ofDouble | ||
| val newIndices = new ArrayBuilder.ofInt | ||
| var i = 0 | ||
| var j = 0 | ||
| var indicesIdx = 0 | ||
| var filterIndicesIdx = 0 | ||
| while (i < indices.length && j < filterIndices.length) { | ||
| indicesIdx = indices(i) | ||
| filterIndicesIdx = filterIndices(j) | ||
| if (indicesIdx == filterIndicesIdx) { | ||
| newIndices += j | ||
| newValues += values(i) | ||
| j += 1 | ||
| i += 1 | ||
| } else { | ||
| if (indicesIdx > filterIndicesIdx) { | ||
| j += 1 | ||
| } else { | ||
| i += 1 | ||
| } | ||
| } | ||
| } | ||
| // TODO: Sparse representation might be ineffective if (newSize ~= newValues.size) | ||
| Vectors.sparse(newSize, newIndices.result(), newValues.result()) | ||
| val (newIndices, newValues) = compressSparse(indices, values) | ||
| Vectors.sparse(filterIndices.length, newIndices, newValues) | ||
| case DenseVector(values) => | ||
| val values = features.toArray | ||
| Vectors.dense(filterIndices.map(i => values(i))) | ||
| Vectors.dense(compressDense(values)) | ||
| case other => | ||
| throw new UnsupportedOperationException( | ||
| s"Only sparse and dense vectors are supported but got ${other.getClass}.") | ||
| } | ||
| } | ||
|
|
||
| private[spark] def compress(features: NewVector): NewVector = { | ||
|
||
| features match { | ||
| case NewSparseVector(size, indices, values) => | ||
| val (newIndices, newValues) = compressSparse(indices, values) | ||
| NewVectors.sparse(filterIndices.length, newIndices, newValues) | ||
| case NewDenseVector(values) => | ||
| NewVectors.dense(compressDense(values)) | ||
| case other => | ||
| throw new UnsupportedOperationException( | ||
| s"Only sparse and dense vectors are supported but got ${other.getClass}.") | ||
| } | ||
| } | ||
|
|
||
| private def compressSparse(indices: Array[Int], | ||
| values: Array[Double]): (Array[Int], Array[Double]) = { | ||
| val newValues = new ArrayBuilder.ofDouble | ||
| val newIndices = new ArrayBuilder.ofInt | ||
| var i = 0 | ||
| var j = 0 | ||
| var indicesIdx = 0 | ||
| var filterIndicesIdx = 0 | ||
| while (i < indices.length && j < filterIndices.length) { | ||
| indicesIdx = indices(i) | ||
| filterIndicesIdx = filterIndices(j) | ||
| if (indicesIdx == filterIndicesIdx) { | ||
| newIndices += j | ||
| newValues += values(i) | ||
| j += 1 | ||
| i += 1 | ||
| } else { | ||
| if (indicesIdx > filterIndicesIdx) { | ||
| j += 1 | ||
| } else { | ||
| i += 1 | ||
| } | ||
| } | ||
| } | ||
| // TODO: Sparse representation might be ineffective if (newSize ~= newValues.size) | ||
| (newIndices.result(), newValues.result()) | ||
| } | ||
|
|
||
| private def compressDense(values: Array[Double]): Array[Double] = { | ||
| filterIndices.map(i => values(i)) | ||
| } | ||
|
|
||
| @Since("1.6.0") | ||
| override def save(sc: SparkContext, path: String): Unit = { | ||
| ChiSqSelectorModel.SaveLoadV1_0.save(sc, this, path) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah OK, I think we don't want to import .ml vectors in .mllib here. But the method below is only used in .ml now. Just move it to .ml.clustering.LDAModel with your changes?