Skip to content

Commit c42fbc7

Browse files
zhengruifengsrowen
authored andcommitted
[SPARK-30398][ML] PCA/RegressionMetrics/RowMatrix avoid unnecessary computation
### What changes were proposed in this pull request? use `.ml.Summarizer` instead of `.mllib.MultivariateOnlineSummarizer` to avoid computation of unused metrics ### Why are the changes needed? to avoid computation of unused metrics ### Does this PR introduce any user-facing change? No ### How was this patch tested? existing testsuites Closes #27059 from zhengruifeng/pac_summarizer. Authored-by: zhengruifeng <[email protected]> Signed-off-by: Sean Owen <[email protected]>
1 parent 4a234dd commit c42fbc7

10 files changed

Lines changed: 384 additions & 345 deletions

File tree

mllib/src/main/scala/org/apache/spark/ml/classification/LinearSVC.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ import org.apache.spark.ml.optim.aggregator.HingeAggregator
3232
import org.apache.spark.ml.optim.loss.{L2Regularization, RDDLossFunction}
3333
import org.apache.spark.ml.param._
3434
import org.apache.spark.ml.param.shared._
35-
import org.apache.spark.ml.stat.SummaryBuilderImpl._
35+
import org.apache.spark.ml.stat._
3636
import org.apache.spark.ml.util._
3737
import org.apache.spark.ml.util.Instrumentation.instrumented
3838
import org.apache.spark.sql.{Dataset, Row}
@@ -170,7 +170,7 @@ class LinearSVC @Since("2.2.0") (
170170
regParam, maxIter, fitIntercept, tol, standardization, threshold, aggregationDepth)
171171

172172
val (summarizer, labelSummarizer) = instances.treeAggregate(
173-
(createSummarizerBuffer("mean", "std", "count"), new MultiClassSummarizer))(
173+
(Summarizer.createSummarizerBuffer("mean", "std", "count"), new MultiClassSummarizer))(
174174
seqOp = (c: (SummarizerBuffer, MultiClassSummarizer), instance: Instance) =>
175175
(c._1.add(instance.features, instance.weight), c._2.add(instance.label, instance.weight)),
176176
combOp = (c1: (SummarizerBuffer, MultiClassSummarizer),

mllib/src/main/scala/org/apache/spark/ml/classification/LogisticRegression.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ import org.apache.spark.ml.optim.aggregator.LogisticAggregator
3434
import org.apache.spark.ml.optim.loss.{L2Regularization, RDDLossFunction}
3535
import org.apache.spark.ml.param._
3636
import org.apache.spark.ml.param.shared._
37-
import org.apache.spark.ml.stat.SummaryBuilderImpl._
37+
import org.apache.spark.ml.stat._
3838
import org.apache.spark.ml.util._
3939
import org.apache.spark.ml.util.Instrumentation.instrumented
4040
import org.apache.spark.mllib.evaluation.{BinaryClassificationMetrics, MulticlassMetrics}
@@ -501,7 +501,7 @@ class LogisticRegression @Since("1.2.0") (
501501
fitIntercept)
502502

503503
val (summarizer, labelSummarizer) = instances.treeAggregate(
504-
(createSummarizerBuffer("mean", "std", "count"), new MultiClassSummarizer))(
504+
(Summarizer.createSummarizerBuffer("mean", "std", "count"), new MultiClassSummarizer))(
505505
seqOp = (c: (SummarizerBuffer, MultiClassSummarizer), instance: Instance) =>
506506
(c._1.add(instance.features, instance.weight), c._2.add(instance.label, instance.weight)),
507507
combOp = (c1: (SummarizerBuffer, MultiClassSummarizer),

mllib/src/main/scala/org/apache/spark/ml/regression/AFTSurvivalRegression.scala

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ import org.apache.spark.ml.{Estimator, Model}
3131
import org.apache.spark.ml.linalg.{BLAS, Vector, Vectors, VectorUDT}
3232
import org.apache.spark.ml.param._
3333
import org.apache.spark.ml.param.shared._
34-
import org.apache.spark.ml.stat.SummaryBuilderImpl._
34+
import org.apache.spark.ml.stat._
3535
import org.apache.spark.ml.util._
3636
import org.apache.spark.ml.util.Instrumentation.instrumented
3737
import org.apache.spark.mllib.util.MLUtils
@@ -215,7 +215,7 @@ class AFTSurvivalRegression @Since("1.6.0") (@Since("1.6.0") override val uid: S
215215
if (handlePersistence) instances.persist(StorageLevel.MEMORY_AND_DISK)
216216

217217
val featuresSummarizer = instances.treeAggregate(
218-
createSummarizerBuffer("mean", "std", "count"))(
218+
Summarizer.createSummarizerBuffer("mean", "std", "count"))(
219219
seqOp = (c: SummarizerBuffer, v: AFTPoint) => c.add(v.features),
220220
combOp = (c1: SummarizerBuffer, c2: SummarizerBuffer) => c1.merge(c2),
221221
depth = $(aggregationDepth)

mllib/src/main/scala/org/apache/spark/ml/regression/LinearRegression.scala

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ import org.apache.spark.ml.optim.aggregator.{HuberAggregator, LeastSquaresAggreg
3636
import org.apache.spark.ml.optim.loss.{L2Regularization, RDDLossFunction}
3737
import org.apache.spark.ml.param.{DoubleParam, Param, ParamMap, ParamValidators}
3838
import org.apache.spark.ml.param.shared._
39-
import org.apache.spark.ml.stat.SummaryBuilderImpl._
39+
import org.apache.spark.ml.stat._
4040
import org.apache.spark.ml.util._
4141
import org.apache.spark.ml.util.Instrumentation.instrumented
4242
import org.apache.spark.mllib.evaluation.RegressionMetrics
@@ -358,8 +358,8 @@ class LinearRegression @Since("1.3.0") (@Since("1.3.0") override val uid: String
358358
if (handlePersistence) instances.persist(StorageLevel.MEMORY_AND_DISK)
359359

360360
val (featuresSummarizer, ySummarizer) = instances.treeAggregate(
361-
(createSummarizerBuffer("mean", "std"),
362-
createSummarizerBuffer("mean", "std", "count")))(
361+
(Summarizer.createSummarizerBuffer("mean", "std"),
362+
Summarizer.createSummarizerBuffer("mean", "std", "count")))(
363363
seqOp = (c: (SummarizerBuffer, SummarizerBuffer), instance: Instance) =>
364364
(c._1.add(instance.features, instance.weight),
365365
c._2.add(Vectors.dense(instance.label), instance.weight)),

0 commit comments

Comments
 (0)