Skip to content

Commit 16ba96e

Browse files
author
jose.cambronero
committed
further PR feedback from @sryza. Modified tests now that certain critical functions are package private and can be tested more directly
1 parent 748e9d3 commit 16ba96e

File tree

2 files changed

+70
-38
lines changed

2 files changed

+70
-38
lines changed

mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -46,13 +46,25 @@ import org.apache.spark.rdd.RDD
4646
* partition, we can collect and operate locally. Locally, we can now adjust each distance by the
4747
* appropriate constant (the cumulative sum of number of elements in the prior partitions divided by
4848
* thedata set size). Finally, we take the maximum absolute value, and this is the statistic.
49+
*
50+
* In the case of the 2-sample variant, the approach is slightly different. We calculate 2
51+
* empirical CDFs corresponding to the distribution under sample 1 and under sample 2. Within each
52+
* partition, we can calculate the maximum difference of the local empirical CDFs, which is off from
53+
* the global value by some constant. Similarly to the 1-sample variant, we can simply adjust this
54+
* difference once we have collected the possible candidate extrema. However, in this case we don't
55+
* collect the number of elements in a partition, but rather an adjustment constant, that we can
56+
* cumulatively sum once we've collected results on the driver, and that when divided by
57+
* |sample 1| * |sample 2| provides the adjustment necessary to the difference between the 2
58+
* empirical CDFs in a given partition and thus the adjustment necessary to the potential extrema
59+
* candidates. The constant that we collect per partition thus corresponds to
60+
* |sample 2| * |sample 1 in partition| - |sample 1| * |sample 2 in partition|.
4961
*/
5062
private[stat] object KolmogorovSmirnovTest extends Logging {
5163

5264
// Null hypothesis for the type of KS test to be included in the result.
5365
object NullHypothesis extends Enumeration {
5466
type NullHypothesis = Value
55-
val OneSampleTwoSided = Value("Sample follows theoretical distribution")
67+
val OneSampleTwoSided = Value("Sample follows the theoretical distribution")
5668
val TwoSampleTwoSided = Value("Both samples follow the same distribution")
5769
}
5870

@@ -218,7 +230,8 @@ private[stat] object KolmogorovSmirnovTest extends Logging {
218230

219231
/**
220232
* Calculates maximum distance candidates and counts of elements from each sample within one
221-
* partition for the two-sample, two-sided Kolmogorov-Smirnov test implementation
233+
* partition for the two-sample, two-sided Kolmogorov-Smirnov test implementation. Function
234+
* is package private for testing convenience.
222235
* @param partData `Iterator[(Double, Boolean)]` the data in 1 partition of the co-sorted RDDs,
223236
* each element is additionally tagged with a boolean flag for sample 1 membership
224237
* @param n1 `Double` sample 1 size
@@ -235,24 +248,27 @@ private[stat] object KolmogorovSmirnovTest extends Logging {
235248
* portion that is attributable to each partition so that following partitions can
236249
* use it to cumulatively adjust their values.
237250
*/
238-
private def searchTwoSampleCandidates(
251+
private[stat] def searchTwoSampleCandidates(
239252
partData: Iterator[(Double, Boolean)],
240253
n1: Double,
241254
n2: Double): Iterator[(Double, Double, Double)] = {
242255
// fold accumulator: local minimum, local maximum, index for sample 1, index for sample2
243-
case class ExtremaAndIndices(min: Double, max: Double, ix1: Int, ix2: Int)
244-
val initAcc = ExtremaAndIndices(Double.MaxValue, Double.MinValue, 0, 0)
256+
case class ExtremaAndRunningIndices(min: Double, max: Double, ix1: Int, ix2: Int)
257+
val initAcc = ExtremaAndRunningIndices(Double.MaxValue, Double.MinValue, 0, 0)
245258
// traverse the data in the partition and calculate distances and counts
246259
val pResults = partData.foldLeft(initAcc) { case (acc, (v, isSample1)) =>
247260
val (add1, add2) = if (isSample1) (1, 0) else (0, 1)
248261
val cdf1 = (acc.ix1 + add1) / n1
249262
val cdf2 = (acc.ix2 + add2) / n2
250263
val dist = cdf1 - cdf2
251-
ExtremaAndIndices(
264+
ExtremaAndRunningIndices(
252265
math.min(acc.min, dist),
253266
math.max(acc.max, dist),
254-
acc.ix1 + add1, acc.ix2 + add2)
267+
acc.ix1 + add1, acc.ix2 + add2
268+
)
255269
}
270+
// If partition has no data, then pResults will match the fold accumulator
271+
// we must filter this out to avoid having the statistic spoiled by the accumulation values
256272
val results = if (pResults == initAcc) {
257273
Array[(Double, Double, Double)]()
258274
} else {
@@ -263,14 +279,15 @@ private[stat] object KolmogorovSmirnovTest extends Logging {
263279

264280
/**
265281
* Adjust candidate extremes by the appropriate constant. The resulting maximum corresponds to
266-
* the two-sample, two-sided Kolmogorov-Smirnov test
282+
* the two-sample, two-sided Kolmogorov-Smirnov test. Function is package private for testing
283+
* convenience.
267284
* @param localData `Array[(Double, Double, Double)]` contains the candidate extremes from each
268285
* partition, along with the numerator for the necessary constant adjustments
269286
* @param n `Double` The denominator in the constant adjustment (i.e. (size of sample 1 ) * (size
270287
* of sample 2))
271288
* @return The two-sample, two-sided Kolmogorov-Smirnov statistic
272289
*/
273-
private def searchTwoSampleStatistic(localData: Array[(Double, Double, Double)], n: Double)
290+
private[stat] def searchTwoSampleStatistic(localData: Array[(Double, Double, Double)], n: Double)
274291
: Double = {
275292
// maximum distance and numerator for constant adjustment
276293
val initAcc = (Double.MinValue, 0.0)
@@ -282,7 +299,7 @@ private[stat] object KolmogorovSmirnovTest extends Logging {
282299
val dist2 = math.abs(maxCand + adjConst)
283300
val maxVal = Array(prevMax, dist1, dist2).max
284301
(maxVal, prevCt + ct)
285-
}
302+
}
286303
results._1
287304
}
288305

mllib/src/test/scala/org/apache/spark/mllib/stat/HypothesisTestSuite.scala

Lines changed: 43 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ import org.apache.commons.math3.stat.inference.{KolmogorovSmirnovTest => CommonM
2626
import org.apache.spark.{SparkException, SparkFunSuite}
2727
import org.apache.spark.mllib.linalg.{DenseVector, Matrices, Vectors}
2828
import org.apache.spark.mllib.regression.LabeledPoint
29-
import org.apache.spark.mllib.stat.test.ChiSqTest
29+
import org.apache.spark.mllib.stat.test.{ChiSqTest, KolmogorovSmirnovTest}
3030
import org.apache.spark.mllib.util.MLlibTestSparkContext
3131
import org.apache.spark.mllib.util.TestingUtils._
3232

@@ -352,52 +352,67 @@ class HypothesisTestSuite extends SparkFunSuite with MLlibTestSparkContext {
352352
assert(kSCompResult.pValue ~== rKSPval relTol 1e-2)
353353
}
354354

355-
test("2 sample Kolmogorov-Smirnov test: partitions with no data") {
355+
test("2 sample Kolmogorov-Smirnov test: helper functions in case partitions have no data") {
356356
// we use the R data provided in the prior test
357-
// We request a number of partitions larger than the number of elements in the data sets
358-
// wich
359-
val rData1 = sc.parallelize(
360-
Array(
357+
// Once we have combined and sorted we partitino with a larger number than
358+
// the number of elements to guarantee we have empty partitions.
359+
// We test various critical package private functions in this circumstance.
360+
val rData1 = Array(
361361
1.1626852897838, -0.585924465893051, 1.78546500331661, -1.33259371048501,
362362
-0.446566766553219, 0.569606122374976, -2.88971761441412, -0.869018343326555,
363363
-0.461702683149641, -0.555540910137444, -0.0201353678515895, -0.150382224136063,
364364
-0.628126755843964, 1.32322085193283, -1.52135057001199, -0.437427868856691,
365365
0.970577579543399, 0.0282226444247749, -0.0857821886527593, 0.389214404984942
366-
), 40)
366+
)
367367

368-
val rData2 = sc.parallelize(
369-
Array(
368+
val rData2 = Array(
370369
0.236687367712904, -0.144440226694072, 0.722229700806146, 0.369906857410192,
371370
-0.242066314481781, -1.47206331842053, -0.596159545765696, -1.1467001312186,
372371
-2.47463643305885, -0.613508578410268, -0.216311514038102, 1.5901457684867,
373372
1.55614327565194, 1.10845089348356, -1.09734184488477, -1.86060571637755,
374373
-0.913578847977252, 1.24556891198713, 0.0878547183607045, 0.423481895050245
375-
), 40)
374+
)
375+
376+
377+
val n1 = rData1.length
378+
val n2 = rData2.length
379+
val unioned = (rData1.map((_, true)) ++ rData2.map((_, false))).sortBy(_._1)
380+
val parallel = sc.parallelize(unioned, 100)
381+
// verify that there are empty partitions
382+
assert(parallel.mapPartitions(x => Array(x.size).iterator).collect().contains(0))
383+
val localExtrema = parallel.mapPartitions(
384+
KolmogorovSmirnovTest.searchTwoSampleCandidates(_, n1, n2)
385+
).collect()
386+
val ksCompStat = KolmogorovSmirnovTest.searchTwoSampleStatistic(localExtrema, n1 * n2)
376387

377388
val rKSStat = 0.15
378-
val rKSPval = 0.9831
379-
val kSCompResult = Statistics.kolmogorovSmirnovTest2Sample(rData1, rData2)
380-
assert(kSCompResult.statistic ~== rKSStat relTol 1e-4)
389+
assert(ksCompStat ~== rKSStat relTol 1e-4)
381390
}
382391

383-
test("2 sample Kolmogorov-Smirnov test: partitions with just data from one sample") {
384-
// Creating 2 samples that don't overlap, so we are guaranteed to have some partitions
385-
// that only include values from sample 1 and some that only include values from sample 2
386-
val n = 1000
387-
val nonOverlap1L = (1 to n).toArray.map(_.toDouble)
388-
val nonOverlap2L = (n + 1 to 2 * n).toArray.map(_.toDouble)
389-
val nonOverlap1P = sc.parallelize(nonOverlap1L, 20)
390-
val nonOverlap2P = sc.parallelize(nonOverlap2L, 20)
392+
test("2 sample Kolmogorov-Smirnov test: helper functions in case partitions have only 1 sample") {
393+
// Creating 2 samples that don't overlap and request a large number of partitions to guarantee
394+
// that there will be partitions with only data from 1 sample. We test critical helper
395+
// functions in these circumstances.
396+
val n = 100
397+
val lower = (1 to n).toArray.map(_.toDouble)
398+
val upper = (1 to n).toArray.map(n + _.toDouble * 100)
399+
400+
val unioned = (lower.map((_, true)) ++ upper.map((_, false))).sortBy(_._1)
401+
val parallel = sc.parallelize(unioned, 200)
402+
// verify that there is at least 1 partition with only 1 sample
403+
assert(parallel.mapPartitions(x =>
404+
Array(x.toArray.map(_._1).distinct.length).iterator
405+
).collect().contains(1)
406+
)
407+
val localExtrema = parallel.mapPartitions(
408+
KolmogorovSmirnovTest.searchTwoSampleCandidates(_, n, n)
409+
).collect()
410+
val ksCompStat = KolmogorovSmirnovTest.searchTwoSampleStatistic(localExtrema, n * n)
391411

392412
// Use apache math commons local KS test to verify calculations
393413
val ksTest = new CommonMathKolmogorovSmirnovTest()
394-
val pThreshold = 0.05
395414

396-
val result4 = Statistics.kolmogorovSmirnovTest2Sample(nonOverlap1P, nonOverlap2P)
397-
val refStat4 = ksTest.kolmogorovSmirnovStatistic(nonOverlap1L, nonOverlap2L)
398-
val refP4 = ksTest.kolmogorovSmirnovTest(nonOverlap1L, nonOverlap2L)
399-
assert(result4.statistic ~== refStat4 relTol 1e-3)
400-
assert(result4.pValue ~== refP4 relTol 1e-3)
401-
assert(result4.pValue < pThreshold) // reject H0
415+
val refStat4 = ksTest.kolmogorovSmirnovStatistic(lower, upper)
416+
assert(ksCompStat ~== refStat4 relTol 1e-3)
402417
}
403418
}

0 commit comments

Comments
 (0)