further PR feedback from @sryza. Modified tests now that certain critical functions are package private and can be tested more directly

jose.cambronero · jose.cambronero · commit 16ba96e00c58 · 2015-08-03T19:10:00.000-07:00
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala b/mllib/src/main/scala/org/apache/spark/mllib/stat/test/KolmogorovSmirnovTest.scala
@@ -46,13 +46,25 @@ import org.apache.spark.rdd.RDD
  * partition, we can collect and operate locally. Locally, we can now adjust each distance by the
  * appropriate constant (the cumulative sum of number of elements in the prior partitions divided by
  * thedata set size). Finally, we take the maximum absolute value, and this is the statistic.
+ *
+ * In the case of the 2-sample variant, the approach is slightly different. We calculate 2
+ * empirical CDFs corresponding to the distribution under sample 1 and under sample 2. Within each
+ * partition, we can calculate the maximum difference of the local empirical CDFs, which is off from
+ * the global value by some constant. Similarly to the 1-sample variant, we can simply adjust this
+ * difference once we have collected the possible candidate extrema. However, in this case we don't
+ * collect the number of elements in a partition, but rather an adjustment constant, that we can
+ * cumulatively sum once we've collected results on the driver, and that when divided by
+ * |sample 1| * |sample 2| provides the adjustment necessary to the difference between the 2
+ * empirical CDFs in a given partition and thus the adjustment necessary to the potential extrema
+ * candidates. The constant that we collect per partition thus corresponds to
+ * |sample 2| * |sample 1 in partition| - |sample 1| * |sample 2 in partition|.
  */
 private[stat] object KolmogorovSmirnovTest extends Logging {
 
   // Null hypothesis for the type of KS test to be included in the result.
   object NullHypothesis extends Enumeration {
     type NullHypothesis = Value
-    val OneSampleTwoSided = Value("Sample follows theoretical distribution")
+    val OneSampleTwoSided = Value("Sample follows the theoretical distribution")
     val TwoSampleTwoSided = Value("Both samples follow the same distribution")
   }
 
@@ -218,7 +230,8 @@ private[stat] object KolmogorovSmirnovTest extends Logging {
 
   /**
    * Calculates maximum distance candidates and counts of elements from each sample within one
-   * partition for the two-sample, two-sided Kolmogorov-Smirnov test implementation
+   * partition for the two-sample, two-sided Kolmogorov-Smirnov test implementation. Function
+   * is package private for testing convenience.
    * @param partData `Iterator[(Double, Boolean)]` the data in 1 partition of the co-sorted RDDs,
    *                each element is additionally tagged with a boolean flag for sample 1 membership
    * @param n1 `Double` sample 1 size
@@ -235,24 +248,27 @@ private[stat] object KolmogorovSmirnovTest extends Logging {
    *        portion that is attributable to each partition so that following partitions can
    *        use it to cumulatively adjust their values.
    */
-  private def searchTwoSampleCandidates(
+  private[stat] def searchTwoSampleCandidates(
       partData: Iterator[(Double, Boolean)],
       n1: Double,
       n2: Double): Iterator[(Double, Double, Double)] = {
     // fold accumulator: local minimum, local maximum, index for sample 1, index for sample2
-    case class ExtremaAndIndices(min: Double, max: Double, ix1: Int, ix2: Int)
-    val initAcc = ExtremaAndIndices(Double.MaxValue, Double.MinValue, 0, 0)
+    case class ExtremaAndRunningIndices(min: Double, max: Double, ix1: Int, ix2: Int)
+    val initAcc = ExtremaAndRunningIndices(Double.MaxValue, Double.MinValue, 0, 0)
     // traverse the data in the partition and calculate distances and counts
     val pResults = partData.foldLeft(initAcc) { case (acc, (v, isSample1)) =>
       val (add1, add2) = if (isSample1) (1, 0) else (0, 1)
       val cdf1 = (acc.ix1 + add1) / n1
       val cdf2 = (acc.ix2 + add2) / n2
       val dist = cdf1 - cdf2
-      ExtremaAndIndices(
+      ExtremaAndRunningIndices(
         math.min(acc.min, dist),
         math.max(acc.max, dist),
-        acc.ix1 + add1, acc.ix2 + add2)
+        acc.ix1 + add1, acc.ix2 + add2
+      )
     }
+    // If partition has no data, then pResults will match the fold accumulator
+    // we must filter this out to avoid having the statistic spoiled by the accumulation values
     val results = if (pResults == initAcc) {
       Array[(Double, Double, Double)]()
     } else {
@@ -263,14 +279,15 @@ private[stat] object KolmogorovSmirnovTest extends Logging {
 
   /**
    * Adjust candidate extremes by the appropriate constant. The resulting maximum corresponds to
-   * the two-sample, two-sided Kolmogorov-Smirnov test
+   * the two-sample, two-sided Kolmogorov-Smirnov test. Function is package private for testing
+   * convenience.
    * @param localData `Array[(Double, Double, Double)]` contains the candidate extremes from each
    *                 partition, along with the numerator for the necessary constant adjustments
    * @param n `Double` The denominator in the constant adjustment (i.e. (size of sample 1 ) * (size
    *         of sample 2))
    * @return The two-sample, two-sided Kolmogorov-Smirnov statistic
    */
-  private def searchTwoSampleStatistic(localData: Array[(Double, Double, Double)], n: Double)
+  private[stat] def searchTwoSampleStatistic(localData: Array[(Double, Double, Double)], n: Double)
     : Double = {
     // maximum distance and numerator for constant adjustment
     val initAcc = (Double.MinValue, 0.0)
@@ -282,7 +299,7 @@ private[stat] object KolmogorovSmirnovTest extends Logging {
       val dist2 = math.abs(maxCand + adjConst)
       val maxVal = Array(prevMax, dist1, dist2).max
       (maxVal, prevCt + ct)
-      }
+    }
     results._1
   }
 
diff --git a/mllib/src/test/scala/org/apache/spark/mllib/stat/HypothesisTestSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/stat/HypothesisTestSuite.scala
@@ -26,7 +26,7 @@ import org.apache.commons.math3.stat.inference.{KolmogorovSmirnovTest => CommonM
 import org.apache.spark.{SparkException, SparkFunSuite}
 import org.apache.spark.mllib.linalg.{DenseVector, Matrices, Vectors}
 import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.mllib.stat.test.ChiSqTest
+import org.apache.spark.mllib.stat.test.{ChiSqTest, KolmogorovSmirnovTest}
 import org.apache.spark.mllib.util.MLlibTestSparkContext
 import org.apache.spark.mllib.util.TestingUtils._
 
@@ -352,52 +352,67 @@ class HypothesisTestSuite extends SparkFunSuite with MLlibTestSparkContext {
     assert(kSCompResult.pValue ~== rKSPval relTol 1e-2)
   }
 
-  test("2 sample Kolmogorov-Smirnov test: partitions with no data") {
+  test("2 sample Kolmogorov-Smirnov test: helper functions in case partitions have no data") {
     // we use the R data provided in the prior test
-    // We request a number of partitions larger than the number of elements in the data sets
-    // wich
-    val rData1 = sc.parallelize(
-      Array(
+    // Once we have combined and sorted we partitino with a larger number than
+    // the number of elements to guarantee we have empty partitions.
+    // We test various critical package private functions in this circumstance.
+    val rData1 = Array(
         1.1626852897838, -0.585924465893051, 1.78546500331661, -1.33259371048501,
         -0.446566766553219, 0.569606122374976, -2.88971761441412, -0.869018343326555,
         -0.461702683149641, -0.555540910137444, -0.0201353678515895, -0.150382224136063,
         -0.628126755843964, 1.32322085193283, -1.52135057001199, -0.437427868856691,
         0.970577579543399, 0.0282226444247749, -0.0857821886527593, 0.389214404984942
-      ), 40)
+      )
 
-    val rData2 = sc.parallelize(
-      Array(
+    val rData2 = Array(
         0.236687367712904, -0.144440226694072, 0.722229700806146, 0.369906857410192,
         -0.242066314481781, -1.47206331842053, -0.596159545765696, -1.1467001312186,
         -2.47463643305885, -0.613508578410268, -0.216311514038102, 1.5901457684867,
         1.55614327565194, 1.10845089348356, -1.09734184488477, -1.86060571637755,
         -0.913578847977252, 1.24556891198713, 0.0878547183607045, 0.423481895050245
-      ), 40)
+      )
+
+
+    val n1 = rData1.length
+    val n2 = rData2.length
+    val unioned = (rData1.map((_, true)) ++ rData2.map((_, false))).sortBy(_._1)
+    val parallel = sc.parallelize(unioned, 100)
+    // verify that there are empty partitions
+    assert(parallel.mapPartitions(x => Array(x.size).iterator).collect().contains(0))
+    val localExtrema = parallel.mapPartitions(
+      KolmogorovSmirnovTest.searchTwoSampleCandidates(_, n1, n2)
+    ).collect()
+    val ksCompStat = KolmogorovSmirnovTest.searchTwoSampleStatistic(localExtrema, n1 * n2)
 
     val rKSStat = 0.15
-    val rKSPval = 0.9831
-    val kSCompResult = Statistics.kolmogorovSmirnovTest2Sample(rData1, rData2)
-    assert(kSCompResult.statistic ~== rKSStat relTol 1e-4)
+    assert(ksCompStat ~== rKSStat relTol 1e-4)
   }
 
-  test("2 sample Kolmogorov-Smirnov test: partitions with just data from one sample") {
-    // Creating 2 samples that don't overlap, so we are guaranteed to have some partitions
-    // that only include values from sample 1 and some that only include values from sample 2
-    val n = 1000
-    val nonOverlap1L = (1 to n).toArray.map(_.toDouble)
-    val nonOverlap2L = (n + 1 to 2 * n).toArray.map(_.toDouble)
-    val nonOverlap1P = sc.parallelize(nonOverlap1L, 20)
-    val nonOverlap2P = sc.parallelize(nonOverlap2L, 20)
+  test("2 sample Kolmogorov-Smirnov test: helper functions in case partitions have only 1 sample") {
+    // Creating 2 samples that don't overlap and request a large number of partitions to guarantee
+    // that there will be partitions with only data from 1 sample. We test critical helper
+    // functions in these circumstances.
+    val n = 100
+    val lower = (1 to n).toArray.map(_.toDouble)
+    val upper = (1 to n).toArray.map(n + _.toDouble * 100)
+
+    val unioned = (lower.map((_, true)) ++ upper.map((_, false))).sortBy(_._1)
+    val parallel = sc.parallelize(unioned, 200)
+    // verify that there is at least 1 partition with only 1 sample
+    assert(parallel.mapPartitions(x =>
+      Array(x.toArray.map(_._1).distinct.length).iterator
+      ).collect().contains(1)
+    )
+    val localExtrema = parallel.mapPartitions(
+      KolmogorovSmirnovTest.searchTwoSampleCandidates(_, n, n)
+    ).collect()
+    val ksCompStat = KolmogorovSmirnovTest.searchTwoSampleStatistic(localExtrema, n * n)
 
     // Use apache math commons local KS test to verify calculations
     val ksTest = new CommonMathKolmogorovSmirnovTest()
-    val pThreshold = 0.05
 
-    val result4 = Statistics.kolmogorovSmirnovTest2Sample(nonOverlap1P, nonOverlap2P)
-    val refStat4 = ksTest.kolmogorovSmirnovStatistic(nonOverlap1L, nonOverlap2L)
-    val refP4 = ksTest.kolmogorovSmirnovTest(nonOverlap1L, nonOverlap2L)
-    assert(result4.statistic ~== refStat4 relTol 1e-3)
-    assert(result4.pValue ~== refP4 relTol 1e-3)
-    assert(result4.pValue < pThreshold) // reject H0
+    val refStat4 = ksTest.kolmogorovSmirnovStatistic(lower, upper)
+    assert(ksCompStat ~== refStat4 relTol 1e-3)
   }
 }