apache · freeman-lab · Aug 1, 2014 · Aug 1, 2014 · Aug 1, 2014 · Aug 1, 2014
diff --git a/python/pyspark/statcounter.py b/python/pyspark/statcounter.py
@@ -20,6 +20,14 @@
 import copy
 import math
 
+_have_numpy = False
+try:
+    from numpy import maximum, minimum, sqrt
+    _have_numpy = True
+except:
+    # no NumPy, so fall back on scalar operators
+    pass
+
 
 class StatCounter(object):
 
@@ -39,10 +47,14 @@ def merge(self, value):
         self.n += 1
         self.mu += delta / self.n
         self.m2 += delta * (value - self.mu)
-        if self.maxValue < value:
-            self.maxValue = value
-        if self.minValue > value:
-            self.minValue = value
+        if not _have_numpy:
+            if self.maxValue < value:
+                self.maxValue = value
+            if self.minValue > value:
+                self.minValue = value
+        else:
+            self.maxValue = maximum(self.maxValue, value)
+            self.minValue = minimum(self.minValue, value)
 
         return self
 
@@ -70,8 +82,12 @@ def mergeStats(self, other):
                 else:
                     self.mu = (self.mu * self.n + other.mu * other.n) / (self.n + other.n)
 
+                if not _have_numpy:
                     self.maxValue = max(self.maxValue, other.maxValue)
                     self.minValue = min(self.minValue, other.minValue)
+                else:
+                    self.maxValue = maximum(self.maxValue, other.maxValue)
+                    self.minValue = minimum(self.minValue, other.minValue)
 
                 self.m2 += other.m2 + (delta * delta * self.n * other.n) / (self.n + other.n)
                 self.n += other.n
@@ -115,14 +131,20 @@ def sampleVariance(self):
 
     # Return the standard deviation of the values.
     def stdev(self):
-        return math.sqrt(self.variance())
+        if not _have_numpy:
+            return math.sqrt(self.variance())
+        else:
+            return sqrt(self.variance())
 
     #
     # Return the sample standard deviation of the values, which corrects for bias in estimating the
     # variance by dividing by N-1 instead of N.
     #
     def sampleStdev(self):
-        return math.sqrt(self.sampleVariance())
+        if not _have_numpy:
+            return math.sqrt(self.sampleVariance())
+        else:
+            return sqrt(self.sampleVariance())
 
     def __repr__(self):
         return ("(count: %s, mean: %s, stdev: %s, max: %s, min: %s)" %

diff --git a/python/pyspark/tests.py b/python/pyspark/tests.py
@@ -38,12 +38,19 @@
 from pyspark.shuffle import Aggregator, InMemoryMerger, ExternalMerger
 
 _have_scipy = False
+_have_numpy = False
 try:
     import scipy.sparse
     _have_scipy = True
 except:
     # No SciPy, but that's okay, we'll skip those tests
     pass
+try:
+    from numpy import array
+    _have_numpy = True
+except:
+    # No NumPy, but that's okay, we'll skip those tests
+    pass
 
 
 SPARK_HOME = os.environ["SPARK_HOME"]
@@ -914,9 +921,27 @@ def test_serialize(self):
         self.assertEqual(expected, observed)
 
 
+@unittest.skipIf(not _have_numpy, "NumPy not installed")
+class NumPyTests(PySparkTestCase):
+    """General PySpark tests that depend on numpy """
+
+    def test_statcounter_array(self):
+        from numpy import array
+        x = self.sc.parallelize([array([1.0,1.0]), array([2.0,2.0]), array([3.0,3.0])])
+        s = x.stats()
+        self.assertSequenceEqual([2.0,2.0], s.mean().tolist())
+        self.assertSequenceEqual([1.0,1.0], s.min().tolist())
+        self.assertSequenceEqual([3.0,3.0], s.max().tolist())
+        self.assertSequenceEqual([1.0,1.0], s.sampleStdev().tolist())
+
+
 if __name__ == "__main__":
     if not _have_scipy:
         print "NOTE: Skipping SciPy tests as it does not seem to be installed"
+    if not _have_numpy:
+        print "NOTE: Skipping NumPy tests as it does not seem to be installed"
     unittest.main()
     if not _have_scipy:
         print "NOTE: SciPy tests were skipped as it does not seem to be installed"
+    if not _have_numpy:
+        print "NOTE: NumPy tests were skipped as it does not seem to be installed"