apache · davies · Nov 10, 2014 · Nov 12, 2014 · Nov 12, 2014 · Nov 13, 2014
diff --git a/python/pyspark/rdd.py b/python/pyspark/rdd.py
@@ -28,7 +28,7 @@
 import warnings
 import heapq
 import bisect
-from random import Random
+import random
 from math import sqrt, log, isinf, isnan
 
 from pyspark.accumulators import PStatsParam
@@ -316,6 +316,30 @@ def sample(self, withReplacement, fraction, seed=None):
         assert fraction >= 0.0, "Negative fraction value: %s" % fraction
         return self.mapPartitionsWithIndex(RDDSampler(withReplacement, fraction, seed).func, True)
 
+    def randomSplit(self, weights, seed=None):
+        """
+        Randomly splits this RDD with the provided weights.
+
+        :param weights: weights for splits, will be normalized if they don't sum to 1
+        :param seed: random seed
+        :return: split RDDs in a list
+
+        >>> rdd = sc.parallelize(range(5), 1)
+        >>> rdd1, rdd2 = rdd.randomSplit([2.0, 3.0], 101)
+        >>> rdd1.collect()
+        [2, 3]
+        >>> rdd2.collect()
+        [0, 1, 4]
+        """
+        s = sum(weights)
+        cweights = [0.0]
+        for w in weights:
+            cweights.append(cweights[-1] + w / s)
+        if seed is None:
+            seed = random.randint(0, 2 ** 32 - 1)
+        return [self.mapPartitionsWithIndex(RDDSampler(False, ub, seed, lb).func, True)
+                for lb, ub in zip(cweights, cweights[1:])]
+
     # this is ported from scala/spark/RDD.scala
     def takeSample(self, withReplacement, num, seed=None):
         """
@@ -341,7 +365,7 @@ def takeSample(self, withReplacement, num, seed=None):
         if initialCount == 0:
             return []
 
-        rand = Random(seed)
+        rand = random.Random(seed)
 
         if (not withReplacement) and num >= initialCount:
             # shuffle current RDD and return

diff --git a/python/pyspark/rddsampler.py b/python/pyspark/rddsampler.py
@@ -96,9 +96,10 @@ def shuffle(self, vals):
 
 class RDDSampler(RDDSamplerBase):
 
-    def __init__(self, withReplacement, fraction, seed=None):
+    def __init__(self, withReplacement, fraction, seed=None, lowbound=0.0):
         RDDSamplerBase.__init__(self, withReplacement, seed)
         self._fraction = fraction
+        self._lowbound = lowbound
 
     def func(self, split, iterator):
         if self._withReplacement:
@@ -111,7 +112,7 @@ def func(self, split, iterator):
                     yield obj
         else:
             for obj in iterator:
-                if self.getUniformSample(split) <= self._fraction:
+                if self._lowbound <= self.getUniformSample(split) < self._fraction:
                     yield obj