[SPARK-21782][CORE] Repartition creates skews when numPartitions is a power of 2

megaserg · srowen · commit 77d046ec47a9 · 2017-08-21T08:21:25.000+01:00
## Problem When an RDD (particularly with a low item-per-partition ratio) is repartitioned to numPartitions = power of 2, the resulting partitions are very uneven-sized, due to using fixed seed to initialize PRNG, and using the PRNG only once. See details in https://issues.apache.org/jira/browse/SPARK-21782 ## What changes were proposed in this pull request? Instead of directly using `0, 1, 2,...` seeds to initialize `Random`, hash them with `scala.util.hashing.byteswap32()`. ## How was this patch tested? `build/mvn -Dtest=none -DwildcardSuites=org.apache.spark.rdd.RDDSuite test` Author: Sergey Serebryakov <sserebryakov@tesla.com> Closes #18990 from megaserg/repartition-skew.
diff --git a/core/src/main/scala/org/apache/spark/rdd/RDD.scala b/core/src/main/scala/org/apache/spark/rdd/RDD.scala
@@ -24,6 +24,7 @@ import scala.collection.mutable.ArrayBuffer
 import scala.io.Codec
 import scala.language.implicitConversions
 import scala.reflect.{classTag, ClassTag}
+import scala.util.hashing
 
 import com.clearspring.analytics.stream.cardinality.HyperLogLogPlus
 import org.apache.hadoop.io.{BytesWritable, NullWritable, Text}
@@ -448,7 +449,7 @@ abstract class RDD[T: ClassTag](
     if (shuffle) {
       /** Distributes elements evenly across output partitions, starting from a random partition. */
       val distributePartition = (index: Int, items: Iterator[T]) => {
-        var position = (new Random(index)).nextInt(numPartitions)
+        var position = (new Random(hashing.byteswap32(index))).nextInt(numPartitions)
         items.map { t =>
           // Note that the hash code of the key will just be the key itself. The HashPartitioner
           // will mod it with the number of total partitions.
diff --git a/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala b/core/src/test/scala/org/apache/spark/rdd/RDDSuite.scala
@@ -347,16 +347,18 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext {
       val partitions = repartitioned.glom().collect()
       // assert all elements are present
       assert(repartitioned.collect().sortWith(_ > _).toSeq === input.toSeq.sortWith(_ > _).toSeq)
-      // assert no bucket is overloaded
+      // assert no bucket is overloaded or empty
       for (partition <- partitions) {
         val avg = input.size / finalPartitions
         val maxPossible = avg + initialPartitions
-        assert(partition.length <=  maxPossible)
+        assert(partition.length <= maxPossible)
+        assert(!partition.isEmpty)
       }
     }
 
     testSplitPartitions(Array.fill(100)(1), 10, 20)
     testSplitPartitions(Array.fill(10000)(1) ++ Array.fill(10000)(2), 20, 100)
+    testSplitPartitions(Array.fill(1000)(1), 250, 128)
   }
 
   test("coalesced RDDs") {

Original file line number	Diff line number	Diff line change
`@@ -347,16 +347,18 @@ class RDDSuite extends SparkFunSuite with SharedSparkContext {`
`347`	`347`	`val partitions = repartitioned.glom().collect()`
`348`	`348`	`// assert all elements are present`
`349`	`349`	`assert(repartitioned.collect().sortWith(_ > _).toSeq === input.toSeq.sortWith(_ > _).toSeq)`
`350`		`- // assert no bucket is overloaded`
	`350`	`+ // assert no bucket is overloaded or empty`
`351`	`351`	`for (partition <- partitions) {`
`352`	`352`	`val avg = input.size / finalPartitions`
`353`	`353`	`val maxPossible = avg + initialPartitions`
`354`		`- assert(partition.length <= maxPossible)`
	`354`	`+ assert(partition.length <= maxPossible)`
	`355`	`+ assert(!partition.isEmpty)`
`355`	`356`	`}`
`356`	`357`	`}`
`357`	`358`
`358`	`359`	`testSplitPartitions(Array.fill(100)(1), 10, 20)`
`359`	`360`	`testSplitPartitions(Array.fill(10000)(1) ++ Array.fill(10000)(2), 20, 100)`
	`361`	`+ testSplitPartitions(Array.fill(1000)(1), 250, 128)`
`360`	`362`	`}`
`361`	`363`
`362`	`364`	`test("coalesced RDDs") {`