apache · mengxr · Apr 18, 2014 · Apr 22, 2014 · Apr 23, 2014 · Apr 25, 2014
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/ButterflyReducedRDD.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/ButterflyReducedRDD.scala
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.rdd
+
+import scala.reflect.ClassTag
+
+import org.apache.spark.{TaskContext, Partition}
+import org.apache.spark.rdd.RDD
+
+/** A partition in a butterfly-reduced RDD. */
+private case class ButterflyReducedRDDPartition(
+    override val index: Int,
+    source: Partition,
+    target: Partition) extends Partition
+
+/**
+ * Butterfly-reduced RDD.
+ */
+private[mllib] class ButterflyReducedRDD[T: ClassTag](
+    @transient rdd: RDD[T],
+    reducer: (T, T) => T,
+    @transient offset: Int) extends RDD[T](rdd) {
+
+  /** Computes the target partition. */
+  private def targetPartition(i: Int): Partition = {
+    val j = (i + offset) % rdd.partitions.size
+    rdd.partitions(j)
+  }
+
+  override def getPartitions: Array[Partition] = {
+    rdd.partitions.zipWithIndex.map { case (part, i) =>
+      ButterflyReducedRDDPartition(i, part, targetPartition(i))
+    }
+  }
+
+  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
+    val pair = s.asInstanceOf[ButterflyReducedRDDPartition]
+    Iterator((firstParent[T].iterator(pair.source, context) ++
+      firstParent[T].iterator(pair.target, context)).reduce(reducer))
+  }
+
+  override def getPreferredLocations(s: Partition): Seq[String] = {
+    rdd.preferredLocations(s.asInstanceOf[ButterflyReducedRDDPartition].source)
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/PartitionSlicingRDD.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/PartitionSlicingRDD.scala
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.rdd
+
+import scala.reflect.ClassTag
+
+import org.apache.spark.rdd.RDD
+import org.apache.spark.{TaskContext, Partition}
+
+/**
+ * Represents an RDD obtained from partition slicing of its parent RDD.
+ */
+private[mllib] class PartitionSlicingRDD[T: ClassTag](
+    @transient rdd: RDD[T],
+    @transient slice: Seq[Int]) extends RDD[T](rdd) {
+
+  override def getPartitions: Array[Partition] = {
+    slice.map(i => rdd.partitions(i)).toArray
+  }
+
+  override def compute(s: Partition, context: TaskContext): Iterator[T] = {
+    firstParent[T].iterator(s, context)
+  }
+
+  override def getPreferredLocations(s: Partition): Seq[String] = {
+    rdd.preferredLocations(s)
+  }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/rdd/RDDFunctions.scala b/mllib/src/main/scala/org/apache/spark/mllib/rdd/RDDFunctions.scala
@@ -44,6 +44,54 @@ class RDDFunctions[T: ClassTag](self: RDD[T]) {
       new SlidingRDD[T](self, windowSize)
     }
   }
+
+  /**
+   * Returns an RDD with the specified slice of partitions.
+   */
+  def slicePartitions(slice: Seq[Int]): RDD[T] = {
+    new PartitionSlicingRDD(self, slice)
+  }
+
+  /**
+   * Computes the all-reduced RDD of the parent RDD, which has the same number of partitions and
+   * locality information as its parent RDD. Each partition contains only one record, which is the
+   * same as calling `RDD#reduce` on its parent RDD.
+   *
+   * @param f reducer
+   * @return all-reduced RDD
+   */
+  def allReduce(f: (T, T) => T): RDD[T] = {
+    val numPartitions = self.partitions.size
+    require(numPartitions > 0, "Parent RDD does not have any partitions.")
+    val nextPowerOfTwo = {
+      var i = 0
+      while ((numPartitions >> i) > 0) {
+        i += 1
+      }
+      1 << i
+    }
+    var butterfly = self.mapPartitions( (iter) =>
+      Iterator(iter.reduce(f)),
+      preservesPartitioning = true
+    ).cache()
+
+    if (nextPowerOfTwo > numPartitions) {
+      val padding = self.context.parallelize(Seq.empty[T], nextPowerOfTwo - numPartitions)
+      butterfly = butterfly.union(padding)
+    }
+
+    var offset = nextPowerOfTwo >> 1
+    while (offset > 0) {
+      butterfly = new ButterflyReducedRDD[T](butterfly, f, offset).cache()
+      offset >>= 1
+    }
+
+    if (nextPowerOfTwo > numPartitions) {
+      new PartitionSlicingRDD(butterfly, 0 until numPartitions)
+    } else {
+      butterfly
+    }
+  }
 }
 
 private[mllib]

diff --git a/mllib/src/test/scala/org/apache/spark/mllib/rdd/RDDFunctionsSuite.scala b/mllib/src/test/scala/org/apache/spark/mllib/rdd/RDDFunctionsSuite.scala
@@ -46,4 +46,21 @@ class RDDFunctionsSuite extends FunSuite with LocalSparkContext {
     val expected = data.flatMap(x => x).sliding(3).toList
     assert(sliding.collect().toList === expected)
   }
+
+  test("slicePartitions") {
+    val rdd = sc.parallelize(0 until 10, 10)
+    val slice = Seq(0, 2, 4, 7)
+    val sliced = rdd.slicePartitions(slice)
+    assert(sliced.collect().toSeq === slice)
+  }
+
+  test("allReduce") {
+    for (numPartitions <- 1 to 10) {
+      val rdd = sc.parallelize(0 until 1000, numPartitions)
+      val sum = rdd.reduce(_ + _)
+      val allReduced = rdd.allReduce(_ + _)
+      assert(allReduced.partitions.size === numPartitions)
+      assert(allReduced.collect().toSeq === Iterator.fill(numPartitions)(sum).toSeq)
+    }
+  }
 }