palantir · bulldozer-bot · Apr 30, 2019 · Mar 20, 2019 · Mar 21, 2019 · Mar 22, 2019
diff --git a/core/src/main/java/org/apache/spark/api/shuffle/ShuffleBlockInfo.java b/core/src/main/java/org/apache/spark/api/shuffle/ShuffleBlockInfo.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.api.shuffle;
+
+import java.util.Objects;
+
+public class ShuffleBlockInfo {
+  private final int shuffleId;
+  private final int mapId;
+  private final int reduceId;
+  private final long length;
+
+  public ShuffleBlockInfo(int shuffleId, int mapId, int reduceId, long length) {
+    this.shuffleId = shuffleId;
+    this.mapId = mapId;
+    this.reduceId = reduceId;
+    this.length = length;
+  }
+
+  public int getShuffleId() {
+    return shuffleId;
+  }
+
+  public int getMapId() {
+    return mapId;
+  }
+
+  public int getReduceId() {
+    return reduceId;
+  }
+
+  public long getLength() {
+    return length;
+  }
+
+  @Override
+  public boolean equals(Object other) {
+    return other instanceof ShuffleBlockInfo
+        && shuffleId == ((ShuffleBlockInfo) other).shuffleId
+        && mapId == ((ShuffleBlockInfo) other).mapId
+        && reduceId == ((ShuffleBlockInfo) other).reduceId
+        && length == ((ShuffleBlockInfo) other).length;
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(shuffleId, mapId, reduceId, length);
+  }
+}
diff --git a/core/src/main/java/org/apache/spark/api/shuffle/ShuffleExecutorComponents.java b/core/src/main/java/org/apache/spark/api/shuffle/ShuffleExecutorComponents.java
@@ -30,4 +30,6 @@ public interface ShuffleExecutorComponents {
   void initializeExecutor(String appId, String execId);
 
   ShuffleWriteSupport writes();
+
+  ShuffleReadSupport reads();
 }
diff --git a/core/src/main/java/org/apache/spark/api/shuffle/ShuffleReadSupport.java b/core/src/main/java/org/apache/spark/api/shuffle/ShuffleReadSupport.java
@@ -0,0 +1,37 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.api.shuffle;
+
+import org.apache.spark.annotation.Experimental;
+
+import java.io.IOException;
+
+/**
+ * :: Experimental ::
+ * An interface for reading shuffle records.
+ * @since 3.0.0
+ */
+@Experimental
+public interface ShuffleReadSupport {
+  /**
+   * Returns an underlying {@link ShuffleReaderIterable} that will iterate through shuffle data,
+   * given an iterable for the shuffle blocks to fetch.
+   */
+  ShuffleReaderIterable getPartitionReaders(Iterable<ShuffleBlockInfo> blockMetadata)
+      throws IOException;
+}
diff --git a/core/src/main/java/org/apache/spark/api/shuffle/ShuffleReaderInputStream.java b/core/src/main/java/org/apache/spark/api/shuffle/ShuffleReaderInputStream.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.api.shuffle;
+
+import org.apache.spark.annotation.Experimental;
+
+import java.io.InputStream;
+
+/**
+ * :: Experimental ::
+ * An interface for reading shuffle records.
+ * @since 3.0.0
+ */
+@Experimental
+public class ShuffleReaderInputStream {
+
+  private final ShuffleBlockInfo shuffleBlockInfo;
+  private final InputStream inputStream;
+
+  public ShuffleReaderInputStream(ShuffleBlockInfo shuffleBlockInfo, InputStream inputStream) {
+    this.shuffleBlockInfo = shuffleBlockInfo;
+    this.inputStream = inputStream;
+  }
+
+  public ShuffleBlockInfo getShuffleBlockInfo() {
+    return shuffleBlockInfo;
+  }
+
+  public InputStream getInputStream() {
+    return inputStream;
+  }
+}
diff --git a/core/src/main/java/org/apache/spark/api/shuffle/ShuffleReaderIterable.java b/core/src/main/java/org/apache/spark/api/shuffle/ShuffleReaderIterable.java
@@ -0,0 +1,44 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.api.shuffle;
+
+import org.apache.spark.annotation.Experimental;
+
+import java.util.Iterator;
+
+/**
+ * :: Experimental ::
+ * An interface for iterating through shuffle blocks to read.
+ * @since 3.0.0
+ */
+@Experimental
+public interface ShuffleReaderIterable extends Iterable<ShuffleReaderInputStream> {
+
+  interface ShuffleReaderIterator extends Iterator<ShuffleReaderInputStream> {
+    /**
+     * Instructs the shuffle iterator to fetch the last block again. This is useful
+     * if the block is determined to be corrupt after decryption or decompression.
+     */
+    default void retryLastBlock(Throwable t) {
+      throw new UnsupportedOperationException("Cannot retry fetching bad blocks", t);
+    }
+  }
+
+  @Override
+  ShuffleReaderIterator iterator();
+}
diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/io/DefaultShuffleExecutorComponents.java b/core/src/main/java/org/apache/spark/shuffle/sort/io/DefaultShuffleExecutorComponents.java
@@ -17,18 +17,22 @@
 
 package org.apache.spark.shuffle.sort.io;
 
+import org.apache.spark.MapOutputTracker;
 import org.apache.spark.SparkConf;
 import org.apache.spark.SparkEnv;
 import org.apache.spark.api.shuffle.ShuffleExecutorComponents;
+import org.apache.spark.api.shuffle.ShuffleReadSupport;
 import org.apache.spark.api.shuffle.ShuffleWriteSupport;
 import org.apache.spark.shuffle.IndexShuffleBlockResolver;
+import org.apache.spark.shuffle.io.DefaultShuffleReadSupport;
 import org.apache.spark.storage.BlockManager;
 
 public class DefaultShuffleExecutorComponents implements ShuffleExecutorComponents {
 
   private final SparkConf sparkConf;
   private BlockManager blockManager;
   private IndexShuffleBlockResolver blockResolver;
+  private MapOutputTracker mapOutputTracker;
 
   public DefaultShuffleExecutorComponents(SparkConf sparkConf) {
     this.sparkConf = sparkConf;
@@ -37,15 +41,29 @@ public DefaultShuffleExecutorComponents(SparkConf sparkConf) {
   @Override
   public void initializeExecutor(String appId, String execId) {
     blockManager = SparkEnv.get().blockManager();
+    mapOutputTracker = SparkEnv.get().mapOutputTracker();
     blockResolver = new IndexShuffleBlockResolver(sparkConf, blockManager);
   }
 
   @Override
   public ShuffleWriteSupport writes() {
+    checkInitialized();
+    return new DefaultShuffleWriteSupport(sparkConf, blockResolver);
+  }
+
+  @Override
+  public ShuffleReadSupport reads() {
+    checkInitialized();
+    return new DefaultShuffleReadSupport(
+        blockManager,
+        mapOutputTracker,
+        sparkConf);
+  }
+
+  private void checkInitialized() {
     if (blockResolver == null) {
       throw new IllegalStateException(
-        "Executor components must be initialized before getting writers.");
+          "Executor components must be initialized before getting writers.");
     }
-    return new DefaultShuffleWriteSupport(sparkConf, blockResolver);
   }
 }
diff --git a/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala b/core/src/main/scala/org/apache/spark/executor/TaskMetrics.scala
@@ -56,6 +56,8 @@ class TaskMetrics private[spark] () extends Serializable {
   private val _diskBytesSpilled = new LongAccumulator
   private val _peakExecutionMemory = new LongAccumulator
   private val _updatedBlockStatuses = new CollectionAccumulator[(BlockId, BlockStatus)]
+  private var _decorFunc: TempShuffleReadMetrics => TempShuffleReadMetrics =
+    Predef.identity[TempShuffleReadMetrics]
 
   /**
    * Time taken on the executor to deserialize this task.
@@ -187,11 +189,17 @@ class TaskMetrics private[spark] () extends Serializable {
    * be lost.
    */
   private[spark] def createTempShuffleReadMetrics(): TempShuffleReadMetrics = synchronized {
-    val readMetrics = new TempShuffleReadMetrics
-    tempShuffleReadMetrics += readMetrics
+    val tempShuffleMetrics = new TempShuffleReadMetrics
+    val readMetrics = _decorFunc(tempShuffleMetrics)
+    tempShuffleReadMetrics += tempShuffleMetrics
     readMetrics
   }
 
+  private[spark] def decorateTempShuffleReadMetrics(
+      decorFunc: TempShuffleReadMetrics => TempShuffleReadMetrics): Unit = synchronized {
+    _decorFunc = decorFunc
+  }
+
   /**
    * Merge values across all temporary [[ShuffleReadMetrics]] into `_shuffleReadMetrics`.
    * This is expected to be called on executor heartbeat and at the end of a task.

diff --git a/core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala b/core/src/main/scala/org/apache/spark/shuffle/BlockStoreShuffleReader.scala
@@ -17,12 +17,19 @@
 
 package org.apache.spark.shuffle
 
+import java.io.{InputStream, IOException}
+import java.nio.ByteBuffer
+
+import scala.collection.JavaConverters._
+
 import org.apache.spark._
-import org.apache.spark.internal.{config, Logging}
+import org.apache.spark.api.shuffle.{ShuffleBlockInfo, ShuffleReadSupport}
+import org.apache.spark.internal.Logging
 import org.apache.spark.serializer.SerializerManager
-import org.apache.spark.storage.{BlockManager, ShuffleBlockFetcherIterator}
-import org.apache.spark.util.CompletionIterator
+import org.apache.spark.storage.ShuffleBlockId
+import org.apache.spark.util.{CompletionIterator, Utils}
 import org.apache.spark.util.collection.ExternalSorter
+import org.apache.spark.util.io.ChunkedByteBufferOutputStream
 
 /**
  * Fetches and reads the partitions in range [startPartition, endPartition) from a shuffle by
@@ -34,33 +41,64 @@ private[spark] class BlockStoreShuffleReader[K, C](
     endPartition: Int,
     context: TaskContext,
     readMetrics: ShuffleReadMetricsReporter,
-    serializerManager: SerializerManager = SparkEnv.get.serializerManager,
-    blockManager: BlockManager = SparkEnv.get.blockManager,
+    serializerManager: SerializerManager,
+    shuffleReadSupport: ShuffleReadSupport,
     mapOutputTracker: MapOutputTracker = SparkEnv.get.mapOutputTracker)
   extends ShuffleReader[K, C] with Logging {
 
   private val dep = handle.dependency
 
-  /** Read the combined key-values for this reduce task */
   override def read(): Iterator[Product2[K, C]] = {
-    val wrappedStreams = new ShuffleBlockFetcherIterator(
-      context,
-      blockManager.shuffleClient,
-      blockManager,
-      mapOutputTracker.getMapSizesByExecutorId(handle.shuffleId, startPartition, endPartition),
-      serializerManager.wrapStream,
-      // Note: we use getSizeAsMb when no suffix is provided for backwards compatibility
-      SparkEnv.get.conf.get(config.REDUCER_MAX_SIZE_IN_FLIGHT) * 1024 * 1024,
-      SparkEnv.get.conf.get(config.REDUCER_MAX_REQS_IN_FLIGHT),
-      SparkEnv.get.conf.get(config.REDUCER_MAX_BLOCKS_IN_FLIGHT_PER_ADDRESS),
-      SparkEnv.get.conf.get(config.MAX_REMOTE_BLOCK_SIZE_FETCH_TO_MEM),
-      SparkEnv.get.conf.get(config.SHUFFLE_DETECT_CORRUPT),
-      readMetrics).toCompletionIterator
+    val wrappedStreams =
+      shuffleReadSupport.getPartitionReaders(new Iterable[ShuffleBlockInfo] {
+        override def iterator: Iterator[ShuffleBlockInfo] = {
+          /** Read the combined key-values for this reduce task */
+          mapOutputTracker.getMapSizesByExecutorId(handle.shuffleId, startPartition, endPartition)
+            .flatMap(blockManagerIdInfo => {
+              blockManagerIdInfo._2.map(
+                blockInfo => {
+                  val block = blockInfo._1.asInstanceOf[ShuffleBlockId]
+                  new ShuffleBlockInfo(block.shuffleId, block.mapId, block.reduceId, blockInfo._2)
+                }
+              )
+            })
+        }
+      }.asJava).iterator()
 
-    val serializerInstance = dep.serializer.newInstance()
+    val retryingWrappedStreams = new Iterator[InputStream] {
+      override def hasNext: Boolean = wrappedStreams.hasNext
 
-    // Create a key/value iterator for each stream
-    val recordIter = wrappedStreams.flatMap { case (blockId, wrappedStream) =>
+      override def next(): InputStream = {
+        var returnStream: InputStream = null
+        while (wrappedStreams.hasNext && returnStream == null) {
+          val nextStream = wrappedStreams.next()
+          val blockInfo = nextStream.getShuffleBlockInfo
+          val blockId = ShuffleBlockId(
+            blockInfo.getShuffleId,
+            blockInfo.getMapId,
+            blockInfo.getReduceId)
+          try {
+            val in = serializerManager.wrapStream(blockId, nextStream.getInputStream)
+            val out = new ChunkedByteBufferOutputStream(64 * 1024, ByteBuffer.allocate)
+            // Decompress the whole block at once to detect any corruption, which could increase
+            // the memory usage tne potential increase the chance of OOM.
+            // TODO: manage the memory used here, and spill it into disk in case of OOM.
+            Utils.copyStream(in, out, closeStreams = true)
+            returnStream = out.toChunkedByteBuffer.toInputStream(dispose = true)
+          } catch {
+            case e: IOException =>
+              wrappedStreams.retryLastBlock(e)
+          }
+        }
+        if (returnStream == null) {
+          throw new IllegalStateException("Expected shuffle reader iterator to return a stream")
+        }
+        returnStream
+      }
+    }
+
+    val serializerInstance = dep.serializer.newInstance()
+    val recordIter = retryingWrappedStreams.flatMap { wrappedStream =>
       // Note: the asKeyValueIterator below wraps a key/value iterator inside of a
       // NextIterator. The NextIterator makes sure that close() is called on the
       // underlying InputStream when all records have been read.