apache · mccheah · Mar 20, 2019 · Apr 3, 2019 · Apr 17, 2019 · Apr 15, 2019
diff --git a/core/src/main/java/org/apache/spark/shuffle/api/ShuffleDataIO.java b/core/src/main/java/org/apache/spark/shuffle/api/ShuffleDataIO.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.shuffle.api;
+
+import org.apache.spark.annotation.Private;
+
+/**
+ * :: Private ::
+ * An interface for plugging in modules for storing and reading temporary shuffle data.
+ * <p>
+ * This is the root of a plugin system for storing shuffle bytes to arbitrary storage
+ * backends in the sort-based shuffle algorithm implemented by the
+ * {@link org.apache.spark.shuffle.sort.SortShuffleManager}. If another shuffle algorithm is
+ * needed instead of sort-based shuffle, one should implement
+ * {@link org.apache.spark.shuffle.ShuffleManager} instead.
+ * <p>
+ * A single instance of this module is loaded per process in the Spark application.
+ * The default implementation reads and writes shuffle data from the local disks of
+ * the executor, and is the implementation of shuffle file storage that has remained
+ * consistent throughout most of Spark's history.
+ * <p>
+ * Alternative implementations of shuffle data storage can be loaded via setting
+ * spark.shuffle.io.plugin.class.
+ * @since 3.0.0
+ */
+@Private
+public interface ShuffleDataIO {
+
+  /**
+   * Called once on executor processes to bootstrap the shuffle data storage modules that
+   * are only invoked on the executors.
+   * <p>
+   * At this point, this module is responsible for reading and writing shuffle data bytes
+   * from the backing store.
+   */
+  ShuffleExecutorComponents executor();
+}
diff --git a/core/src/main/java/org/apache/spark/shuffle/api/ShuffleExecutorComponents.java b/core/src/main/java/org/apache/spark/shuffle/api/ShuffleExecutorComponents.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.shuffle.api;
+
+import org.apache.spark.annotation.Private;
+
+/**
+ * :: Private ::
+ * An interface for building shuffle support for Executors.
+ *
+ * @since 3.0.0
+ */
+@Private
+public interface ShuffleExecutorComponents {
+
+  /**
+   * Called once per executor to bootstrap this module with state that is specific to
+   * that executor, specifically the application ID and executor ID.
+   */
+  void initializeExecutor(String appId, String execId);
+
+  /**
+   * Returns the modules that are responsible for persisting shuffle data to the backing
+   * store.
+   * <p>
+   * This may be called multiple times on each executor. Implementations should not make
+   * any assumptions about the lifetime of the returned module.
+   */
+  ShuffleWriteSupport writes();
+}
diff --git a/core/src/main/java/org/apache/spark/shuffle/api/ShuffleMapOutputWriter.java b/core/src/main/java/org/apache/spark/shuffle/api/ShuffleMapOutputWriter.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.shuffle.api;
+
+import java.io.IOException;
+
+import org.apache.spark.annotation.Private;
+import org.apache.spark.shuffle.ShuffleWriteMetricsReporter;
+
+/**
+ * :: Private ::
+ * A top-level writer that returns child writers for persisting the output of a map task,
+ * and then commits all of the writes as one atomic operation.
+ *
+ * @since 3.0.0
+ */
+@Private
+public interface ShuffleMapOutputWriter {
+
+  /**
+   * Creates a writer that can open an output stream to persist bytes targeted for a given reduce
+   * partition id.
+   * <p>
+   * The chunk corresponds to bytes in the given reduce partition. This will not be called twice
+   * for the same partition within any given map task. The partition identifier will be in the
+   * range of precisely 0 (inclusive) to numPartitions (exclusive), where numPartitions was
+   * provided upon the creation of this map output writer via
+   * {@link ShuffleWriteSupport#createMapOutputWriter(
+   * int, int, long, int, ShuffleWriteMetricsReporter)}.
+   * <p>
+   * Calls to this method will be invoked with monotonically increasing reducePartitionIds; each
+   * call to this method will be called with a reducePartitionId that is strictly greater than
+   * the reducePartitionIds given to any previous call to this method. This method is not
+   * guaranteed to be called for every partition id in the above described range. In particular,
+   * no guarantees are made as to whether or not this method will be called for empty partitions.
+   */
+  ShufflePartitionWriter getPartitionWriter(int reducePartitionId) throws IOException;
+
+  /**
+   * Commits the writes done by all partition writers returned by all calls to this object's
+   * {@link #getPartitionWriter(int)}.
+   * <p>
+   * This should ensure that the writes conducted by this module's partition writers are
+   * available to downstream reduce tasks. If this method throws any exception, this module's
+   * {@link #abort(Throwable)} method will be invoked before propagating the exception.
+   * <p>
+   * This can also close any resources and clean up temporary state if necessary.
+   */
+  void commitAllPartitions() throws IOException;
+
+  /**
+   * Abort all of the writes done by any writers returned by {@link #getPartitionWriter(int)}.
+   * <p>
+   * This should invalidate the results of writing bytes. This can also close any resources and
+   * clean up temporary state if necessary.
+   */
+  void abort(Throwable error) throws IOException;
+}
diff --git a/core/src/main/java/org/apache/spark/shuffle/api/ShufflePartitionWriter.java b/core/src/main/java/org/apache/spark/shuffle/api/ShufflePartitionWriter.java
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.shuffle.api;
+
+import java.io.IOException;
+import java.io.OutputStream;
+
+import java.nio.channels.Channels;
+import org.apache.spark.annotation.Private;
+import org.apache.spark.shuffle.sort.DefaultTransferrableWritableByteChannel;
+
+/**
+ * :: Private ::
+ * An interface for opening streams to persist partition bytes to a backing data store.
+ * <p>
+ * This writer stores bytes for one (mapper, reducer) pair, corresponding to one shuffle
+ * block.
+ *
+ * @since 3.0.0
+ */
+@Private
+public interface ShufflePartitionWriter {
+
+  /**
+   * Open and return an {@link OutputStream} that can write bytes to the underlying
+   * data store.
+   * <p>
+   * This method will only be called once on this partition writer in the map task, to write the
+   * bytes to the partition. The output stream will only be used to write the bytes for this
+   * partition. The map task closes this output stream upon writing all the bytes for this
+   * block, or if the write fails for any reason.
+   * <p>
+   * Implementations that intend on combining the bytes for all the partitions written by this
+   * map task should reuse the same OutputStream instance across all the partition writers provided
+   * by the parent {@link ShuffleMapOutputWriter}. If one does so, ensure that
+   * {@link OutputStream#close()} does not close the resource, since it will be reused across
+   * partition writes. The underlying resources should be cleaned up in
+   * {@link ShuffleMapOutputWriter#commitAllPartitions()} and
+   * {@link ShuffleMapOutputWriter#abort(Throwable)}.
+   */
+  OutputStream openStream() throws IOException;
+
+  /**
+   * Opens and returns a {@link TransferrableWritableByteChannel} for transferring bytes from
+   * input byte channels to the underlying shuffle data store.
+   * <p>
+   * This method will only be called once on this partition writer in the map task, to write the
+   * bytes to the partition. The channel will only be used to write the bytes for this
+   * partition. The map task closes this channel upon writing all the bytes for this
+   * block, or if the write fails for any reason.
+   * <p>
+   * Implementations that intend on combining the bytes for all the partitions written by this
+   * map task should reuse the same channel instance across all the partition writers provided
+   * by the parent {@link ShuffleMapOutputWriter}. If one does so, ensure that
+   * {@link TransferrableWritableByteChannel#close()} does not close the resource, since it
+   * will be reused across partition writes. The underlying resources should be cleaned up in
+   * {@link ShuffleMapOutputWriter#commitAllPartitions()} and
+   * {@link ShuffleMapOutputWriter#abort(Throwable)}.
+   * <p>
+   * This method is primarily for advanced optimizations where bytes can be copied from the input
+   * spill files to the output channel without copying data into memory.
+   * <p>
+   * The default implementation should be sufficient for most situations. Only override this
+   * method if there is a very specific optimization that needs to be built.
+   */
+  default TransferrableWritableByteChannel openTransferrableChannel() throws IOException {
+    return new DefaultTransferrableWritableByteChannel(
+        Channels.newChannel(openStream()));
+  }
+
+  /**
+   * Returns the number of bytes written either by this writer's output stream opened by
+   * {@link #openStream()} or the byte channel opened by {@link #openTransferrableChannel()}.
+   * <p>
+   * This can be different from the number of bytes given by the caller. For example, the
+   * stream might compress or encrypt the bytes before persisting the data to the backing
+   * data store.
+   */
+  long getNumBytesWritten();
+}
diff --git a/core/src/main/java/org/apache/spark/shuffle/api/ShuffleWriteSupport.java b/core/src/main/java/org/apache/spark/shuffle/api/ShuffleWriteSupport.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.shuffle.api;
+
+import java.io.IOException;
+
+import org.apache.spark.annotation.Private;
+import org.apache.spark.shuffle.ShuffleWriteMetricsReporter;
+
+/**
+ * :: Private ::
+ * A module that returns shuffle writers to persist data that is written by shuffle map tasks.
+ *
+ * @since 3.0.0
+ */
+@Private
+public interface ShuffleWriteSupport {
+
+  /**
+   * Called once per map task to create a writer that will be responsible for persisting all the
+   * partitioned bytes written by that map task.
+   *
+   * @param shuffleId Unique identifier for the shuffle stage of the map task
+   * @param mapId Within the shuffle stage, the identifier of the map task
+   * @param mapTaskAttemptId Identifier of the task attempt. Multiple attempts of the same map task
+   *                         with the same (shuffleId, mapId) pair can be distinguished by the
+   *                         different values of mapTaskAttemptId.
+   * @param numPartitions The number of partitions that will be written by the map task. Some of
+   *                      these partitions may be empty.
+   * @param mapTaskWriteMetrics The map task's write metrics, which can be updated by the returned
+   *                            writer. The updates that are posted to this reporter are listed in
+   *                            the Spark UI. Note that the caller will update the total write time
+   *                            at the end of the map task, so implementations should not call
+   *                            {@link ShuffleWriteMetricsReporter#incWriteTime(long)}.
+   */
+  ShuffleMapOutputWriter createMapOutputWriter(
+      int shuffleId,
+      int mapId,
+      long mapTaskAttemptId,
+      int numPartitions,
+      ShuffleWriteMetricsReporter mapTaskWriteMetrics) throws IOException;
+}
diff --git a/core/src/main/java/org/apache/spark/shuffle/api/TransferrableWritableByteChannel.java b/core/src/main/java/org/apache/spark/shuffle/api/TransferrableWritableByteChannel.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.shuffle.api;
+
+import java.io.Closeable;
+import java.io.IOException;
+
+import java.nio.channels.FileChannel;
+import java.nio.channels.WritableByteChannel;
+import org.apache.spark.annotation.Private;
+
+/**
+ * :: Private ::
+ * Represents an output byte channel that can copy bytes from input file channels to some
+ * arbitrary storage system.
+ * <p>
+ * This API is provided for advanced users who can transfer bytes from a file channel to
+ * some output sink without copying data into memory. Most users should not need to use
+ * this functionality; this is primarily provided for the built-in shuffle storage backends
+ * that persist shuffle files on local disk.
+ * <p>
+ * For a simpler alternative, see {@link ShufflePartitionWriter}.
+ *
+ * @since 3.0.0
+ */
+@Private
+public interface TransferrableWritableByteChannel extends Closeable {
+
+  /**
+   * Copy all bytes from the source readable byte channel into this byte channel.
+   * <p>
+   * This method should block until all of the bytes from the source (that is, up until
+   * numBytesToTransfer) are available in the output storage layer.
+   *
+   * @param source File to transfer bytes from. Do not call anything on this channel other than
+   *               {@link FileChannel#transferTo(long, long, WritableByteChannel)}.
+   * @param transferStartPosition Start position of the input file to transfer from.
+   * @param numBytesToTransfer Number of bytes to transfer from the given source.
+   */
+  void transferFrom(
+      FileChannel source,
+      long transferStartPosition,
+      long numBytesToTransfer) throws IOException;
+}