apache · yinxusen · Mar 18, 2014 · Mar 18, 2014 · Mar 18, 2014 · Mar 18, 2014
diff --git a/.gitignore b/.gitignore
@@ -45,3 +45,4 @@ dist/
 spark-*-bin.tar.gz
 unit-tests.log
 /lib/
+mllib/build/
diff --git a/mllib/src/main/java/org/apache/spark/mllib/util/BatchFileInputFormat.java b/mllib/src/main/java/org/apache/spark/mllib/util/BatchFileInputFormat.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.util;
+
+import java.io.IOException;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.JobContext;
+import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat;
+import org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader;
+import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+
+/**
+ * The specific InputFormat reads files in HDFS or local disk. It will be called by
+ * HadoopRDD to generate new BatchFileRecordReader.
+ */
+public class BatchFileInputFormat
+        extends CombineFileInputFormat<String, Text> {
+
+    @Override
+    protected boolean isSplitable(JobContext context, Path file) {
+        return false;
+    }
+    @Override
+    public RecordReader<String, Text> createRecordReader(
+            InputSplit split,
+            TaskAttemptContext context) throws IOException {
+        return new CombineFileRecordReader<String, Text>(
+                (CombineFileSplit)split,
+                context,
+                (Class)BatchFileRecordReader.class);
+    }
+}
diff --git a/mllib/src/main/java/org/apache/spark/mllib/util/BatchFileRecordReader.java b/mllib/src/main/java/org/apache/spark/mllib/util/BatchFileRecordReader.java
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.mllib.util;
+
+import java.io.IOException;
+
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.InputSplit;
+import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit;
+import org.apache.hadoop.mapreduce.RecordReader;
+import org.apache.hadoop.mapreduce.TaskAttemptContext;
+
+/**
+ * Reads an entire file out in bytes format in <filename, content> format.
+ */
+
+public class BatchFileRecordReader extends RecordReader<String, Text> {
+    private long startOffset;
+    private long end;
+    private long pos;
+    private Path path;
+
+    private static final int MAX_BYTES_ALLOCATION = 64 * 1024 * 1024;
+
+    private String key = null;
+    private Text value = null;
+
+    private FSDataInputStream fileIn;
+
+    public BatchFileRecordReader(
+            CombineFileSplit split,
+            TaskAttemptContext context,
+            Integer index)
+            throws IOException {
+        path = split.getPath(index);
+        startOffset = split.getOffset(index);
+        pos = startOffset;
+        end = startOffset + split.getLength(index);
+
+        FileSystem fs = path.getFileSystem(context.getConfiguration());
+        fileIn = fs.open(path);
+        fileIn.seek(startOffset);
+    }
+
+    @Override
+    public void initialize(InputSplit arg0, TaskAttemptContext arg1)
+            throws IOException, InterruptedException {}
+
+    @Override
+    public void close() throws IOException {
+        if (fileIn != null) {
+            fileIn.close();
+        }
+    }
+
+    @Override
+    public float getProgress() throws IOException {
+        if (startOffset == end) return 0;
+        return Math.min(1.0f, (pos - startOffset) / (float) (end - startOffset));
+    }
+
+    @Override
+    public String getCurrentKey() throws IOException, InterruptedException {
+        return key;
+    }
+
+    @Override
+    public Text getCurrentValue() throws IOException, InterruptedException{
+        return value;
+    }
+
+    @Override
+    public boolean nextKeyValue() throws IOException {
+        if (key == null) {
+            key = path.getName();
+        }
+        if (value == null) {
+            value = new Text();
+        }
+
+        if (pos >= end) {
+            return false;
+        }
+
+        int maxBufferLength = end - pos < Integer.MAX_VALUE ? (int) (end - pos) : Integer.MAX_VALUE;
+        if (maxBufferLength > MAX_BYTES_ALLOCATION) {
+            maxBufferLength = MAX_BYTES_ALLOCATION;
+        }
+
+        byte[] innerBuffer = new byte[maxBufferLength];
+
+        int len = fileIn.read(pos, innerBuffer, 0, maxBufferLength);
+        pos += len;
+
+        value.set(innerBuffer, 0, len);
+
+        return true;
+    }
+}
diff --git a/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala b/mllib/src/main/scala/org/apache/spark/mllib/util/MLUtils.scala
@@ -17,11 +17,12 @@
 
 package org.apache.spark.mllib.util
 
+import org.apache.hadoop.io.Text
+import org.jblas.DoubleMatrix
+
 import org.apache.spark.SparkContext
 import org.apache.spark.rdd.RDD
 import org.apache.spark.SparkContext._
-
-import org.jblas.DoubleMatrix
 import org.apache.spark.mllib.regression.LabeledPoint
 
 /**
@@ -120,4 +121,22 @@ object MLUtils {
     }
     sum
   }
+
+  /**
+   * Reads a bunch of small files from HDFS, or a local file system (available on all nodes), or any
+   * Hadoop-supported file system URI, and return an RDD[(String, String)].
+   *
+   * @param path The directory you should specified, such as
+   *             hdfs://[address]:[port]/[dir]
+   *
+   * @return RDD[(fileName: String, content: String)]
+   *         i.e. the first is the file name of a file, the second one is its content.
+   */
+  def smallTextFiles(sc: SparkContext, path: String): RDD[(String, String)] = {
+    sc.newAPIHadoopFile(
+      path,
+      classOf[BatchFileInputFormat],
+      classOf[String],
+      classOf[Text]).mapValues(_.toString)
+  }
 }