-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-2759][CORE] Generic Binary File Support in Spark #1658
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
81c5f12
84035f1
1cfa38a
1622935
eacfaa6
f4841dc
edf5829
9a313d5
df8e528
bc5c0b9
f032bc0
5deb79e
12e7be1
441f79a
a01c9cf
932a206
238c83c
19812a8
4163e38
0588737
b348ce1
c27a8f1
49174d9
a32fef7
92bda0d
8ac288b
7b9d181
6379be4
359a096
3c49a30
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,102 @@ | ||
| package org.apache.spark.input | ||
|
|
||
| import scala.collection.JavaConversions._ | ||
| import com.google.common.io.{ByteStreams, Closeables} | ||
| import org.apache.hadoop.mapreduce.InputSplit | ||
| import org.apache.hadoop.mapreduce.lib.input.CombineFileSplit | ||
| import org.apache.hadoop.mapreduce.RecordReader | ||
| import org.apache.hadoop.mapreduce.TaskAttemptContext | ||
| import org.apache.hadoop.fs.Path | ||
| import org.apache.hadoop.mapreduce.lib.input.CombineFileInputFormat | ||
| import org.apache.hadoop.mapreduce.JobContext | ||
| import org.apache.hadoop.mapreduce.lib.input.CombineFileRecordReader | ||
|
|
||
|
|
||
| /** | ||
| * The new (Hadoop 2.0) InputFormat for while binary files (not be to be confused with the recordreader itself) | ||
| */ | ||
| @serializable abstract class BinaryFileInputFormat[T] | ||
| extends CombineFileInputFormat[String,T] { | ||
| override protected def isSplitable(context: JobContext, file: Path): Boolean = false | ||
| /** | ||
| * Allow minPartitions set by end-user in order to keep compatibility with old Hadoop API. | ||
| */ | ||
| def setMaxSplitSize(context: JobContext, minPartitions: Int) { | ||
| val files = listStatus(context) | ||
| val totalLen = files.map { file => | ||
| if (file.isDir) 0L else file.getLen | ||
| }.sum | ||
|
|
||
| /** val maxSplitSize = Math.ceil(totalLen * 1.0 / | ||
| (if (minPartitions == 0) 1 else minPartitions)).toLong **/ | ||
| val maxSplitSize = Math.ceil(totalLen*1.0/files.length).toLong | ||
| super.setMaxSplitSize(maxSplitSize) | ||
| } | ||
|
|
||
| def createRecordReader(split: InputSplit, taContext: TaskAttemptContext): RecordReader[String,T] | ||
|
|
||
| } | ||
|
|
||
| /** | ||
| * A [[org.apache.hadoop.mapreduce.RecordReader RecordReader]] for reading a single whole tiff file | ||
| * out in a key-value pair, where the key is the file path and the value is the entire content of | ||
| * the file as a TSliceReader (to keep the size information | ||
| */ | ||
| @serializable abstract class BinaryRecordReader[T]( | ||
| split: CombineFileSplit, | ||
| context: TaskAttemptContext, | ||
| index: Integer) | ||
| extends RecordReader[String, T] { | ||
|
|
||
| private val path = split.getPath(index) | ||
| private val fs = path.getFileSystem(context.getConfiguration) | ||
|
|
||
| // True means the current file has been processed, then skip it. | ||
| private var processed = false | ||
|
|
||
| private val key = path.toString | ||
| private var value: T = null.asInstanceOf[T] | ||
| override def initialize(split: InputSplit, context: TaskAttemptContext) = {} | ||
| override def close() = {} | ||
|
|
||
| override def getProgress = if (processed) 1.0f else 0.0f | ||
|
|
||
| override def getCurrentKey = key | ||
|
|
||
| override def getCurrentValue = value | ||
|
|
||
| override def nextKeyValue = { | ||
| if (!processed) { | ||
| val fileIn = fs.open(path) | ||
| val innerBuffer = ByteStreams.toByteArray(fileIn) | ||
| value = parseByteArray(innerBuffer) | ||
| Closeables.close(fileIn, false) | ||
|
|
||
| processed = true | ||
| true | ||
| } else { | ||
| false | ||
| } | ||
| } | ||
| def parseByteArray(inArray: Array[Byte]): T | ||
| } | ||
|
|
||
| /** | ||
| * A demo class for extracting just the byte array itself | ||
| */ | ||
|
|
||
| @serializable class ByteInputFormat extends BinaryFileInputFormat[Array[Byte]] { | ||
| override def createRecordReader(split: InputSplit, taContext: TaskAttemptContext)= | ||
| { | ||
| new CombineFileRecordReader[String,Array[Byte]](split.asInstanceOf[CombineFileSplit],taContext,classOf[ByteRecordReader]) | ||
| } | ||
| } | ||
|
|
||
| @serializable class ByteRecordReader( | ||
| split: CombineFileSplit, | ||
| context: TaskAttemptContext, | ||
| index: Integer) | ||
| extends BinaryRecordReader[Array[Byte]](split,context,index) { | ||
|
|
||
| def parseByteArray(inArray: Array[Byte]) = inArray | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,48 @@ | ||
| package org.apache.spark.rdd | ||
|
|
||
| /** Allows better control of the partitioning | ||
| * | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This comment seems unrelated, why is it up here? |
||
| */ | ||
| import java.text.SimpleDateFormat | ||
| import java.util.Date | ||
|
|
||
| import org.apache.hadoop.conf.{Configurable, Configuration} | ||
| import org.apache.hadoop.io.Writable | ||
| import org.apache.hadoop.mapreduce._ | ||
|
|
||
| import org.apache.spark.annotation.DeveloperApi | ||
| import org.apache.spark.input.WholeTextFileInputFormat | ||
| import org.apache.spark.InterruptibleIterator | ||
| import org.apache.spark.Logging | ||
| import org.apache.spark.Partition | ||
| import org.apache.spark.SerializableWritable | ||
| import org.apache.spark.{SparkContext, TaskContext} | ||
|
|
||
| import org.apache.spark.input.BinaryFileInputFormat | ||
|
|
||
| private[spark] class BinaryFileRDD[T]( | ||
| sc : SparkContext, | ||
| inputFormatClass: Class[_ <: BinaryFileInputFormat[T]], | ||
| keyClass: Class[String], | ||
| valueClass: Class[T], | ||
| @transient conf: Configuration, | ||
| minPartitions: Int) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Format is slightly wrong here, the constructor args should only be indented with 4 spaces |
||
| extends NewHadoopRDD[String, T](sc, inputFormatClass, keyClass, valueClass, conf) { | ||
|
|
||
| override def getPartitions: Array[Partition] = { | ||
| val inputFormat = inputFormatClass.newInstance | ||
| inputFormat match { | ||
| case configurable: Configurable => | ||
| configurable.setConf(conf) | ||
| case _ => | ||
| } | ||
| val jobContext = newJobContext(conf, jobId) | ||
| inputFormat.setMaxSplitSize(jobContext, minPartitions) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this actually a max split size? It seems you're passing an int that means something else, but I might've misunderstood
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. BTW this comment was important too, what is the meaning of this parameter?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry this function was named incorrectly, it ultimately calls |
||
| val rawSplits = inputFormat.getSplits(jobContext).toArray | ||
| val result = new Array[Partition](rawSplits.size) | ||
| for (i <- 0 until rawSplits.size) { | ||
| result(i) = new NewHadoopPartition(id, i, rawSplits(i).asInstanceOf[InputSplit with Writable]) | ||
| } | ||
| result | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd call this
binaryFiles.Also, please add it to
JavaSparkContext, and ideally we'd have a way to add it to Python as well. That one will be trickier -- we probably need to read the file in chunks and pass them to Python. But I think it's important to design the API as part of this change.