-
Notifications
You must be signed in to change notification settings - Fork 29k
SPARK-1195: set map_input_file environment variable in PipedRDD #94
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,6 +19,14 @@ package org.apache.spark | |
|
|
||
| import org.scalatest.FunSuite | ||
|
|
||
|
|
||
| import org.apache.spark.rdd.{HadoopRDD, PipedRDD, HadoopPartition} | ||
| import org.apache.hadoop.mapred.{JobConf, TextInputFormat, FileSplit} | ||
| import org.apache.hadoop.fs.Path | ||
|
|
||
| import scala.collection.Map | ||
| import org.apache.hadoop.io.{Text, LongWritable} | ||
|
|
||
| class PipedRDDSuite extends FunSuite with SharedSparkContext { | ||
|
|
||
| test("basic pipe") { | ||
|
|
@@ -89,4 +97,37 @@ class PipedRDDSuite extends FunSuite with SharedSparkContext { | |
| } | ||
| } | ||
|
|
||
| test("test pipe exports map_input_file") { | ||
| testExportInputFile("map_input_file") | ||
| } | ||
|
|
||
| test("test pipe exports mapreduce_map_input_file") { | ||
| testExportInputFile("mapreduce_map_input_file") | ||
| } | ||
|
|
||
| def testExportInputFile(varName:String) { | ||
| val nums = new HadoopRDD(sc, new JobConf(), classOf[TextInputFormat], classOf[LongWritable], | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just wondering - any reason to create this whole fake
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. no reason I just was trying to avoid temporary file. |
||
| classOf[Text], 2) { | ||
| override def getPartitions: Array[Partition] = Array(generateFakeHadoopPartition()) | ||
| override val getDependencies = List[Dependency[_]]() | ||
| override def compute(theSplit: Partition, context: TaskContext) = { | ||
| new InterruptibleIterator[(LongWritable, Text)](context, Iterator((new LongWritable(1), | ||
| new Text("b")))) | ||
| } | ||
| } | ||
| val hadoopPart1 = generateFakeHadoopPartition() | ||
| val pipedRdd = new PipedRDD(nums, "printenv " + varName) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It might be good if this test first checks whether
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a convention or perhaps utilities for doing this already? I didn't see one doing quick look but I might have missed it. Note I copied the printenv command from a test above so I'll change both.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah I see - well then since your'e just doing what's there already I guess it's fine. But if you want to be a true hero, I think something like this would work:
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm definitely fine with fixing it, I was just wondering if we had some generic utilities that perhaps handled it for various platforms. Or perhaps a class that handled calling correct function depending on OS. |
||
| val tContext = new TaskContext(0, 0, 0, interrupted = false, runningLocally = false, | ||
| taskMetrics = null) | ||
| val rddIter = pipedRdd.compute(hadoopPart1, tContext) | ||
| val arr = rddIter.toArray | ||
| assert(arr(0) == "/some/path") | ||
| } | ||
|
|
||
| def generateFakeHadoopPartition(): HadoopPartition = { | ||
| val split = new FileSplit(new Path("/some/path"), 0, 1, | ||
| Array[String]("loc1", "loc2", "loc3", "loc4", "loc5")) | ||
| new HadoopPartition(sc.newRddId(), 1, split) | ||
| } | ||
|
|
||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
tiny nit: but mind saying
map_input_fileetc here (using underscores and not periods). Otherwise it's sorta confusing because it looks like these are hadoop conf options.