-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-20244][Core] Handle incorrect bytesRead metrics when using PySpark #17617
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
a23633d
8b16017
1e3fb8a
2b08b48
c068f43
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -21,6 +21,7 @@ import java.io.IOException | |
| import java.security.PrivilegedExceptionAction | ||
| import java.text.DateFormat | ||
| import java.util.{Arrays, Comparator, Date, Locale} | ||
| import java.util.concurrent.ConcurrentHashMap | ||
|
|
||
| import scala.collection.JavaConverters._ | ||
| import scala.util.control.NonFatal | ||
|
|
@@ -143,14 +144,18 @@ class SparkHadoopUtil extends Logging { | |
| * Returns a function that can be called to find Hadoop FileSystem bytes read. If | ||
| * getFSBytesReadOnThreadCallback is called from thread r at time t, the returned callback will | ||
| * return the bytes read on r since t. | ||
| * | ||
| * @return None if the required method can't be found. | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why removing this line instead of the doc?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this doesn't return a |
||
| */ | ||
| private[spark] def getFSBytesReadOnThreadCallback(): () => Long = { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. let's update the document to say that, the returned function may be called in multiple threads. |
||
| val threadStats = FileSystem.getAllStatistics.asScala.map(_.getThreadStatistics) | ||
| val f = () => threadStats.map(_.getBytesRead).sum | ||
| val baselineBytesRead = f() | ||
| () => f() - baselineBytesRead | ||
| val f = () => FileSystem.getAllStatistics.asScala.map(_.getThreadStatistics.getBytesRead).sum | ||
|
||
| val baseline = (Thread.currentThread().getId, f()) | ||
| val bytesReadMap = new ConcurrentHashMap[Long, Long]() | ||
|
|
||
| () => { | ||
|
||
| bytesReadMap.put(Thread.currentThread().getId, f()) | ||
| bytesReadMap.asScala.map { case (k, v) => | ||
|
||
| v - (if (k == baseline._1) baseline._2 else 0) | ||
| }.sum | ||
| } | ||
| } | ||
|
|
||
| /** | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -251,7 +251,13 @@ class HadoopRDD[K, V]( | |
| null | ||
| } | ||
| // Register an on-task-completion callback to close the input stream. | ||
| context.addTaskCompletionListener{ context => closeIfNeeded() } | ||
| context.addTaskCompletionListener { context => | ||
| // Update the bytes read before closing is to make sure lingering bytesRead statistics in | ||
| // this thread get correctly added. | ||
| updateBytesRead() | ||
|
||
| closeIfNeeded() | ||
| } | ||
|
|
||
| private val key: K = if (reader == null) null.asInstanceOf[K] else reader.createKey() | ||
| private val value: V = if (reader == null) null.asInstanceOf[V] else reader.createValue() | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -319,6 +319,45 @@ class InputOutputMetricsSuite extends SparkFunSuite with SharedSparkContext | |
| } | ||
| assert(bytesRead >= tmpFile.length()) | ||
| } | ||
|
|
||
| test("input metrics with old Hadoop API in different thread") { | ||
| val bytesRead = runAndReturnBytesRead { | ||
| sc.textFile(tmpFilePath, 4).mapPartitions { iter => | ||
| val buf = new ArrayBuffer[String]() | ||
| val thread = new Thread() { | ||
|
||
| override def run(): Unit = { | ||
| iter.flatMap(_.split(" ")).foreach(buf.append(_)) | ||
| } | ||
| } | ||
| thread.start() | ||
| thread.join() | ||
|
|
||
| buf.iterator | ||
| }.count() | ||
| } | ||
| assert(bytesRead != 0) | ||
|
||
| assert(bytesRead >= tmpFile.length()) | ||
| } | ||
|
|
||
| test("input metrics with new Hadoop API in different thread") { | ||
| val bytesRead = runAndReturnBytesRead { | ||
| sc.newAPIHadoopFile(tmpFilePath, classOf[NewTextInputFormat], classOf[LongWritable], | ||
| classOf[Text]).mapPartitions { iter => | ||
| val buf = new ArrayBuffer[String]() | ||
| val thread = new Thread() { | ||
|
||
| override def run(): Unit = { | ||
| iter.map(_._2.toString).flatMap(_.split(" ")).foreach(buf.append(_)) | ||
| } | ||
| } | ||
| thread.start() | ||
| thread.join() | ||
|
|
||
| buf.iterator | ||
| }.count() | ||
| } | ||
| assert(bytesRead != 0) | ||
| assert(bytesRead >= tmpFile.length()) | ||
| } | ||
| } | ||
|
|
||
| /** | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: unneeded import.