-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-28247][SS] Fix flaky test "query without test harness" on ContinuousSuite #25048
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
299f629
97b5de6
2cadef3
c006be8
8e309ac
d3fbd82
faa41b5
aa932bc
6706a65
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -17,6 +17,8 @@ | |
|
|
||
| package org.apache.spark.sql.execution.streaming.continuous | ||
|
|
||
| import java.util.concurrent.atomic.AtomicLong | ||
|
|
||
| import org.json4s.DefaultFormats | ||
| import org.json4s.jackson.Serialization | ||
|
|
||
|
|
@@ -36,6 +38,9 @@ class RateStreamContinuousStream(rowsPerSecond: Long, numPartitions: Int) extend | |
|
|
||
| val perPartitionRate = rowsPerSecond.toDouble / numPartitions.toDouble | ||
|
|
||
| private[sql] val highestCommittedValue = new AtomicLong(Long.MinValue) | ||
| private[sql] val firstCommittedTime = new AtomicLong(Long.MinValue) | ||
|
|
||
| override def mergeOffsets(offsets: Array[PartitionOffset]): Offset = { | ||
| assert(offsets.length == numPartitions) | ||
| val tuples = offsets.map { | ||
|
|
@@ -82,7 +87,16 @@ class RateStreamContinuousStream(rowsPerSecond: Long, numPartitions: Int) extend | |
| RateStreamContinuousReaderFactory | ||
| } | ||
|
|
||
| override def commit(end: Offset): Unit = {} | ||
| override def commit(end: Offset): Unit = { | ||
|
||
| end.asInstanceOf[RateStreamOffset].partitionToValueAndRunTimeMs.foreach { | ||
| case (_, ValueRunTimeMsPair(value, _)) => | ||
| if (highestCommittedValue.get() < value) { | ||
|
||
| highestCommittedValue.set(value) | ||
| } | ||
| } | ||
| firstCommittedTime.compareAndSet(Long.MinValue, System.currentTimeMillis()) | ||
|
||
| } | ||
|
|
||
| override def stop(): Unit = {} | ||
|
|
||
| private def createInitialOffset(numPartitions: Int, creationTimeMs: Long) = { | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -38,18 +38,52 @@ class ContinuousSuiteBase extends StreamTest { | |
| sparkConf.set("spark.sql.testkey", "true"))) | ||
|
|
||
| protected def waitForRateSourceTriggers(query: StreamExecution, numTriggers: Int): Unit = { | ||
| query match { | ||
| findRateStreamContinuousStream(query).foreach { reader => | ||
| // Make sure epoch 0 is completed. | ||
| query.asInstanceOf[ContinuousExecution].awaitEpoch(0) | ||
|
|
||
| // This is called after waiting first epoch to be committed, but there might be | ||
| // a gap between committing epoch to commit log and committing epoch to source. | ||
| // If epoch 0 is not reported to rate source yet, use current time instead. | ||
| var firstCommittedTime = reader.firstCommittedTime.longValue() | ||
| if (firstCommittedTime < 0) { | ||
| firstCommittedTime = System.currentTimeMillis() | ||
| } | ||
|
|
||
| val deltaMs = numTriggers * 1000 + 300 | ||
| while (System.currentTimeMillis < firstCommittedTime + deltaMs) { | ||
| Thread.sleep(firstCommittedTime + deltaMs - System.currentTimeMillis) | ||
| } | ||
| } | ||
| } | ||
|
|
||
| protected def waitForRateSourceCommittedValue( | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is safest approach to expect some rows to produce outputs. We still need to have max time to wait, since it may block infinitely in case of bugs. |
||
| query: StreamExecution, | ||
| desiredValue: Long, | ||
| maxWaitTimeMs: Long): Unit = { | ||
| findRateStreamContinuousStream(query).foreach { reader => | ||
| val startTime = System.currentTimeMillis() | ||
| val maxWait = startTime + maxWaitTimeMs | ||
| while (System.currentTimeMillis() < maxWait && | ||
| reader.highestCommittedValue.get() < desiredValue) { | ||
| Thread.sleep(100) | ||
| } | ||
| if (System.currentTimeMillis() > maxWait) { | ||
| logWarning(s"Couldn't reach desired value in $maxWaitTimeMs milliseconds!" + | ||
| s"Current highest committed value is ${reader.highestCommittedValue}") | ||
| } | ||
| } | ||
| } | ||
|
|
||
| private def findRateStreamContinuousStream( | ||
| query: StreamExecution): Option[RateStreamContinuousStream] = query match { | ||
|
|
||
| case s: ContinuousExecution => | ||
| assert(numTriggers >= 2, "must wait for at least 2 triggers to ensure query is initialized") | ||
| val reader = s.lastExecution.executedPlan.collectFirst { | ||
| s.lastExecution.executedPlan.collectFirst { | ||
| case ContinuousScanExec(_, _, r: RateStreamContinuousStream, _) => r | ||
| }.get | ||
|
|
||
| val deltaMs = numTriggers * 1000 + 300 | ||
| while (System.currentTimeMillis < reader.creationTime + deltaMs) { | ||
| Thread.sleep(reader.creationTime + deltaMs - System.currentTimeMillis) | ||
| } | ||
| } | ||
|
|
||
| case _ => None | ||
| } | ||
|
|
||
| // A continuous trigger that will only fire the initial time for the duration of a test. | ||
|
|
@@ -218,8 +252,7 @@ class ContinuousSuite extends ContinuousSuiteBase { | |
| .start() | ||
| val continuousExecution = | ||
| query.asInstanceOf[StreamingQueryWrapper].streamingQuery.asInstanceOf[ContinuousExecution] | ||
| continuousExecution.awaitEpoch(0) | ||
| waitForRateSourceTriggers(continuousExecution, 2) | ||
| waitForRateSourceCommittedValue(continuousExecution, 3, 20 * 1000) | ||
| query.stop() | ||
|
|
||
| val results = spark.read.table("noharness").collect() | ||
|
|
@@ -241,7 +274,7 @@ class ContinuousStressSuite extends ContinuousSuiteBase { | |
| testStream(df)( | ||
| StartStream(longContinuousTrigger), | ||
| AwaitEpoch(0), | ||
| Execute(waitForRateSourceTriggers(_, 10)), | ||
| Execute(waitForRateSourceTriggers(_, 5)), | ||
|
||
| IncrementEpoch(), | ||
| StopStream, | ||
| CheckAnswerRowsContains(scala.Range(0, 2500).map(Row(_))) | ||
|
|
@@ -259,7 +292,7 @@ class ContinuousStressSuite extends ContinuousSuiteBase { | |
| testStream(df)( | ||
| StartStream(Trigger.Continuous(2012)), | ||
| AwaitEpoch(0), | ||
| Execute(waitForRateSourceTriggers(_, 10)), | ||
| Execute(waitForRateSourceTriggers(_, 5)), | ||
|
||
| IncrementEpoch(), | ||
| StopStream, | ||
| CheckAnswerRowsContains(scala.Range(0, 2500).map(Row(_)))) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I still have a doubt on the value of these. We usually don't add a variable like this just for testing.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As I commented earlier,
creationTimein this class above is one of fields just for testing.Unlike microbatch mode, it's not easy to track and capture the progress. That might be the reason we were relying on wait time. I totally agree the change is not beauty, but given rate source is for testing/benchmarking purpose, and end users don't deal with RateStreamContinuousStream directly, we might be just OK with it.
If it's still a thing for us to be concerned, please let me know, and we can rollback the change and just add more seconds in wait time to make test be higher rate to be passed.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The first approach is definitely more ugly than this. Just a question (without having a deep look) is there a metric maybe which can be used? Or just add something like this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Stream query listener is not ready for continuous processing. #24537 is trying to address it, but seems like it's not ready. Actually the query listener mainly reports per batch to show changing numbers, and if we apply this to continuous processing, the report could be far behind if there's skew among partitions and epochs for partitions differ.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you subclass this for testing and only add the new fields in the subclass?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It would also require a new data source provider, as we just use it by the name of data source.
The change would bring the test version of RateStreamProvider, and test version of RateStreamTable (maybe subclassing to deduplicate), and test version of RateStreamContinuousStream. I'd like to confirm whether it's OK to apply the change since the changeset is going to be bigger.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see, hmm, it's not instantiated directly.
I'm also uneasy about adding this just for a test; is there any way to avoid this while fixing the test, even if it means the test can't test as much?
I'm not super against this though. I'd mark it as visible for testing and comment of course.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK maybe I can try to find alternative approach first and stick with this only when I can't find one. I didn't do that since alternatives would be pretty less simpler, but I feel we would like to avoid the change like this.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
And I should admit that I just realized
creationTimeis not only used for testing. My bad. Sorry about the missing.