-
Notifications
You must be signed in to change notification settings - Fork 29k
[SPARK-48589][SQL][SS] Add option snapshotStartBatchId and snapshotPartitionId to state data source #46944
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[SPARK-48589][SQL][SS] Add option snapshotStartBatchId and snapshotPartitionId to state data source #46944
Changes from 1 commit
6db0e3d
7dad0c1
2475173
07267b5
9d902d7
eddb3c7
1a3d20a
292ec5d
aa337c1
dfa712e
4ebd078
1656580
61dea35
5229152
4825215
20e1b9c
9eb6c76
4d4cd70
1870b35
fe9cea1
3f266c1
2eb6646
be30817
3ece6f2
ef9b095
876256e
1a23abb
97ee3ef
23639f4
40b6dc6
e15213e
42d952f
6f1425d
4deb63e
d140708
337785d
9dbe295
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -900,7 +900,8 @@ private[sql] class HDFSBackedStateStoreProvider extends StateStoreProvider with | |
| */ | ||
| override def replayStateFromSnapshot(snapshotVersion: Long, endVersion: Long): StateStore = { | ||
| val newMap = replayLoadedMapForStoreFromSnapshot(snapshotVersion, endVersion) | ||
| logInfo(log"Retrieved version ${MDC(LogKeys.STATE_STORE_VERSION, snapshotVersion)} to " + | ||
| logInfo(log"Retrieved snapshot at version " + | ||
| log"${MDC(LogKeys.STATE_STORE_VERSION, snapshotVersion)} and apply delta files to version" + | ||
|
||
| log"${MDC(LogKeys.STATE_STORE_VERSION, endVersion)} of " + | ||
| log"${MDC(LogKeys.STATE_STORE_PROVIDER, HDFSBackedStateStoreProvider.this)} for update") | ||
| new HDFSBackedStateStore(endVersion, newMap) | ||
|
|
@@ -917,9 +918,10 @@ private[sql] class HDFSBackedStateStoreProvider extends StateStoreProvider with | |
| override def replayReadStateFromSnapshot(snapshotVersion: Long, endVersion: Long): | ||
| ReadStateStore = { | ||
| val newMap = replayLoadedMapForStoreFromSnapshot(snapshotVersion, endVersion) | ||
| logInfo(log"Retrieved version ${MDC(LogKeys.STATE_STORE_VERSION, snapshotVersion)} to " + | ||
| logInfo(log"Retrieved snapshot at version " + | ||
| log"${MDC(LogKeys.STATE_STORE_VERSION, snapshotVersion)} and apply delta files to version" + | ||
|
||
| log"${MDC(LogKeys.STATE_STORE_VERSION, endVersion)} of " + | ||
| log"${MDC(LogKeys.STATE_STORE_PROVIDER, HDFSBackedStateStoreProvider.this)} for readonly") | ||
| log"${MDC(LogKeys.STATE_STORE_PROVIDER, HDFSBackedStateStoreProvider.this)} for read-only") | ||
| new HDFSBackedReadStateStore(endVersion, newMap) | ||
| } | ||
|
|
||
|
|
@@ -935,14 +937,13 @@ private[sql] class HDFSBackedStateStoreProvider extends StateStoreProvider with | |
| if (snapshotVersion < 1) { | ||
| throw QueryExecutionErrors.unexpectedStateStoreVersion(snapshotVersion) | ||
| } | ||
| if (endVersion < snapshotVersion || endVersion < 0) { | ||
| if (endVersion < snapshotVersion) { | ||
| throw QueryExecutionErrors.unexpectedStateStoreVersion(endVersion) | ||
| } | ||
|
|
||
| val newMap = HDFSBackedStateStoreMap.create(keySchema, numColsPrefixKey) | ||
| if (endVersion != 0) { | ||
| newMap.putAll(constructMapFromSnapshot(snapshotVersion, endVersion)) | ||
| } | ||
| newMap.putAll(constructMapFromSnapshot(snapshotVersion, endVersion)) | ||
|
|
||
| newMap | ||
| } | ||
| catch { | ||
|
|
@@ -972,7 +973,8 @@ private[sql] class HDFSBackedStateStoreProvider extends StateStoreProvider with | |
| resultMap | ||
| } | ||
|
|
||
| logDebug(s"Loading state from $snapshotVersion to $endVersion takes $elapsedMs ms.") | ||
| logDebug(s"Loading snapshot at version $snapshotVersion and apply delta files to version " + | ||
| s"$endVersion takes $elapsedMs ms.") | ||
|
|
||
| result | ||
| } | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -242,14 +242,14 @@ class RocksDB( | |
| acquire(LoadStore) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. lock release path is still the same right ? i assume we release on an abort ?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think I am copying the existing implementation. Any changes needed here?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yea - Im guessing the unlock happens in the end as part of an abort within the state data source reader |
||
| recordedMetrics = None | ||
| logInfo( | ||
| log"Loading ${MDC(LogKeys.VERSION_NUM, endVersion)} from " + | ||
| log"${MDC(LogKeys.VERSION_NUM, snapshotVersion)}") | ||
| log"Loading snapshot at version ${MDC(LogKeys.VERSION_NUM, snapshotVersion)} and apply " + | ||
| log"changelog files to version ${MDC(LogKeys.VERSION_NUM, endVersion)}.") | ||
| try { | ||
| replayFromCheckpoint(snapshotVersion, endVersion) | ||
|
|
||
| logInfo( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ditto |
||
| log"Loaded ${MDC(LogKeys.VERSION_NUM, endVersion)} from " + | ||
| log"${MDC(LogKeys.VERSION_NUM, snapshotVersion)}") | ||
| log"Loaded snapshot at version ${MDC(LogKeys.VERSION_NUM, snapshotVersion)} and apply " + | ||
| log"changelog files to version ${MDC(LogKeys.VERSION_NUM, endVersion)}.") | ||
| } catch { | ||
| case t: Throwable => | ||
| loadedVersion = -1 // invalidate loaded data | ||
|
|
@@ -267,29 +267,27 @@ class RocksDB( | |
| * @param endVersion end version | ||
| */ | ||
| private def replayFromCheckpoint(snapshotVersion: Long, endVersion: Long): Any = { | ||
| if (loadedVersion != snapshotVersion) { | ||
| closeDB() | ||
| val metadata = fileManager.loadCheckpointFromDfs(snapshotVersion, workingDir) | ||
| loadedVersion = snapshotVersion | ||
|
|
||
| // reset last snapshot version | ||
| if (lastSnapshotVersion > snapshotVersion) { | ||
| // discard any newer snapshots | ||
| lastSnapshotVersion = 0L | ||
| latestSnapshot = None | ||
| } | ||
| openDB() | ||
|
|
||
| numKeysOnWritingVersion = if (!conf.trackTotalNumberOfRows) { | ||
| // we don't track the total number of rows - discard the number being track | ||
| -1L | ||
| } else if (metadata.numKeys < 0) { | ||
| // we track the total number of rows, but the snapshot doesn't have tracking number | ||
| // need to count keys now | ||
| countKeys() | ||
| } else { | ||
| metadata.numKeys | ||
| } | ||
| closeDB() | ||
| val metadata = fileManager.loadCheckpointFromDfs(snapshotVersion, workingDir) | ||
| loadedVersion = snapshotVersion | ||
|
|
||
| // reset last snapshot version | ||
| if (lastSnapshotVersion > snapshotVersion) { | ||
| // discard any newer snapshots | ||
| lastSnapshotVersion = 0L | ||
| latestSnapshot = None | ||
| } | ||
| openDB() | ||
|
|
||
| numKeysOnWritingVersion = if (!conf.trackTotalNumberOfRows) { | ||
| // we don't track the total number of rows - discard the number being track | ||
| -1L | ||
| } else if (metadata.numKeys < 0) { | ||
| // we track the total number of rows, but the snapshot doesn't have tracking number | ||
| // need to count keys now | ||
| countKeys() | ||
| } else { | ||
| metadata.numKeys | ||
| } | ||
| if (loadedVersion != endVersion) replayChangelog(endVersion) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd like to see user-friendly error message when changelog file does not exist. Let's say, users may be actually not using changelog checkpointing and somehow mislead that it's supported. Providing FileNotFoundException to them does not give an hint what is possibly not correct - smart user may just notice what is wrong, but better to be user-friendly, and also be a part of error class framework.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The error users will get now when the changelog does not exist is: It does not have its own error class so I think we should put this to further tasks: Put this error to its own error class and catch it here to remind user of possible cause. |
||
| // After changelog replay the numKeysOnWritingVersion will be updated to | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -975,7 +975,7 @@ abstract class StateDataSourceReadSuite extends StateDataSourceTestBase with Ass | |
| } | ||
|
|
||
| protected def testSnapshotNotFound(): Unit = { | ||
| withTempDir(tempDir => { | ||
| withTempDir { tempDir => | ||
| val provider = getNewStateStoreProvider(tempDir.getAbsolutePath) | ||
| for (i <- 1 to 4) { | ||
| val store = provider.getStore(i - 1) | ||
|
|
@@ -989,11 +989,11 @@ abstract class StateDataSourceReadSuite extends StateDataSourceTestBase with Ass | |
| .replayReadStateFromSnapshot(1, 2) | ||
| } | ||
| checkError(exc, "CANNOT_LOAD_STATE_STORE.UNCATEGORIZED") | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be nice if we can provide users the better error message e.g. snapshot file does not exist, but I'm OK with addressing this later.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's put it later along with the changelog file not found exception. |
||
| }) | ||
| } | ||
| } | ||
|
|
||
| protected def testGetReadStoreWithStartVersion(): Unit = { | ||
| withTempDir(tempDir => { | ||
| withTempDir { tempDir => | ||
| val provider = getNewStateStoreProvider(tempDir.getAbsolutePath) | ||
| for (i <- 1 to 4) { | ||
| val store = provider.getStore(i - 1) | ||
|
|
@@ -1012,11 +1012,11 @@ abstract class StateDataSourceReadSuite extends StateDataSourceTestBase with Ass | |
| assert(get(result, "a", 4).isEmpty) | ||
|
|
||
| provider.close() | ||
| }) | ||
| } | ||
| } | ||
|
|
||
| protected def testSnapshotPartitionId(): Unit = { | ||
| withTempDir(tempDir => { | ||
| withTempDir { tempDir => | ||
| val inputData = MemoryStream[Int] | ||
| val df = inputData.toDF().limit(10) | ||
|
|
||
|
|
@@ -1039,16 +1039,15 @@ abstract class StateDataSourceReadSuite extends StateDataSourceTestBase with Ass | |
| val stateDfError = spark.read.format("statestore") | ||
| .option(StateSourceOptions.SNAPSHOT_START_BATCH_ID, 0) | ||
| .option( | ||
| StateSourceOptions.SNAPSHOT_PARTITION_ID, | ||
| spark.sessionState.conf.numShufflePartitions) | ||
| StateSourceOptions.SNAPSHOT_PARTITION_ID, 1) | ||
| .option(StateSourceOptions.BATCH_ID, 0) | ||
| .load(tempDir.getAbsolutePath) | ||
|
|
||
| val exc = intercept[StateStoreSnapshotPartitionNotFound] { | ||
| stateDfError.show() | ||
| } | ||
| assert(exc.getErrorClass === "CANNOT_LOAD_STATE_STORE.SNAPSHOT_PARTITION_ID_NOT_FOUND") | ||
| }) | ||
| } | ||
| } | ||
|
|
||
| // Todo: Should also test against state generated by 3.5 | ||
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.