Skip to content

Commit 9f75183

Browse files
authored
Merge 16f8c7c into 2ee8ac6
2 parents 2ee8ac6 + 16f8c7c commit 9f75183

3 files changed

Lines changed: 128 additions & 32 deletions

File tree

packages/beacon-node/src/chain/stateCache/persistentCheckpointsCache.ts

Lines changed: 62 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -87,8 +87,10 @@ export const DEFAULT_MAX_CP_STATE_EPOCHS_IN_MEMORY = 2;
8787
* ╚════════════════════════════════════╝═══════════════╝
8888
*
8989
* The "in memory" checkpoint states are similar to the old implementation: we have both Previous Root Checkpoint State and Current Root Checkpoint State per epoch.
90-
* However in the "persisted to db or fs" part, we usually only persist 1 checkpoint state per epoch, the one that could potentially be justified/finalized later
91-
* based on the view of blocks.
90+
* However in the "persisted to db or fs" part
91+
* - if there is no reorg, we only store 1 checkpoint state per epoch, the one that could potentially be justified/finalized later based on the view of the state
92+
* - if there is reorg, we may store >=2 checkpoint states per epoch, including any checkpoints with unknown roots to the processed state
93+
* - the goal is to make sure we can regen any states later if needed, and we have the checkpoint state that could be justified/finalized later
9294
*/
9395
export class PersistentCheckpointStateCache implements CheckpointStateCache {
9496
private readonly cache: MapTracker<CacheKey, CacheItem>;
@@ -101,6 +103,7 @@ export class PersistentCheckpointStateCache implements CheckpointStateCache {
101103
private preComputedCheckpoint: string | null = null;
102104
private preComputedCheckpointHits: number | null = null;
103105
private readonly maxEpochsInMemory: number;
106+
// only for testing, default false for production
104107
private readonly processLateBlock: boolean;
105108
private readonly datastore: CPStateDatastore;
106109
private readonly shufflingCache: ShufflingCache;
@@ -476,9 +479,7 @@ export class PersistentCheckpointStateCache implements CheckpointStateCache {
476479
* - 1 then we'll persist {root: b1, epoch n-1} checkpoint state to disk. Note that at epoch n there is both {root: b0, epoch: n} and {root: c0, epoch: n} checkpoint states in memory
477480
* - 2 then we'll persist {root: b2, epoch n-2} checkpoint state to disk, there are also 2 checkpoint states in memory at epoch n, same to the above (maxEpochsInMemory=1)
478481
*
479-
* As of Jan 2024, it takes 1.2s to persist a holesky state on fast server. TODO:
480-
* - improve state serialization time
481-
* - or research how to only store diff against the finalized state
482+
* As of Mar 2024, it takes <=350ms to persist a holesky state on fast server
482483
*/
483484
async processState(blockRootHex: RootHex, state: CachedBeaconStateAllForks): Promise<number> {
484485
let persistCount = 0;
@@ -602,7 +603,41 @@ export class PersistentCheckpointStateCache implements CheckpointStateCache {
602603
}
603604

604605
/**
605-
* Prune or persist checkpoint states in an epoch, see the description in `processState()` function
606+
* Prune or persist checkpoint states in an epoch
607+
* 1) If there is 1 checkpoint state with known root, persist it. This is when there is skipped slot at block 0 of epoch
608+
* slot: n
609+
* |-----------------------|-----------------------|
610+
* PRCS root |
611+
*
612+
* 2) If there are 2 checkpoint states, PRCS and CRCS and both roots are known to this state, persist CRCS. If the block is reorged,
613+
* PRCS is regen and populated to this cache again.
614+
* slot: n
615+
* |-----------------------|-----------------------|
616+
* PRCS root - prune |
617+
* CRCS root - persist |
618+
*
619+
* 3) If there are any roots that unknown to this state, persist their cp state. This is to handle the current block is reorged later
620+
*
621+
* 4) (derived from above) If there are 2 checkpoint states, PRCS and an unknown root, persist both.
622+
* - In the example below block slot (n + 1) reorged n
623+
* - If we process state n + 1, CRCS is unknown to it
624+
* - we need to also store CRCS to handle the case (n+2) switches to n again
625+
*
626+
* PRCS - persist
627+
* | processState()
628+
* | |
629+
* -------------n+1
630+
* / |
631+
* n-1 ------n------------n+2
632+
* |
633+
* CRCS - persist
634+
*
635+
* - PRCS is the checkpoint state that could be justified/finalized later based on the view of the state
636+
* - unknown root checkpoint state is persisted to handle the reorg back to that branch later
637+
*
638+
* Performance note:
639+
* - In normal condition, we persist 1 checkpoint state per epoch.
640+
* - In reorged condition, we may persist multiple (most likely 2) checkpoint states per epoch.
606641
*/
607642
private async processPastEpoch(
608643
blockRootHex: RootHex,
@@ -614,14 +649,29 @@ export class PersistentCheckpointStateCache implements CheckpointStateCache {
614649
const epochBoundaryRoot =
615650
epochBoundarySlot === state.slot ? fromHexString(blockRootHex) : getBlockRootAtSlot(state, epochBoundarySlot);
616651
const epochBoundaryHex = toHexString(epochBoundaryRoot);
652+
const prevEpochRoot = toHexString(getBlockRootAtSlot(state, epochBoundarySlot - 1));
653+
654+
// for each epoch, usually there are 2 rootHexes respective to the 2 checkpoint states: Previous Root Checkpoint State and Current Root Checkpoint State
655+
const cpRootHexes = this.epochIndex.get(epoch) ?? [];
656+
const persistedRootHexes = new Set<RootHex>();
657+
658+
// 1) if there is no CRCS, persist PRCS (block 0 of epoch is skipped). In this case prevEpochRoot === epochBoundaryHex
659+
// 2) if there are PRCS and CRCS, persist CRCS => persist CRCS
660+
// => this is simplified to always persist epochBoundaryHex
661+
persistedRootHexes.add(epochBoundaryHex);
662+
663+
// 3) persist any states with unknown roots to this state
664+
for (const rootHex of cpRootHexes) {
665+
if (rootHex !== epochBoundaryHex && rootHex !== prevEpochRoot) {
666+
persistedRootHexes.add(rootHex);
667+
}
668+
}
617669

618-
// for each epoch, usually there are 2 rootHex respective to the 2 checkpoint states: Previous Root Checkpoint State and Current Root Checkpoint State
619-
for (const rootHex of this.epochIndex.get(epoch) ?? []) {
670+
for (const rootHex of cpRootHexes) {
620671
const cpKey = toCacheKey({epoch: epoch, rootHex});
621672
const cacheItem = this.cache.get(cpKey);
622673

623674
if (cacheItem !== undefined && isInMemoryCacheItem(cacheItem)) {
624-
// this is state in memory, we don't care if the checkpoint state is already persisted
625675
let {persistedKey} = cacheItem;
626676
const {state} = cacheItem;
627677
const logMeta = {
@@ -631,14 +681,14 @@ export class PersistentCheckpointStateCache implements CheckpointStateCache {
631681
persistedKey: persistedKey ? toHexString(persistedKey) : "",
632682
};
633683

634-
if (rootHex === epochBoundaryHex) {
684+
if (persistedRootHexes.has(rootHex)) {
635685
if (persistedKey) {
636-
// no need to persist
686+
// we don't care if the checkpoint state is already persisted
637687
this.logger.verbose("Pruned checkpoint state from memory but no need to persist", logMeta);
638688
} else {
639689
// persist and do not update epochIndex
640690
this.metrics?.statePersistSecFromSlot.observe(this.clock?.secFromSlot(this.clock?.currentSlot ?? 0) ?? 0);
641-
const cpPersist = {epoch: epoch, root: epochBoundaryRoot};
691+
const cpPersist = {epoch: epoch, root: fromHexString(rootHex)};
642692
{
643693
const timer = this.metrics?.stateSerializeDuration.startTimer();
644694
// automatically free the buffer pool after this scope

packages/beacon-node/test/e2e/chain/stateCache/nHistoricalStates.test.ts

Lines changed: 21 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,10 @@ describe(
5757
maxBlockStates: number;
5858
maxCPStateEpochsInMemory: number;
5959
reloadCount: number;
60+
// total persist count, to compare to metrics
6061
persistCount: number;
6162
numStatesInMemory: number;
63+
// number of states persisted at the end of test
6264
numStatesPersisted: number;
6365
numEpochsInMemory: number;
6466
numEpochsPersisted: number;
@@ -162,7 +164,7 @@ describe(
162164
* 16 ^ ^ ^ ^ ^
163165
* 23 24 25 26 27
164166
* ^
165-
* PRCS at epoch 3 is persisted, CRCS is pruned
167+
* both PRCS and CRCS are persisted
166168
*/
167169
{
168170
name: "maxCPStateEpochsInMemory=1, reorg last slot of previous epoch",
@@ -172,12 +174,12 @@ describe(
172174
maxCPStateEpochsInMemory: 1,
173175
// PRCS at epoch 3 is available in memory so no need to reload
174176
reloadCount: 0,
175-
// 1 cp state for epoch 0 1 2 3
176-
persistCount: 4,
177+
// {root0, epoch: 0} {root8, epoch: 1} {root16, epoch: 2} {root23, epoch: 3} {root24, epoch: 3}
178+
persistCount: 5,
177179
// epoch 4, one for Current Root Checkpoint State and one for Previous Root Checkpoint State
178180
numStatesInMemory: 2,
179-
// chain is not finalized, epoch 4 is in-memory so CP state at epoch 0 1 2 3 are persisted
180-
numStatesPersisted: 4,
181+
// chain is not finalized, same to persistCount
182+
numStatesPersisted: 5,
181183
// epoch 4
182184
numEpochsInMemory: 1,
183185
// chain is not finalized, epoch 4 is in-memory so CP state at epoch 0 1 2 3 are persisted
@@ -186,15 +188,15 @@ describe(
186188
skip: true,
187189
},
188190
/**
189-
* Block slot 28 has parent slot 23, block slot 24 25 26 and 27 are reorged
191+
* Block slot 28 has parent slot 19, block slot 24 25 26 and 27 are reorged
190192
* --------------------------------|---
191193
* / | ^ ^ ^ ^
192194
* / | 28 29 32 33
193195
* |----------------|----------
194196
* 16 ^ ^ ^ ^ ^ ^
195197
* 19 23 24 25 26 27
196198
* ^
197-
* PRCS at epoch 3 is persisted, CRCS is pruned
199+
* both PRCS and CRCS are persisted since their roots are unknown to block state 33
198200
*/
199201
{
200202
name: "maxCPStateEpochsInMemory=1, reorg middle slot of previous epoch",
@@ -204,12 +206,12 @@ describe(
204206
maxCPStateEpochsInMemory: 1,
205207
// reload CP state epoch 2 (slot = 16)
206208
reloadCount: 1,
207-
// 1 cp state for epoch 0 1 2 3
208-
persistCount: 4,
209+
// {root0, epoch: 0} {root8, epoch: 1} {root16, epoch: 2} {root23, epoch: 3} {root24, epoch: 3} {root19, epoch: 3}
210+
persistCount: 6,
209211
// epoch 4, one for Current Root Checkpoint State and one for Previous Root Checkpoint State
210212
numStatesInMemory: 2,
211-
// chain is not finalized, epoch 4 is in-memory so CP state at epoch 0 1 2 3 are persisted
212-
numStatesPersisted: 4,
213+
// chain is not finalized, same to persist count
214+
numStatesPersisted: 6,
213215
// epoch 4
214216
numEpochsInMemory: 1,
215217
// chain is not finalized, epoch 4 is in-memory so CP state at epoch 0 1 2 3 are persisted
@@ -218,15 +220,15 @@ describe(
218220
skip: true,
219221
},
220222
/**
221-
* Block slot 28 has parent slot 23, block slot 24 25 26 and 27 are reorged
223+
* Block slot 28 has parent slot 15, block slot 24 25 26 and 27 are reorged
222224
* --------------------------------------------|---
223225
* / | ^ ^ ^ ^
224226
* / | 28 29 32 33
225-
* |----------------|----------------|----------
226-
* ^ ^ 16 ^ ^ ^ ^ ^ ^
227+
* |----------------|----------------|---------- ^
228+
* ^ ^ 16 ^ ^ ^ ^ ^ ^ test end
227229
* 8 15 19 23 24 25 26 27
228230
*reload ^
229-
* PRCS at epoch 3 is persisted, CRCS is pruned
231+
* both PRCS and CRCS are persisted because roots are unknown to block 28
230232
*/
231233
{
232234
name: "maxCPStateEpochsInMemory=1, reorg 2 epochs",
@@ -236,12 +238,12 @@ describe(
236238
maxCPStateEpochsInMemory: 1,
237239
// reload CP state epoch 2 (slot = 16)
238240
reloadCount: 1,
239-
// 1 cp state for epoch 0 1, 2 CP states for epoch 2, 1 cp state for epoch 3
240-
persistCount: 5,
241+
// {root0, epoch: 0} {root8, epoch: 1} {root16, epoch: 2} {root15, epoch: 2} {root23, epoch: 3} {root24, epoch: 3} {root15, epoch: 3}
242+
persistCount: 7,
241243
// epoch 4, one for Current Root Checkpoint State and one for Previous Root Checkpoint State
242244
numStatesInMemory: 2,
243-
// chain is not finalized, epoch 4 is in-memory so CP state at epoch 0 1 2 3 are persisted, epoch 2 has 2 CP states
244-
numStatesPersisted: 5,
245+
// chain is not finalized, so same number to persistCount
246+
numStatesPersisted: 7,
245247
// epoch 4
246248
numEpochsInMemory: 1,
247249
// chain is not finalized, epoch 4 is in-memory so CP state at epoch 0 1 2 3 are persisted

packages/beacon-node/test/unit/chain/stateCache/persistentCheckpointsCache.test.ts

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -867,14 +867,58 @@ describe("PersistentCheckpointStateCache", function () {
867867
expect(await cache.getStateOrBytes(cp0aHex)).toBeNull();
868868
});
869869

870+
// Real mainnet scenario: root1b reorg root1a, and later on it's reorged back to root1a
871+
// processState is skipped for root1a because it's late, it's only called for root1b
872+
// we should persist both checkpoint states {0a, 20} and {0b, 20} in order to have finalized checkpoint states later
873+
// - {0a, 20} is persisted because it's the view of root1b state
874+
// - {0b, 20} is persisted because it's unknown in root1b state
870875
// epoch: 19 20 21 22 23
871876
// |-----------|-----------|-----------|-----------|
872877
// ^^ ^ ^
873878
// || | |
874879
// |0b --root1a|
875880
// | |
876881
// 0a---------root1b
877-
it("reorg 1 epoch", async () => {
882+
it("reorg 1 epoch, processState once", async () => {
883+
fileApisBuffer = new Map();
884+
const datastore = getTestDatastore(fileApisBuffer);
885+
cache = new PersistentCheckpointStateCache(
886+
{datastore, logger: testLogger(), shufflingCache: new ShufflingCache()},
887+
{maxCPStateEpochsInMemory: 0, processLateBlock: true}
888+
);
889+
890+
const root1a = Buffer.alloc(32, 100);
891+
const state1a = states["cp0b"].clone();
892+
state1a.slot = 20 * SLOTS_PER_EPOCH + SLOTS_PER_EPOCH + 3;
893+
state1a.blockRoots.set(state1a.slot % SLOTS_PER_HISTORICAL_ROOT, root1a);
894+
// state transition add to cache
895+
cache.add(cp0b, states["cp0b"]);
896+
// do not processState root1a because it's late
897+
898+
// no need to reload cp0b because it's available in block state
899+
const root1b = Buffer.alloc(32, 101);
900+
const state1b = states["cp0a"].clone();
901+
state1b.slot = state1a.slot + 1;
902+
state1b.blockRoots.set(state1b.slot % SLOTS_PER_HISTORICAL_ROOT, root1b);
903+
// state transition add to cache
904+
cache.add(cp0a, states["cp0a"]);
905+
906+
// need to persist 2 checkpoint states
907+
expect(await cache.processState(toHexString(root1b), state1b)).toEqual(2);
908+
// both are persisited
909+
expect(await cache.getStateOrBytes(cp0bHex)).toEqual(stateBytes["cp0b"]);
910+
expect(await cache.getStateOrBytes(cp0aHex)).toEqual(stateBytes["cp0a"]);
911+
});
912+
913+
// Same to above, but we processState for both root1a and root1b
914+
// epoch: 19 20 21 22 23
915+
// |-----------|-----------|-----------|-----------|
916+
// ^^ ^ ^
917+
// || | |
918+
// |0b --root1a|
919+
// | |
920+
// 0a---------root1b
921+
it("reorg 1 epoch, processState twice", async () => {
878922
expect(await cache.processState(toHexString(root0b), states["cp0b"])).toEqual(1);
879923
await assertPersistedCheckpointState([cp0b], [stateBytes["cp0b"]]);
880924
expect(await cache.getStateOrBytes(cp0aHex)).toBeNull();

0 commit comments

Comments
 (0)