Skip to content

Commit 882a303

Browse files
vsmk98jpmsam
authored andcommitted
Allow raft to recover state after non-graceful shutdown in non-archive mode (#860)
raft: re-enable gcmode full for raft and ensure the node is able to sync up post non-disgraceful shutdown
1 parent ef99f6d commit 882a303

File tree

3 files changed

+35
-16
lines changed

3 files changed

+35
-16
lines changed

cmd/utils/flags.go

Lines changed: 1 addition & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -1225,11 +1225,6 @@ func SetEthConfig(ctx *cli.Context, stack *node.Node, cfg *eth.Config) {
12251225
}
12261226
cfg.NoPruning = ctx.GlobalString(GCModeFlag.Name) == "archive"
12271227

1228-
//Quorum - set gcmode=archive for Raft
1229-
if ctx.GlobalBool(RaftModeFlag.Name) {
1230-
log.Info("set gcmode=archive for Raft")
1231-
cfg.NoPruning = true
1232-
}
12331228

12341229
if ctx.GlobalIsSet(CacheFlag.Name) || ctx.GlobalIsSet(CacheGCFlag.Name) {
12351230
cfg.TrieCache = ctx.GlobalInt(CacheFlag.Name) * ctx.GlobalInt(CacheGCFlag.Name) / 100
@@ -1499,15 +1494,8 @@ func MakeChain(ctx *cli.Context, stack *node.Node) (chain *core.BlockChain, chai
14991494
Fatalf("--%s must be either 'full' or 'archive'", GCModeFlag.Name)
15001495
}
15011496

1502-
trieWriteCacheDisabled := ctx.GlobalString(GCModeFlag.Name) == "archive"
1503-
//Quorum - set gcmode=archive for Raft
1504-
if !trieWriteCacheDisabled && ctx.GlobalBool(RaftModeFlag.Name) {
1505-
log.Info("set gcmode=archive for Raft")
1506-
trieWriteCacheDisabled = true
1507-
}
1508-
15091497
cache := &core.CacheConfig{
1510-
Disabled: trieWriteCacheDisabled,
1498+
Disabled: ctx.GlobalString(GCModeFlag.Name) == "archive",
15111499
TrieNodeLimit: eth.DefaultConfig.TrieCache,
15121500
TrieTimeLimit: eth.DefaultConfig.TrieTimeout,
15131501
}

raft/handler.go

Lines changed: 32 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -431,9 +431,40 @@ func (pm *ProtocolManager) startRaft() {
431431
maybeRaftSnapshot = pm.loadSnapshot() // re-establishes peer connections
432432
}
433433

434-
pm.wal = pm.replayWAL(maybeRaftSnapshot)
434+
loadedWal, entries := pm.replayWAL(maybeRaftSnapshot)
435+
pm.wal = loadedWal
435436

436437
if walExisted {
438+
439+
// If we shutdown but didn't manage to flush the state to disk, then it will be the case that we will only sync
440+
// up to the snapshot. In this case, we can replay the raft entries that we have in saved to replay the blocks
441+
// back into our chain. We output errors but cannot do much if one occurs, since we can't fork to a different
442+
// chain and all other nodes in the network have confirmed these blocks
443+
if maybeRaftSnapshot != nil {
444+
currentChainHead := pm.blockchain.CurrentBlock().Number()
445+
for _, entry := range entries {
446+
if entry.Type == raftpb.EntryNormal {
447+
var block types.Block
448+
if err := rlp.DecodeBytes(entry.Data, &block); err != nil {
449+
log.Error("error decoding block: ", "err", err)
450+
continue
451+
}
452+
453+
if thisBlockHead := pm.blockchain.GetBlockByHash(block.Hash()); thisBlockHead != nil {
454+
// check if the block is already existing in the local chain
455+
// and the block number is greater than current chain head
456+
if thisBlockHeadNum := thisBlockHead.Number(); thisBlockHeadNum.Cmp(currentChainHead) > 0 {
457+
// insert the block only if its already seen
458+
blocks := []*types.Block{&block}
459+
if _, err := pm.blockchain.InsertChain(blocks); err != nil {
460+
log.Error("error inserting the block into the chain", "number", block.NumberU64(), "hash", block.Hash(), "err", err)
461+
}
462+
}
463+
}
464+
}
465+
}
466+
}
467+
437468
if hardState, _, err := pm.raftStorage.InitialState(); err != nil {
438469
panic(fmt.Sprintf("failed to read initial state from raft while restarting: %v", err))
439470
} else {

raft/wal.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ func (pm *ProtocolManager) openWAL(maybeRaftSnapshot *raftpb.Snapshot) *wal.WAL
3838
return wal
3939
}
4040

41-
func (pm *ProtocolManager) replayWAL(maybeRaftSnapshot *raftpb.Snapshot) *wal.WAL {
41+
func (pm *ProtocolManager) replayWAL(maybeRaftSnapshot *raftpb.Snapshot) (*wal.WAL, []raftpb.Entry) {
4242
log.Info("replaying WAL")
4343
wal := pm.openWAL(maybeRaftSnapshot)
4444

@@ -50,5 +50,5 @@ func (pm *ProtocolManager) replayWAL(maybeRaftSnapshot *raftpb.Snapshot) *wal.WA
5050
pm.raftStorage.SetHardState(hardState)
5151
pm.raftStorage.Append(entries)
5252

53-
return wal
53+
return wal, entries
5454
}

0 commit comments

Comments
 (0)