Skip to content

Commit ad344f5

Browse files
lexnvnkpar
authored andcommitted
sync/fix: Clear gap sync on known imported blocks (paritytech#8445)
This PR ensures that warp sync gaps are properly cleared when known blocks are imported. Previously, gaps were only removed in response to `ImportedUnknown` events. This limitation caused issues for asset-hub and bridge-hub collators, which remained stuck in the "Block history" state without progressing. The root cause lies in the client.info() reporting a gap during node startup or restart (ie block verification fails). In some cases, a peer may respond with the missing blocks after we’ve already imported them locally, leaving the gap open. Grafana link: https://grafana.teleport.parity.io/goto/jCcsBLxNg?orgId=1 Traces from production: ``` 2025-05-06 12:55:34.251 DEBUG main sync: [Parachain] Starting gap sync #4935955 - #4935955 2025-05-06 12:55:34.558 TRACE tokio-runtime-worker sync: [Parachain] New gap block request for 12D3KooWAVQMhkXmc5ueSYasdsRWQbKus2YGZ6HDZUB4ViJMCxXy, (best:5103253, common:5103253) BlockRequest { id: 0, fields: HEADER | BODY | JUSTIFICATION, from: Number(4935955), direction: Descending, max: Some(1) } 2025-05-06 12:55:34.558 TRACE tokio-runtime-worker sync: [Parachain] Processed `SyncingAction::StartRequest` to 12D3KooWAVQMhkXmc5ueSYasdsRWQbKus2YGZ6HDZUB4ViJMCxXy with strategy key StrategyKey("ChainSync"). 2025-05-06 12:55:34.608 TRACE tokio-runtime-worker sync: [Parachain] BlockResponse 0 from 12D3KooWAVQMhkXmc5ueSYasdsRWQbKus2YGZ6HDZUB4ViJMCxXy with 1 blocks (4935955) 2025-05-06 12:55:34.608 DEBUG tokio-runtime-worker sync: [Parachain] Drained 1 gap blocks from 4935954 2025-05-06 12:55:35.511 TRACE tokio-runtime-worker sync::import-queue: [Parachain] Starting import of 1 blocks (4935955) 2025-05-06 12:55:35.517 TRACE tokio-runtime-worker sync::import-queue: [Parachain] Block already in chain 4935955: 0x63db2b40cccac020fbc922e5e98bb3955f4cdaa823a2be85ecf22776745ccacc 2025-05-06 12:55:35.517 TRACE tokio-runtime-worker sync::import-queue: [Parachain] Block imported successfully Some(4935955) (0x63db…cacc) 2025-05-06 12:55:35.517 TRACE tokio-runtime-worker sync: [Parachain] Cleared blocks from 4935955 to 4935956 ``` ### Testing Done Added two tests to verify that warp sync gaps are correctly cleared under both block import scenarios. The first test closely follows the operations performed by the node, while the second one emulates the imports. ### Next Steps Added extra debug logs to monitor if the issue persists (pointing towards a corupt database -- ie client.info() always has the gap present). Closes: paritytech#8416 cc @paritytech/networking --------- Signed-off-by: Alexandru Vasile <alexandru.vasile@parity.io> Co-authored-by: Nikolai Kozlov <1431590+nkpar@users.noreply.github.com>
1 parent 0f69fab commit ad344f5

4 files changed

Lines changed: 276 additions & 12 deletions

File tree

prdoc/pr_8445.prdoc

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
title: Fix the clearing of gap sync on known imported blocks
2+
3+
doc:
4+
- audience: [Node Dev, Node Operator]
5+
description: |
6+
This PR ensures that warp sync gaps are properly cleared when known blocks are imported. Previously, gaps were only removed in response to ImportedUnknown events.
7+
This limitation caused issues for asset-hub and bridge-hub collators, which remained stuck in the "Block history" state without progressing.
8+
The root cause lies in the client.info() reporting a gap during node startup or restart (ie block verification fails). In some cases, a peer may respond with the missing blocks after we’ve already imported them locally, leaving the gap open.
9+
10+
crates:
11+
- name: sc-network
12+
bump: patch

substrate/client/network/sync/src/strategy.rs

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,34 @@ pub enum SyncingAction<B: BlockT> {
176176
Finished,
177177
}
178178

179+
// Note: Ideally we can deduce this information with #[derive(derive_more::Debug)].
180+
// However, we'd need a bump to the latest version 2 of the crate.
181+
impl<B> std::fmt::Debug for SyncingAction<B>
182+
where
183+
B: BlockT,
184+
{
185+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
186+
match &self {
187+
Self::StartRequest { peer_id, key, remove_obsolete, .. } => {
188+
write!(
189+
f,
190+
"StartRequest {{ peer_id: {:?}, key: {:?}, remove_obsolete: {:?} }}",
191+
peer_id, key, remove_obsolete
192+
)
193+
},
194+
Self::CancelRequest { peer_id, key } => {
195+
write!(f, "CancelRequest {{ peer_id: {:?}, key: {:?} }}", peer_id, key)
196+
},
197+
Self::DropPeer(peer) => write!(f, "DropPeer({:?})", peer),
198+
Self::ImportBlocks { blocks, .. } => write!(f, "ImportBlocks({:?})", blocks),
199+
Self::ImportJustifications { hash, number, .. } => {
200+
write!(f, "ImportJustifications({:?}, {:?})", hash, number)
201+
},
202+
Self::Finished => write!(f, "Finished"),
203+
}
204+
}
205+
}
206+
179207
impl<B: BlockT> SyncingAction<B> {
180208
/// Returns `true` if the syncing action has completed.
181209
pub fn is_finished(&self) -> bool {

substrate/client/network/sync/src/strategy/chain_sync.rs

Lines changed: 22 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -674,10 +674,12 @@ where
674674
has_error |= result.is_err();
675675

676676
match result {
677-
Ok(BlockImportStatus::ImportedKnown(number, peer_id)) =>
677+
Ok(BlockImportStatus::ImportedKnown(number, peer_id)) => {
678678
if let Some(peer) = peer_id {
679679
self.update_peer_common_number(&peer, number);
680-
},
680+
}
681+
self.complete_gap_if_target(number);
682+
},
681683
Ok(BlockImportStatus::ImportedUnknown(number, aux, peer_id)) => {
682684
if aux.clear_justification_requests {
683685
trace!(
@@ -720,15 +722,8 @@ where
720722
self.mode = ChainSyncMode::Full;
721723
self.restart();
722724
}
723-
let gap_sync_complete =
724-
self.gap_sync.as_ref().map_or(false, |s| s.target == number);
725-
if gap_sync_complete {
726-
info!(
727-
target: LOG_TARGET,
728-
"Block history download is complete."
729-
);
730-
self.gap_sync = None;
731-
}
725+
726+
self.complete_gap_if_target(number);
732727
},
733728
Err(BlockImportError::IncompleteHeader(peer_id)) =>
734729
if let Some(peer) = peer_id {
@@ -992,6 +987,18 @@ where
992987
Ok(sync)
993988
}
994989

990+
/// Complete the gap sync if the target number is reached and there is a gap.
991+
fn complete_gap_if_target(&mut self, number: NumberFor<B>) {
992+
let gap_sync_complete = self.gap_sync.as_ref().map_or(false, |s| s.target == number);
993+
if gap_sync_complete {
994+
info!(
995+
target: LOG_TARGET,
996+
"Block history download is complete."
997+
);
998+
self.gap_sync = None;
999+
}
1000+
}
1001+
9951002
#[must_use]
9961003
fn add_peer_inner(
9971004
&mut self,
@@ -1676,6 +1683,8 @@ where
16761683
/// state for.
16771684
fn reset_sync_start_point(&mut self) -> Result<(), ClientError> {
16781685
let info = self.client.info();
1686+
debug!(target: LOG_TARGET, "Restarting sync with client info {info:?}");
1687+
16791688
if matches!(self.mode, ChainSyncMode::LightState { .. }) && info.finalized_state.is_some() {
16801689
warn!(
16811690
target: LOG_TARGET,
@@ -1705,7 +1714,8 @@ where
17051714
}
17061715

17071716
if let Some(BlockGap { start, end, .. }) = info.block_gap {
1708-
debug!(target: LOG_TARGET, "Starting gap sync #{start} - #{end}");
1717+
let old_gap = self.gap_sync.take().map(|g| (g.best_queued_number, g.target));
1718+
debug!(target: LOG_TARGET, "Starting gap sync #{start} - #{end} (old gap best and target: {old_gap:?})");
17091719
self.gap_sync = Some(GapSync {
17101720
best_queued_number: start - One::one(),
17111721
target: end,

substrate/client/network/sync/src/strategy/chain_sync/test.rs

Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1127,3 +1127,217 @@ fn request_across_forks() {
11271127
assert!(sync.is_known(&block.header.parent_hash()));
11281128
}
11291129
}
1130+
1131+
/// This test simulates a scenario where we get a `VerificationFailed` error
1132+
/// while a gap reported by our client.info(). Then the gap is filled after
1133+
/// the restart of the sync process. The test ensures that the gap is properly closed
1134+
/// on importing unknown blocks (ie blocks we don't have in our chain yet).
1135+
#[test]
1136+
fn sync_verification_failed_with_gap_filled() {
1137+
sp_tracing::try_init_simple();
1138+
1139+
// We only care about 2 iterations of the loop (since max blocks per request is 64).
1140+
const TEST_TARGET: u32 = 64 * 3;
1141+
1142+
let blocks = {
1143+
let client = TestClientBuilder::new().build();
1144+
(0..TEST_TARGET).map(|_| build_block(&client, None, false)).collect::<Vec<_>>()
1145+
};
1146+
1147+
let client = Arc::new(TestClientBuilder::new().build());
1148+
let info = client.info();
1149+
1150+
let mut sync = ChainSync::new(
1151+
ChainSyncMode::Full,
1152+
client.clone(),
1153+
5,
1154+
64,
1155+
ProtocolName::Static(""),
1156+
Arc::new(MockBlockDownloader::new()),
1157+
None,
1158+
std::iter::empty(),
1159+
)
1160+
.unwrap();
1161+
1162+
let peer_id1 = PeerId::random();
1163+
let peer_id2 = PeerId::random();
1164+
1165+
let best_block = blocks.last().unwrap().clone();
1166+
let max_blocks_to_request = sync.max_blocks_per_request;
1167+
1168+
let status = sync.status();
1169+
assert!(status.warp_sync.is_none());
1170+
log::info!(target: LOG_TARGET, "Before adding peers: {status:?}");
1171+
1172+
// Connect the node we will sync from
1173+
sync.add_peer(peer_id1, best_block.hash(), *best_block.header().number());
1174+
sync.add_peer(peer_id2, info.best_hash, 0);
1175+
1176+
let mut best_block_num = 0;
1177+
assert_eq!(sync.best_queued_number, 0);
1178+
1179+
// Two iterations to simulate the gap filling.
1180+
for loop_index in 0..2 {
1181+
log::info!(target: LOG_TARGET, "Loop index: {loop_index}");
1182+
1183+
// Build the request.
1184+
let request = get_block_request(
1185+
&mut sync,
1186+
FromBlock::Number(max_blocks_to_request as u64 + best_block_num as u64),
1187+
max_blocks_to_request as u32,
1188+
&peer_id1,
1189+
);
1190+
let from = unwrap_from_block_number(request.from.clone());
1191+
let mut resp_blocks = blocks[best_block_num as usize..from as usize].to_vec();
1192+
resp_blocks.reverse();
1193+
let response = create_block_response(resp_blocks.clone());
1194+
1195+
// Clear old actions to not deal with them
1196+
let _ = sync.take_actions();
1197+
1198+
let status = sync.status();
1199+
log::info!(target: LOG_TARGET, "Status before on_block_data: {status:?}");
1200+
1201+
sync.on_block_data(&peer_id1, Some(request.clone()), response.clone()).unwrap();
1202+
1203+
let actions = sync.take_actions().collect::<Vec<_>>();
1204+
assert_eq!(actions.len(), 1);
1205+
assert!(matches!(
1206+
&actions[0],
1207+
SyncingAction::ImportBlocks{ origin: _, blocks } if blocks.len() ==
1208+
max_blocks_to_request as usize, ));
1209+
1210+
let status = sync.status();
1211+
log::info!(target: LOG_TARGET, "Status before processing blocks: {status:?}");
1212+
1213+
best_block_num += max_blocks_to_request as u32;
1214+
1215+
let responses: Vec<_> = resp_blocks
1216+
.iter()
1217+
.rev()
1218+
.map(|b| {
1219+
(
1220+
Ok(BlockImportStatus::ImportedUnknown(
1221+
*b.header().number(),
1222+
Default::default(),
1223+
Some(peer_id1),
1224+
)),
1225+
b.hash(),
1226+
)
1227+
})
1228+
.collect();
1229+
1230+
sync.on_blocks_processed(
1231+
max_blocks_to_request as usize,
1232+
max_blocks_to_request as usize,
1233+
responses,
1234+
);
1235+
1236+
let status = sync.status();
1237+
log::info!(target: LOG_TARGET, "Status after processing blocks: {status:?}");
1238+
1239+
// Import the blocks as final to the client.
1240+
resp_blocks
1241+
.into_iter()
1242+
.rev()
1243+
.for_each(|b| block_on(client.import_as_final(BlockOrigin::Own, b)).unwrap());
1244+
1245+
if loop_index == 0 {
1246+
log::info!(target: LOG_TARGET, "Peer state {:#?}", sync.peers);
1247+
1248+
// Both peers are in the available state.
1249+
match sync.peers.get(&peer_id1) {
1250+
Some(peer) => assert_eq!(peer.state, PeerSyncState::Available),
1251+
None => panic!("Peer not found"),
1252+
}
1253+
match sync.peers.get(&peer_id2) {
1254+
Some(peer) => assert_eq!(peer.state, PeerSyncState::Available),
1255+
None => panic!("Peer not found"),
1256+
}
1257+
1258+
// Simulate that we encounter a `VerificationFailed` error while processing the blocks.
1259+
// During this error, the sync will enter the `AncestorSearch` state for the peer 1
1260+
// because of the sync restart operation. Then, the peer will be in the `Available`
1261+
// state after the ancestor search is done. However, we still have the gap present.
1262+
sync.gap_sync = Some(GapSync {
1263+
best_queued_number: 64 as u64,
1264+
target: 84 as u64,
1265+
blocks: BlockCollection::new(),
1266+
});
1267+
} else if loop_index == 1 {
1268+
if sync.gap_sync.is_none() {
1269+
log::info!(target: LOG_TARGET, "Gap successfully closed");
1270+
} else {
1271+
panic!("Gap not closed after the second loop");
1272+
}
1273+
}
1274+
}
1275+
}
1276+
1277+
#[test]
1278+
fn sync_gap_filled_regardless_of_blocks_origin() {
1279+
sp_tracing::try_init_simple();
1280+
1281+
let blocks = {
1282+
let client = TestClientBuilder::new().build();
1283+
(0..2).map(|_| build_block(&client, None, false)).collect::<Vec<_>>()
1284+
};
1285+
1286+
let client = Arc::new(TestClientBuilder::new().build());
1287+
let mut sync = ChainSync::new(
1288+
ChainSyncMode::Full,
1289+
client.clone(),
1290+
5,
1291+
64,
1292+
ProtocolName::Static(""),
1293+
Arc::new(MockBlockDownloader::new()),
1294+
None,
1295+
std::iter::empty(),
1296+
)
1297+
.unwrap();
1298+
1299+
let peer_id1 = PeerId::random();
1300+
1301+
// BlockImportStatus::ImportedUnknown clears the gap.
1302+
{
1303+
// Simulate that we encounter a `VerificationFailed` error while processing the blocks
1304+
// and the client.info() reports a gap.
1305+
sync.gap_sync = Some(GapSync {
1306+
best_queued_number: *blocks[0].header().number(),
1307+
target: *blocks[0].header().number(),
1308+
blocks: BlockCollection::new(),
1309+
});
1310+
1311+
// Announce the block as unknown.
1312+
let results = [(
1313+
Ok(BlockImportStatus::ImportedUnknown(
1314+
*blocks[0].header().number(),
1315+
Default::default(),
1316+
Some(peer_id1),
1317+
)),
1318+
blocks[0].hash(),
1319+
)];
1320+
sync.on_blocks_processed(1, 1, results.into_iter().collect());
1321+
// Ensure the gap is cleared out.
1322+
assert!(sync.gap_sync.is_none());
1323+
}
1324+
1325+
// BlockImportStatus::ImportedKnown also clears the gap.
1326+
{
1327+
sync.gap_sync = Some(GapSync {
1328+
best_queued_number: *blocks[0].header().number(),
1329+
target: *blocks[0].header().number(),
1330+
blocks: BlockCollection::new(),
1331+
});
1332+
1333+
// Announce the block as known.
1334+
let results = [(
1335+
Ok(BlockImportStatus::ImportedKnown(*blocks[0].header().number(), Some(peer_id1))),
1336+
blocks[0].hash(),
1337+
)];
1338+
1339+
sync.on_blocks_processed(1, 1, results.into_iter().collect());
1340+
// Ensure the gap is cleared out.
1341+
assert!(sync.gap_sync.is_none());
1342+
}
1343+
}

0 commit comments

Comments
 (0)