Skip to content

Commit 6878ba1

Browse files
authored
Retry approval on availability failure if the check is still needed (#6807)
Recovering the POV can fail in situation where the node just restart and the DHT topology wasn't fully discovered yet, so the current node can't connect to most of its Peers. This is bad because for gossiping the assignment you need to be connected to just a few peers, so because we can't approve the candidate and other nodes will see this as a no show. This becomes bad in the scenario where you've got a lot of nodes restarting at the same time, so you end up having a lot of no-shows in the network that are never covered, in that case it makes sense for nodes to actually retry approving the candidate at a later data in time and retry several times if the block containing the candidate wasn't approved. ## TODO - [x] Add a subsystem test. --------- Signed-off-by: Alexandru Gheorghe <[email protected]>
1 parent 023763d commit 6878ba1

4 files changed

Lines changed: 297 additions & 7 deletions

File tree

polkadot/node/core/approval-voting/src/lib.rs

Lines changed: 130 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
//! of others. It uses this information to determine when candidates and blocks have
2222
//! been sufficiently approved to finalize.
2323
24+
use futures_timer::Delay;
2425
use polkadot_node_primitives::{
2526
approval::{
2627
v1::{BlockApprovalMeta, DelayTranche},
@@ -122,6 +123,9 @@ const APPROVAL_CHECKING_TIMEOUT: Duration = Duration::from_secs(120);
122123
const WAIT_FOR_SIGS_TIMEOUT: Duration = Duration::from_millis(500);
123124
const APPROVAL_CACHE_SIZE: u32 = 1024;
124125

126+
/// The maximum number of times we retry to approve a block if is still needed.
127+
const MAX_APPROVAL_RETRIES: u32 = 16;
128+
125129
const APPROVAL_DELAY: Tick = 2;
126130
pub(crate) const LOG_TARGET: &str = "parachain::approval-voting";
127131

@@ -165,6 +169,10 @@ pub struct ApprovalVotingSubsystem {
165169
metrics: Metrics,
166170
clock: Arc<dyn Clock + Send + Sync>,
167171
spawner: Arc<dyn overseer::gen::Spawner + 'static>,
172+
/// The maximum time we retry to approve a block if it is still needed and PoV fetch failed.
173+
max_approval_retries: u32,
174+
/// The backoff before we retry the approval.
175+
retry_backoff: Duration,
168176
}
169177

170178
#[derive(Clone)]
@@ -493,6 +501,8 @@ impl ApprovalVotingSubsystem {
493501
metrics,
494502
Arc::new(SystemClock {}),
495503
spawner,
504+
MAX_APPROVAL_RETRIES,
505+
APPROVAL_CHECKING_TIMEOUT / 2,
496506
)
497507
}
498508

@@ -505,6 +515,8 @@ impl ApprovalVotingSubsystem {
505515
metrics: Metrics,
506516
clock: Arc<dyn Clock + Send + Sync>,
507517
spawner: Arc<dyn overseer::gen::Spawner + 'static>,
518+
max_approval_retries: u32,
519+
retry_backoff: Duration,
508520
) -> Self {
509521
ApprovalVotingSubsystem {
510522
keystore,
@@ -515,6 +527,8 @@ impl ApprovalVotingSubsystem {
515527
metrics,
516528
clock,
517529
spawner,
530+
max_approval_retries,
531+
retry_backoff,
518532
}
519533
}
520534

@@ -706,18 +720,53 @@ enum ApprovalOutcome {
706720
TimedOut,
707721
}
708722

723+
#[derive(Clone)]
724+
struct RetryApprovalInfo {
725+
candidate: CandidateReceipt,
726+
backing_group: GroupIndex,
727+
executor_params: ExecutorParams,
728+
core_index: Option<CoreIndex>,
729+
session_index: SessionIndex,
730+
attempts_remaining: u32,
731+
backoff: Duration,
732+
}
733+
709734
struct ApprovalState {
710735
validator_index: ValidatorIndex,
711736
candidate_hash: CandidateHash,
712737
approval_outcome: ApprovalOutcome,
738+
retry_info: Option<RetryApprovalInfo>,
713739
}
714740

715741
impl ApprovalState {
716742
fn approved(validator_index: ValidatorIndex, candidate_hash: CandidateHash) -> Self {
717-
Self { validator_index, candidate_hash, approval_outcome: ApprovalOutcome::Approved }
743+
Self {
744+
validator_index,
745+
candidate_hash,
746+
approval_outcome: ApprovalOutcome::Approved,
747+
retry_info: None,
748+
}
718749
}
719750
fn failed(validator_index: ValidatorIndex, candidate_hash: CandidateHash) -> Self {
720-
Self { validator_index, candidate_hash, approval_outcome: ApprovalOutcome::Failed }
751+
Self {
752+
validator_index,
753+
candidate_hash,
754+
approval_outcome: ApprovalOutcome::Failed,
755+
retry_info: None,
756+
}
757+
}
758+
759+
fn failed_with_retry(
760+
validator_index: ValidatorIndex,
761+
candidate_hash: CandidateHash,
762+
retry_info: Option<RetryApprovalInfo>,
763+
) -> Self {
764+
Self {
765+
validator_index,
766+
candidate_hash,
767+
approval_outcome: ApprovalOutcome::Failed,
768+
retry_info,
769+
}
721770
}
722771
}
723772

@@ -757,6 +806,7 @@ impl CurrentlyCheckingSet {
757806
candidate_hash,
758807
validator_index,
759808
approval_outcome: ApprovalOutcome::TimedOut,
809+
retry_info: None,
760810
},
761811
Some(approval_state) => approval_state,
762812
}
@@ -1271,25 +1321,63 @@ where
12711321
validator_index,
12721322
candidate_hash,
12731323
approval_outcome,
1324+
retry_info,
12741325
}
12751326
) = approval_state;
12761327

12771328
if matches!(approval_outcome, ApprovalOutcome::Approved) {
12781329
let mut approvals: Vec<Action> = relay_block_hashes
1279-
.into_iter()
1330+
.iter()
12801331
.map(|block_hash|
12811332
Action::IssueApproval(
12821333
candidate_hash,
12831334
ApprovalVoteRequest {
12841335
validator_index,
1285-
block_hash,
1336+
block_hash: *block_hash,
12861337
},
12871338
)
12881339
)
12891340
.collect();
12901341
actions.append(&mut approvals);
12911342
}
12921343

1344+
if let Some(retry_info) = retry_info {
1345+
for block_hash in relay_block_hashes {
1346+
if overlayed_db.load_block_entry(&block_hash).map(|block_info| block_info.is_some()).unwrap_or(false) {
1347+
let sender = to_other_subsystems.clone();
1348+
let spawn_handle = subsystem.spawner.clone();
1349+
let metrics = subsystem.metrics.clone();
1350+
let retry_info = retry_info.clone();
1351+
let executor_params = retry_info.executor_params.clone();
1352+
let candidate = retry_info.candidate.clone();
1353+
1354+
currently_checking_set
1355+
.insert_relay_block_hash(
1356+
candidate_hash,
1357+
validator_index,
1358+
block_hash,
1359+
async move {
1360+
launch_approval(
1361+
sender,
1362+
spawn_handle,
1363+
metrics,
1364+
retry_info.session_index,
1365+
candidate,
1366+
validator_index,
1367+
block_hash,
1368+
retry_info.backing_group,
1369+
executor_params,
1370+
retry_info.core_index,
1371+
retry_info,
1372+
)
1373+
.await
1374+
},
1375+
)
1376+
.await?;
1377+
}
1378+
}
1379+
}
1380+
12931381
actions
12941382
},
12951383
(block_hash, validator_index) = delayed_approvals_timers.select_next_some() => {
@@ -1340,6 +1428,8 @@ where
13401428
&mut approvals_cache,
13411429
&mut subsystem.mode,
13421430
actions,
1431+
subsystem.max_approval_retries,
1432+
subsystem.retry_backoff,
13431433
)
13441434
.await?
13451435
{
@@ -1389,6 +1479,8 @@ pub async fn start_approval_worker<
13891479
metrics,
13901480
clock,
13911481
spawner,
1482+
MAX_APPROVAL_RETRIES,
1483+
APPROVAL_CHECKING_TIMEOUT / 2,
13921484
);
13931485
let backend = DbBackend::new(db.clone(), approval_voting.db_config);
13941486
let spawner = approval_voting.spawner.clone();
@@ -1456,6 +1548,8 @@ async fn handle_actions<
14561548
approvals_cache: &mut LruMap<CandidateHash, ApprovalOutcome>,
14571549
mode: &mut Mode,
14581550
actions: Vec<Action>,
1551+
max_approval_retries: u32,
1552+
retry_backoff: Duration,
14591553
) -> SubsystemResult<bool> {
14601554
let mut conclude = false;
14611555
let mut actions_iter = actions.into_iter();
@@ -1542,6 +1636,16 @@ async fn handle_actions<
15421636
let sender = sender.clone();
15431637
let spawn_handle = spawn_handle.clone();
15441638

1639+
let retry = RetryApprovalInfo {
1640+
candidate: candidate.clone(),
1641+
backing_group,
1642+
executor_params: executor_params.clone(),
1643+
core_index,
1644+
session_index: session,
1645+
attempts_remaining: max_approval_retries,
1646+
backoff: retry_backoff,
1647+
};
1648+
15451649
currently_checking_set
15461650
.insert_relay_block_hash(
15471651
candidate_hash,
@@ -1559,6 +1663,7 @@ async fn handle_actions<
15591663
backing_group,
15601664
executor_params,
15611665
core_index,
1666+
retry,
15621667
)
15631668
.await
15641669
},
@@ -3329,6 +3434,7 @@ async fn launch_approval<
33293434
backing_group: GroupIndex,
33303435
executor_params: ExecutorParams,
33313436
core_index: Option<CoreIndex>,
3437+
retry: RetryApprovalInfo,
33323438
) -> SubsystemResult<RemoteHandle<ApprovalState>> {
33333439
let (a_tx, a_rx) = oneshot::channel();
33343440
let (code_tx, code_rx) = oneshot::channel();
@@ -3360,6 +3466,7 @@ async fn launch_approval<
33603466

33613467
let candidate_hash = candidate.hash();
33623468
let para_id = candidate.descriptor.para_id();
3469+
let mut next_retry = None;
33633470
gum::trace!(target: LOG_TARGET, ?candidate_hash, ?para_id, "Recovering data.");
33643471

33653472
let timer = metrics.time_recover_and_approve();
@@ -3388,7 +3495,6 @@ async fn launch_approval<
33883495
let background = async move {
33893496
// Force the move of the timer into the background task.
33903497
let _timer = timer;
3391-
33923498
let available_data = match a_rx.await {
33933499
Err(_) => return ApprovalState::failed(validator_index, candidate_hash),
33943500
Ok(Ok(a)) => a,
@@ -3399,10 +3505,27 @@ async fn launch_approval<
33993505
target: LOG_TARGET,
34003506
?para_id,
34013507
?candidate_hash,
3508+
attempts_remaining = retry.attempts_remaining,
34023509
"Data unavailable for candidate {:?}",
34033510
(candidate_hash, candidate.descriptor.para_id()),
34043511
);
3405-
// do nothing. we'll just be a no-show and that'll cause others to rise up.
3512+
// Availability could fail if we did not discover much of the network, so
3513+
// let's back off and order the subsystem to retry at a later point if the
3514+
// approval is still needed, because no-show wasn't covered yet.
3515+
if retry.attempts_remaining > 0 {
3516+
Delay::new(retry.backoff).await;
3517+
next_retry = Some(RetryApprovalInfo {
3518+
candidate,
3519+
backing_group,
3520+
executor_params,
3521+
core_index,
3522+
session_index,
3523+
attempts_remaining: retry.attempts_remaining - 1,
3524+
backoff: retry.backoff,
3525+
});
3526+
} else {
3527+
next_retry = None;
3528+
}
34063529
metrics_guard.take().on_approval_unavailable();
34073530
},
34083531
&RecoveryError::ChannelClosed => {
@@ -3433,7 +3556,7 @@ async fn launch_approval<
34333556
metrics_guard.take().on_approval_invalid();
34343557
},
34353558
}
3436-
return ApprovalState::failed(validator_index, candidate_hash)
3559+
return ApprovalState::failed_with_retry(validator_index, candidate_hash, next_retry)
34373560
},
34383561
};
34393562

0 commit comments

Comments
 (0)