From bbd049d4472dc51e68924cabe23d7f2a497f8806 Mon Sep 17 00:00:00 2001 From: Alexandru Gheorghe Date: Wed, 4 Dec 2024 10:38:59 +0100 Subject: [PATCH 1/7] Approval-voting restart Signed-off-by: Alexandru Gheorghe --- polkadot/node/core/approval-voting/src/lib.rs | 130 ++++++++++++++++-- 1 file changed, 117 insertions(+), 13 deletions(-) diff --git a/polkadot/node/core/approval-voting/src/lib.rs b/polkadot/node/core/approval-voting/src/lib.rs index 2176cc7675beb..c1238b96200c2 100644 --- a/polkadot/node/core/approval-voting/src/lib.rs +++ b/polkadot/node/core/approval-voting/src/lib.rs @@ -21,6 +21,7 @@ //! of others. It uses this information to determine when candidates and blocks have //! been sufficiently approved to finalize. +use futures_timer::Delay; use polkadot_node_primitives::{ approval::{ v1::{BlockApprovalMeta, DelayTranche}, @@ -122,6 +123,9 @@ const APPROVAL_CHECKING_TIMEOUT: Duration = Duration::from_secs(120); const WAIT_FOR_SIGS_TIMEOUT: Duration = Duration::from_millis(500); const APPROVAL_CACHE_SIZE: u32 = 1024; +/// The maximum number of times we retry to approve a block if is still needed. +const MAX_APPROVAL_RETRIES: u32 = 16; + const APPROVAL_DELAY: Tick = 2; pub(crate) const LOG_TARGET: &str = "parachain::approval-voting"; @@ -706,18 +710,54 @@ enum ApprovalOutcome { TimedOut, } +#[derive(Clone)] +struct RetryApprovalInfo { + candidate: CandidateReceipt, + validator_index: ValidatorIndex, + backing_group: GroupIndex, + executor_params: ExecutorParams, + core_index: Option, + relay_block: Hash, + session_index: SessionIndex, + num_attempts: u32, +} + struct ApprovalState { validator_index: ValidatorIndex, candidate_hash: CandidateHash, approval_outcome: ApprovalOutcome, + retry_info: Option, } impl ApprovalState { fn approved(validator_index: ValidatorIndex, candidate_hash: CandidateHash) -> Self { - Self { validator_index, candidate_hash, approval_outcome: ApprovalOutcome::Approved } + Self { + validator_index, + candidate_hash, + approval_outcome: ApprovalOutcome::Approved, + retry_info: None, + } } fn failed(validator_index: ValidatorIndex, candidate_hash: CandidateHash) -> Self { - Self { validator_index, candidate_hash, approval_outcome: ApprovalOutcome::Failed } + Self { + validator_index, + candidate_hash, + approval_outcome: ApprovalOutcome::Failed, + retry_info: None, + } + } + + fn failed_with_retry( + validator_index: ValidatorIndex, + candidate_hash: CandidateHash, + retry_info: Option, + ) -> Self { + Self { + validator_index, + candidate_hash, + approval_outcome: ApprovalOutcome::Failed, + retry_info, + } } } @@ -757,6 +797,7 @@ impl CurrentlyCheckingSet { candidate_hash, validator_index, approval_outcome: ApprovalOutcome::TimedOut, + retry_info: None, }, Some(approval_state) => approval_state, } @@ -1271,18 +1312,19 @@ where validator_index, candidate_hash, approval_outcome, + retry_info, } ) = approval_state; if matches!(approval_outcome, ApprovalOutcome::Approved) { let mut approvals: Vec = relay_block_hashes - .into_iter() + .iter() .map(|block_hash| Action::IssueApproval( candidate_hash, ApprovalVoteRequest { validator_index, - block_hash, + block_hash: *block_hash, }, ) ) @@ -1290,6 +1332,52 @@ where actions.append(&mut approvals); } + if let Some(RetryApprovalInfo { + candidate, + validator_index, + backing_group, + executor_params, + core_index, + relay_block, + session_index, + num_attempts: _, + }) = retry_info.clone() { + for block_hash in relay_block_hashes { + if overlayed_db.load_block_entry(&block_hash).map(|block_info| block_info.is_some()).unwrap_or(false) { + let sender = to_other_subsystems.clone(); + let spawn_handle = subsystem.spawner.clone(); + let metrics = subsystem.metrics.clone(); + let retry_info = retry_info.clone(); + let executor_params = executor_params.clone(); + let candidate = candidate.clone(); + + currently_checking_set + .insert_relay_block_hash( + candidate_hash, + validator_index, + relay_block, + async move { + launch_approval( + sender, + spawn_handle, + metrics, + session_index, + candidate, + validator_index, + block_hash, + backing_group, + executor_params, + core_index, + retry_info, + ) + .await + }, + ) + .await?; + } + } + } + actions }, (block_hash, validator_index) = delayed_approvals_timers.select_next_some() => { @@ -1559,6 +1647,7 @@ async fn handle_actions< backing_group, executor_params, core_index, + None, ) .await }, @@ -2466,12 +2555,7 @@ fn schedule_wakeup_action( last_assignment_tick.map(|l| l + APPROVAL_DELAY).filter(|t| t > &tick_now), next_no_show, ) - .map(|tick| Action::ScheduleWakeup { - block_hash, - block_number, - candidate_hash, - tick, - }) + .map(|tick| Action::ScheduleWakeup { block_hash, block_number, candidate_hash, tick }) }, RequiredTranches::Pending { considered, next_no_show, clock_drift, .. } => { // select the minimum of `next_no_show`, or the tick of the next non-empty tranche @@ -3323,6 +3407,7 @@ async fn launch_approval< backing_group: GroupIndex, executor_params: ExecutorParams, core_index: Option, + mut retry: Option, ) -> SubsystemResult> { let (a_tx, a_rx) = oneshot::channel(); let (code_tx, code_rx) = oneshot::channel(); @@ -3382,7 +3467,6 @@ async fn launch_approval< let background = async move { // Force the move of the timer into the background task. let _timer = timer; - let available_data = match a_rx.await { Err(_) => return ApprovalState::failed(validator_index, candidate_hash), Ok(Ok(a)) => a, @@ -3396,7 +3480,27 @@ async fn launch_approval< "Data unavailable for candidate {:?}", (candidate_hash, candidate.descriptor.para_id()), ); - // do nothing. we'll just be a no-show and that'll cause others to rise up. + let num_attempts = + retry.as_ref().map(|retry| retry.num_attempts + 1).unwrap_or(1); + let retry_back_off = APPROVAL_CHECKING_TIMEOUT / 2; + // Availability could fail if we did not discover much of the network, so + // let's back off and order the subsystem to retry at a later point if the + // approval is still needed, because no-show wasn't covered yet. + if num_attempts < MAX_APPROVAL_RETRIES { + Delay::new(retry_back_off).await; + retry = Some(RetryApprovalInfo { + candidate, + validator_index, + backing_group, + executor_params, + core_index, + relay_block: block_hash, + session_index, + num_attempts, + }); + } else { + retry = None; + } metrics_guard.take().on_approval_unavailable(); }, &RecoveryError::ChannelClosed => { @@ -3427,7 +3531,7 @@ async fn launch_approval< metrics_guard.take().on_approval_invalid(); }, } - return ApprovalState::failed(validator_index, candidate_hash) + return ApprovalState::failed_with_retry(validator_index, candidate_hash, retry) }, }; From 7d4781c3677edcee3fc09c885f7f92adbec51b5f Mon Sep 17 00:00:00 2001 From: Alexandru Gheorghe Date: Fri, 20 Dec 2024 16:41:07 +0200 Subject: [PATCH 2/7] Fix in order Signed-off-by: Alexandru Gheorghe --- polkadot/node/core/approval-voting/src/lib.rs | 81 ++++--- .../node/core/approval-voting/src/tests.rs | 229 ++++++++++++++++++ .../subsystem-bench/src/lib/approval/mod.rs | 2 + prdoc/pr_6729.prdoc | 15 ++ 4 files changed, 296 insertions(+), 31 deletions(-) create mode 100644 prdoc/pr_6729.prdoc diff --git a/polkadot/node/core/approval-voting/src/lib.rs b/polkadot/node/core/approval-voting/src/lib.rs index 2f9edfa223ca9..f4a2b9b5e5c81 100644 --- a/polkadot/node/core/approval-voting/src/lib.rs +++ b/polkadot/node/core/approval-voting/src/lib.rs @@ -169,6 +169,10 @@ pub struct ApprovalVotingSubsystem { metrics: Metrics, clock: Arc, spawner: Arc, + /// The maximum time we retry to approve a block if it is still needed and PoV fetch failed. + max_approval_retries: u32, + /// The backoff before we retry the approval. + retry_backoff: Duration, } #[derive(Clone)] @@ -497,6 +501,8 @@ impl ApprovalVotingSubsystem { metrics, Arc::new(SystemClock {}), spawner, + MAX_APPROVAL_RETRIES, + APPROVAL_CHECKING_TIMEOUT / 2, ) } @@ -509,6 +515,8 @@ impl ApprovalVotingSubsystem { metrics: Metrics, clock: Arc, spawner: Arc, + max_approval_retries: u32, + retry_backoff: Duration, ) -> Self { ApprovalVotingSubsystem { keystore, @@ -519,6 +527,8 @@ impl ApprovalVotingSubsystem { metrics, clock, spawner, + max_approval_retries, + retry_backoff, } } @@ -713,13 +723,12 @@ enum ApprovalOutcome { #[derive(Clone)] struct RetryApprovalInfo { candidate: CandidateReceipt, - validator_index: ValidatorIndex, backing_group: GroupIndex, executor_params: ExecutorParams, core_index: Option, - relay_block: Hash, session_index: SessionIndex, - num_attempts: u32, + attempts_remaining: u32, + backoff: Duration, } struct ApprovalState { @@ -1332,42 +1341,33 @@ where actions.append(&mut approvals); } - if let Some(RetryApprovalInfo { - candidate, - validator_index, - backing_group, - executor_params, - core_index, - relay_block, - session_index, - num_attempts: _, - }) = retry_info.clone() { + if let Some(retry_info) = retry_info { for block_hash in relay_block_hashes { if overlayed_db.load_block_entry(&block_hash).map(|block_info| block_info.is_some()).unwrap_or(false) { let sender = to_other_subsystems.clone(); let spawn_handle = subsystem.spawner.clone(); let metrics = subsystem.metrics.clone(); let retry_info = retry_info.clone(); - let executor_params = executor_params.clone(); - let candidate = candidate.clone(); + let executor_params = retry_info.executor_params.clone(); + let candidate = retry_info.candidate.clone(); currently_checking_set .insert_relay_block_hash( candidate_hash, validator_index, - relay_block, + block_hash, async move { launch_approval( sender, spawn_handle, metrics, - session_index, + retry_info.session_index, candidate, validator_index, block_hash, - backing_group, + retry_info.backing_group, executor_params, - core_index, + retry_info.core_index, retry_info, ) .await @@ -1428,6 +1428,8 @@ where &mut approvals_cache, &mut subsystem.mode, actions, + subsystem.max_approval_retries, + subsystem.retry_backoff, ) .await? { @@ -1477,6 +1479,8 @@ pub async fn start_approval_worker< metrics, clock, spawner, + MAX_APPROVAL_RETRIES, + APPROVAL_CHECKING_TIMEOUT / 2, ); let backend = DbBackend::new(db.clone(), approval_voting.db_config); let spawner = approval_voting.spawner.clone(); @@ -1544,6 +1548,8 @@ async fn handle_actions< approvals_cache: &mut LruMap, mode: &mut Mode, actions: Vec, + max_approval_retries: u32, + retry_backoff: Duration, ) -> SubsystemResult { let mut conclude = false; let mut actions_iter = actions.into_iter(); @@ -1630,6 +1636,16 @@ async fn handle_actions< let sender = sender.clone(); let spawn_handle = spawn_handle.clone(); + let retry = RetryApprovalInfo { + candidate: candidate.clone(), + backing_group, + executor_params: executor_params.clone(), + core_index, + session_index: session, + attempts_remaining: max_approval_retries, + backoff: retry_backoff, + }; + currently_checking_set .insert_relay_block_hash( candidate_hash, @@ -1647,7 +1663,7 @@ async fn handle_actions< backing_group, executor_params, core_index, - None, + retry, ) .await }, @@ -2561,7 +2577,12 @@ fn schedule_wakeup_action( last_assignment_tick.map(|l| l + APPROVAL_DELAY).filter(|t| t > &tick_now), next_no_show, ) - .map(|tick| Action::ScheduleWakeup { block_hash, block_number, candidate_hash, tick }) + .map(|tick| Action::ScheduleWakeup { + block_hash, + block_number, + candidate_hash, + tick, + }) }, RequiredTranches::Pending { considered, next_no_show, clock_drift, .. } => { // select the minimum of `next_no_show`, or the tick of the next non-empty tranche @@ -3413,7 +3434,7 @@ async fn launch_approval< backing_group: GroupIndex, executor_params: ExecutorParams, core_index: Option, - mut retry: Option, + retry: RetryApprovalInfo, ) -> SubsystemResult> { let (a_tx, a_rx) = oneshot::channel(); let (code_tx, code_rx) = oneshot::channel(); @@ -3445,6 +3466,7 @@ async fn launch_approval< let candidate_hash = candidate.hash(); let para_id = candidate.descriptor.para_id(); + let mut next_retry = None; gum::trace!(target: LOG_TARGET, ?candidate_hash, ?para_id, "Recovering data."); let timer = metrics.time_recover_and_approve(); @@ -3486,26 +3508,23 @@ async fn launch_approval< "Data unavailable for candidate {:?}", (candidate_hash, candidate.descriptor.para_id()), ); - let num_attempts = - retry.as_ref().map(|retry| retry.num_attempts + 1).unwrap_or(1); let retry_back_off = APPROVAL_CHECKING_TIMEOUT / 2; // Availability could fail if we did not discover much of the network, so // let's back off and order the subsystem to retry at a later point if the // approval is still needed, because no-show wasn't covered yet. - if num_attempts < MAX_APPROVAL_RETRIES { + if retry.attempts_remaining > 0 { Delay::new(retry_back_off).await; - retry = Some(RetryApprovalInfo { + next_retry = Some(RetryApprovalInfo { candidate, - validator_index, backing_group, executor_params, core_index, - relay_block: block_hash, session_index, - num_attempts, + attempts_remaining: retry.attempts_remaining - 1, + backoff: retry.backoff, }); } else { - retry = None; + next_retry = None; } metrics_guard.take().on_approval_unavailable(); }, @@ -3537,7 +3556,7 @@ async fn launch_approval< metrics_guard.take().on_approval_invalid(); }, } - return ApprovalState::failed_with_retry(validator_index, candidate_hash, retry) + return ApprovalState::failed_with_retry(validator_index, candidate_hash, next_retry) }, }; diff --git a/polkadot/node/core/approval-voting/src/tests.rs b/polkadot/node/core/approval-voting/src/tests.rs index be569a1de3ecb..4c5bd4850c901 100644 --- a/polkadot/node/core/approval-voting/src/tests.rs +++ b/polkadot/node/core/approval-voting/src/tests.rs @@ -78,6 +78,9 @@ const SLOT_DURATION_MILLIS: u64 = 5000; const TIMEOUT: Duration = Duration::from_millis(2000); +const NUM_APPROVAL_RETRIES: u32 = 3; +const RETRY_BACKOFF: Duration = Duration::from_millis(300); + #[derive(Clone)] struct TestSyncOracle { flag: Arc, @@ -573,6 +576,8 @@ fn test_harness>( Metrics::default(), clock.clone(), Arc::new(SpawnGlue(pool)), + NUM_APPROVAL_RETRIES, + RETRY_BACKOFF, ), assignment_criteria, backend, @@ -4791,6 +4796,230 @@ fn subsystem_relaunches_approval_work_on_restart() { }); } +// Tests that for candidates that we did not approve yet, for which we triggered the assignment and +// the approval work we restart the work to approve it. +#[test] +fn subsystem_relaunches_approval_work_on_availability_failure() { + let assignment_criteria = Box::new(MockAssignmentCriteria( + || { + let mut assignments = HashMap::new(); + let _ = assignments.insert( + CoreIndex(0), + approval_db::v2::OurAssignment { + cert: garbage_assignment_cert(AssignmentCertKind::RelayVRFModulo { sample: 0 }) + .into(), + tranche: 0, + validator_index: ValidatorIndex(0), + triggered: false, + } + .into(), + ); + + let _ = assignments.insert( + CoreIndex(0), + approval_db::v2::OurAssignment { + cert: garbage_assignment_cert_v2(AssignmentCertKindV2::RelayVRFModuloCompact { + core_bitfield: vec![CoreIndex(0), CoreIndex(1), CoreIndex(2)] + .try_into() + .unwrap(), + }), + tranche: 0, + validator_index: ValidatorIndex(0), + triggered: false, + } + .into(), + ); + assignments + }, + |_| Ok(0), + )); + let config = HarnessConfigBuilder::default().assignment_criteria(assignment_criteria).build(); + let store = config.backend(); + let store_clone = config.backend(); + + test_harness(config, |test_harness| async move { + let TestHarness { mut virtual_overseer, clock, sync_oracle_handle } = test_harness; + + setup_overseer_with_two_blocks_each_with_one_assignment_triggered( + &mut virtual_overseer, + store, + &clock, + sync_oracle_handle, + ) + .await; + + assert_matches!( + overseer_recv(&mut virtual_overseer).await, + AllMessages::ApprovalDistribution(ApprovalDistributionMessage::DistributeAssignment( + _, + _, + )) => { + } + ); + + recover_available_data(&mut virtual_overseer).await; + fetch_validation_code(&mut virtual_overseer).await; + + assert_matches!( + overseer_recv(&mut virtual_overseer).await, + AllMessages::ApprovalDistribution(ApprovalDistributionMessage::DistributeAssignment( + _, + _ + )) => { + } + ); + + // Bail early after the assignment has been distributed but before we answer with the mocked + // approval from CandidateValidation. + virtual_overseer + }); + + // Restart a new approval voting subsystem with the same database and major syncing true until + // the last leaf. + let config = HarnessConfigBuilder::default().backend(store_clone).major_syncing(true).build(); + + test_harness(config, |test_harness| async move { + let TestHarness { mut virtual_overseer, clock, sync_oracle_handle } = test_harness; + + assert_matches!( + overseer_recv(&mut virtual_overseer).await, + AllMessages::ChainApi(ChainApiMessage::FinalizedBlockNumber(rx)) => { + rx.send(Ok(0)).unwrap(); + } + ); + + let block_hash = Hash::repeat_byte(0x01); + let fork_block_hash = Hash::repeat_byte(0x02); + let candidate_commitments = CandidateCommitments::default(); + let mut candidate_receipt = dummy_candidate_receipt_v2(block_hash); + candidate_receipt.commitments_hash = candidate_commitments.hash(); + let slot = Slot::from(1); + clock.inner.lock().set_tick(slot_to_tick(slot + 2)); + let (chain_builder, session_info) = build_chain_with_two_blocks_with_one_candidate_each( + block_hash, + fork_block_hash, + slot, + sync_oracle_handle, + candidate_receipt, + ) + .await; + + chain_builder.build(&mut virtual_overseer).await; + + futures_timer::Delay::new(Duration::from_millis(2000)).await; + + assert_matches!( + overseer_recv(&mut virtual_overseer).await, + AllMessages::RuntimeApi( + RuntimeApiMessage::Request( + _, + RuntimeApiRequest::SessionInfo(_, si_tx), + ) + ) => { + si_tx.send(Ok(Some(session_info.clone()))).unwrap(); + } + ); + assert_matches!( + overseer_recv(&mut virtual_overseer).await, + AllMessages::RuntimeApi( + RuntimeApiMessage::Request( + _, + RuntimeApiRequest::SessionExecutorParams(_, si_tx), + ) + ) => { + // Make sure all SessionExecutorParams calls are not made for the leaf (but for its relay parent) + si_tx.send(Ok(Some(ExecutorParams::default()))).unwrap(); + } + ); + + assert_matches!( + overseer_recv(&mut virtual_overseer).await, + AllMessages::RuntimeApi( + RuntimeApiMessage::Request(_, RuntimeApiRequest::NodeFeatures(_, si_tx), ) + ) => { + si_tx.send(Ok(NodeFeatures::EMPTY)).unwrap(); + } + ); + + // On major syncing ending Approval voting should send all the necessary messages for a + // candidate to be approved. + assert_matches!( + overseer_recv(&mut virtual_overseer).await, + AllMessages::ApprovalDistribution(ApprovalDistributionMessage::NewBlocks( + _, + )) => { + } + ); + + assert_matches!( + overseer_recv(&mut virtual_overseer).await, + AllMessages::ApprovalDistribution(ApprovalDistributionMessage::DistributeAssignment( + _, + _, + )) => { + } + ); + + assert_matches!( + overseer_recv(&mut virtual_overseer).await, + AllMessages::ApprovalDistribution(ApprovalDistributionMessage::DistributeAssignment( + _, + _, + )) => { + } + ); + + // Guarantees the approval work has been relaunched. + recover_available_data(&mut virtual_overseer).await; + fetch_validation_code(&mut virtual_overseer).await; + + assert_matches!( + overseer_recv(&mut virtual_overseer).await, + AllMessages::CandidateValidation(CandidateValidationMessage::ValidateFromExhaustive { + exec_kind, + response_sender, + .. + }) if exec_kind == PvfExecKind::Approval => { + response_sender.send(Ok(ValidationResult::Valid(Default::default(), Default::default()))) + .unwrap(); + } + ); + + assert_matches!( + overseer_recv(&mut virtual_overseer).await, + AllMessages::RuntimeApi(RuntimeApiMessage::Request(_, RuntimeApiRequest::ApprovalVotingParams(_, sender))) => { + let _ = sender.send(Ok(ApprovalVotingParams { + max_approval_coalesce_count: 1, + })); + } + ); + + assert_matches!( + overseer_recv(&mut virtual_overseer).await, + AllMessages::ApprovalDistribution(ApprovalDistributionMessage::DistributeApproval(_)) + ); + + assert_matches!( + overseer_recv(&mut virtual_overseer).await, + AllMessages::RuntimeApi(RuntimeApiMessage::Request(_, RuntimeApiRequest::ApprovalVotingParams(_, sender))) => { + let _ = sender.send(Ok(ApprovalVotingParams { + max_approval_coalesce_count: 1, + })); + } + ); + + assert_matches!( + overseer_recv(&mut virtual_overseer).await, + AllMessages::ApprovalDistribution(ApprovalDistributionMessage::DistributeApproval(_)) + ); + + // Assert that there are no more messages being sent by the subsystem + assert!(overseer_recv(&mut virtual_overseer).timeout(TIMEOUT / 2).await.is_none()); + + virtual_overseer + }); +} + // Test that cached approvals, which are candidates that we approved but we didn't issue // the signature yet because we want to coalesce it with more candidate are sent after restart. #[test] diff --git a/polkadot/node/subsystem-bench/src/lib/approval/mod.rs b/polkadot/node/subsystem-bench/src/lib/approval/mod.rs index 1b20960a3f8a6..5f1689cb226b3 100644 --- a/polkadot/node/subsystem-bench/src/lib/approval/mod.rs +++ b/polkadot/node/subsystem-bench/src/lib/approval/mod.rs @@ -891,6 +891,8 @@ fn build_overseer( state.approval_voting_parallel_metrics.approval_voting_metrics(), Arc::new(system_clock.clone()), Arc::new(SpawnGlue(spawn_task_handle.clone())), + 1, + Duration::from_secs(1), ); let approval_distribution = ApprovalDistribution::new_with_clock( diff --git a/prdoc/pr_6729.prdoc b/prdoc/pr_6729.prdoc new file mode 100644 index 0000000000000..9eaa67363c9a7 --- /dev/null +++ b/prdoc/pr_6729.prdoc @@ -0,0 +1,15 @@ +# Schema: Polkadot SDK PRDoc Schema (prdoc) v1.0.0 +# See doc at https://raw.githubusercontent.com/paritytech/polkadot-sdk/master/prdoc/schema_user.json + +title: Fix order of resending messages after restart + +doc: + - audience: Node Dev + description: | + At restart when dealing with a coalesced approval we might end up in a situation where we sent to + approval-distribution the approval before all assignments covering it, in that case, the approval + is ignored and never distribute, which will lead to no-shows. + +crates: + - name: polkadot-node-core-approval-voting + bump: minor From 1346e7665cfcf591f0b24f7457ed60af5cb2bd22 Mon Sep 17 00:00:00 2001 From: Alexandru Gheorghe Date: Fri, 20 Dec 2024 17:21:08 +0200 Subject: [PATCH 3/7] Add unittest Signed-off-by: Alexandru Gheorghe --- polkadot/node/core/approval-voting/src/lib.rs | 3 +- .../node/core/approval-voting/src/tests.rs | 160 +++++------------- 2 files changed, 40 insertions(+), 123 deletions(-) diff --git a/polkadot/node/core/approval-voting/src/lib.rs b/polkadot/node/core/approval-voting/src/lib.rs index f4a2b9b5e5c81..d468477a6332a 100644 --- a/polkadot/node/core/approval-voting/src/lib.rs +++ b/polkadot/node/core/approval-voting/src/lib.rs @@ -3508,12 +3508,11 @@ async fn launch_approval< "Data unavailable for candidate {:?}", (candidate_hash, candidate.descriptor.para_id()), ); - let retry_back_off = APPROVAL_CHECKING_TIMEOUT / 2; // Availability could fail if we did not discover much of the network, so // let's back off and order the subsystem to retry at a later point if the // approval is still needed, because no-show wasn't covered yet. if retry.attempts_remaining > 0 { - Delay::new(retry_back_off).await; + Delay::new(retry.backoff).await; next_retry = Some(RetryApprovalInfo { candidate, backing_group, diff --git a/polkadot/node/core/approval-voting/src/tests.rs b/polkadot/node/core/approval-voting/src/tests.rs index 4c5bd4850c901..57e9322ca313b 100644 --- a/polkadot/node/core/approval-voting/src/tests.rs +++ b/polkadot/node/core/approval-voting/src/tests.rs @@ -3207,6 +3207,20 @@ async fn recover_available_data(virtual_overseer: &mut VirtualOverseer) { ); } +async fn recover_available_data_failure(virtual_overseer: &mut VirtualOverseer) { + let available_data = RecoveryError::Unavailable; + + assert_matches!( + virtual_overseer.recv().await, + AllMessages::AvailabilityRecovery( + AvailabilityRecoveryMessage::RecoverAvailableData(_, _, _, _, tx) + ) => { + tx.send(Err(available_data)).unwrap(); + }, + "overseer did not receive recover available data message", + ); +} + struct TriggersAssignmentConfig { our_assigned_tranche: DelayTranche, assign_validator_tranche: F1, @@ -4796,18 +4810,19 @@ fn subsystem_relaunches_approval_work_on_restart() { }); } -// Tests that for candidates that we did not approve yet, for which we triggered the assignment and -// the approval work we restart the work to approve it. +/// Test that we retry the approval of candidate on availability failure, up to max retries. #[test] fn subsystem_relaunches_approval_work_on_availability_failure() { let assignment_criteria = Box::new(MockAssignmentCriteria( || { let mut assignments = HashMap::new(); + let _ = assignments.insert( CoreIndex(0), approval_db::v2::OurAssignment { - cert: garbage_assignment_cert(AssignmentCertKind::RelayVRFModulo { sample: 0 }) - .into(), + cert: garbage_assignment_cert_v2(AssignmentCertKindV2::RelayVRFModuloCompact { + core_bitfield: vec![CoreIndex(0), CoreIndex(2)].try_into().unwrap(), + }), tranche: 0, validator_index: ValidatorIndex(0), triggered: false, @@ -4816,12 +4831,10 @@ fn subsystem_relaunches_approval_work_on_availability_failure() { ); let _ = assignments.insert( - CoreIndex(0), + CoreIndex(1), approval_db::v2::OurAssignment { - cert: garbage_assignment_cert_v2(AssignmentCertKindV2::RelayVRFModuloCompact { - core_bitfield: vec![CoreIndex(0), CoreIndex(1), CoreIndex(2)] - .try_into() - .unwrap(), + cert: garbage_assignment_cert_v2(AssignmentCertKindV2::RelayVRFDelay { + core_index: CoreIndex(1), }), tranche: 0, validator_index: ValidatorIndex(0), @@ -4840,7 +4853,7 @@ fn subsystem_relaunches_approval_work_on_availability_failure() { test_harness(config, |test_harness| async move { let TestHarness { mut virtual_overseer, clock, sync_oracle_handle } = test_harness; - setup_overseer_with_two_blocks_each_with_one_assignment_triggered( + setup_overseer_with_blocks_with_two_assignments_triggered( &mut virtual_overseer, store, &clock, @@ -4848,6 +4861,9 @@ fn subsystem_relaunches_approval_work_on_availability_failure() { ) .await; + // We have two candidates for one we are going to fail the availability for up to + // max_retries and for the other we are going to succeed on the last retry, so we should + // see the approval being distributed. assert_matches!( overseer_recv(&mut virtual_overseer).await, AllMessages::ApprovalDistribution(ApprovalDistributionMessage::DistributeAssignment( @@ -4857,7 +4873,7 @@ fn subsystem_relaunches_approval_work_on_availability_failure() { } ); - recover_available_data(&mut virtual_overseer).await; + recover_available_data_failure(&mut virtual_overseer).await; fetch_validation_code(&mut virtual_overseer).await; assert_matches!( @@ -4869,107 +4885,24 @@ fn subsystem_relaunches_approval_work_on_availability_failure() { } ); - // Bail early after the assignment has been distributed but before we answer with the mocked - // approval from CandidateValidation. - virtual_overseer - }); - - // Restart a new approval voting subsystem with the same database and major syncing true until - // the last leaf. - let config = HarnessConfigBuilder::default().backend(store_clone).major_syncing(true).build(); - - test_harness(config, |test_harness| async move { - let TestHarness { mut virtual_overseer, clock, sync_oracle_handle } = test_harness; - - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::ChainApi(ChainApiMessage::FinalizedBlockNumber(rx)) => { - rx.send(Ok(0)).unwrap(); - } - ); - - let block_hash = Hash::repeat_byte(0x01); - let fork_block_hash = Hash::repeat_byte(0x02); - let candidate_commitments = CandidateCommitments::default(); - let mut candidate_receipt = dummy_candidate_receipt_v2(block_hash); - candidate_receipt.commitments_hash = candidate_commitments.hash(); - let slot = Slot::from(1); - clock.inner.lock().set_tick(slot_to_tick(slot + 2)); - let (chain_builder, session_info) = build_chain_with_two_blocks_with_one_candidate_each( - block_hash, - fork_block_hash, - slot, - sync_oracle_handle, - candidate_receipt, - ) - .await; - - chain_builder.build(&mut virtual_overseer).await; - - futures_timer::Delay::new(Duration::from_millis(2000)).await; + recover_available_data_failure(&mut virtual_overseer).await; + fetch_validation_code(&mut virtual_overseer).await; - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::RuntimeApi( - RuntimeApiMessage::Request( - _, - RuntimeApiRequest::SessionInfo(_, si_tx), - ) - ) => { - si_tx.send(Ok(Some(session_info.clone()))).unwrap(); - } - ); - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::RuntimeApi( - RuntimeApiMessage::Request( - _, - RuntimeApiRequest::SessionExecutorParams(_, si_tx), - ) - ) => { - // Make sure all SessionExecutorParams calls are not made for the leaf (but for its relay parent) - si_tx.send(Ok(Some(ExecutorParams::default()))).unwrap(); - } - ); + recover_available_data_failure(&mut virtual_overseer).await; + fetch_validation_code(&mut virtual_overseer).await; - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::RuntimeApi( - RuntimeApiMessage::Request(_, RuntimeApiRequest::NodeFeatures(_, si_tx), ) - ) => { - si_tx.send(Ok(NodeFeatures::EMPTY)).unwrap(); - } - ); + recover_available_data_failure(&mut virtual_overseer).await; + fetch_validation_code(&mut virtual_overseer).await; - // On major syncing ending Approval voting should send all the necessary messages for a - // candidate to be approved. - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::ApprovalDistribution(ApprovalDistributionMessage::NewBlocks( - _, - )) => { - } - ); + recover_available_data_failure(&mut virtual_overseer).await; + fetch_validation_code(&mut virtual_overseer).await; - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::ApprovalDistribution(ApprovalDistributionMessage::DistributeAssignment( - _, - _, - )) => { - } - ); + recover_available_data_failure(&mut virtual_overseer).await; + fetch_validation_code(&mut virtual_overseer).await; - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::ApprovalDistribution(ApprovalDistributionMessage::DistributeAssignment( - _, - _, - )) => { - } - ); + recover_available_data_failure(&mut virtual_overseer).await; + fetch_validation_code(&mut virtual_overseer).await; - // Guarantees the approval work has been relaunched. recover_available_data(&mut virtual_overseer).await; fetch_validation_code(&mut virtual_overseer).await; @@ -4984,21 +4917,6 @@ fn subsystem_relaunches_approval_work_on_availability_failure() { .unwrap(); } ); - - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::RuntimeApi(RuntimeApiMessage::Request(_, RuntimeApiRequest::ApprovalVotingParams(_, sender))) => { - let _ = sender.send(Ok(ApprovalVotingParams { - max_approval_coalesce_count: 1, - })); - } - ); - - assert_matches!( - overseer_recv(&mut virtual_overseer).await, - AllMessages::ApprovalDistribution(ApprovalDistributionMessage::DistributeApproval(_)) - ); - assert_matches!( overseer_recv(&mut virtual_overseer).await, AllMessages::RuntimeApi(RuntimeApiMessage::Request(_, RuntimeApiRequest::ApprovalVotingParams(_, sender))) => { From 17f8b1566e9e3cab539458c71a992675a299c184 Mon Sep 17 00:00:00 2001 From: Alexandru Gheorghe <49718502+alexggh@users.noreply.github.com> Date: Fri, 20 Dec 2024 17:28:36 +0200 Subject: [PATCH 4/7] Delete prdoc/pr_6729.prdoc --- prdoc/pr_6729.prdoc | 15 --------------- 1 file changed, 15 deletions(-) delete mode 100644 prdoc/pr_6729.prdoc diff --git a/prdoc/pr_6729.prdoc b/prdoc/pr_6729.prdoc deleted file mode 100644 index 9eaa67363c9a7..0000000000000 --- a/prdoc/pr_6729.prdoc +++ /dev/null @@ -1,15 +0,0 @@ -# Schema: Polkadot SDK PRDoc Schema (prdoc) v1.0.0 -# See doc at https://raw.githubusercontent.com/paritytech/polkadot-sdk/master/prdoc/schema_user.json - -title: Fix order of resending messages after restart - -doc: - - audience: Node Dev - description: | - At restart when dealing with a coalesced approval we might end up in a situation where we sent to - approval-distribution the approval before all assignments covering it, in that case, the approval - is ignored and never distribute, which will lead to no-shows. - -crates: - - name: polkadot-node-core-approval-voting - bump: minor From 3acbceaf4655ab29053c55b89d270f968a0759df Mon Sep 17 00:00:00 2001 From: Alexandru Gheorghe Date: Tue, 7 Jan 2025 18:08:48 +0200 Subject: [PATCH 5/7] Extend log Signed-off-by: Alexandru Gheorghe --- polkadot/node/core/approval-voting/src/lib.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/polkadot/node/core/approval-voting/src/lib.rs b/polkadot/node/core/approval-voting/src/lib.rs index d468477a6332a..27361df373104 100644 --- a/polkadot/node/core/approval-voting/src/lib.rs +++ b/polkadot/node/core/approval-voting/src/lib.rs @@ -3505,6 +3505,7 @@ async fn launch_approval< target: LOG_TARGET, ?para_id, ?candidate_hash, + attempts_remaining = retry.attempts_remaining, "Data unavailable for candidate {:?}", (candidate_hash, candidate.descriptor.para_id()), ); From 226035930fb81e7a6e593710fc1ddc9867afa98a Mon Sep 17 00:00:00 2001 From: Alexandru Gheorghe Date: Wed, 8 Jan 2025 10:47:40 +0200 Subject: [PATCH 6/7] Make clippy happy Signed-off-by: Alexandru Gheorghe --- polkadot/node/core/approval-voting/src/tests.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/polkadot/node/core/approval-voting/src/tests.rs b/polkadot/node/core/approval-voting/src/tests.rs index 57e9322ca313b..b72993fe1a94a 100644 --- a/polkadot/node/core/approval-voting/src/tests.rs +++ b/polkadot/node/core/approval-voting/src/tests.rs @@ -4848,7 +4848,6 @@ fn subsystem_relaunches_approval_work_on_availability_failure() { )); let config = HarnessConfigBuilder::default().assignment_criteria(assignment_criteria).build(); let store = config.backend(); - let store_clone = config.backend(); test_harness(config, |test_harness| async move { let TestHarness { mut virtual_overseer, clock, sync_oracle_handle } = test_harness; From 2d3199def713ee1f4539aa31e68be46477af4acf Mon Sep 17 00:00:00 2001 From: Alexandru Gheorghe Date: Mon, 13 Jan 2025 17:28:58 +0200 Subject: [PATCH 7/7] Add prdoc Signed-off-by: Alexandru Gheorghe --- prdoc/pr_6807.prdoc | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 prdoc/pr_6807.prdoc diff --git a/prdoc/pr_6807.prdoc b/prdoc/pr_6807.prdoc new file mode 100644 index 0000000000000..b9564dfb2fe26 --- /dev/null +++ b/prdoc/pr_6807.prdoc @@ -0,0 +1,19 @@ +# Schema: Polkadot SDK PRDoc Schema (prdoc) v1.0.0 +# See doc at https://raw.githubusercontent.com/paritytech/polkadot-sdk/master/prdoc/schema_user.json + +title: Retry approval on availability failure if the check is still needed + +doc: + - audience: Node Dev + description: | + Recovering the POV can fail in situation where the node just restart and the DHT topology + wasn't fully discovered yet, so the current node can't connect to most of its Peers. + This is bad because for gossiping the assignment you need to be connected to just a few + peers, so because we can't approve the candidate other nodes will see this as a no show. + Fix it by retrying to approve a candidate for a fixed number of atttempts if the block is + still needed. + + +crates: + - name: polkadot-node-core-approval-voting + bump: minor