2121//! of others. It uses this information to determine when candidates and blocks have
2222//! been sufficiently approved to finalize.
2323
24+ use futures_timer:: Delay ;
2425use polkadot_node_primitives:: {
2526 approval:: {
2627 v1:: { BlockApprovalMeta , DelayTranche } ,
@@ -122,6 +123,9 @@ const APPROVAL_CHECKING_TIMEOUT: Duration = Duration::from_secs(120);
122123const WAIT_FOR_SIGS_TIMEOUT : Duration = Duration :: from_millis ( 500 ) ;
123124const APPROVAL_CACHE_SIZE : u32 = 1024 ;
124125
126+ /// The maximum number of times we retry to approve a block if is still needed.
127+ const MAX_APPROVAL_RETRIES : u32 = 16 ;
128+
125129const APPROVAL_DELAY : Tick = 2 ;
126130pub ( crate ) const LOG_TARGET : & str = "parachain::approval-voting" ;
127131
@@ -165,6 +169,10 @@ pub struct ApprovalVotingSubsystem {
165169 metrics : Metrics ,
166170 clock : Arc < dyn Clock + Send + Sync > ,
167171 spawner : Arc < dyn overseer:: gen:: Spawner + ' static > ,
172+ /// The maximum time we retry to approve a block if it is still needed and PoV fetch failed.
173+ max_approval_retries : u32 ,
174+ /// The backoff before we retry the approval.
175+ retry_backoff : Duration ,
168176}
169177
170178#[ derive( Clone ) ]
@@ -493,6 +501,8 @@ impl ApprovalVotingSubsystem {
493501 metrics,
494502 Arc :: new ( SystemClock { } ) ,
495503 spawner,
504+ MAX_APPROVAL_RETRIES ,
505+ APPROVAL_CHECKING_TIMEOUT / 2 ,
496506 )
497507 }
498508
@@ -505,6 +515,8 @@ impl ApprovalVotingSubsystem {
505515 metrics : Metrics ,
506516 clock : Arc < dyn Clock + Send + Sync > ,
507517 spawner : Arc < dyn overseer:: gen:: Spawner + ' static > ,
518+ max_approval_retries : u32 ,
519+ retry_backoff : Duration ,
508520 ) -> Self {
509521 ApprovalVotingSubsystem {
510522 keystore,
@@ -515,6 +527,8 @@ impl ApprovalVotingSubsystem {
515527 metrics,
516528 clock,
517529 spawner,
530+ max_approval_retries,
531+ retry_backoff,
518532 }
519533 }
520534
@@ -706,18 +720,53 @@ enum ApprovalOutcome {
706720 TimedOut ,
707721}
708722
723+ #[ derive( Clone ) ]
724+ struct RetryApprovalInfo {
725+ candidate : CandidateReceipt ,
726+ backing_group : GroupIndex ,
727+ executor_params : ExecutorParams ,
728+ core_index : Option < CoreIndex > ,
729+ session_index : SessionIndex ,
730+ attempts_remaining : u32 ,
731+ backoff : Duration ,
732+ }
733+
709734struct ApprovalState {
710735 validator_index : ValidatorIndex ,
711736 candidate_hash : CandidateHash ,
712737 approval_outcome : ApprovalOutcome ,
738+ retry_info : Option < RetryApprovalInfo > ,
713739}
714740
715741impl ApprovalState {
716742 fn approved ( validator_index : ValidatorIndex , candidate_hash : CandidateHash ) -> Self {
717- Self { validator_index, candidate_hash, approval_outcome : ApprovalOutcome :: Approved }
743+ Self {
744+ validator_index,
745+ candidate_hash,
746+ approval_outcome : ApprovalOutcome :: Approved ,
747+ retry_info : None ,
748+ }
718749 }
719750 fn failed ( validator_index : ValidatorIndex , candidate_hash : CandidateHash ) -> Self {
720- Self { validator_index, candidate_hash, approval_outcome : ApprovalOutcome :: Failed }
751+ Self {
752+ validator_index,
753+ candidate_hash,
754+ approval_outcome : ApprovalOutcome :: Failed ,
755+ retry_info : None ,
756+ }
757+ }
758+
759+ fn failed_with_retry (
760+ validator_index : ValidatorIndex ,
761+ candidate_hash : CandidateHash ,
762+ retry_info : Option < RetryApprovalInfo > ,
763+ ) -> Self {
764+ Self {
765+ validator_index,
766+ candidate_hash,
767+ approval_outcome : ApprovalOutcome :: Failed ,
768+ retry_info,
769+ }
721770 }
722771}
723772
@@ -757,6 +806,7 @@ impl CurrentlyCheckingSet {
757806 candidate_hash,
758807 validator_index,
759808 approval_outcome : ApprovalOutcome :: TimedOut ,
809+ retry_info : None ,
760810 } ,
761811 Some ( approval_state) => approval_state,
762812 }
@@ -1271,25 +1321,63 @@ where
12711321 validator_index,
12721322 candidate_hash,
12731323 approval_outcome,
1324+ retry_info,
12741325 }
12751326 ) = approval_state;
12761327
12771328 if matches!( approval_outcome, ApprovalOutcome :: Approved ) {
12781329 let mut approvals: Vec <Action > = relay_block_hashes
1279- . into_iter ( )
1330+ . iter ( )
12801331 . map( |block_hash|
12811332 Action :: IssueApproval (
12821333 candidate_hash,
12831334 ApprovalVoteRequest {
12841335 validator_index,
1285- block_hash,
1336+ block_hash: * block_hash ,
12861337 } ,
12871338 )
12881339 )
12891340 . collect( ) ;
12901341 actions. append( & mut approvals) ;
12911342 }
12921343
1344+ if let Some ( retry_info) = retry_info {
1345+ for block_hash in relay_block_hashes {
1346+ if overlayed_db. load_block_entry( & block_hash) . map( |block_info| block_info. is_some( ) ) . unwrap_or( false ) {
1347+ let sender = to_other_subsystems. clone( ) ;
1348+ let spawn_handle = subsystem. spawner. clone( ) ;
1349+ let metrics = subsystem. metrics. clone( ) ;
1350+ let retry_info = retry_info. clone( ) ;
1351+ let executor_params = retry_info. executor_params. clone( ) ;
1352+ let candidate = retry_info. candidate. clone( ) ;
1353+
1354+ currently_checking_set
1355+ . insert_relay_block_hash(
1356+ candidate_hash,
1357+ validator_index,
1358+ block_hash,
1359+ async move {
1360+ launch_approval(
1361+ sender,
1362+ spawn_handle,
1363+ metrics,
1364+ retry_info. session_index,
1365+ candidate,
1366+ validator_index,
1367+ block_hash,
1368+ retry_info. backing_group,
1369+ executor_params,
1370+ retry_info. core_index,
1371+ retry_info,
1372+ )
1373+ . await
1374+ } ,
1375+ )
1376+ . await ?;
1377+ }
1378+ }
1379+ }
1380+
12931381 actions
12941382 } ,
12951383 ( block_hash, validator_index) = delayed_approvals_timers. select_next_some( ) => {
@@ -1340,6 +1428,8 @@ where
13401428 & mut approvals_cache,
13411429 & mut subsystem. mode ,
13421430 actions,
1431+ subsystem. max_approval_retries ,
1432+ subsystem. retry_backoff ,
13431433 )
13441434 . await ?
13451435 {
@@ -1389,6 +1479,8 @@ pub async fn start_approval_worker<
13891479 metrics,
13901480 clock,
13911481 spawner,
1482+ MAX_APPROVAL_RETRIES ,
1483+ APPROVAL_CHECKING_TIMEOUT / 2 ,
13921484 ) ;
13931485 let backend = DbBackend :: new ( db. clone ( ) , approval_voting. db_config ) ;
13941486 let spawner = approval_voting. spawner . clone ( ) ;
@@ -1456,6 +1548,8 @@ async fn handle_actions<
14561548 approvals_cache : & mut LruMap < CandidateHash , ApprovalOutcome > ,
14571549 mode : & mut Mode ,
14581550 actions : Vec < Action > ,
1551+ max_approval_retries : u32 ,
1552+ retry_backoff : Duration ,
14591553) -> SubsystemResult < bool > {
14601554 let mut conclude = false ;
14611555 let mut actions_iter = actions. into_iter ( ) ;
@@ -1542,6 +1636,16 @@ async fn handle_actions<
15421636 let sender = sender. clone ( ) ;
15431637 let spawn_handle = spawn_handle. clone ( ) ;
15441638
1639+ let retry = RetryApprovalInfo {
1640+ candidate : candidate. clone ( ) ,
1641+ backing_group,
1642+ executor_params : executor_params. clone ( ) ,
1643+ core_index,
1644+ session_index : session,
1645+ attempts_remaining : max_approval_retries,
1646+ backoff : retry_backoff,
1647+ } ;
1648+
15451649 currently_checking_set
15461650 . insert_relay_block_hash (
15471651 candidate_hash,
@@ -1559,6 +1663,7 @@ async fn handle_actions<
15591663 backing_group,
15601664 executor_params,
15611665 core_index,
1666+ retry,
15621667 )
15631668 . await
15641669 } ,
@@ -3329,6 +3434,7 @@ async fn launch_approval<
33293434 backing_group : GroupIndex ,
33303435 executor_params : ExecutorParams ,
33313436 core_index : Option < CoreIndex > ,
3437+ retry : RetryApprovalInfo ,
33323438) -> SubsystemResult < RemoteHandle < ApprovalState > > {
33333439 let ( a_tx, a_rx) = oneshot:: channel ( ) ;
33343440 let ( code_tx, code_rx) = oneshot:: channel ( ) ;
@@ -3360,6 +3466,7 @@ async fn launch_approval<
33603466
33613467 let candidate_hash = candidate. hash ( ) ;
33623468 let para_id = candidate. descriptor . para_id ( ) ;
3469+ let mut next_retry = None ;
33633470 gum:: trace!( target: LOG_TARGET , ?candidate_hash, ?para_id, "Recovering data." ) ;
33643471
33653472 let timer = metrics. time_recover_and_approve ( ) ;
@@ -3388,7 +3495,6 @@ async fn launch_approval<
33883495 let background = async move {
33893496 // Force the move of the timer into the background task.
33903497 let _timer = timer;
3391-
33923498 let available_data = match a_rx. await {
33933499 Err ( _) => return ApprovalState :: failed ( validator_index, candidate_hash) ,
33943500 Ok ( Ok ( a) ) => a,
@@ -3399,10 +3505,27 @@ async fn launch_approval<
33993505 target: LOG_TARGET ,
34003506 ?para_id,
34013507 ?candidate_hash,
3508+ attempts_remaining = retry. attempts_remaining,
34023509 "Data unavailable for candidate {:?}" ,
34033510 ( candidate_hash, candidate. descriptor. para_id( ) ) ,
34043511 ) ;
3405- // do nothing. we'll just be a no-show and that'll cause others to rise up.
3512+ // Availability could fail if we did not discover much of the network, so
3513+ // let's back off and order the subsystem to retry at a later point if the
3514+ // approval is still needed, because no-show wasn't covered yet.
3515+ if retry. attempts_remaining > 0 {
3516+ Delay :: new ( retry. backoff ) . await ;
3517+ next_retry = Some ( RetryApprovalInfo {
3518+ candidate,
3519+ backing_group,
3520+ executor_params,
3521+ core_index,
3522+ session_index,
3523+ attempts_remaining : retry. attempts_remaining - 1 ,
3524+ backoff : retry. backoff ,
3525+ } ) ;
3526+ } else {
3527+ next_retry = None ;
3528+ }
34063529 metrics_guard. take ( ) . on_approval_unavailable ( ) ;
34073530 } ,
34083531 & RecoveryError :: ChannelClosed => {
@@ -3433,7 +3556,7 @@ async fn launch_approval<
34333556 metrics_guard. take ( ) . on_approval_invalid ( ) ;
34343557 } ,
34353558 }
3436- return ApprovalState :: failed ( validator_index, candidate_hash)
3559+ return ApprovalState :: failed_with_retry ( validator_index, candidate_hash, next_retry )
34373560 } ,
34383561 } ;
34393562
0 commit comments