diff --git a/bridges/primitives/header-chain/src/justification/verification/mod.rs b/bridges/primitives/header-chain/src/justification/verification/mod.rs index 9941537eb0953..9d9f6a36b5e8b 100644 --- a/bridges/primitives/header-chain/src/justification/verification/mod.rs +++ b/bridges/primitives/header-chain/src/justification/verification/mod.rs @@ -306,7 +306,9 @@ trait JustificationVerifier { justification.round, context.authority_set_id, &mut signature_buffer, - ) { + ) + .is_valid() + { self.process_invalid_signature_vote(precommit_idx).map_err(Error::Precommit)?; continue } diff --git a/prdoc/pr_9015.prdoc b/prdoc/pr_9015.prdoc new file mode 100644 index 0000000000000..7aff98cf2b740 --- /dev/null +++ b/prdoc/pr_9015.prdoc @@ -0,0 +1,45 @@ +title: 'consensus/grandpa: Fix high number of peer disconnects with invalid justification' +doc: +- audience: [Node Dev, Node Operator] + description: |- + A grandpa race-casse has been identified in the versi-net stack around authority set changes, which leads to the following: + + - T0 / Node A: Completes round (15) + - T1 / Node A: Applies new authority set change and increments the SetID (from 0 to 1) + - T2 / Node B: Sends Precommit for round (15) with SetID (0) -- previous set ID + - T3 / Node B: Applies new authority set change and increments the SetID (1) + + In this scenario, Node B is not aware at the moment of sending justifications that the Set ID has changed. + The downstream effect is that Node A will not be able to verify the signature of justifications, since a different SetID is taken into account. This will cascade through the sync engine, where the Node B is wrongfully banned and disconnected. + + This PR aims to fix the edge-case by making the grandpa resilient to verifying prior setIDs for signatures. + When the signature of the grandpa justification fails to decode, the prior SetID is also verified. If the prior SetID produces a valid signature, then the outdated justification error is propagated through the code (ie `SignatureResult::OutdatedSet`). + + The sync engine will handle the outdated justifications as invalid, but without banning the peer. This leads to increased stability of the network during authority changes, which caused frequent disconnects to versi-net in the past. + + ### Review Notes + - Main changes that verify prior SetId on failures are placed in [check_message_signature_with_buffer](https://github.com/paritytech/polkadot-sdk/pull/9015/files#diff-359d7a46ea285177e5d86979f62f0f04baabf65d595c61bfe44b6fc01af70d89R458-R501) + - Sync engine no longer disconnects outdated justifications in [process_service_command](https://github.com/paritytech/polkadot-sdk/pull/9015/files#diff-9ab3391aa82ee2b2868ece610100f84502edcf40638dba9ed6953b6e572dfba5R678-R703) + + ### Testing Done + - Deployed the PR to versi-net with 40 validators + - Prior we have noticed 10/40 validators disconnecting every 15-20 minutes, leading to instability + - Over past 24h the issue has been mitigated: https://grafana.teleport.parity.io/goto/FPNWlmsHR?orgId=1 + - Note: bootnodes 0 and 1 are currently running outdated versions that do not incorporate this SetID verification improvement + + Part of: https://github.com/paritytech/polkadot-sdk/issues/8872 +crates: +- name: sp-consensus-grandpa + bump: minor +- name: bp-header-chain + bump: patch +- name: sc-consensus-grandpa + bump: patch +- name: sp-blockchain + bump: minor +- name: sp-consensus + bump: minor +- name: sc-consensus + bump: minor +- name: sc-network-sync + bump: patch diff --git a/substrate/client/consensus/common/src/import_queue.rs b/substrate/client/consensus/common/src/import_queue.rs index 602683907d482..be14d780b2e06 100644 --- a/substrate/client/consensus/common/src/import_queue.rs +++ b/substrate/client/consensus/common/src/import_queue.rs @@ -141,6 +141,19 @@ pub trait ImportQueue: Send { async fn run(self, link: &dyn Link); } +/// The result of importing a justification. +#[derive(Debug, PartialEq)] +pub enum JustificationImportResult { + /// Justification was imported successfully. + Success, + + /// Justification was not imported successfully. + Failure, + + /// Justification was not imported successfully, because it is outdated. + OutdatedJustification, +} + /// Hooks that the verification queue can use to influence the synchronization /// algorithm. pub trait Link: Send + Sync { @@ -159,7 +172,7 @@ pub trait Link: Send + Sync { _who: RuntimeOrigin, _hash: &B::Hash, _number: NumberFor, - _success: bool, + _import_result: JustificationImportResult, ) { } diff --git a/substrate/client/consensus/common/src/import_queue/basic_queue.rs b/substrate/client/consensus/common/src/import_queue/basic_queue.rs index 21270859dd75c..d8879731654c1 100644 --- a/substrate/client/consensus/common/src/import_queue/basic_queue.rs +++ b/substrate/client/consensus/common/src/import_queue/basic_queue.rs @@ -34,7 +34,8 @@ use crate::{ buffered_link::{self, BufferedLinkReceiver, BufferedLinkSender}, import_single_block_metered, verify_single_block_metered, BlockImportError, BlockImportStatus, BoxBlockImport, BoxJustificationImport, ImportQueue, ImportQueueService, - IncomingBlock, Link, RuntimeOrigin, SingleBlockVerificationOutcome, Verifier, LOG_TARGET, + IncomingBlock, JustificationImportResult, Link, RuntimeOrigin, + SingleBlockVerificationOutcome, Verifier, LOG_TARGET, }, metrics::Metrics, }; @@ -342,8 +343,9 @@ impl BlockImportWorker { ) { let started = std::time::Instant::now(); - let success = match self.justification_import.as_mut() { - Some(justification_import) => justification_import + let import_result = match self.justification_import.as_mut() { + Some(justification_import) => { + let result = justification_import .import_justification(hash, number, justification) .await .map_err(|e| { @@ -356,16 +358,22 @@ impl BlockImportWorker { e, ); e - }) - .is_ok(), - None => false, + }); + match result { + Ok(()) => JustificationImportResult::Success, + Err(sp_consensus::Error::OutdatedJustification) => + JustificationImportResult::OutdatedJustification, + Err(_) => JustificationImportResult::Failure, + } + }, + None => JustificationImportResult::Failure, }; if let Some(metrics) = self.metrics.as_ref() { metrics.justification_import_time.observe(started.elapsed().as_secs_f64()); } - self.result_sender.justification_imported(who, &hash, number, success); + self.result_sender.justification_imported(who, &hash, number, import_result); } } @@ -579,7 +587,7 @@ mod tests { _who: RuntimeOrigin, hash: &Hash, _number: BlockNumber, - _success: bool, + _import_result: JustificationImportResult, ) { self.events.lock().push(Event::JustificationImported(*hash)) } diff --git a/substrate/client/consensus/common/src/import_queue/buffered_link.rs b/substrate/client/consensus/common/src/import_queue/buffered_link.rs index 67131b06a32e5..19b0668da24dc 100644 --- a/substrate/client/consensus/common/src/import_queue/buffered_link.rs +++ b/substrate/client/consensus/common/src/import_queue/buffered_link.rs @@ -38,7 +38,7 @@ //! }); //! ``` -use crate::import_queue::{Link, RuntimeOrigin}; +use crate::import_queue::{JustificationImportResult, Link, RuntimeOrigin}; use futures::prelude::*; use sc_utils::mpsc::{tracing_unbounded, TracingUnboundedReceiver, TracingUnboundedSender}; use sp_runtime::traits::{Block as BlockT, NumberFor}; @@ -84,7 +84,7 @@ impl Clone for BufferedLinkSender { /// Internal buffered message. pub enum BlockImportWorkerMsg { BlocksProcessed(usize, usize, Vec<(BlockImportResult, B::Hash)>), - JustificationImported(RuntimeOrigin, B::Hash, NumberFor, bool), + JustificationImported(RuntimeOrigin, B::Hash, NumberFor, JustificationImportResult), RequestJustification(B::Hash, NumberFor), } @@ -105,9 +105,9 @@ impl Link for BufferedLinkSender { who: RuntimeOrigin, hash: &B::Hash, number: NumberFor, - success: bool, + import_result: JustificationImportResult, ) { - let msg = BlockImportWorkerMsg::JustificationImported(who, *hash, number, success); + let msg = BlockImportWorkerMsg::JustificationImported(who, *hash, number, import_result); let _ = self.tx.unbounded_send(msg); } @@ -129,8 +129,8 @@ impl BufferedLinkReceiver { match msg { BlockImportWorkerMsg::BlocksProcessed(imported, count, results) => link.blocks_processed(imported, count, results), - BlockImportWorkerMsg::JustificationImported(who, hash, number, success) => - link.justification_imported(who, &hash, number, success), + BlockImportWorkerMsg::JustificationImported(who, hash, number, import_result) => + link.justification_imported(who, &hash, number, import_result), BlockImportWorkerMsg::RequestJustification(hash, number) => link.request_justification(&hash, number), } diff --git a/substrate/client/consensus/common/src/lib.rs b/substrate/client/consensus/common/src/lib.rs index 6bf1ed0b48b4d..070e663451ffb 100644 --- a/substrate/client/consensus/common/src/lib.rs +++ b/substrate/client/consensus/common/src/lib.rs @@ -29,7 +29,8 @@ pub use block_import::{ }; pub use import_queue::{ import_single_block, BasicQueue, BlockImportError, BlockImportStatus, BoxBlockImport, - BoxJustificationImport, DefaultImportQueue, ImportQueue, IncomingBlock, Link, Verifier, + BoxJustificationImport, DefaultImportQueue, ImportQueue, IncomingBlock, + JustificationImportResult, Link, Verifier, }; mod longest_chain; diff --git a/substrate/client/consensus/grandpa/src/communication/gossip.rs b/substrate/client/consensus/grandpa/src/communication/gossip.rs index fc1a35df49f57..0ff52b085cdf9 100644 --- a/substrate/client/consensus/grandpa/src/communication/gossip.rs +++ b/substrate/client/consensus/grandpa/src/communication/gossip.rs @@ -927,7 +927,9 @@ impl Inner { &full.message.signature, full.round.0, full.set_id.0, - ) { + ) + .is_valid() + { debug!(target: LOG_TARGET, "Bad message signature {}", full.message.id); telemetry!( self.config.telemetry; diff --git a/substrate/client/consensus/grandpa/src/communication/mod.rs b/substrate/client/consensus/grandpa/src/communication/mod.rs index 336a2a939a688..d1a1c158ba473 100644 --- a/substrate/client/consensus/grandpa/src/communication/mod.rs +++ b/substrate/client/consensus/grandpa/src/communication/mod.rs @@ -865,7 +865,9 @@ fn check_compact_commit( round.0, set_id.0, &mut buf, - ) { + ) + .is_valid() + { debug!(target: LOG_TARGET, "Bad commit message signature {}", id); telemetry!( telemetry; @@ -952,7 +954,9 @@ fn check_catch_up( if !sp_consensus_grandpa::check_message_signature_with_buffer( &msg, id, sig, round, set_id, buf, - ) { + ) + .is_valid() + { debug!(target: LOG_TARGET, "Bad catch up message signature {}", id); telemetry!( telemetry; diff --git a/substrate/client/consensus/grandpa/src/import.rs b/substrate/client/consensus/grandpa/src/import.rs index 5cec5204c7396..e2343409f3ea2 100644 --- a/substrate/client/consensus/grandpa/src/import.rs +++ b/substrate/client/consensus/grandpa/src/import.rs @@ -795,7 +795,13 @@ where ); let justification = match justification { - Err(e) => return Err(ConsensusError::ClientImport(e.to_string())), + Err(e) => { + return match e { + sp_blockchain::Error::OutdatedJustification => + Err(ConsensusError::OutdatedJustification), + _ => Err(ConsensusError::ClientImport(e.to_string())), + }; + }, Ok(justification) => justification, }; diff --git a/substrate/client/consensus/grandpa/src/justification.rs b/substrate/client/consensus/grandpa/src/justification.rs index 934c0d695fdab..d46fe5935c59c 100644 --- a/substrate/client/consensus/grandpa/src/justification.rs +++ b/substrate/client/consensus/grandpa/src/justification.rs @@ -205,17 +205,22 @@ impl GrandpaJustification { let mut buf = Vec::new(); let mut visited_hashes = HashSet::new(); for signed in self.justification.commit.precommits.iter() { - if !sp_consensus_grandpa::check_message_signature_with_buffer( + let signature_result = sp_consensus_grandpa::check_message_signature_with_buffer( &finality_grandpa::Message::Precommit(signed.precommit.clone()), &signed.id, &signed.signature, self.justification.round, set_id, &mut buf, - ) { - return Err(ClientError::BadJustification( - "invalid signature for precommit in grandpa justification".to_string(), - )) + ); + match signature_result { + sp_consensus_grandpa::SignatureResult::Invalid => + return Err(ClientError::BadJustification( + "invalid signature for precommit in grandpa justification".to_string(), + )), + sp_consensus_grandpa::SignatureResult::OutdatedSet => + return Err(ClientError::OutdatedJustification), + sp_consensus_grandpa::SignatureResult::Valid => {}, } if base_hash == signed.precommit.target_hash { diff --git a/substrate/client/network/sync/src/engine.rs b/substrate/client/network/sync/src/engine.rs index 3ae8c465e1b49..77abb56a4affd 100644 --- a/substrate/client/network/sync/src/engine.rs +++ b/substrate/client/network/sync/src/engine.rs @@ -670,17 +670,36 @@ where ToServiceCommand::BlocksProcessed(imported, count, results) => { self.strategy.on_blocks_processed(imported, count, results); }, - ToServiceCommand::JustificationImported(peer_id, hash, number, success) => { + ToServiceCommand::JustificationImported(peer_id, hash, number, import_result) => { + let success = + matches!(import_result, sc_consensus::JustificationImportResult::Success); self.strategy.on_justification_import(hash, number, success); - if !success { - log::info!( - target: LOG_TARGET, - "💔 Invalid justification provided by {peer_id} for #{hash}", - ); - self.network_service - .disconnect_peer(peer_id, self.block_announce_protocol_name.clone()); - self.network_service - .report_peer(peer_id, ReputationChange::new_fatal("Invalid justification")); + + match import_result { + sc_consensus::JustificationImportResult::OutdatedJustification => { + log::info!( + target: LOG_TARGET, + "💔 Outdated justification provided by {peer_id} for #{hash}", + ); + }, + sc_consensus::JustificationImportResult::Failure => { + log::info!( + target: LOG_TARGET, + "💔 Invalid justification provided by {peer_id} for #{hash}", + ); + self.network_service + .disconnect_peer(peer_id, self.block_announce_protocol_name.clone()); + self.network_service.report_peer( + peer_id, + ReputationChange::new_fatal("Invalid justification"), + ); + }, + sc_consensus::JustificationImportResult::Success => { + log::debug!( + target: LOG_TARGET, + "Justification for block #{hash} ({number}) imported from {peer_id} successfully", + ); + }, } }, ToServiceCommand::AnnounceBlock(hash, data) => self.announce_block(hash, data), diff --git a/substrate/client/network/sync/src/service/mock.rs b/substrate/client/network/sync/src/service/mock.rs index 300aa076515f8..1974e70227e3d 100644 --- a/substrate/client/network/sync/src/service/mock.rs +++ b/substrate/client/network/sync/src/service/mock.rs @@ -55,7 +55,7 @@ mockall::mock! { who: PeerId, hash: &B::Hash, number: NumberFor, - success: bool, + import_result: sc_consensus::JustificationImportResult, ); fn request_justification(&self, hash: &B::Hash, number: NumberFor); } diff --git a/substrate/client/network/sync/src/service/syncing_service.rs b/substrate/client/network/sync/src/service/syncing_service.rs index b56af2b9976a1..7c0c0b428ddb0 100644 --- a/substrate/client/network/sync/src/service/syncing_service.rs +++ b/substrate/client/network/sync/src/service/syncing_service.rs @@ -21,7 +21,9 @@ use crate::types::{ExtendedPeerInfo, SyncEvent, SyncEventStream, SyncStatus, Syn use futures::{channel::oneshot, Stream}; use sc_network_types::PeerId; -use sc_consensus::{BlockImportError, BlockImportStatus, JustificationSyncLink, Link}; +use sc_consensus::{ + BlockImportError, BlockImportStatus, JustificationImportResult, JustificationSyncLink, Link, +}; use sc_network::{NetworkBlock, NetworkSyncForkRequest}; use sc_utils::mpsc::{tracing_unbounded, TracingUnboundedSender}; use sp_runtime::traits::{Block as BlockT, NumberFor}; @@ -44,7 +46,7 @@ pub enum ToServiceCommand { usize, Vec<(Result>, BlockImportError>, B::Hash)>, ), - JustificationImported(PeerId, B::Hash, NumberFor, bool), + JustificationImported(PeerId, B::Hash, NumberFor, JustificationImportResult), AnnounceBlock(B::Hash, Option>), NewBestBlockImported(B::Hash, NumberFor), EventStream(TracingUnboundedSender), @@ -192,11 +194,14 @@ impl Link for SyncingService { who: PeerId, hash: &B::Hash, number: NumberFor, - success: bool, + import_result: JustificationImportResult, ) { - let _ = self - .tx - .unbounded_send(ToServiceCommand::JustificationImported(who, *hash, number, success)); + let _ = self.tx.unbounded_send(ToServiceCommand::JustificationImported( + who, + *hash, + number, + import_result, + )); } fn request_justification(&self, hash: &B::Hash, number: NumberFor) { diff --git a/substrate/primitives/blockchain/src/error.rs b/substrate/primitives/blockchain/src/error.rs index e8ac148d7511e..d1cf0aca2f350 100644 --- a/substrate/primitives/blockchain/src/error.rs +++ b/substrate/primitives/blockchain/src/error.rs @@ -102,6 +102,9 @@ pub enum Error { #[error("bad justification for header: {0}")] BadJustification(String), + #[error("outdated justification")] + OutdatedJustification, + #[error("This method is not currently available when running in light client mode")] NotAvailableOnLightClient, diff --git a/substrate/primitives/consensus/common/src/error.rs b/substrate/primitives/consensus/common/src/error.rs index fb8d0447fe3d6..5fa8984544a89 100644 --- a/substrate/primitives/consensus/common/src/error.rs +++ b/substrate/primitives/consensus/common/src/error.rs @@ -41,6 +41,9 @@ pub enum Error { /// Justification requirements not met. #[error("Invalid justification")] InvalidJustification, + /// The justification provided is outdated and corresponds to a previous set. + #[error("Import failed with outdated justification")] + OutdatedJustification, /// Error from the client while importing. #[error("Import failed: {0}")] ClientImport(String), diff --git a/substrate/primitives/consensus/grandpa/src/lib.rs b/substrate/primitives/consensus/grandpa/src/lib.rs index 3aa4dc2b4cfb3..6c973964cfdee 100644 --- a/substrate/primitives/consensus/grandpa/src/lib.rs +++ b/substrate/primitives/consensus/grandpa/src/lib.rs @@ -368,7 +368,8 @@ where &$equivocation.first.1, $equivocation.round_number, report.set_id, - ); + ) + .is_valid(); let valid_second = check_message_signature( &$message($equivocation.second.0), @@ -376,7 +377,8 @@ where &$equivocation.second.1, $equivocation.round_number, report.set_id, - ); + ) + .is_valid(); return valid_first && valid_second }; @@ -412,6 +414,27 @@ pub fn localized_payload_with_buffer( (message, round, set_id).encode_to(buf) } +/// Result of checking a message signature. +#[derive(Clone, Encode, Decode, PartialEq, Eq)] +#[cfg_attr(feature = "std", derive(Debug))] +pub enum SignatureResult { + /// Valid signature. + Valid, + + /// Invalid signature. + Invalid, + + /// Valid signature, but the message was signed in the previous set. + OutdatedSet, +} + +impl SignatureResult { + /// Returns `true` if the signature is valid. + pub fn is_valid(&self) -> bool { + matches!(self, SignatureResult::Valid) + } +} + /// Check a message signature by encoding the message as a localized payload and /// verifying the provided signature using the expected authority id. pub fn check_message_signature( @@ -420,7 +443,7 @@ pub fn check_message_signature( signature: &AuthoritySignature, round: RoundNumber, set_id: SetId, -) -> bool +) -> SignatureResult where H: Encode, N: Encode, @@ -439,7 +462,7 @@ pub fn check_message_signature_with_buffer( round: RoundNumber, set_id: SetId, buf: &mut Vec, -) -> bool +) -> SignatureResult where H: Encode, N: Encode, @@ -448,15 +471,34 @@ where localized_payload_with_buffer(round, set_id, message, buf); - let valid = id.verify(&buf, signature); + if id.verify(&buf, signature) { + return SignatureResult::Valid; + } - if !valid { - let log_target = if cfg!(feature = "std") { CLIENT_LOG_TARGET } else { RUNTIME_LOG_TARGET }; + let log_target = if cfg!(feature = "std") { CLIENT_LOG_TARGET } else { RUNTIME_LOG_TARGET }; + log::debug!( + target: log_target, + "Bad signature on message from id={id:?} round={round:?} set_id={set_id:?}", + ); - log::debug!(target: log_target, "Bad signature on message from {:?}", id); + // Check if the signature is valid in the previous set. + if set_id == 0 { + return SignatureResult::Invalid; } - valid + let prev_set_id = set_id - 1; + localized_payload_with_buffer(round, prev_set_id, message, buf); + let valid = id.verify(&buf, signature); + log::debug!( + target: log_target, + "Previous set signature check for id={id:?} round={round:?} previous_set={prev_set_id:?} valid={valid:?}" + ); + + if valid { + SignatureResult::OutdatedSet + } else { + SignatureResult::Invalid + } } /// Localizes the message to the given set and round and signs the payload.