Add more debug logs to understand if statement-distribution misbehaving (#3419)

alexggh · ordian · web-flow · commit b9c792a4b8ad · 2024-02-26T08:06:21.000Z
Add more debug logs to understand if statement-distribution is in a bad state, should be useful for debugging #3314 on production networks. Additionally, increase the number of parallel requests should make, since I notice that requests take around 100ms on kusama, and the 5 parallel request was picked mostly random, no reason why we can do more than that. --------- Signed-off-by: Alexandru Gheorghe <alexandru.gheorghe@parity.io> Co-authored-by: ordian <write@reusable.software>
diff --git a/polkadot/node/network/statement-distribution/src/v2/cluster.rs b/polkadot/node/network/statement-distribution/src/v2/cluster.rs
@@ -55,8 +55,9 @@
 //! and to keep track of what we have sent to other validators in the group and what we may
 //! continue to send them.
 
-use polkadot_primitives::{CandidateHash, CompactStatement, ValidatorIndex};
+use polkadot_primitives::{CandidateHash, CompactStatement, Hash, ValidatorIndex};
 
+use crate::LOG_TARGET;
 use std::collections::{HashMap, HashSet};
 
 #[derive(Hash, PartialEq, Eq)]
@@ -424,6 +425,28 @@ impl ClusterTracker {
 	fn is_in_group(&self, validator: ValidatorIndex) -> bool {
 		self.validators.contains(&validator)
 	}
+
+	/// Dumps pending statement for this cluster.
+	///
+	/// Normally we should not have pending statements to validators in our cluster,
+	/// but if we do for all validators in our cluster, then we don't participate
+	/// in backing. Ocasional pending statements are expected if two authorities
+	/// can't detect each otehr or after restart, where it takes a while to discover
+	/// the whole network.
+
+	pub fn warn_if_too_many_pending_statements(&self, parent_hash: Hash) {
+		if self.pending.iter().filter(|pending| !pending.1.is_empty()).count() >=
+			self.validators.len()
+		{
+			gum::warn!(
+				target: LOG_TARGET,
+				pending_statements  = ?self.pending,
+				?parent_hash,
+				"Cluster has too many pending statements, something wrong with our connection to our group peers \n
+				Restart might be needed if validator gets 0 backing rewards for more than 3-4 consecutive sessions"
+			);
+		}
+	}
 }
 
 /// Incoming statement was accepted.
diff --git a/polkadot/node/network/statement-distribution/src/v2/mod.rs b/polkadot/node/network/statement-distribution/src/v2/mod.rs
@@ -251,6 +251,13 @@ impl PerSessionState {
 		if local_index.is_some() {
 			self.local_validator.get_or_insert(LocalValidatorIndex::Inactive);
 		}
+
+		gum::info!(
+			target: LOG_TARGET,
+			index_in_gossip_topology = ?local_index,
+			index_in_parachain_authorities = ?self.local_validator,
+			"Node uses the following topology indices"
+		);
 	}
 
 	/// Returns `true` if local is neither active or inactive validator node.
@@ -768,7 +775,15 @@ pub(crate) fn handle_deactivate_leaves(state: &mut State, leaves: &[Hash]) {
 		let pruned = state.implicit_view.deactivate_leaf(*leaf);
 		for pruned_rp in pruned {
 			// clean up per-relay-parent data based on everything removed.
-			state.per_relay_parent.remove(&pruned_rp);
+			state
+				.per_relay_parent
+				.remove(&pruned_rp)
+				.as_ref()
+				.and_then(|pruned| pruned.active_validator_state())
+				.map(|active_state| {
+					active_state.cluster_tracker.warn_if_too_many_pending_statements(pruned_rp)
+				});
+
 			// clean up requests related to this relay parent.
 			state.request_manager.remove_by_relay_parent(*leaf);
 		}
diff --git a/polkadot/node/network/statement-distribution/src/v2/requests.rs b/polkadot/node/network/statement-distribution/src/v2/requests.rs
@@ -315,7 +315,16 @@ impl RequestManager {
 		request_props: impl Fn(&CandidateIdentifier) -> Option<RequestProperties>,
 		peer_advertised: impl Fn(&CandidateIdentifier, &PeerId) -> Option<StatementFilter>,
 	) -> Option<OutgoingRequest<AttestedCandidateRequest>> {
-		if response_manager.len() >= MAX_PARALLEL_ATTESTED_CANDIDATE_REQUESTS as usize {
+		// The number of parallel requests a node can answer is limited by
+		// `MAX_PARALLEL_ATTESTED_CANDIDATE_REQUESTS`, however there is no
+		// need for the current node to limit itself to the same amount the
+		// requests, because the requests are going to different nodes anyways.
+		// While looking at https://github.com/paritytech/polkadot-sdk/issues/3314,
+		// found out that this requests take around 100ms to fullfill, so it
+		// would make sense to try to request things as early as we can, given
+		// we would need to request it for each candidate, around 25 right now
+		// on kusama.
+		if response_manager.len() >= 2 * MAX_PARALLEL_ATTESTED_CANDIDATE_REQUESTS as usize {
 			return None
 		}
 
@@ -1027,6 +1036,7 @@ mod tests {
 			let peer_advertised = |_identifier: &CandidateIdentifier, _peer: &_| {
 				Some(StatementFilter::full(group_size))
 			};
+
 			let outgoing = request_manager
 				.next_request(&mut response_manager, request_props, peer_advertised)
 				.unwrap();
@@ -1148,6 +1158,7 @@ mod tests {
 		{
 			let request_props =
 				|_identifier: &CandidateIdentifier| Some((&request_properties).clone());
+
 			let outgoing = request_manager
 				.next_request(&mut response_manager, request_props, peer_advertised)
 				.unwrap();
@@ -1230,6 +1241,7 @@ mod tests {
 		{
 			let request_props =
 				|_identifier: &CandidateIdentifier| Some((&request_properties).clone());
+
 			let outgoing = request_manager
 				.next_request(&mut response_manager, request_props, peer_advertised)
 				.unwrap();