Skip to content

Commit b9c792a

Browse files
alexgghordian
andauthored
Add more debug logs to understand if statement-distribution misbehaving (#3419)
Add more debug logs to understand if statement-distribution is in a bad state, should be useful for debugging #3314 on production networks. Additionally, increase the number of parallel requests should make, since I notice that requests take around 100ms on kusama, and the 5 parallel request was picked mostly random, no reason why we can do more than that. --------- Signed-off-by: Alexandru Gheorghe <[email protected]> Co-authored-by: ordian <[email protected]>
1 parent ce5de99 commit b9c792a

3 files changed

Lines changed: 53 additions & 3 deletions

File tree

polkadot/node/network/statement-distribution/src/v2/cluster.rs

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,9 @@
5555
//! and to keep track of what we have sent to other validators in the group and what we may
5656
//! continue to send them.
5757
58-
use polkadot_primitives::{CandidateHash, CompactStatement, ValidatorIndex};
58+
use polkadot_primitives::{CandidateHash, CompactStatement, Hash, ValidatorIndex};
5959

60+
use crate::LOG_TARGET;
6061
use std::collections::{HashMap, HashSet};
6162

6263
#[derive(Hash, PartialEq, Eq)]
@@ -424,6 +425,28 @@ impl ClusterTracker {
424425
fn is_in_group(&self, validator: ValidatorIndex) -> bool {
425426
self.validators.contains(&validator)
426427
}
428+
429+
/// Dumps pending statement for this cluster.
430+
///
431+
/// Normally we should not have pending statements to validators in our cluster,
432+
/// but if we do for all validators in our cluster, then we don't participate
433+
/// in backing. Ocasional pending statements are expected if two authorities
434+
/// can't detect each otehr or after restart, where it takes a while to discover
435+
/// the whole network.
436+
437+
pub fn warn_if_too_many_pending_statements(&self, parent_hash: Hash) {
438+
if self.pending.iter().filter(|pending| !pending.1.is_empty()).count() >=
439+
self.validators.len()
440+
{
441+
gum::warn!(
442+
target: LOG_TARGET,
443+
pending_statements = ?self.pending,
444+
?parent_hash,
445+
"Cluster has too many pending statements, something wrong with our connection to our group peers \n
446+
Restart might be needed if validator gets 0 backing rewards for more than 3-4 consecutive sessions"
447+
);
448+
}
449+
}
427450
}
428451

429452
/// Incoming statement was accepted.

polkadot/node/network/statement-distribution/src/v2/mod.rs

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,13 @@ impl PerSessionState {
251251
if local_index.is_some() {
252252
self.local_validator.get_or_insert(LocalValidatorIndex::Inactive);
253253
}
254+
255+
gum::info!(
256+
target: LOG_TARGET,
257+
index_in_gossip_topology = ?local_index,
258+
index_in_parachain_authorities = ?self.local_validator,
259+
"Node uses the following topology indices"
260+
);
254261
}
255262

256263
/// Returns `true` if local is neither active or inactive validator node.
@@ -768,7 +775,15 @@ pub(crate) fn handle_deactivate_leaves(state: &mut State, leaves: &[Hash]) {
768775
let pruned = state.implicit_view.deactivate_leaf(*leaf);
769776
for pruned_rp in pruned {
770777
// clean up per-relay-parent data based on everything removed.
771-
state.per_relay_parent.remove(&pruned_rp);
778+
state
779+
.per_relay_parent
780+
.remove(&pruned_rp)
781+
.as_ref()
782+
.and_then(|pruned| pruned.active_validator_state())
783+
.map(|active_state| {
784+
active_state.cluster_tracker.warn_if_too_many_pending_statements(pruned_rp)
785+
});
786+
772787
// clean up requests related to this relay parent.
773788
state.request_manager.remove_by_relay_parent(*leaf);
774789
}

polkadot/node/network/statement-distribution/src/v2/requests.rs

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -315,7 +315,16 @@ impl RequestManager {
315315
request_props: impl Fn(&CandidateIdentifier) -> Option<RequestProperties>,
316316
peer_advertised: impl Fn(&CandidateIdentifier, &PeerId) -> Option<StatementFilter>,
317317
) -> Option<OutgoingRequest<AttestedCandidateRequest>> {
318-
if response_manager.len() >= MAX_PARALLEL_ATTESTED_CANDIDATE_REQUESTS as usize {
318+
// The number of parallel requests a node can answer is limited by
319+
// `MAX_PARALLEL_ATTESTED_CANDIDATE_REQUESTS`, however there is no
320+
// need for the current node to limit itself to the same amount the
321+
// requests, because the requests are going to different nodes anyways.
322+
// While looking at https://github.com/paritytech/polkadot-sdk/issues/3314,
323+
// found out that this requests take around 100ms to fullfill, so it
324+
// would make sense to try to request things as early as we can, given
325+
// we would need to request it for each candidate, around 25 right now
326+
// on kusama.
327+
if response_manager.len() >= 2 * MAX_PARALLEL_ATTESTED_CANDIDATE_REQUESTS as usize {
319328
return None
320329
}
321330

@@ -1027,6 +1036,7 @@ mod tests {
10271036
let peer_advertised = |_identifier: &CandidateIdentifier, _peer: &_| {
10281037
Some(StatementFilter::full(group_size))
10291038
};
1039+
10301040
let outgoing = request_manager
10311041
.next_request(&mut response_manager, request_props, peer_advertised)
10321042
.unwrap();
@@ -1148,6 +1158,7 @@ mod tests {
11481158
{
11491159
let request_props =
11501160
|_identifier: &CandidateIdentifier| Some((&request_properties).clone());
1161+
11511162
let outgoing = request_manager
11521163
.next_request(&mut response_manager, request_props, peer_advertised)
11531164
.unwrap();
@@ -1230,6 +1241,7 @@ mod tests {
12301241
{
12311242
let request_props =
12321243
|_identifier: &CandidateIdentifier| Some((&request_properties).clone());
1244+
12331245
let outgoing = request_manager
12341246
.next_request(&mut response_manager, request_props, peer_advertised)
12351247
.unwrap();

0 commit comments

Comments
 (0)