Skip to content

Commit fc906d5

Browse files
authored
fix av-distribution Jaeger spans mem leak (#5321)
Fixes #5258
1 parent 1f49358 commit fc906d5

File tree

4 files changed

+24
-11
lines changed

4 files changed

+24
-11
lines changed

polkadot/node/network/availability-distribution/src/lib.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ use polkadot_node_subsystem::{
2525
jaeger, messages::AvailabilityDistributionMessage, overseer, FromOrchestra, OverseerSignal,
2626
SpawnedSubsystem, SubsystemError,
2727
};
28-
use polkadot_primitives::Hash;
28+
use polkadot_primitives::{BlockNumber, Hash};
2929
use std::collections::HashMap;
3030

3131
/// Error and [`Result`] type for this subsystem.
@@ -104,7 +104,7 @@ impl AvailabilityDistributionSubsystem {
104104
/// Start processing work as passed on from the Overseer.
105105
async fn run<Context>(self, mut ctx: Context) -> std::result::Result<(), FatalError> {
106106
let Self { mut runtime, recvs, metrics, req_protocol_names } = self;
107-
let mut spans: HashMap<Hash, jaeger::PerLeafSpan> = HashMap::new();
107+
let mut spans: HashMap<Hash, (BlockNumber, jaeger::PerLeafSpan)> = HashMap::new();
108108

109109
let IncomingRequestReceivers {
110110
pov_req_receiver,
@@ -162,7 +162,7 @@ impl AvailabilityDistributionSubsystem {
162162
};
163163
let span =
164164
jaeger::PerLeafSpan::new(cloned_leaf.span, "availability-distribution");
165-
spans.insert(cloned_leaf.hash, span);
165+
spans.insert(cloned_leaf.hash, (cloned_leaf.number, span));
166166
log_error(
167167
requester
168168
.get_mut()
@@ -172,8 +172,8 @@ impl AvailabilityDistributionSubsystem {
172172
&mut warn_freq,
173173
)?;
174174
},
175-
FromOrchestra::Signal(OverseerSignal::BlockFinalized(hash, _)) => {
176-
spans.remove(&hash);
175+
FromOrchestra::Signal(OverseerSignal::BlockFinalized(_hash, finalized_number)) => {
176+
spans.retain(|_hash, (block_number, _span)| *block_number > finalized_number);
177177
},
178178
FromOrchestra::Signal(OverseerSignal::Conclude) => return Ok(()),
179179
FromOrchestra::Communication {
@@ -189,7 +189,7 @@ impl AvailabilityDistributionSubsystem {
189189
} => {
190190
let span = spans
191191
.get(&relay_parent)
192-
.map(|span| span.child("fetch-pov"))
192+
.map(|(_, span)| span.child("fetch-pov"))
193193
.unwrap_or_else(|| jaeger::Span::new(&relay_parent, "fetch-pov"))
194194
.with_trace_id(candidate_hash)
195195
.with_candidate(candidate_hash)

polkadot/node/network/availability-distribution/src/requester/mod.rs

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,9 @@ use polkadot_node_subsystem_util::{
3939
availability_chunks::availability_chunk_index,
4040
runtime::{get_occupied_cores, RuntimeInfo},
4141
};
42-
use polkadot_primitives::{CandidateHash, CoreIndex, Hash, OccupiedCore, SessionIndex};
42+
use polkadot_primitives::{
43+
BlockNumber, CandidateHash, CoreIndex, Hash, OccupiedCore, SessionIndex,
44+
};
4345

4446
use super::{FatalError, Metrics, Result, LOG_TARGET};
4547

@@ -112,14 +114,14 @@ impl Requester {
112114
ctx: &mut Context,
113115
runtime: &mut RuntimeInfo,
114116
update: ActiveLeavesUpdate,
115-
spans: &HashMap<Hash, jaeger::PerLeafSpan>,
117+
spans: &HashMap<Hash, (BlockNumber, jaeger::PerLeafSpan)>,
116118
) -> Result<()> {
117119
gum::trace!(target: LOG_TARGET, ?update, "Update fetching heads");
118120
let ActiveLeavesUpdate { activated, deactivated } = update;
119121
if let Some(leaf) = activated {
120122
let span = spans
121123
.get(&leaf.hash)
122-
.map(|span| span.child("update-fetching-heads"))
124+
.map(|(_, span)| span.child("update-fetching-heads"))
123125
.unwrap_or_else(|| jaeger::Span::new(&leaf.hash, "update-fetching-heads"))
124126
.with_string_tag("leaf", format!("{:?}", leaf.hash))
125127
.with_stage(jaeger::Stage::AvailabilityDistribution);

polkadot/node/network/availability-distribution/src/requester/tests.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -208,7 +208,7 @@ fn check_ancestry_lookup_in_same_session() {
208208

209209
test_harness(test_state.clone(), |mut ctx| async move {
210210
let chain = &test_state.relay_chain;
211-
let spans: HashMap<Hash, jaeger::PerLeafSpan> = HashMap::new();
211+
let spans: HashMap<Hash, (u32, jaeger::PerLeafSpan)> = HashMap::new();
212212
let block_number = 1;
213213
let update = ActiveLeavesUpdate {
214214
activated: Some(new_leaf(chain[block_number], block_number as u32)),
@@ -281,7 +281,7 @@ fn check_ancestry_lookup_in_different_sessions() {
281281

282282
test_harness(test_state.clone(), |mut ctx| async move {
283283
let chain = &test_state.relay_chain;
284-
let spans: HashMap<Hash, jaeger::PerLeafSpan> = HashMap::new();
284+
let spans: HashMap<Hash, (u32, jaeger::PerLeafSpan)> = HashMap::new();
285285
let block_number = 3;
286286
let update = ActiveLeavesUpdate {
287287
activated: Some(new_leaf(chain[block_number], block_number as u32)),

prdoc/pr_5321.prdoc

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
title: fix availability-distribution Jaeger spans memory leak
2+
3+
doc:
4+
- audience: Node Dev
5+
description: |
6+
Fixes a memory leak which caused the Jaeger span storage in availability-distribution to never be pruned and therefore increasing indefinitely.
7+
This was caused by improper handling of finalized heads. More info in https://github.com/paritytech/polkadot-sdk/issues/5258
8+
9+
crates:
10+
- name: polkadot-availability-distribution
11+
bump: patch

0 commit comments

Comments
 (0)