Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
46eaa28
service/builder: Expose metrics to the RPC layers
lexnv Apr 25, 2025
89fe6ae
cargo: Add prometheus endpoint to the RPC layers
lexnv Apr 25, 2025
c56660a
tx: Initialize a subset of counter metrics
lexnv Apr 25, 2025
830b95d
tx: Increment metrics as counter vec
lexnv Apr 25, 2025
bf88c82
tx: Propagate transition times as histogram
lexnv Apr 25, 2025
bf4a235
tx: Replace unvalidated initial state with submitted
lexnv Apr 25, 2025
a009f37
tx: Introduce a metrics module
lexnv Apr 25, 2025
5b558a0
tx/metrics: Increment the status counter on internal advancement
lexnv Apr 25, 2025
6ab0c8c
tx/metrics: Simplify code by relying on the transaction event directly
lexnv Apr 25, 2025
2cc6fd1
tx/metrics: Provide clean API for metric control
lexnv Apr 25, 2025
a07ca93
tx: Propagate metrics on error
lexnv Apr 25, 2025
260e601
tx/metrics: Adjust internal labels
lexnv Apr 25, 2025
b0510d3
tx/metrics: Elapsed time since start to finalized
lexnv Apr 28, 2025
fe971b7
tx/event: Add wrapper for event state transitioning into final states
lexnv Apr 28, 2025
04d24c2
tx/metrics: Propagate start to final metrics as well
lexnv Apr 28, 2025
9b0febd
tx/tests: Adjust testing to the new interface
lexnv Apr 28, 2025
ca78079
tx: Register the rpc-v2 metrics only once
lexnv Apr 28, 2025
4a8b0c0
tx/metrics: Simplify reported metrics and code
lexnv Apr 28, 2025
96b0cbd
tx/metrics: Remove the counter since it can dededuced by histogram
lexnv Apr 28, 2025
5540857
Merge branch 'master' into lexnv/tx-metrics
lexnv Apr 28, 2025
8c2c9a4
tx/metrics: Fix unused imports
lexnv Apr 28, 2025
7c44dc1
tx/metrics: Fix docs references
lexnv Apr 28, 2025
1b21650
tx/metrics: Simplify labels since they are not used externally
lexnv Apr 29, 2025
516d2b3
tx/metrics: Replace HistogramVec with individual Histograms for granual
lexnv Apr 29, 2025
d45d520
tx/metrics: Add proper elapsed time
lexnv Apr 29, 2025
8ad08b8
tx/metrics: Add unit of seconds to metric description
lexnv May 23, 2025
c975366
tx/metrics: Apply feedback
lexnv May 23, 2025
6ac204d
Update from github-actions[bot] running command 'prdoc --audience nod…
github-actions[bot] May 25, 2025
793825f
cargo: Sort deps in alphabetical order
lexnv May 26, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

26 changes: 26 additions & 0 deletions prdoc/pr_8345.prdoc
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
title: 'tx/metrics: Add metrics for the RPC v2 `transactionWatch_v1_submitAndWatch`'
doc:
- audience: Node Operator
description: |-
This PR adds metrics for the following RPC subscription: [transactionWatch_v1_submitAndWatch](https://paritytech.github.io/json-rpc-interface-spec/api/transactionWatch_v1_submitAndWatch.html)

Metrics are exposed in two ways:
- simple counters of how many events we've seen globally
- a histogram vector of execution times, which is labeled by `initial event` -> `final event`
- This helps us identify how long it takes the transaction pool to advance the state of the events, and further debug issues

Part of: https://github.com/paritytech/polkadot-sdk/issues/8336

### (outdated) PoC Dashboards

![Screenshot 2025-04-28 at 17 50 48](https://github.com/user-attachments/assets/9fd0bf30-a321-4362-a10b-dfc3de1eb474)


### Next steps
- [x] initial dashboards with a live node
- [x] adjust testing
crates:
- name: sc-service
bump: major
- name: sc-rpc-spec-v2
bump: major
1 change: 1 addition & 0 deletions substrate/client/rpc-spec-v2/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ hex = { workspace = true, default-features = true }
itertools = { workspace = true }
log = { workspace = true, default-features = true }
parking_lot = { workspace = true, default-features = true }
prometheus-endpoint = { workspace = true, default-features = true }
rand = { workspace = true, default-features = true }
sc-client-api = { workspace = true, default-features = true }
sc-rpc = { workspace = true, default-features = true }
Expand Down
13 changes: 13 additions & 0 deletions substrate/client/rpc-spec-v2/src/transaction/event.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,19 @@ pub enum TransactionEvent<Hash> {
Dropped(TransactionDropped),
}

impl<Hash> TransactionEvent<Hash> {
/// Returns true if this is the last event emitted by the RPC subscription.
pub fn is_final(&self) -> bool {
matches!(
&self,
TransactionEvent::Finalized(_) |
TransactionEvent::Error(_) |
TransactionEvent::Invalid(_) |
TransactionEvent::Dropped(_)
)
}
}

/// Intermediate representation (IR) for the transaction events
/// that handles block events only.
///
Expand Down
155 changes: 155 additions & 0 deletions substrate/client/rpc-spec-v2/src/transaction/metrics.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
// This file is part of Substrate.

// Copyright (C) Parity Technologies (UK) Ltd.
// SPDX-License-Identifier: GPL-3.0-or-later WITH Classpath-exception-2.0

// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.

// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.

// You should have received a copy of the GNU General Public License
// along with this program. If not, see <https://www.gnu.org/licenses/>.

//! Metrics for recording transaction events.

use std::{collections::HashSet, time::Instant};

use prometheus_endpoint::{
exponential_buckets, linear_buckets, register, Histogram, HistogramOpts, PrometheusError,
Registry,
};

use super::TransactionEvent;

/// RPC layer metrics for transaction pool.
#[derive(Debug, Clone)]
pub struct Metrics {
validated: Histogram,
in_block: Histogram,
finalized: Histogram,
dropped: Histogram,
invalid: Histogram,
error: Histogram,
}

impl Metrics {
/// Creates a new [`Metrics`] instance.
pub fn new(registry: &Registry) -> Result<Self, PrometheusError> {
let validated = register(
Histogram::with_opts(
HistogramOpts::new(
"rpc_transaction_validation_time",
"RPC Transaction validation time in seconds",
)
.buckets(exponential_buckets(0.01, 2.0, 16).expect("Valid buckets; qed")),
)?,
registry,
)?;

let in_block = register(
Histogram::with_opts(
HistogramOpts::new(
"rpc_transaction_in_block_time",
"RPC Transaction in block time in seconds",
)
.buckets(linear_buckets(0.0, 3.0, 20).expect("Valid buckets; qed")),
Copy link
Copy Markdown
Contributor

@michalkucharczyk michalkucharczyk May 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@olliecorbisiero here are new buckets for in-block.

Buckets for internal pool events (they you are using now in dashboard) were exactly the same:

in_block: register(
Histogram::with_opts(histogram_opts!(
"substrate_sub_txpool_timing_event_in_block",
"Histogram of timings for reporting InBlock event",
linear_buckets(0.0, 3.0, 20).unwrap()
))?,
registry,

Let us know what would work for you?

)?,
registry,
)?;

let finalized = register(
Histogram::with_opts(
HistogramOpts::new(
"rpc_transaction_finalized_time",
"RPC Transaction finalized time in seconds",
)
.buckets(linear_buckets(0.01, 40.0, 20).expect("Valid buckets; qed")),
)?,
registry,
)?;

let dropped = register(
Histogram::with_opts(
HistogramOpts::new(
"rpc_transaction_dropped_time",
"RPC Transaction dropped time in seconds",
)
.buckets(linear_buckets(0.01, 3.0, 20).expect("Valid buckets; qed")),
)?,
registry,
)?;

let invalid = register(
Histogram::with_opts(
HistogramOpts::new(
"rpc_transaction_invalid_time",
"RPC Transaction invalid time in seconds",
)
.buckets(linear_buckets(0.01, 3.0, 20).expect("Valid buckets; qed")),
)?,
registry,
)?;

let error = register(
Histogram::with_opts(
HistogramOpts::new(
"rpc_transaction_error_time",
"RPC Transaction error time in seconds",
)
.buckets(linear_buckets(0.01, 3.0, 20).expect("Valid buckets; qed")),
)?,
registry,
)?;

Ok(Metrics { validated, in_block, finalized, dropped, invalid, error })
}
}

/// Transaction metrics for a single transaction instance.
pub struct InstanceMetrics {
/// The metrics instance.
metrics: Option<Metrics>,
/// The time when the transaction was submitted.
submitted_at: Instant,
/// Ensure the states are reported once.
reported_states: HashSet<&'static str>,
}

impl InstanceMetrics {
/// Creates a new [`InstanceMetrics`] instance.
pub fn new(metrics: Option<Metrics>) -> Self {
Self { metrics, submitted_at: Instant::now(), reported_states: HashSet::new() }
}

/// Record the execution time of a transaction state.
///
/// This represents how long it took for the transaction to move to the next state.
///
/// The method must be called before the transaction event is provided to the user.
pub fn register_event<Hash>(&mut self, event: &TransactionEvent<Hash>) {
let Some(ref metrics) = self.metrics else {
return;
};

let (histogram, target_state) = match event {
TransactionEvent::Validated => (&metrics.validated, "validated"),
TransactionEvent::BestChainBlockIncluded(Some(_)) => (&metrics.in_block, "in_block"),
TransactionEvent::BestChainBlockIncluded(None) => (&metrics.in_block, "retracted"),
TransactionEvent::Finalized(..) => (&metrics.finalized, "finalized"),
TransactionEvent::Error(..) => (&metrics.error, "error"),
TransactionEvent::Dropped(..) => (&metrics.dropped, "dropped"),
TransactionEvent::Invalid(..) => (&metrics.invalid, "invalid"),
};

// Only record the state if it hasn't been reported before.
if self.reported_states.insert(target_state) {
histogram.observe(self.submitted_at.elapsed().as_secs_f64());
}
}
}
3 changes: 3 additions & 0 deletions substrate/client/rpc-spec-v2/src/transaction/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
#[cfg(test)]
mod tests;

mod metrics;

pub mod api;
pub mod error;
pub mod event;
Expand All @@ -36,5 +38,6 @@ pub mod transaction_broadcast;

pub use api::{TransactionApiServer, TransactionBroadcastApiServer};
pub use event::{TransactionBlock, TransactionDropped, TransactionError, TransactionEvent};
pub use metrics::Metrics as TransactionMetrics;
pub use transaction::Transaction;
pub use transaction_broadcast::TransactionBroadcast;
3 changes: 2 additions & 1 deletion substrate/client/rpc-spec-v2/src/transaction/tests/setup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,8 @@ pub fn setup_api_tx() -> (
let (task_executor, executor_recv) = TaskExecutorBroadcast::new();

let tx_api =
RpcTransaction::new(client_mock.clone(), pool.clone(), Arc::new(task_executor)).into_rpc();
RpcTransaction::new(client_mock.clone(), pool.clone(), Arc::new(task_executor), None)
.into_rpc();

(api, pool, client_mock, tx_api, executor_recv, pool_state)
}
Expand Down
47 changes: 37 additions & 10 deletions substrate/client/rpc-spec-v2/src/transaction/transaction.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,9 @@ use crate::{
use codec::Decode;
use futures::{StreamExt, TryFutureExt};
use jsonrpsee::{core::async_trait, PendingSubscriptionSink};

use super::metrics::{InstanceMetrics, Metrics};

use sc_rpc::utils::{RingBuffer, Subscription};
use sc_transaction_pool_api::{
error::IntoPoolError, BlockHash, TransactionFor, TransactionPool, TransactionSource,
Expand All @@ -50,12 +53,19 @@ pub struct Transaction<Pool, Client> {
pool: Arc<Pool>,
/// Executor to spawn subscriptions.
executor: SubscriptionTaskExecutor,
/// Metrics for transactions.
metrics: Option<Metrics>,
}

impl<Pool, Client> Transaction<Pool, Client> {
/// Creates a new [`Transaction`].
pub fn new(client: Arc<Client>, pool: Arc<Pool>, executor: SubscriptionTaskExecutor) -> Self {
Transaction { client, pool, executor }
pub fn new(
client: Arc<Client>,
pool: Arc<Pool>,
executor: SubscriptionTaskExecutor,
metrics: Option<Metrics>,
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That you need to take here the Metrics object, is kind of dirty. I get why you are doing this (because of the outer clone), but yeah would be nice without this. But yeah, not really doable right now.

) -> Self {
Transaction { client, pool, executor, metrics }
}
}

Expand All @@ -78,6 +88,9 @@ where
let client = self.client.clone();
let pool = self.pool.clone();

// Get a new transaction metrics instance and increment the counter.
let mut metrics = InstanceMetrics::new(self.metrics.clone());

let fut = async move {
let decoded_extrinsic = match TransactionFor::<Pool>::decode(&mut &xt[..]) {
Ok(decoded_extrinsic) => decoded_extrinsic,
Expand All @@ -86,12 +99,14 @@ where

let Ok(sink) = pending.accept().await.map(Subscription::from) else { return };

let event = TransactionEvent::Invalid::<BlockHash<Pool>>(TransactionError {
error: "Extrinsic bytes cannot be decoded".into(),
});

metrics.register_event(&event);

// The transaction is invalid.
let _ = sink
.send(&TransactionEvent::Invalid::<BlockHash<Pool>>(TransactionError {
error: "Extrinsic bytes cannot be decoded".into(),
}))
.await;
let _ = sink.send(&event).await;
return
},
};
Expand All @@ -112,8 +127,17 @@ where

match submit.await {
Ok(stream) => {
let stream =
stream.filter_map(move |event| async move { handle_event(event) }).boxed();
let stream = stream
.filter_map(|event| {
let event = handle_event(event);

event.as_ref().inspect(|event| {
metrics.register_event(event);
});

async move { event }
})
.boxed();

// If the subscription is too slow older events will be overwritten.
sink.pipe_from_stream(stream, RingBuffer::new(3)).await;
Expand All @@ -122,6 +146,9 @@ where
// We have not created an `Watcher` for the tx. Make sure the
// error is still propagated as an event.
let event: TransactionEvent<<Pool::Block as BlockT>::Hash> = err.into();

metrics.register_event(&event);

_ = sink.send(&event).await;
},
};
Expand All @@ -134,7 +161,7 @@ where
/// Handle events generated by the transaction-pool and convert them
/// to the new API expected state.
#[inline]
pub fn handle_event<Hash: Clone, BlockHash: Clone>(
fn handle_event<Hash: Clone, BlockHash: Clone>(
event: TransactionStatus<Hash, BlockHash>,
) -> Option<TransactionEvent<BlockHash>> {
match event {
Expand Down
11 changes: 11 additions & 0 deletions substrate/client/service/src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -515,6 +515,14 @@ where
let rpc_id_provider = config.rpc.id_provider.take();

// jsonrpsee RPC
// RPC-V2 specific metrics need to be registered before the RPC server is started,
// since we might have two instances running (one for the in-memory RPC and one for the network
// RPC).
let rpc_v2_metrics = config
.prometheus_registry()
.map(|registry| sc_rpc_spec_v2::transaction::TransactionMetrics::new(registry))
.transpose()?;

let gen_rpc_module = || {
gen_rpc_module(
task_manager.spawn_handle(),
Expand All @@ -529,6 +537,7 @@ where
config.blocks_pruning,
backend.clone(),
&*rpc_builder,
rpc_v2_metrics.clone(),
)
};

Expand Down Expand Up @@ -676,6 +685,7 @@ pub fn gen_rpc_module<TBl, TBackend, TCl, TRpc, TExPool>(
blocks_pruning: BlocksPruning,
backend: Arc<TBackend>,
rpc_builder: &(dyn Fn(SubscriptionTaskExecutor) -> Result<RpcModule<TRpc>, Error>),
metrics: Option<sc_rpc_spec_v2::transaction::TransactionMetrics>,
) -> Result<RpcModule<()>, Error>
where
TBl: BlockT,
Expand Down Expand Up @@ -731,6 +741,7 @@ where
client.clone(),
transaction_pool.clone(),
task_executor.clone(),
metrics,
)
.into_rpc();

Expand Down
Loading