benchmark: storage: Make read/write benchmarks more accurate (#7867)

alexggh · bkchr · antkve · commit 989802a0e3fc · 2025-04-08T17:00:55.000+01:00
There are a few problems with these read/write benchmarks which makes them produce misleading results, especially when we enable the trie-cache. The problems are: - Both benchmarks run without PoV recorder enabled, that is not accurate for parachains because without the PoV recorder, you can directly access the key from the value cache, while with the PoV recorder you still need to do the walk through which uses the Node cache, e.g: https://github.com/paritytech/trie/blob/master/trie-db/src/lookup.rs#L446. To fix this I added I parameter enable-pov-recorder which is meant to be used when generating the weights for parachains. - Every write measures both the time to update the key and to compute the storage root and commit all the changes, which is not accurate because the storage root is computed only once at the end of the block. For this I added a new argument --batch-size, which is used to determine how many keys to update and performs the storage root computation only once, it then calculate the per key write cost as `durations / batch-size`. - For reads when you run with the PoV recorder, there is also a benefit from running with the same recorder rather than creating a different recorder every read, so we again use the `batch-size` for than to obtain the amortised cost of a read. - bench warmup seemed to not warmup child keys even when `include-child-trees`, so I fixed that as well ## Results on reference hardware, asset-hub-westend state | Setup | Batch size| Amortized cost of a key write(**ns**) | Amortized cost of a key read(**ns**)| |--------|--------|--------|--------| |Without TrieCache, Without PoV Recorder|1|88_521|46_981| |Without TrieCache, With PoV Recorder|1|95_161|48_711| |With TrieCache, Without PoV Recorder|1|66_008|528| |With TrieCache, With PoV Recorder|1|73_145|12_142| |Without TrieCache, Without PoV Recorder|1000|52_646|72_434| |Without TrieCache, With PoV Recorder|1000|54_896|50_267| |With TrieCache, Without PoV Recorder|1000|30_585|497| |With TrieCache, With PoV Recorder|1000|33_765|6_928| |Without TrieCache, Without PoV Recorder|10_000|48_945|52_730| |Without TrieCache, With PoV Recorder|10_000|50_285|49_860| |With TrieCache, Without PoV Recorder|10_000|25_903|484| |With TrieCache, With PoV Recorder|10_000|28_417|7_153| |Without TrieCache, Without PoV Recorder|100_000|31_359|45_839| |Without TrieCache, With PoV Recorder|100_000|32_932|48_393| |With TrieCache, Without PoV Recorder|100_000|20_255|493| |*With TrieCache, With PoV Recorder*, to be used|100_000|21_998|6_908| ## Results on reference hardware asset-hub-polkadot state | Setup | Batch size| Amortized cost of a key write(**ns**) | Amortized cost of a key read(**ns**)| |--------|--------|--------|--------| |Without TrieCache, Without PoV Recorder|1|102_239|56_209| |Without TrieCache, With PoV Recorder|1|106_659|54_256| |With TrieCache, Without PoV Recorder|1|85_419|608| |With TrieCache, With PoV Recorder|1|95_221|13_567| |Without TrieCache, Without PoV Recorder|1000|61_574|53_767| |Without TrieCache, With PoV Recorder|1000|64_770|66_162| |With TrieCache, Without PoV Recorder|1000|35_879|597| |With TrieCache, With PoV Recorder|1000|39_464|8_482| |Without TrieCache, Without PoV Recorder|10_000|62_465|58_236| |Without TrieCache, With PoV Recorder|10_000|65_082|95_118| |With TrieCache, Without PoV Recorder|10_000|32_259|601| |With TrieCache, With PoV Recorder|10_000|34_620|8_810| |Without TrieCache, Without PoV Recorder|100_000|43_794|69_157| |Without TrieCache, With PoV Recorder|100_000|45_060|66_343| |With TrieCache, Without PoV Recorder|100_000|25_327|596| |*With TrieCache, With PoV Recorder*, to be used|100_000|27_622|8_598| ## Results on my local machine with westend-assethub state. | Setup | Batch size| Amortized cost of a key write(**ns**) | Amortized cost of a key read(**ns**)| |--------|--------|--------|--------| |Without TrieCache, Without PoV Recorder|1| 55_443|27_510| |Without TrieCache, With PoV Recorder|1|143_189|105_103| |With TrieCache, Without PoV Recorder|1|37_519|370| |With TrieCache, With PoV Recorder|1|42_569|7_309| |Without TrieCache, Without PoV Recorder|1000| 29_364|25_150| |Without TrieCache, With PoV Recorder|1000|33_221|107_349| |With TrieCache, Without PoV Recorder|1000|18_355|370| |With TrieCache, With PoV Recorder|1000|19_883|4_063| |Without TrieCache, Without PoV Recorder|10_000| 28_336|27_765| |Without TrieCache, With PoV Recorder|10_000|29_673|62_392| |With TrieCache, Without PoV Recorder|10_000|15_102|370| |With TrieCache, With PoV Recorder|10_000|16_461|4_124| |Without TrieCache, Without PoV Recorder|100_000| 18_935|27_151| |Without TrieCache, With PoV Recorder|100_000|19_681|48_393| |With TrieCache, Without PoV Recorder|100_000|12_569|362| |*With TrieCache, With PoV Recorder*, to be used|100_000|13_469|3_895| Fixes: #7535 ## Todo: - [x] Run this benchmarks on reference hardware on configuration variant closest to the production environment. --------- Signed-off-by: Alexandru Gheorghe <alexandru.gheorghe@parity.io> Co-authored-by: Bastian Köcher <git@kchr.de>
diff --git a/cumulus/polkadot-omni-node/lib/src/common/command.rs b/cumulus/polkadot-omni-node/lib/src/common/command.rs
@@ -155,7 +155,8 @@ where
 		let partial = T::new_partial(&config).map_err(sc_cli::Error::Service)?;
 		let db = partial.backend.expose_db();
 		let storage = partial.backend.expose_storage();
+		let shared_trie_cache = partial.backend.expose_shared_trie_cache();
 
-		cmd.run(config, partial.client, db, storage)
+		cmd.run(config, partial.client, db, storage, shared_trie_cache)
 	}
 }
diff --git a/cumulus/polkadot-omni-node/lib/src/tests/benchmark_storage_works.rs b/cumulus/polkadot-omni-node/lib/src/tests/benchmark_storage_works.rs
@@ -53,6 +53,7 @@ fn benchmark_storage(db: &str, runtime: &str, base_path: &Path) -> ExitStatus {
 		.arg("--weight-path")
 		.arg(base_path)
 		.args(["--state-version", "0"])
+		.args(["--batch-size", "1"])
 		.args(["--warmups", "0"])
 		.args(["--add", "100", "--mul", "1.2", "--metric", "p75"])
 		.status()
diff --git a/polkadot/cli/src/command.rs b/polkadot/cli/src/command.rs
@@ -386,8 +386,9 @@ pub fn run() -> Result<()> {
 					let (client, backend, _, _) = polkadot_service::new_chain_ops(&mut config)?;
 					let db = backend.expose_db();
 					let storage = backend.expose_storage();
+					let shared_trie_cache = backend.expose_shared_trie_cache();
 
-					cmd.run(config, client.clone(), db, storage).map_err(Error::SubstrateCli)
+					cmd.run(config, client.clone(), db, storage, shared_trie_cache).map_err(Error::SubstrateCli)
 				}),
 				BenchmarkCmd::Block(cmd) => runner.sync_run(|mut config| {
 					let (client, _, _, _) = polkadot_service::new_chain_ops(&mut config)?;
diff --git a/polkadot/tests/benchmark_storage_works.rs b/polkadot/tests/benchmark_storage_works.rs
@@ -46,6 +46,7 @@ fn benchmark_storage(db: &str, base_path: &Path) -> ExitStatus {
 		.arg("--weight-path")
 		.arg(base_path)
 		.args(["--state-version", "0"])
+		.args(["--batch-size", "1"])
 		.args(["--warmups", "0"])
 		.args(["--add", "100", "--mul", "1.2", "--metric", "p75"])
 		.status()
diff --git a/prdoc/pr_7867.prdoc b/prdoc/pr_7867.prdoc
@@ -0,0 +1,23 @@
+title: benchmark/storage Make read/write benchmarks more accurate
+
+doc:
+- audience: [Runtime Dev, Node Dev]
+  description: |-
+    Improve the benchmark accuracy of read/write costs by making sure for both
+    reads and write we compute the amortized cost of a single key operation, by adding
+    a batch functionality to make sure the cost of common operations like root computation
+    is spread across multiple keys. Additionally, also add a pov-recorder flag, so that we
+    are able to replicate the same environment as parachains do.
+
+crates:
+- name: sc-client-db
+  bump: major
+- name: frame-benchmarking-cli
+  bump: major
+- name: polkadot-cli
+  bump: major
+- name: polkadot-omni-node-lib
+  bump: major
+- name: polkadot
+  bump: patch
+
diff --git a/substrate/bin/node/cli/src/command.rs b/substrate/bin/node/cli/src/command.rs
@@ -127,8 +127,9 @@ pub fn run() -> Result<()> {
 						let partial = new_partial(&config, None)?;
 						let db = partial.backend.expose_db();
 						let storage = partial.backend.expose_storage();
+						let shared_trie_cache = partial.backend.expose_shared_trie_cache();
 
-						cmd.run(config, partial.client, db, storage)
+						cmd.run(config, partial.client, db, storage, shared_trie_cache)
 					},
 					BenchmarkCmd::Overhead(cmd) => {
 						// ensure that we keep the task manager alive
diff --git a/substrate/bin/node/cli/tests/benchmark_storage_works.rs b/substrate/bin/node/cli/tests/benchmark_storage_works.rs
@@ -47,6 +47,7 @@ fn benchmark_storage(db: &str, base_path: &Path) -> ExitStatus {
 		.arg("--weight-path")
 		.arg(base_path)
 		.args(["--state-version", "1"])
+		.args(["--batch-size", "1"])
 		.args(["--warmups", "0"])
 		.args(["--add", "100", "--mul", "1.2", "--metric", "p75"])
 		.arg("--include-child-trees")
diff --git a/substrate/client/db/src/lib.rs b/substrate/client/db/src/lib.rs
@@ -1193,6 +1193,16 @@ impl<Block: BlockT> Backend<Block> {
 		self.storage.clone()
 	}
 
+	/// Expose the shared trie cache that is used by this backend.
+	///
+	/// Should only be needed for benchmarking.
+	#[cfg(feature = "runtime-benchmarks")]
+	pub fn expose_shared_trie_cache(
+		&self,
+	) -> Option<sp_trie::cache::SharedTrieCache<HashingFor<Block>>> {
+		self.shared_trie_cache.clone()
+	}
+
 	fn from_database(
 		db: Arc<dyn Database<DbHash>>,
 		canonicalization_delay: u64,
diff --git a/substrate/utils/frame/benchmarking-cli/src/storage/cmd.rs b/substrate/utils/frame/benchmarking-cli/src/storage/cmd.rs
@@ -19,6 +19,7 @@ use sc_cli::{CliConfiguration, DatabaseParams, PruningParams, Result, SharedPara
 use sc_client_api::{Backend as ClientBackend, StorageProvider, UsageProvider};
 use sc_client_db::DbHash;
 use sc_service::Configuration;
+use sp_api::CallApiAt;
 use sp_blockchain::HeaderBackend;
 use sp_database::{ColumnId, Database};
 use sp_runtime::traits::{Block as BlockT, HashingFor};
@@ -116,6 +117,25 @@ pub struct StorageParams {
 	/// Include child trees in benchmark.
 	#[arg(long)]
 	pub include_child_trees: bool,
+
+	/// Disable PoV recorder.
+	///
+	/// The recorder has impact on performance when benchmarking with the TrieCache enabled.
+	/// If the chain is recording a proof while building/importing a block, the pov recorder
+	/// should be activated.
+	///
+	/// Hence, when generating weights for a parachain this should be activated and when generating
+	/// weights for a standalone chain this should be deactivated.
+	#[arg(long, default_value = "false")]
+	pub disable_pov_recorder: bool,
+
+	/// The batch size for the write benchmark.
+	///
+	/// Since the write size needs to also include the cost of computing the storage root, which is
+	/// done once at the end of the block, the batch size is used to simulate multiple writes in a
+	/// block.
+	#[arg(long, default_value_t = 100_000)]
+	pub batch_size: usize,
 }
 
 impl StorageCmd {
@@ -127,11 +147,15 @@ impl StorageCmd {
 		client: Arc<C>,
 		db: (Arc<dyn Database<DbHash>>, ColumnId),
 		storage: Arc<dyn Storage<HashingFor<Block>>>,
+		shared_trie_cache: Option<sp_trie::cache::SharedTrieCache<HashingFor<Block>>>,
 	) -> Result<()>
 	where
 		BA: ClientBackend<Block>,
 		Block: BlockT<Hash = DbHash>,
-		C: UsageProvider<Block> + StorageProvider<Block, BA> + HeaderBackend<Block>,
+		C: UsageProvider<Block>
+			+ StorageProvider<Block, BA>
+			+ HeaderBackend<Block>
+			+ CallApiAt<Block>,
 	{
 		let mut template = TemplateData::new(&cfg, &self.params)?;
 
@@ -140,7 +164,7 @@ impl StorageCmd {
 
 		if !self.params.skip_read {
 			self.bench_warmup(&client)?;
-			let record = self.bench_read(client.clone())?;
+			let record = self.bench_read(client.clone(), shared_trie_cache.clone())?;
 			if let Some(path) = &self.params.json_read_path {
 				record.save_json(&cfg, path, "read")?;
 			}
@@ -151,7 +175,7 @@ impl StorageCmd {
 
 		if !self.params.skip_write {
 			self.bench_warmup(&client)?;
-			let record = self.bench_write(client, db, storage)?;
+			let record = self.bench_write(client, db, storage, shared_trie_cache)?;
 			if let Some(path) = &self.params.json_write_path {
 				record.save_json(&cfg, path, "write")?;
 			}
@@ -197,11 +221,31 @@ impl StorageCmd {
 
 		for i in 0..self.params.warmups {
 			info!("Warmup round {}/{}", i + 1, self.params.warmups);
+			let mut child_nodes = Vec::new();
+
 			for key in keys.as_slice() {
 				let _ = client
 					.storage(hash, &key)
 					.expect("Checked above to exist")
 					.ok_or("Value unexpectedly empty");
+
+				if let Some(info) = self
+					.params
+					.include_child_trees
+					.then(|| self.is_child_key(key.clone().0))
+					.flatten()
+				{
+					// child tree key
+					for ck in client.child_storage_keys(hash, info.clone(), None, None)? {
+						child_nodes.push((ck.clone(), info.clone()));
+					}
+				}
+			}
+			for (key, info) in child_nodes.as_slice() {
+				let _ = client
+					.child_storage(hash, info, key)
+					.expect("Checked above to exist")
+					.ok_or("Value unexpectedly empty")?;
 			}
 		}
 
diff --git a/substrate/utils/frame/benchmarking-cli/src/storage/read.rs b/substrate/utils/frame/benchmarking-cli/src/storage/read.rs
@@ -15,12 +15,13 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-use sc_cli::Result;
-use sc_client_api::{Backend as ClientBackend, StorageProvider, UsageProvider};
-use sp_runtime::traits::{Block as BlockT, Header as HeaderT};
-
 use log::info;
 use rand::prelude::*;
+use sc_cli::{Error, Result};
+use sc_client_api::{Backend as ClientBackend, StorageProvider, UsageProvider};
+use sp_api::CallApiAt;
+use sp_runtime::traits::{Block as BlockT, HashingFor, Header as HeaderT};
+use sp_state_machine::{backend::AsTrieBackend, Backend};
 use std::{fmt::Debug, sync::Arc, time::Instant};
 
 use super::cmd::StorageCmd;
@@ -29,9 +30,13 @@ use crate::shared::{new_rng, BenchRecord};
 impl StorageCmd {
 	/// Benchmarks the time it takes to read a single Storage item.
 	/// Uses the latest state that is available for the given client.
-	pub(crate) fn bench_read<B, BA, C>(&self, client: Arc<C>) -> Result<BenchRecord>
+	pub(crate) fn bench_read<B, BA, C>(
+		&self,
+		client: Arc<C>,
+		_shared_trie_cache: Option<sp_trie::cache::SharedTrieCache<HashingFor<B>>>,
+	) -> Result<BenchRecord>
 	where
-		C: UsageProvider<B> + StorageProvider<B, BA>,
+		C: UsageProvider<B> + StorageProvider<B, BA> + CallApiAt<B>,
 		B: BlockT + Debug,
 		BA: ClientBackend<B>,
 		<<B as BlockT>::Header as HeaderT>::Number: From<u32>,
@@ -49,6 +54,19 @@ impl StorageCmd {
 		// Interesting part here:
 		// Read all the keys in the database and measure the time it takes to access each.
 		info!("Reading {} keys", keys.len());
+
+		// Read using the same TrieBackend and recorder for up to `batch_size` keys.
+		// This would allow us to measure the amortized cost of reading a key.
+		let recorder = (!self.params.disable_pov_recorder).then(|| Default::default());
+		let mut state = client
+			.state_at(best_hash)
+			.map_err(|_err| Error::Input("State not found".into()))?;
+		let mut as_trie_backend = state.as_trie_backend();
+		let mut backend = sp_state_machine::TrieBackendBuilder::wrap(&as_trie_backend)
+			.with_optional_recorder(recorder)
+			.build();
+		let mut read_in_batch = 0;
+
 		for key in keys.as_slice() {
 			match (self.params.include_child_trees, self.is_child_key(key.clone().0)) {
 				(true, Some(info)) => {
@@ -60,13 +78,31 @@ impl StorageCmd {
 				_ => {
 					// regular key
 					let start = Instant::now();
-					let v = client
-						.storage(best_hash, &key)
+
+					let v = backend
+						.storage(key.0.as_ref())
 						.expect("Checked above to exist")
 						.ok_or("Value unexpectedly empty")?;
-					record.append(v.0.len(), start.elapsed())?;
+					record.append(v.len(), start.elapsed())?;
 				},
 			}
+			read_in_batch += 1;
+			if read_in_batch >= self.params.batch_size {
+				// Using a new recorder for every read vs using the same for the entire batch
+				// produces significant different results. Since in the real use case we use a
+				// single recorder per block, simulate the same behavior by creating a new
+				// recorder every batch size, so that the amortized cost of reading a key is
+				// measured in conditions closer to the real world.
+				let recorder = (!self.params.disable_pov_recorder).then(|| Default::default());
+				state = client
+					.state_at(best_hash)
+					.map_err(|_err| Error::Input("State not found".to_string()))?;
+				as_trie_backend = state.as_trie_backend();
+				backend = sp_state_machine::TrieBackendBuilder::wrap(&as_trie_backend)
+					.with_optional_recorder(recorder)
+					.build();
+				read_in_batch = 0;
+			}
 		}
 
 		if self.params.include_child_trees {
@@ -75,11 +111,29 @@ impl StorageCmd {
 			info!("Reading {} child keys", child_nodes.len());
 			for (key, info) in child_nodes.as_slice() {
 				let start = Instant::now();
-				let v = client
-					.child_storage(best_hash, info, key)
+				let v = backend
+					.child_storage(info, key.0.as_ref())
 					.expect("Checked above to exist")
 					.ok_or("Value unexpectedly empty")?;
-				record.append(v.0.len(), start.elapsed())?;
+				record.append(v.len(), start.elapsed())?;
+
+				read_in_batch += 1;
+				if read_in_batch >= self.params.batch_size {
+					// Using a new recorder for every read vs using the same for the entire batch
+					// produces significant different results. Since in the real use case we use a
+					// single recorder per block, simulate the same behavior by creating a new
+					// recorder every batch size, so that the amortized cost of reading a key is
+					// measured in conditions closer to the real world.
+					let recorder = (!self.params.disable_pov_recorder).then(|| Default::default());
+					state = client
+						.state_at(best_hash)
+						.map_err(|_err| Error::Input("State not found".to_string()))?;
+					as_trie_backend = state.as_trie_backend();
+					backend = sp_state_machine::TrieBackendBuilder::wrap(&as_trie_backend)
+						.with_optional_recorder(recorder)
+						.build();
+					read_in_batch = 0;
+				}
 			}
 		}
 		Ok(record)
diff --git a/substrate/utils/frame/benchmarking-cli/src/storage/write.rs b/substrate/utils/frame/benchmarking-cli/src/storage/write.rs
diff --git a/templates/parachain/node/src/command.rs b/templates/parachain/node/src/command.rs
diff --git a/templates/solochain/node/src/command.rs b/templates/solochain/node/src/command.rs

Original file line number	Diff line number	Diff line change
`@@ -155,7 +155,8 @@ where`
`155`	`155`	`let partial = T::new_partial(&config).map_err(sc_cli::Error::Service)?;`
`156`	`156`	`let db = partial.backend.expose_db();`
`157`	`157`	`let storage = partial.backend.expose_storage();`
	`158`	`+ let shared_trie_cache = partial.backend.expose_shared_trie_cache();`
`158`	`159`
`159`		`- cmd.run(config, partial.client, db, storage)`
	`160`	`+ cmd.run(config, partial.client, db, storage, shared_trie_cache)`
`160`	`161`	`}`
`161`	`162`	`}`