Skip to content

Commit f870b66

Browse files
authored
Rework Validator Client fallback mechanism (#4393)
* Rework Validator Client fallback mechanism * Add CI workflow for fallback simulator * Tie-break with sync distance for non-synced nodes * Fix simulator * Cleanup unused code * More improvements * Add IsOptimistic enum for readability * Use configurable sync distance tiers * Fix tests * Combine status and health and improve logging * Fix nodes not being marked as available * Fix simulator * Fix tests again * Increase fallback simulator tolerance * Add http api endpoint * Fix todos and tests * Update simulator * Merge branch 'unstable' into vc-fallback * Add suggestions * Add id to ui endpoint * Remove unnecessary clones * Formatting * Merge branch 'unstable' into vc-fallback * Merge branch 'unstable' into vc-fallback * Fix flag tests * Merge branch 'unstable' into vc-fallback * Merge branch 'unstable' into vc-fallback * Fix conflicts * Merge branch 'unstable' into vc-fallback * Remove unnecessary pubs * Simplify `compute_distance_tier` and reduce notifier awaits * Use the more descriptive `user_index` instead of `id` * Combine sync distance tolerance flags into one * Merge branch 'unstable' into vc-fallback * Merge branch 'unstable' into vc-fallback * wip * Use new simulator from unstable * Fix cli text * Remove leftover files * Remove old commented code * Merge branch 'unstable' into vc-fallback * Update cli text * Silence candidate errors when pre-genesis * Merge branch 'unstable' into vc-fallback * Merge branch 'unstable' into vc-fallback * Retry on failure * Merge branch 'unstable' into vc-fallback * Merge branch 'unstable' into vc-fallback * Remove disable_run_on_all * Remove unused error variant * Fix out of date comment * Merge branch 'unstable' into vc-fallback * Remove unnecessary as_u64 * Remove more out of date comments * Use tokio RwLock and remove parking_lot * Merge branch 'unstable' into vc-fallback * Formatting * Ensure nodes are still added to total when not available * Allow VC to detect when BN comes online * Fix ui endpoint * Don't have block_service as an Option * Merge branch 'unstable' into vc-fallback * Clean up lifetimes and futures * Revert "Don't have block_service as an Option" This reverts commit b5445a0. * Merge branch 'unstable' into vc-fallback * Merge branch 'unstable' into vc-fallback * Improve rwlock sanitation using clones * Merge branch 'unstable' into vc-fallback * Drop read lock immediately by cloning the vec.
1 parent 17849b5 commit f870b66

24 files changed

+1314
-776
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

book/src/help_vc.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,22 @@ Options:
177177
Default is unlimited.
178178
179179
Flags:
180+
--beacon-nodes-sync-tolerances <SYNC_TOLERANCES>
181+
A comma-separated list of 3 values which sets the size of each sync
182+
distance range when determining the health of each connected beacon
183+
node. The first value determines the `Synced` range. If a connected
184+
beacon node is synced to within this number of slots it is considered
185+
'Synced'. The second value determines the `Small` sync distance range.
186+
This range starts immediately after the `Synced` range. The third
187+
value determines the `Medium` sync distance range. This range starts
188+
immediately after the `Small` range. Any sync distance value beyond
189+
that is considered `Large`. For example, a value of `8,8,48` would
190+
have ranges like the following: `Synced`: 0..=8 `Small`: 9..=16
191+
`Medium`: 17..=64 `Large`: 65.. These values are used to determine
192+
what ordering beacon node fallbacks are used in. Generally, `Synced`
193+
nodes are preferred over `Small` and so on. Nodes in the `Synced`
194+
range will tie-break based on their ordering in `--beacon-nodes`. This
195+
ensures the primary beacon node is prioritised. [default: 8,8,48]
180196
--builder-proposals
181197
If this flag is set, Lighthouse will query the Beacon Node for only
182198
block headers during proposals and will sign over headers. Useful for

common/eth2/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ store = { workspace = true }
2929
slashing_protection = { workspace = true }
3030
mediatype = "0.19.13"
3131
pretty_reqwest_error = { workspace = true }
32+
derivative = { workspace = true }
3233

3334
[dev-dependencies]
3435
tokio = { workspace = true }

common/eth2/src/lib.rs

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ pub mod types;
1616

1717
use self::mixin::{RequestAccept, ResponseOptional};
1818
use self::types::{Error as ResponseError, *};
19+
use derivative::Derivative;
1920
use futures::Stream;
2021
use futures_util::StreamExt;
2122
use lighthouse_network::PeerId;
@@ -117,7 +118,7 @@ impl fmt::Display for Error {
117118

118119
/// A struct to define a variety of different timeouts for different validator tasks to ensure
119120
/// proper fallback behaviour.
120-
#[derive(Clone)]
121+
#[derive(Clone, Debug, PartialEq, Eq)]
121122
pub struct Timeouts {
122123
pub attestation: Duration,
123124
pub attester_duties: Duration,
@@ -154,13 +155,17 @@ impl Timeouts {
154155

155156
/// A wrapper around `reqwest::Client` which provides convenience methods for interfacing with a
156157
/// Lighthouse Beacon Node HTTP server (`http_api`).
157-
#[derive(Clone)]
158+
#[derive(Clone, Debug, Derivative)]
159+
#[derivative(PartialEq)]
158160
pub struct BeaconNodeHttpClient {
161+
#[derivative(PartialEq = "ignore")]
159162
client: reqwest::Client,
160163
server: SensitiveUrl,
161164
timeouts: Timeouts,
162165
}
163166

167+
impl Eq for BeaconNodeHttpClient {}
168+
164169
impl fmt::Display for BeaconNodeHttpClient {
165170
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
166171
self.server.fmt(f)

lighthouse/tests/validator_client.rs

Lines changed: 31 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
1-
use validator_client::{config::DEFAULT_WEB3SIGNER_KEEP_ALIVE, ApiTopic, Config};
1+
use validator_client::{
2+
config::DEFAULT_WEB3SIGNER_KEEP_ALIVE, ApiTopic, BeaconNodeSyncDistanceTiers, Config,
3+
};
24

35
use crate::exec::CommandLineTestExec;
46
use bls::{Keypair, PublicKeyBytes};
@@ -12,7 +14,7 @@ use std::str::FromStr;
1214
use std::string::ToString;
1315
use std::time::Duration;
1416
use tempfile::TempDir;
15-
use types::Address;
17+
use types::{Address, Slot};
1618

1719
/// Returns the `lighthouse validator_client` command.
1820
fn base_cmd() -> Command {
@@ -511,7 +513,6 @@ fn monitoring_endpoint() {
511513
assert_eq!(api_conf.update_period_secs, Some(30));
512514
});
513515
}
514-
515516
#[test]
516517
fn disable_run_on_all_flag() {
517518
CommandLineTest::new()
@@ -572,6 +573,33 @@ fn broadcast_flag() {
572573
});
573574
}
574575

576+
/// Tests for validator fallback flags.
577+
#[test]
578+
fn beacon_nodes_sync_tolerances_flag_default() {
579+
CommandLineTest::new().run().with_config(|config| {
580+
assert_eq!(
581+
config.beacon_node_fallback.sync_tolerances,
582+
BeaconNodeSyncDistanceTiers::default()
583+
)
584+
});
585+
}
586+
#[test]
587+
fn beacon_nodes_sync_tolerances_flag() {
588+
CommandLineTest::new()
589+
.flag("beacon-nodes-sync-tolerances", Some("4,4,4"))
590+
.run()
591+
.with_config(|config| {
592+
assert_eq!(
593+
config.beacon_node_fallback.sync_tolerances,
594+
BeaconNodeSyncDistanceTiers {
595+
synced: Slot::new(4),
596+
small: Slot::new(8),
597+
medium: Slot::new(12),
598+
}
599+
);
600+
});
601+
}
602+
575603
#[test]
576604
#[should_panic(expected = "Unknown API topic")]
577605
fn wrong_broadcast_flag() {

testing/simulator/src/fallback_sim.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ const DENEB_FORK_EPOCH: u64 = 2;
2929
// This has potential to block CI so it should be set conservatively enough that spurious failures
3030
// don't become very common, but not so conservatively that regressions to the fallback mechanism
3131
// cannot be detected.
32-
const ACCEPTABLE_FALLBACK_ATTESTATION_HIT_PERCENTAGE: f64 = 85.0;
32+
const ACCEPTABLE_FALLBACK_ATTESTATION_HIT_PERCENTAGE: f64 = 95.0;
3333

3434
const SUGGESTED_FEE_RECIPIENT: [u8; 20] =
3535
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1];

validator_client/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ path = "src/lib.rs"
1010

1111
[dev-dependencies]
1212
tokio = { workspace = true }
13-
itertools = { workspace = true }
1413

1514
[dependencies]
1615
tree_hash = { workspace = true }
@@ -60,4 +59,5 @@ sysinfo = { workspace = true }
6059
system_health = { path = "../common/system_health" }
6160
logging = { workspace = true }
6261
strum = { workspace = true }
62+
itertools = { workspace = true }
6363
fdlimit = "0.3.0"

validator_client/src/attestation_service.rs

Lines changed: 79 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,8 @@
1-
use crate::beacon_node_fallback::{ApiTopic, BeaconNodeFallback, RequireSynced};
1+
use crate::beacon_node_fallback::{ApiTopic, BeaconNodeFallback};
22
use crate::{
33
duties_service::{DutiesService, DutyAndProof},
44
http_metrics::metrics,
55
validator_store::{Error as ValidatorStoreError, ValidatorStore},
6-
OfflineOnFailure,
76
};
87
use environment::RuntimeContext;
98
use futures::future::join_all;
@@ -339,21 +338,17 @@ impl<T: SlotClock + 'static, E: EthSpec> AttestationService<T, E> {
339338

340339
let attestation_data = self
341340
.beacon_nodes
342-
.first_success(
343-
RequireSynced::No,
344-
OfflineOnFailure::Yes,
345-
|beacon_node| async move {
346-
let _timer = metrics::start_timer_vec(
347-
&metrics::ATTESTATION_SERVICE_TIMES,
348-
&[metrics::ATTESTATIONS_HTTP_GET],
349-
);
350-
beacon_node
351-
.get_validator_attestation_data(slot, committee_index)
352-
.await
353-
.map_err(|e| format!("Failed to produce attestation data: {:?}", e))
354-
.map(|result| result.data)
355-
},
356-
)
341+
.first_success(|beacon_node| async move {
342+
let _timer = metrics::start_timer_vec(
343+
&metrics::ATTESTATION_SERVICE_TIMES,
344+
&[metrics::ATTESTATIONS_HTTP_GET],
345+
);
346+
beacon_node
347+
.get_validator_attestation_data(slot, committee_index)
348+
.await
349+
.map_err(|e| format!("Failed to produce attestation data: {:?}", e))
350+
.map(|result| result.data)
351+
})
357352
.await
358353
.map_err(|e| e.to_string())?;
359354

@@ -458,26 +453,21 @@ impl<T: SlotClock + 'static, E: EthSpec> AttestationService<T, E> {
458453
// Post the attestations to the BN.
459454
match self
460455
.beacon_nodes
461-
.request(
462-
RequireSynced::No,
463-
OfflineOnFailure::Yes,
464-
ApiTopic::Attestations,
465-
|beacon_node| async move {
466-
let _timer = metrics::start_timer_vec(
467-
&metrics::ATTESTATION_SERVICE_TIMES,
468-
&[metrics::ATTESTATIONS_HTTP_POST],
469-
);
470-
if fork_name.electra_enabled() {
471-
beacon_node
472-
.post_beacon_pool_attestations_v2(attestations, fork_name)
473-
.await
474-
} else {
475-
beacon_node
476-
.post_beacon_pool_attestations_v1(attestations)
477-
.await
478-
}
479-
},
480-
)
456+
.request(ApiTopic::Attestations, |beacon_node| async move {
457+
let _timer = metrics::start_timer_vec(
458+
&metrics::ATTESTATION_SERVICE_TIMES,
459+
&[metrics::ATTESTATIONS_HTTP_POST],
460+
);
461+
if fork_name.electra_enabled() {
462+
beacon_node
463+
.post_beacon_pool_attestations_v2(attestations, fork_name)
464+
.await
465+
} else {
466+
beacon_node
467+
.post_beacon_pool_attestations_v1(attestations)
468+
.await
469+
}
470+
})
481471
.await
482472
{
483473
Ok(()) => info!(
@@ -540,46 +530,38 @@ impl<T: SlotClock + 'static, E: EthSpec> AttestationService<T, E> {
540530

541531
let aggregated_attestation = &self
542532
.beacon_nodes
543-
.first_success(
544-
RequireSynced::No,
545-
OfflineOnFailure::Yes,
546-
|beacon_node| async move {
547-
let _timer = metrics::start_timer_vec(
548-
&metrics::ATTESTATION_SERVICE_TIMES,
549-
&[metrics::AGGREGATES_HTTP_GET],
550-
);
551-
if fork_name.electra_enabled() {
552-
beacon_node
553-
.get_validator_aggregate_attestation_v2(
554-
attestation_data.slot,
555-
attestation_data.tree_hash_root(),
556-
committee_index,
557-
)
558-
.await
559-
.map_err(|e| {
560-
format!("Failed to produce an aggregate attestation: {:?}", e)
561-
})?
562-
.ok_or_else(|| {
563-
format!("No aggregate available for {:?}", attestation_data)
564-
})
565-
.map(|result| result.data)
566-
} else {
567-
beacon_node
568-
.get_validator_aggregate_attestation_v1(
569-
attestation_data.slot,
570-
attestation_data.tree_hash_root(),
571-
)
572-
.await
573-
.map_err(|e| {
574-
format!("Failed to produce an aggregate attestation: {:?}", e)
575-
})?
576-
.ok_or_else(|| {
577-
format!("No aggregate available for {:?}", attestation_data)
578-
})
579-
.map(|result| result.data)
580-
}
581-
},
582-
)
533+
.first_success(|beacon_node| async move {
534+
let _timer = metrics::start_timer_vec(
535+
&metrics::ATTESTATION_SERVICE_TIMES,
536+
&[metrics::AGGREGATES_HTTP_GET],
537+
);
538+
if fork_name.electra_enabled() {
539+
beacon_node
540+
.get_validator_aggregate_attestation_v2(
541+
attestation_data.slot,
542+
attestation_data.tree_hash_root(),
543+
committee_index,
544+
)
545+
.await
546+
.map_err(|e| {
547+
format!("Failed to produce an aggregate attestation: {:?}", e)
548+
})?
549+
.ok_or_else(|| format!("No aggregate available for {:?}", attestation_data))
550+
.map(|result| result.data)
551+
} else {
552+
beacon_node
553+
.get_validator_aggregate_attestation_v1(
554+
attestation_data.slot,
555+
attestation_data.tree_hash_root(),
556+
)
557+
.await
558+
.map_err(|e| {
559+
format!("Failed to produce an aggregate attestation: {:?}", e)
560+
})?
561+
.ok_or_else(|| format!("No aggregate available for {:?}", attestation_data))
562+
.map(|result| result.data)
563+
}
564+
})
583565
.await
584566
.map_err(|e| e.to_string())?;
585567

@@ -637,30 +619,26 @@ impl<T: SlotClock + 'static, E: EthSpec> AttestationService<T, E> {
637619
let signed_aggregate_and_proofs_slice = signed_aggregate_and_proofs.as_slice();
638620
match self
639621
.beacon_nodes
640-
.first_success(
641-
RequireSynced::No,
642-
OfflineOnFailure::Yes,
643-
|beacon_node| async move {
644-
let _timer = metrics::start_timer_vec(
645-
&metrics::ATTESTATION_SERVICE_TIMES,
646-
&[metrics::AGGREGATES_HTTP_POST],
647-
);
648-
if fork_name.electra_enabled() {
649-
beacon_node
650-
.post_validator_aggregate_and_proof_v2(
651-
signed_aggregate_and_proofs_slice,
652-
fork_name,
653-
)
654-
.await
655-
} else {
656-
beacon_node
657-
.post_validator_aggregate_and_proof_v1(
658-
signed_aggregate_and_proofs_slice,
659-
)
660-
.await
661-
}
662-
},
663-
)
622+
.first_success(|beacon_node| async move {
623+
let _timer = metrics::start_timer_vec(
624+
&metrics::ATTESTATION_SERVICE_TIMES,
625+
&[metrics::AGGREGATES_HTTP_POST],
626+
);
627+
if fork_name.electra_enabled() {
628+
beacon_node
629+
.post_validator_aggregate_and_proof_v2(
630+
signed_aggregate_and_proofs_slice,
631+
fork_name,
632+
)
633+
.await
634+
} else {
635+
beacon_node
636+
.post_validator_aggregate_and_proof_v1(
637+
signed_aggregate_and_proofs_slice,
638+
)
639+
.await
640+
}
641+
})
664642
.await
665643
{
666644
Ok(()) => {

0 commit comments

Comments
 (0)