Skip to content

Commit 46eecd0

Browse files
committed
fix: auto-reconnect Bittensor client when WS connection dies, health-check timer
Validators were silently stopping weight submission when the Bittensor WebSocket connection died ('background task closed'). The BlockSync internally reconnects, but the Subtensor and BittensorClient used for metagraph refresh and weight submission were left on the dead connection. - Recreate bittensor_client_for_metagraph on BlockSyncEvent::Reconnected - Recreate bittensor_client in metagraph refresh when old client is dead - Add 5-min health-check timer: if no block events received, proactively reconnect Subtensor + recreate BittensorClient - Better error classification: HotKeyNotRegisteredInSubNet, CommittingWeightsTooFast - All reconnection events logged as warn/error for Sentry visibility
1 parent 63411f9 commit 46eecd0

File tree

1 file changed

+115
-3
lines changed

1 file changed

+115
-3
lines changed

bins/validator-node/src/main.rs

Lines changed: 115 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1255,6 +1255,8 @@ async fn main() -> Result<()> {
12551255
let mut storage_stats_interval = tokio::time::interval(Duration::from_secs(300));
12561256
let mut storage_flush_interval = tokio::time::interval(Duration::from_secs(5));
12571257
let mut background_tick_interval = tokio::time::interval(Duration::from_secs(12));
1258+
let mut bittensor_health_interval = tokio::time::interval(Duration::from_secs(300));
1259+
let mut last_block_event_time = std::time::Instant::now();
12581260
// Track last synced block per challenge for delta sync
12591261
let challenge_last_sync: Arc<
12601262
RwLock<std::collections::HashMap<platform_core::ChallengeId, u64>>,
@@ -1293,6 +1295,8 @@ async fn main() -> Result<()> {
12931295
None => std::future::pending().await,
12941296
}
12951297
} => {
1298+
last_block_event_time = std::time::Instant::now();
1299+
12961300
// Check if this block triggers weight submission (every WEIGHT_SET_BLOCK_INTERVAL blocks)
12971301
let is_weight_block = if let BlockSyncEvent::NewBlock { block_number, .. } = &event {
12981302
*block_number > 0
@@ -1303,6 +1307,7 @@ async fn main() -> Result<()> {
13031307

13041308
// Force metagraph refresh before weight submission
13051309
if is_weight_block || matches!(event, BlockSyncEvent::CommitWindowOpen { .. }) {
1310+
let mut refresh_ok = false;
13061311
if let Some(bittensor_client) = bittensor_client_for_metagraph.as_ref() {
13071312
match tokio::time::timeout(
13081313
Duration::from_secs(15),
@@ -1312,7 +1317,6 @@ async fn main() -> Result<()> {
13121317
info!("Pre-weight metagraph refresh: {} neurons", mg.n);
13131318
let our_hk = keypair.hotkey();
13141319
update_validator_set_from_metagraph(&mg, &validator_set, &chain_state, &valid_voters, &state_root_consensus, &state_manager, Some(&our_hk));
1315-
// Update shared UID map for real-time weight RPC
13161320
{
13171321
let mut uid_map = shared_uid_map.write();
13181322
uid_map.clear();
@@ -1323,12 +1327,59 @@ async fn main() -> Result<()> {
13231327
if let Some(sc) = subtensor_client.as_mut() {
13241328
sc.set_metagraph(mg);
13251329
}
1330+
refresh_ok = true;
13261331
}
13271332
Ok(Err(e)) => {
1328-
warn!("Pre-weight metagraph refresh failed: {}. Using cached.", e);
1333+
let err_str = format!("{}", e);
1334+
if err_str.contains("background task") || err_str.contains("connection closed") {
1335+
warn!("Bittensor client dead ({}), recreating...", e);
1336+
} else {
1337+
warn!("Pre-weight metagraph refresh failed: {}. Using cached.", e);
1338+
refresh_ok = true; // non-fatal, keep going with cached
1339+
}
13291340
}
13301341
Err(_) => {
13311342
warn!("Pre-weight metagraph refresh timed out (15s). Using cached.");
1343+
refresh_ok = true;
1344+
}
1345+
}
1346+
}
1347+
// Recreate bittensor client if dead or missing
1348+
if !refresh_ok {
1349+
warn!("Recreating Bittensor client for metagraph refresh");
1350+
match BittensorClient::new(&subtensor_endpoint_for_reconnect).await {
1351+
Ok(new_client) => {
1352+
let new_client = Arc::new(new_client);
1353+
match tokio::time::timeout(
1354+
Duration::from_secs(15),
1355+
sync_metagraph(&new_client, netuid),
1356+
).await {
1357+
Ok(Ok(mg)) => {
1358+
warn!("Metagraph refresh OK after client recreation: {} neurons", mg.n);
1359+
let our_hk = keypair.hotkey();
1360+
update_validator_set_from_metagraph(&mg, &validator_set, &chain_state, &valid_voters, &state_root_consensus, &state_manager, Some(&our_hk));
1361+
{
1362+
let mut uid_map = shared_uid_map.write();
1363+
uid_map.clear();
1364+
for (uid, neuron) in &mg.neurons {
1365+
uid_map.insert(neuron.hotkey.to_string(), *uid as u16);
1366+
}
1367+
}
1368+
if let Some(sc) = subtensor_client.as_mut() {
1369+
sc.set_metagraph(mg);
1370+
}
1371+
bittensor_client_for_metagraph = Some(new_client);
1372+
}
1373+
Ok(Err(e)) => {
1374+
error!("Metagraph refresh failed even after client recreation: {}", e);
1375+
}
1376+
Err(_) => {
1377+
error!("Metagraph refresh timed out even after client recreation");
1378+
}
1379+
}
1380+
}
1381+
Err(e) => {
1382+
error!("Failed to recreate Bittensor client: {}", e);
13321383
}
13331384
}
13341385
}
@@ -1356,6 +1407,7 @@ async fn main() -> Result<()> {
13561407
&mut last_weight_submission_epoch,
13571408
&subtensor_endpoint_for_reconnect,
13581409
&subtensor_state_path_for_reconnect,
1410+
&mut bittensor_client_for_metagraph,
13591411
).await;
13601412

13611413
// On first block after startup, compute weights immediately so
@@ -1403,6 +1455,7 @@ async fn main() -> Result<()> {
14031455
&mut last_weight_submission_epoch,
14041456
&subtensor_endpoint_for_reconnect,
14051457
&subtensor_state_path_for_reconnect,
1458+
&mut bittensor_client_for_metagraph,
14061459
).await;
14071460
}
14081461
}
@@ -2178,6 +2231,36 @@ async fn main() -> Result<()> {
21782231
}
21792232
}
21802233

2234+
// Bittensor connection health-check
2235+
_ = bittensor_health_interval.tick() => {
2236+
if !args.no_bittensor && block_rx.is_some() {
2237+
let secs_since_last = last_block_event_time.elapsed().as_secs();
2238+
if secs_since_last > 300 {
2239+
error!(
2240+
seconds_since_last_block = secs_since_last,
2241+
"No Bittensor block events received in {}s - connection likely dead, reconnecting Subtensor",
2242+
secs_since_last
2243+
);
2244+
// Reconnect subtensor for weight submission
2245+
try_reconnect_subtensor(
2246+
&mut subtensor,
2247+
&subtensor_endpoint_for_reconnect,
2248+
&subtensor_state_path_for_reconnect,
2249+
).await;
2250+
// Recreate bittensor client for metagraph
2251+
match BittensorClient::new(&subtensor_endpoint_for_reconnect).await {
2252+
Ok(new_client) => {
2253+
bittensor_client_for_metagraph = Some(Arc::new(new_client));
2254+
warn!("Bittensor client recreated after health-check failure");
2255+
}
2256+
Err(e) => {
2257+
error!("Failed to recreate Bittensor client during health-check: {}", e);
2258+
}
2259+
}
2260+
}
2261+
}
2262+
}
2263+
21812264
// Ctrl+C or SIGTERM
21822265
_ = &mut shutdown_signal => {
21832266
info!("Received shutdown signal, persisting state...");
@@ -4823,6 +4906,7 @@ async fn handle_block_event(
48234906
last_weight_submission_epoch: &mut u64,
48244907
subtensor_endpoint: &str,
48254908
subtensor_state_path: &Option<std::path::PathBuf>,
4909+
bittensor_client_for_metagraph: &mut Option<Arc<BittensorClient>>,
48264910
) {
48274911
match event {
48284912
BlockSyncEvent::NewBlock { block_number, .. } => {
@@ -5330,6 +5414,22 @@ async fn handle_block_event(
53305414
uids.clone(),
53315415
weights.clone(),
53325416
));
5417+
} else if err_str.contains("HotKeyNotRegisteredInSubNet") {
5418+
error!(
5419+
epoch = epoch,
5420+
mechanism_id = mechanism_id,
5421+
error = %err_str,
5422+
"Hotkey not registered on subnet - check registration status"
5423+
);
5424+
failed_mechanisms.push(*mechanism_id);
5425+
} else if err_str.contains("CommittingWeightsTooFast") {
5426+
warn!(
5427+
epoch = epoch,
5428+
mechanism_id = mechanism_id,
5429+
error = %err_str,
5430+
"Committing weights too fast (rate limited by chain), will retry next window"
5431+
);
5432+
failed_mechanisms.push(*mechanism_id);
53335433
} else {
53345434
error!(
53355435
epoch = epoch,
@@ -5483,8 +5583,20 @@ async fn handle_block_event(
54835583
warn!("Bittensor disconnected: {}", reason);
54845584
}
54855585
BlockSyncEvent::Reconnected => {
5486-
info!("Bittensor block sync reconnected - also reconnecting Subtensor weight submission client");
5586+
warn!("Bittensor block sync reconnected - reconnecting Subtensor and metagraph client");
54875587
try_reconnect_subtensor(subtensor, subtensor_endpoint, subtensor_state_path).await;
5588+
match BittensorClient::new(subtensor_endpoint).await {
5589+
Ok(new_client) => {
5590+
*bittensor_client_for_metagraph = Some(Arc::new(new_client));
5591+
warn!("Bittensor metagraph client recreated after reconnect");
5592+
}
5593+
Err(e) => {
5594+
error!(
5595+
"Failed to recreate Bittensor metagraph client after reconnect: {}",
5596+
e
5597+
);
5598+
}
5599+
}
54885600
}
54895601
}
54905602
}

0 commit comments

Comments
 (0)