@@ -1255,6 +1255,8 @@ async fn main() -> Result<()> {
12551255 let mut storage_stats_interval = tokio:: time:: interval ( Duration :: from_secs ( 300 ) ) ;
12561256 let mut storage_flush_interval = tokio:: time:: interval ( Duration :: from_secs ( 5 ) ) ;
12571257 let mut background_tick_interval = tokio:: time:: interval ( Duration :: from_secs ( 12 ) ) ;
1258+ let mut bittensor_health_interval = tokio:: time:: interval ( Duration :: from_secs ( 300 ) ) ;
1259+ let mut last_block_event_time = std:: time:: Instant :: now ( ) ;
12581260 // Track last synced block per challenge for delta sync
12591261 let challenge_last_sync: Arc <
12601262 RwLock < std:: collections:: HashMap < platform_core:: ChallengeId , u64 > > ,
@@ -1293,6 +1295,8 @@ async fn main() -> Result<()> {
12931295 None => std:: future:: pending( ) . await ,
12941296 }
12951297 } => {
1298+ last_block_event_time = std:: time:: Instant :: now( ) ;
1299+
12961300 // Check if this block triggers weight submission (every WEIGHT_SET_BLOCK_INTERVAL blocks)
12971301 let is_weight_block = if let BlockSyncEvent :: NewBlock { block_number, .. } = & event {
12981302 * block_number > 0
@@ -1303,6 +1307,7 @@ async fn main() -> Result<()> {
13031307
13041308 // Force metagraph refresh before weight submission
13051309 if is_weight_block || matches!( event, BlockSyncEvent :: CommitWindowOpen { .. } ) {
1310+ let mut refresh_ok = false ;
13061311 if let Some ( bittensor_client) = bittensor_client_for_metagraph. as_ref( ) {
13071312 match tokio:: time:: timeout(
13081313 Duration :: from_secs( 15 ) ,
@@ -1312,7 +1317,6 @@ async fn main() -> Result<()> {
13121317 info!( "Pre-weight metagraph refresh: {} neurons" , mg. n) ;
13131318 let our_hk = keypair. hotkey( ) ;
13141319 update_validator_set_from_metagraph( & mg, & validator_set, & chain_state, & valid_voters, & state_root_consensus, & state_manager, Some ( & our_hk) ) ;
1315- // Update shared UID map for real-time weight RPC
13161320 {
13171321 let mut uid_map = shared_uid_map. write( ) ;
13181322 uid_map. clear( ) ;
@@ -1323,12 +1327,59 @@ async fn main() -> Result<()> {
13231327 if let Some ( sc) = subtensor_client. as_mut( ) {
13241328 sc. set_metagraph( mg) ;
13251329 }
1330+ refresh_ok = true ;
13261331 }
13271332 Ok ( Err ( e) ) => {
1328- warn!( "Pre-weight metagraph refresh failed: {}. Using cached." , e) ;
1333+ let err_str = format!( "{}" , e) ;
1334+ if err_str. contains( "background task" ) || err_str. contains( "connection closed" ) {
1335+ warn!( "Bittensor client dead ({}), recreating..." , e) ;
1336+ } else {
1337+ warn!( "Pre-weight metagraph refresh failed: {}. Using cached." , e) ;
1338+ refresh_ok = true ; // non-fatal, keep going with cached
1339+ }
13291340 }
13301341 Err ( _) => {
13311342 warn!( "Pre-weight metagraph refresh timed out (15s). Using cached." ) ;
1343+ refresh_ok = true ;
1344+ }
1345+ }
1346+ }
1347+ // Recreate bittensor client if dead or missing
1348+ if !refresh_ok {
1349+ warn!( "Recreating Bittensor client for metagraph refresh" ) ;
1350+ match BittensorClient :: new( & subtensor_endpoint_for_reconnect) . await {
1351+ Ok ( new_client) => {
1352+ let new_client = Arc :: new( new_client) ;
1353+ match tokio:: time:: timeout(
1354+ Duration :: from_secs( 15 ) ,
1355+ sync_metagraph( & new_client, netuid) ,
1356+ ) . await {
1357+ Ok ( Ok ( mg) ) => {
1358+ warn!( "Metagraph refresh OK after client recreation: {} neurons" , mg. n) ;
1359+ let our_hk = keypair. hotkey( ) ;
1360+ update_validator_set_from_metagraph( & mg, & validator_set, & chain_state, & valid_voters, & state_root_consensus, & state_manager, Some ( & our_hk) ) ;
1361+ {
1362+ let mut uid_map = shared_uid_map. write( ) ;
1363+ uid_map. clear( ) ;
1364+ for ( uid, neuron) in & mg. neurons {
1365+ uid_map. insert( neuron. hotkey. to_string( ) , * uid as u16 ) ;
1366+ }
1367+ }
1368+ if let Some ( sc) = subtensor_client. as_mut( ) {
1369+ sc. set_metagraph( mg) ;
1370+ }
1371+ bittensor_client_for_metagraph = Some ( new_client) ;
1372+ }
1373+ Ok ( Err ( e) ) => {
1374+ error!( "Metagraph refresh failed even after client recreation: {}" , e) ;
1375+ }
1376+ Err ( _) => {
1377+ error!( "Metagraph refresh timed out even after client recreation" ) ;
1378+ }
1379+ }
1380+ }
1381+ Err ( e) => {
1382+ error!( "Failed to recreate Bittensor client: {}" , e) ;
13321383 }
13331384 }
13341385 }
@@ -1356,6 +1407,7 @@ async fn main() -> Result<()> {
13561407 & mut last_weight_submission_epoch,
13571408 & subtensor_endpoint_for_reconnect,
13581409 & subtensor_state_path_for_reconnect,
1410+ & mut bittensor_client_for_metagraph,
13591411 ) . await ;
13601412
13611413 // On first block after startup, compute weights immediately so
@@ -1403,6 +1455,7 @@ async fn main() -> Result<()> {
14031455 & mut last_weight_submission_epoch,
14041456 & subtensor_endpoint_for_reconnect,
14051457 & subtensor_state_path_for_reconnect,
1458+ & mut bittensor_client_for_metagraph,
14061459 ) . await ;
14071460 }
14081461 }
@@ -2178,6 +2231,36 @@ async fn main() -> Result<()> {
21782231 }
21792232 }
21802233
2234+ // Bittensor connection health-check
2235+ _ = bittensor_health_interval. tick( ) => {
2236+ if !args. no_bittensor && block_rx. is_some( ) {
2237+ let secs_since_last = last_block_event_time. elapsed( ) . as_secs( ) ;
2238+ if secs_since_last > 300 {
2239+ error!(
2240+ seconds_since_last_block = secs_since_last,
2241+ "No Bittensor block events received in {}s - connection likely dead, reconnecting Subtensor" ,
2242+ secs_since_last
2243+ ) ;
2244+ // Reconnect subtensor for weight submission
2245+ try_reconnect_subtensor(
2246+ & mut subtensor,
2247+ & subtensor_endpoint_for_reconnect,
2248+ & subtensor_state_path_for_reconnect,
2249+ ) . await ;
2250+ // Recreate bittensor client for metagraph
2251+ match BittensorClient :: new( & subtensor_endpoint_for_reconnect) . await {
2252+ Ok ( new_client) => {
2253+ bittensor_client_for_metagraph = Some ( Arc :: new( new_client) ) ;
2254+ warn!( "Bittensor client recreated after health-check failure" ) ;
2255+ }
2256+ Err ( e) => {
2257+ error!( "Failed to recreate Bittensor client during health-check: {}" , e) ;
2258+ }
2259+ }
2260+ }
2261+ }
2262+ }
2263+
21812264 // Ctrl+C or SIGTERM
21822265 _ = & mut shutdown_signal => {
21832266 info!( "Received shutdown signal, persisting state..." ) ;
@@ -4823,6 +4906,7 @@ async fn handle_block_event(
48234906 last_weight_submission_epoch : & mut u64 ,
48244907 subtensor_endpoint : & str ,
48254908 subtensor_state_path : & Option < std:: path:: PathBuf > ,
4909+ bittensor_client_for_metagraph : & mut Option < Arc < BittensorClient > > ,
48264910) {
48274911 match event {
48284912 BlockSyncEvent :: NewBlock { block_number, .. } => {
@@ -5330,6 +5414,22 @@ async fn handle_block_event(
53305414 uids. clone ( ) ,
53315415 weights. clone ( ) ,
53325416 ) ) ;
5417+ } else if err_str. contains ( "HotKeyNotRegisteredInSubNet" ) {
5418+ error ! (
5419+ epoch = epoch,
5420+ mechanism_id = mechanism_id,
5421+ error = %err_str,
5422+ "Hotkey not registered on subnet - check registration status"
5423+ ) ;
5424+ failed_mechanisms. push ( * mechanism_id) ;
5425+ } else if err_str. contains ( "CommittingWeightsTooFast" ) {
5426+ warn ! (
5427+ epoch = epoch,
5428+ mechanism_id = mechanism_id,
5429+ error = %err_str,
5430+ "Committing weights too fast (rate limited by chain), will retry next window"
5431+ ) ;
5432+ failed_mechanisms. push ( * mechanism_id) ;
53335433 } else {
53345434 error ! (
53355435 epoch = epoch,
@@ -5483,8 +5583,20 @@ async fn handle_block_event(
54835583 warn ! ( "Bittensor disconnected: {}" , reason) ;
54845584 }
54855585 BlockSyncEvent :: Reconnected => {
5486- info ! ( "Bittensor block sync reconnected - also reconnecting Subtensor weight submission client" ) ;
5586+ warn ! ( "Bittensor block sync reconnected - reconnecting Subtensor and metagraph client" ) ;
54875587 try_reconnect_subtensor ( subtensor, subtensor_endpoint, subtensor_state_path) . await ;
5588+ match BittensorClient :: new ( subtensor_endpoint) . await {
5589+ Ok ( new_client) => {
5590+ * bittensor_client_for_metagraph = Some ( Arc :: new ( new_client) ) ;
5591+ warn ! ( "Bittensor metagraph client recreated after reconnect" ) ;
5592+ }
5593+ Err ( e) => {
5594+ error ! (
5595+ "Failed to recreate Bittensor metagraph client after reconnect: {}" ,
5596+ e
5597+ ) ;
5598+ }
5599+ }
54885600 }
54895601 }
54905602}
0 commit comments