@@ -942,9 +942,9 @@ volatile int heartbeat_enabled;
942942uv_sem_t heartbeat_on_sem , // jl_heartbeat_enable -> thread
943943 heartbeat_off_sem ; // thread -> jl_heartbeat_enable
944944int heartbeat_interval_s ,
945- n_loss_reports ,
946- reset_reporting_s ;
947- int last_report_s , report_interval_s , n_reported ;
945+ tasks_after_n ,
946+ reset_tasks_after_n ;
947+ int tasks_showed , n_hbs_missed , n_hbs_recvd ;
948948_Atomic(int ) heartbeats ;
949949
950950JL_DLLEXPORT void jl_print_task_backtraces (int show_done ) JL_NOTSAFEPOINT ;
@@ -963,21 +963,19 @@ void jl_init_heartbeat(void)
963963
964964// enable/disable heartbeats
965965// heartbeat_s: interval within which jl_heartbeat() must be called
966- // n_reports: for one heartbeat loss interval, how many times to report
967- // reset_reporting_after_s: how long to wait after a heartbeat loss
968- // interval and a return to steady heartbeats, before resetting
969- // reporting behavior
966+ // show_tasks_after_n: number of heartbeats missed before printing task backtraces
967+ // reset_after_n: number of heartbeats after which to reset
970968//
971969// When disabling heartbeats, the heartbeat thread must wake up,
972970// find out that heartbeats are now diabled, and reset. For now, we
973971// handle this by preventing re-enabling of heartbeats until this
974972// completes.
975- JL_DLLEXPORT int jl_heartbeat_enable (int heartbeat_s , int n_reports ,
976- int reset_reporting_after_s )
973+ JL_DLLEXPORT int jl_heartbeat_enable (int heartbeat_s , int show_tasks_after_n ,
974+ int reset_after_n )
977975{
978976 if (heartbeat_s <= 0 ) {
979977 heartbeat_enabled = 0 ;
980- heartbeat_interval_s = n_loss_reports = reset_reporting_s = 0 ;
978+ heartbeat_interval_s = tasks_after_n = reset_tasks_after_n = 0 ;
981979 }
982980 else {
983981 // must disable before enabling
@@ -991,10 +989,11 @@ JL_DLLEXPORT int jl_heartbeat_enable(int heartbeat_s, int n_reports,
991989
992990 jl_atomic_store_relaxed (& heartbeats , 0 );
993991 heartbeat_interval_s = heartbeat_s ;
994- n_loss_reports = n_reports ;
995- reset_reporting_s = reset_reporting_after_s ;
996- last_report_s = 0 ;
997- report_interval_s = heartbeat_interval_s ;
992+ tasks_after_n = show_tasks_after_n ;
993+ reset_tasks_after_n = reset_after_n ;
994+ tasks_showed = 0 ;
995+ n_hbs_missed = 0 ;
996+ n_hbs_recvd = 0 ;
998997 heartbeat_enabled = 1 ;
999998 uv_sem_post (& heartbeat_on_sem ); // wake the heartbeat thread
1000999 }
@@ -1030,44 +1029,42 @@ void sleep_for(int secs, int nsecs)
10301029uint8_t check_heartbeats (uint8_t gc_state )
10311030{
10321031 int hb = jl_atomic_exchange (& heartbeats , 0 );
1033- uint64_t curr_s = jl_hrtime () / 1e9 ;
10341032
10351033 if (hb <= 0 ) {
1036- // we didn't get a heartbeat in the last interval; should we report?
1037- if ( n_reported < n_loss_reports &&
1038- curr_s - last_report_s >= report_interval_s ) {
1039- jl_task_t * ct = jl_current_task ;
1040- jl_ptls_t ptls = ct -> ptls ;
1041-
1042- // exit GC-safe region to report then re-enter
1043- jl_gc_safe_leave ( ptls , gc_state );
1044- jl_safe_printf ( "==== heartbeat loss ====\n" );
1045- jl_print_task_backtraces ( 0 );
1046- gc_state = jl_gc_safe_enter ( ptls );
1047-
1048- // we've reported
1049- n_reported ++ ;
1050-
1051- // record the reporting time _after_ the report
1052- last_report_s = jl_hrtime () / 1e9 ;
1053-
1054- // double the reporting interval up to a maximum
1055- if ( report_interval_s < 60 * heartbeat_interval_s ) {
1056- report_interval_s *= 2 ;
1034+ // we didn't get a heartbeat
1035+ n_hbs_recvd = 0 ;
1036+ n_hbs_missed ++ ;
1037+
1038+ // if we've printed task backtraces already, do nothing
1039+ if (! tasks_showed ) {
1040+ // otherwise, at least show this message
1041+ jl_safe_printf ( "==== heartbeat loss (%ds) ====\n" ,
1042+ n_hbs_missed * heartbeat_interval_s );
1043+ // if we've missed enough heartbeats, print task backtraces
1044+ if ( n_hbs_missed >= tasks_after_n ) {
1045+ jl_task_t * ct = jl_current_task ;
1046+ jl_ptls_t ptls = ct -> ptls ;
1047+
1048+ // exit GC-safe region to report then re-enter
1049+ jl_gc_safe_leave ( ptls , gc_state );
1050+ jl_print_task_backtraces ( 0 ) ;
1051+ gc_state = jl_gc_safe_enter ( ptls );
1052+
1053+ // we printed task backtraces
1054+ tasks_showed = 1 ;
10571055 }
10581056 }
1059- // no heartbeats, don't change reporting state
1060- return gc_state ;
10611057 }
10621058 else {
1063- // we got a heartbeat; reset the report count
1064- n_reported = 0 ;
1065- }
1066-
1067- // reset the reporting interval only once we're steadily getting
1068- // heartbeats for the requested reset interval
1069- if (curr_s - reset_reporting_s > last_report_s ) {
1070- report_interval_s = heartbeat_interval_s ;
1059+ // got a heartbeat
1060+ n_hbs_recvd ++ ;
1061+ // if we'd printed task backtraces, check for reset
1062+ if (tasks_showed && n_hbs_recvd >= reset_tasks_after_n ) {
1063+ tasks_showed = 0 ;
1064+ jl_safe_printf ("==== heartbeats recovered (lost for %ds) ====\n" ,
1065+ n_hbs_missed * heartbeat_interval_s );
1066+ }
1067+ n_hbs_missed = 0 ;
10711068 }
10721069
10731070 return gc_state ;
@@ -1076,7 +1073,7 @@ uint8_t check_heartbeats(uint8_t gc_state)
10761073// heartbeat thread function
10771074void jl_heartbeat_threadfun (void * arg )
10781075{
1079- int s , ns = 1e9 - 1 , rs ;
1076+ int s = 59 , ns = 1e9 - 1 , rs ;
10801077 uint64_t t0 , tchb ;
10811078
10821079 // We need a TLS because backtraces are accumulated into ptls->bt_size
@@ -1134,8 +1131,8 @@ void jl_init_heartbeat(void)
11341131{
11351132}
11361133
1137- JL_DLLEXPORT int jl_heartbeat_enable (int heartbeat_s , int n_reports ,
1138- int reset_reporting_after_s )
1134+ JL_DLLEXPORT int jl_heartbeat_enable (int heartbeat_s , int show_tasks_after_n ,
1135+ int reset_after_n )
11391136{
11401137 return -1 ;
11411138}
0 commit comments