@@ -724,6 +724,85 @@ void jl_unlock_stackwalk(int lockret)
724724 jl_unlock_profile_mach (1 , lockret );
725725}
726726
727+ // assumes holding `jl_lock_profile_mach`
728+ void jl_profile_thread_mach (int tid )
729+ {
730+ // if there is no space left, return early
731+ if (jl_profile_is_buffer_full ()) {
732+ jl_profile_stop_timer ();
733+ return ;
734+ }
735+ if (_dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL )
736+ _dyld_dlopen_atfork_prepare ();
737+ if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL )
738+ _dyld_atfork_prepare (); // briefly acquire the dlsym lock
739+ host_thread_state_t state ;
740+ int valid_thread = jl_thread_suspend_and_get_state2 (tid , & state );
741+ unw_context_t * uc = (unw_context_t * )& state ;
742+ if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL )
743+ _dyld_atfork_parent (); // quickly release the dlsym lock
744+ if (_dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL )
745+ _dyld_dlopen_atfork_parent ();
746+ if (!valid_thread )
747+ return ;
748+ if (profile_running ) {
749+ #ifdef LLVMLIBUNWIND
750+ /*
751+ * Unfortunately compact unwind info is incorrectly generated for quite a number of
752+ * libraries by quite a large number of compilers. We can fall back to DWARF unwind info
753+ * in some cases, but in quite a number of cases (especially libraries not compiled in debug
754+ * mode, only the compact unwind info may be available). Even more unfortunately, there is no
755+ * way to detect such bogus compact unwind info (other than noticing the resulting segfault).
756+ * What we do here is ugly, but necessary until the compact unwind info situation improves.
757+ * We try to use the compact unwind info and if that results in a segfault, we retry with DWARF info.
758+ * Note that in a small number of cases this may result in bogus stack traces, but at least the topmost
759+ * entry will always be correct, and the number of cases in which this is an issue is rather small.
760+ * Other than that, this implementation is not incorrect as the other thread is paused while we are profiling
761+ * and during stack unwinding we only ever read memory, but never write it.
762+ */
763+
764+ forceDwarf = 0 ;
765+ unw_getcontext (& profiler_uc ); // will resume from this point if the next lines segfault at any point
766+
767+ if (forceDwarf == 0 ) {
768+ // Save the backtrace
769+ profile_bt_size_cur += rec_backtrace_ctx ((jl_bt_element_t * )profile_bt_data_prof + profile_bt_size_cur , profile_bt_size_max - profile_bt_size_cur - 1 , uc , NULL );
770+ }
771+ else if (forceDwarf == 1 ) {
772+ profile_bt_size_cur += rec_backtrace_ctx_dwarf ((jl_bt_element_t * )profile_bt_data_prof + profile_bt_size_cur , profile_bt_size_max - profile_bt_size_cur - 1 , uc , NULL );
773+ }
774+ else if (forceDwarf == -1 ) {
775+ jl_safe_printf ("WARNING: profiler attempt to access an invalid memory location\n" );
776+ }
777+
778+ forceDwarf = -2 ;
779+ #else
780+ profile_bt_size_cur += rec_backtrace_ctx ((jl_bt_element_t * )profile_bt_data_prof + profile_bt_size_cur , profile_bt_size_max - profile_bt_size_cur - 1 , uc , NULL );
781+ #endif
782+ jl_ptls_t ptls = jl_atomic_load_relaxed (& jl_all_tls_states )[tid ];
783+
784+ // store threadid but add 1 as 0 is preserved to indicate end of block
785+ profile_bt_data_prof [profile_bt_size_cur ++ ].uintptr = ptls -> tid + 1 ;
786+
787+ // store task id (never null)
788+ profile_bt_data_prof [profile_bt_size_cur ++ ].jlvalue = (jl_value_t * )jl_atomic_load_relaxed (& ptls -> current_task );
789+
790+ // store cpu cycle clock
791+ profile_bt_data_prof [profile_bt_size_cur ++ ].uintptr = cycleclock ();
792+
793+ // store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block
794+ profile_bt_data_prof [profile_bt_size_cur ++ ].uintptr = jl_atomic_load_relaxed (& ptls -> sleep_check_state ) + 1 ;
795+
796+ // Mark the end of this block with two 0's
797+ profile_bt_data_prof [profile_bt_size_cur ++ ].uintptr = 0 ;
798+ profile_bt_data_prof [profile_bt_size_cur ++ ].uintptr = 0 ;
799+ }
800+ // We're done! Resume the thread.
801+ jl_thread_resume (tid );
802+ }
803+
804+ void jl_profile_task_unix (void );
805+
727806void * mach_profile_listener (void * arg )
728807{
729808 (void )arg ;
@@ -741,88 +820,21 @@ void *mach_profile_listener(void *arg)
741820 // sample each thread, round-robin style in reverse order
742821 // (so that thread zero gets notified last)
743822 int keymgr_locked = jl_lock_profile_mach (0 );
744-
745823 int nthreads = jl_atomic_load_acquire (& jl_n_threads );
746- int * randperm = profile_get_randperm (nthreads );
747- for (int idx = nthreads ; idx -- > 0 ; ) {
748- // Stop the threads in the random or reverse round-robin order.
749- int i = randperm [idx ];
750- // if there is no space left, break early
751- if (jl_profile_is_buffer_full ()) {
752- jl_profile_stop_timer ();
753- break ;
754- }
755-
756- if (_dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL )
757- _dyld_dlopen_atfork_prepare ();
758- if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL )
759- _dyld_atfork_prepare (); // briefly acquire the dlsym lock
760- host_thread_state_t state ;
761- int valid_thread = jl_thread_suspend_and_get_state2 (i , & state );
762- unw_context_t * uc = (unw_context_t * )& state ;
763- if (_dyld_atfork_prepare != NULL && _dyld_atfork_parent != NULL )
764- _dyld_atfork_parent (); // quickly release the dlsym lock
765- if (_dyld_dlopen_atfork_prepare != NULL && _dyld_dlopen_atfork_parent != NULL )
766- _dyld_dlopen_atfork_parent ();
767- if (!valid_thread )
768- continue ;
769- if (running ) {
770- #ifdef LLVMLIBUNWIND
771- /*
772- * Unfortunately compact unwind info is incorrectly generated for quite a number of
773- * libraries by quite a large number of compilers. We can fall back to DWARF unwind info
774- * in some cases, but in quite a number of cases (especially libraries not compiled in debug
775- * mode, only the compact unwind info may be available). Even more unfortunately, there is no
776- * way to detect such bogus compact unwind info (other than noticing the resulting segfault).
777- * What we do here is ugly, but necessary until the compact unwind info situation improves.
778- * We try to use the compact unwind info and if that results in a segfault, we retry with DWARF info.
779- * Note that in a small number of cases this may result in bogus stack traces, but at least the topmost
780- * entry will always be correct, and the number of cases in which this is an issue is rather small.
781- * Other than that, this implementation is not incorrect as the other thread is paused while we are profiling
782- * and during stack unwinding we only ever read memory, but never write it.
783- */
784-
785- forceDwarf = 0 ;
786- unw_getcontext (& profiler_uc ); // will resume from this point if the next lines segfault at any point
787-
788- if (forceDwarf == 0 ) {
789- // Save the backtrace
790- bt_size_cur += rec_backtrace_ctx ((jl_bt_element_t * )bt_data_prof + bt_size_cur , bt_size_max - bt_size_cur - 1 , uc , NULL );
791- }
792- else if (forceDwarf == 1 ) {
793- bt_size_cur += rec_backtrace_ctx_dwarf ((jl_bt_element_t * )bt_data_prof + bt_size_cur , bt_size_max - bt_size_cur - 1 , uc , NULL );
794- }
795- else if (forceDwarf == -1 ) {
796- jl_safe_printf ("WARNING: profiler attempt to access an invalid memory location\n" );
797- }
798-
799- forceDwarf = -2 ;
800- #else
801- bt_size_cur += rec_backtrace_ctx ((jl_bt_element_t * )bt_data_prof + bt_size_cur , bt_size_max - bt_size_cur - 1 , uc , NULL );
802- #endif
803- jl_ptls_t ptls = jl_atomic_load_relaxed (& jl_all_tls_states )[i ];
804-
805- // META_OFFSET_THREADID store threadid but add 1 as 0 is preserved to indicate end of block
806- bt_data_prof [bt_size_cur ++ ].uintptr = ptls -> tid + 1 ;
807-
808- // META_OFFSET_TASKID store task id (never null)
809- bt_data_prof [bt_size_cur ++ ].jlvalue = (jl_value_t * )jl_atomic_load_relaxed (& ptls -> current_task );
810-
811- // META_OFFSET_CPUCYCLECLOCK store cpu cycle clock
812- bt_data_prof [bt_size_cur ++ ].uintptr = cycleclock ();
813-
814- // META_OFFSET_SLEEPSTATE store whether thread is sleeping but add 1 as 0 is preserved to indicate end of block
815- bt_data_prof [bt_size_cur ++ ].uintptr = jl_atomic_load_relaxed (& ptls -> sleep_check_state ) + 1 ;
816-
817- // Mark the end of this block with two 0's
818- bt_data_prof [bt_size_cur ++ ].uintptr = 0 ;
819- bt_data_prof [bt_size_cur ++ ].uintptr = 0 ;
824+ if (profile_all_tasks ) {
825+ // Don't take the stackwalk lock here since it's already taken in `jl_rec_backtrace`
826+ jl_profile_task_unix ();
827+ }
828+ else {
829+ int * randperm = profile_get_randperm (nthreads );
830+ for (int idx = nthreads ; idx -- > 0 ; ) {
831+ // Stop the threads in random order.
832+ int i = randperm [idx ];
833+ jl_profile_thread_mach (i );
820834 }
821- // We're done! Resume the thread.
822- jl_thread_resume (i );
823835 }
824836 jl_unlock_profile_mach (0 , keymgr_locked );
825- if (running ) {
837+ if (profile_running ) {
826838 jl_check_profile_autostop ();
827839 // Reset the alarm
828840 kern_return_t ret = clock_alarm (clk , TIME_RELATIVE , timerprof , profile_port );
@@ -831,7 +843,8 @@ void *mach_profile_listener(void *arg)
831843 }
832844}
833845
834- JL_DLLEXPORT int jl_profile_start_timer (void )
846+
847+ JL_DLLEXPORT int jl_profile_start_timer (uint8_t all_tasks )
835848{
836849 kern_return_t ret ;
837850 if (!profile_started ) {
@@ -860,7 +873,8 @@ JL_DLLEXPORT int jl_profile_start_timer(void)
860873 timerprof .tv_sec = nsecprof /GIGA ;
861874 timerprof .tv_nsec = nsecprof %GIGA ;
862875
863- running = 1 ;
876+ profile_running = 1 ;
877+ profile_all_tasks = all_tasks ;
864878 // ensure the alarm is running
865879 ret = clock_alarm (clk , TIME_RELATIVE , timerprof , profile_port );
866880 HANDLE_MACH_ERROR ("clock_alarm" , ret );
@@ -870,5 +884,6 @@ JL_DLLEXPORT int jl_profile_start_timer(void)
870884
871885JL_DLLEXPORT void jl_profile_stop_timer (void )
872886{
873- running = 0 ;
887+ profile_running = 0 ;
888+ profile_all_tasks = 0 ;
874889}
0 commit comments