Skip to content

Commit 6dd99b1

Browse files
committed
GC: report times spent in each step of sweeping (JuliaLang#56993)
We've been using this patch for a while (e.g., #176), and it has proven valuable for identifying the most expensive part of sweeping in a customer workload. Specifically, it highlighted that the madvise stage was the bottleneck. These metrics allowed us to effectively determine whether concurrent page sweeping would be beneficial.
1 parent 0d6bd8c commit 6dd99b1

File tree

3 files changed

+60
-43
lines changed

3 files changed

+60
-43
lines changed

base/timing.jl

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,9 @@ struct GC_Num
2424
mark_time ::Int64
2525
stack_pool_sweep_time ::Int64
2626
total_sweep_time ::Int64
27+
total_sweep_page_walk_time ::Int64
28+
total_sweep_madvise_time ::Int64
29+
total_sweep_free_mallocd_memory_time ::Int64
2730
total_mark_time ::Int64
2831
total_stack_pool_sweep_time::Int64
2932
last_full_sweep ::Int64

src/gc-interface.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,9 @@ typedef struct {
4646
uint64_t mark_time;
4747
uint64_t stack_pool_sweep_time;
4848
uint64_t total_sweep_time;
49+
uint64_t total_sweep_page_walk_time;
50+
uint64_t total_sweep_madvise_time;
51+
uint64_t total_sweep_free_mallocd_memory_time;
4952
uint64_t total_mark_time;
5053
uint64_t total_stack_pool_sweep_time;
5154
uint64_t last_full_sweep;

src/gc-stock.c

Lines changed: 54 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -984,9 +984,12 @@ STATIC_INLINE void gc_sweep_pool_page(gc_page_profiler_serializer_t *s, jl_gc_pa
984984
// sweep over all memory that is being used and not in a pool
985985
static void gc_sweep_other(jl_ptls_t ptls, int sweep_full) JL_NOTSAFEPOINT
986986
{
987+
uint64_t t_free_mallocd_memory_start = jl_hrtime();
987988
gc_sweep_foreign_objs();
988989
sweep_malloced_memory();
989990
sweep_big(ptls);
991+
uint64_t t_free_mallocd_memory_end = jl_hrtime();
992+
gc_num.total_sweep_free_mallocd_memory_time += t_free_mallocd_memory_end - t_free_mallocd_memory_start;
990993
jl_engine_sweep(gc_all_tls_states);
991994
}
992995

@@ -1300,66 +1303,74 @@ static void gc_sweep_pool(void)
13001303
}
13011304
}
13021305

1303-
// the actual sweeping
1304-
jl_gc_padded_page_stack_t *new_gc_allocd_scratch = (jl_gc_padded_page_stack_t *) calloc_s(n_threads * sizeof(jl_gc_padded_page_stack_t));
1305-
jl_ptls_t ptls = jl_current_task->ptls;
1306-
gc_sweep_wake_all_pages(ptls, new_gc_allocd_scratch);
1307-
gc_sweep_pool_parallel(ptls);
1308-
gc_sweep_wait_for_all_pages();
1309-
1310-
// reset half-pages pointers
1311-
for (int t_i = 0; t_i < n_threads; t_i++) {
1312-
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
1313-
if (ptls2 != NULL) {
1314-
ptls2->gc_tls.page_metadata_allocd = new_gc_allocd_scratch[t_i].stack;
1315-
for (int i = 0; i < JL_GC_N_POOLS; i++) {
1316-
jl_gc_pool_t *p = &ptls2->gc_tls.heap.norm_pools[i];
1317-
p->newpages = NULL;
1306+
uint64_t t_page_walk_start = jl_hrtime();
1307+
{
1308+
// the actual sweeping
1309+
jl_gc_padded_page_stack_t *new_gc_allocd_scratch = (jl_gc_padded_page_stack_t *) calloc_s(n_threads * sizeof(jl_gc_padded_page_stack_t));
1310+
jl_ptls_t ptls = jl_current_task->ptls;
1311+
gc_sweep_wake_all_pages(ptls, new_gc_allocd_scratch);
1312+
gc_sweep_pool_parallel(ptls);
1313+
gc_sweep_wait_for_all_pages();
1314+
1315+
// reset half-pages pointers
1316+
for (int t_i = 0; t_i < n_threads; t_i++) {
1317+
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
1318+
if (ptls2 != NULL) {
1319+
ptls2->gc_tls.page_metadata_allocd = new_gc_allocd_scratch[t_i].stack;
1320+
for (int i = 0; i < JL_GC_N_POOLS; i++) {
1321+
jl_gc_pool_t *p = &ptls2->gc_tls.heap.norm_pools[i];
1322+
p->newpages = NULL;
1323+
}
13181324
}
13191325
}
1320-
}
13211326

1322-
// merge free lists
1323-
for (int t_i = 0; t_i < n_threads; t_i++) {
1324-
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
1325-
if (ptls2 == NULL) {
1326-
continue;
1327-
}
1328-
jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->gc_tls.page_metadata_allocd.bottom);
1329-
while (pg != NULL) {
1330-
jl_gc_pagemeta_t *pg2 = pg->next;
1331-
if (pg->fl_begin_offset != UINT16_MAX) {
1332-
char *cur_pg = pg->data;
1333-
jl_taggedvalue_t *fl_beg = (jl_taggedvalue_t*)(cur_pg + pg->fl_begin_offset);
1334-
jl_taggedvalue_t *fl_end = (jl_taggedvalue_t*)(cur_pg + pg->fl_end_offset);
1335-
*pfl[t_i * JL_GC_N_POOLS + pg->pool_n] = fl_beg;
1336-
pfl[t_i * JL_GC_N_POOLS + pg->pool_n] = &fl_end->next;
1327+
// merge free lists
1328+
for (int t_i = 0; t_i < n_threads; t_i++) {
1329+
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
1330+
if (ptls2 == NULL) {
1331+
continue;
1332+
}
1333+
jl_gc_pagemeta_t *pg = jl_atomic_load_relaxed(&ptls2->gc_tls.page_metadata_allocd.bottom);
1334+
while (pg != NULL) {
1335+
jl_gc_pagemeta_t *pg2 = pg->next;
1336+
if (pg->fl_begin_offset != UINT16_MAX) {
1337+
char *cur_pg = pg->data;
1338+
jl_taggedvalue_t *fl_beg = (jl_taggedvalue_t*)(cur_pg + pg->fl_begin_offset);
1339+
jl_taggedvalue_t *fl_end = (jl_taggedvalue_t*)(cur_pg + pg->fl_end_offset);
1340+
*pfl[t_i * JL_GC_N_POOLS + pg->pool_n] = fl_beg;
1341+
pfl[t_i * JL_GC_N_POOLS + pg->pool_n] = &fl_end->next;
1342+
}
1343+
pg = pg2;
13371344
}
1338-
pg = pg2;
13391345
}
1340-
}
13411346

1342-
// null out terminal pointers of free lists
1343-
for (int t_i = 0; t_i < n_threads; t_i++) {
1344-
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
1345-
if (ptls2 != NULL) {
1346-
for (int i = 0; i < JL_GC_N_POOLS; i++) {
1347-
*pfl[t_i * JL_GC_N_POOLS + i] = NULL;
1347+
// null out terminal pointers of free lists
1348+
for (int t_i = 0; t_i < n_threads; t_i++) {
1349+
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
1350+
if (ptls2 != NULL) {
1351+
for (int i = 0; i < JL_GC_N_POOLS; i++) {
1352+
*pfl[t_i * JL_GC_N_POOLS + i] = NULL;
1353+
}
13481354
}
13491355
}
1350-
}
13511356

1352-
// cleanup
1353-
free(pfl);
1354-
free(new_gc_allocd_scratch);
1357+
// cleanup
1358+
free(pfl);
1359+
free(new_gc_allocd_scratch);
1360+
}
1361+
uint64_t t_page_walk_end = jl_hrtime();
1362+
gc_num.total_sweep_page_walk_time += t_page_walk_end - t_page_walk_start;
13551363

13561364
#ifdef _P64 // only enable concurrent sweeping on 64bit
13571365
// wake thread up to sweep concurrently
13581366
if (jl_n_sweepthreads > 0) {
13591367
uv_sem_post(&gc_sweep_assists_needed);
13601368
}
13611369
else {
1370+
uint64_t t_madvise_start = jl_hrtime();
13621371
gc_free_pages();
1372+
uint64_t t_madvise_end = jl_hrtime();
1373+
gc_num.total_sweep_madvise_time += t_madvise_end - t_madvise_start;
13631374
}
13641375
#else
13651376
gc_free_pages();

0 commit comments

Comments
 (0)