Skip to content

Commit 1f66e0d

Browse files
d-nettokpamnany
andcommitted
Simplify sweeping of big values (JuliaLang#54936)
Simplifies the layout of the doubly linked list of big objects to make it a bit more canonical: let's just store a pointer to the previous element, instead of storing a "pointer to the next element of the previous element". This should make the implementation a bit easier to understand without incurring any memory overhead. I ran the serial and multithreaded benchmarks from GCBenchmarks and this seems fairly close to performance neutral on my machine. We also ran our internal benchmarks on it at RAI and it looks fine from a correctness and performance point of view. --------- Co-authored-by: Kiran Pamnany <[email protected]>
1 parent 9490ee3 commit 1f66e0d

File tree

4 files changed

+105
-100
lines changed

4 files changed

+105
-100
lines changed

src/gc-debug.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -132,7 +132,7 @@ static void clear_mark(int bits)
132132
}
133133
bigval_t *v;
134134
for (int i = 0; i < gc_n_threads; i++) {
135-
v = gc_all_tls_states[i]->gc_tls.heap.big_objects;
135+
v = gc_all_tls_states[i]->gc_tls.heap.young_generation_of_bigvals;
136136
while (v != NULL) {
137137
void *gcv = &v->header;
138138
if (!gc_verifying)
@@ -142,7 +142,7 @@ static void clear_mark(int bits)
142142
}
143143
}
144144

145-
v = big_objects_marked;
145+
v = oldest_generation_of_bigvals;
146146
while (v != NULL) {
147147
void *gcv = &v->header;
148148
if (!gc_verifying)
@@ -965,15 +965,15 @@ void gc_stats_big_obj(void)
965965
size_t nused=0, nbytes=0, nused_old=0, nbytes_old=0;
966966
for (int t_i = 0; t_i < gc_n_threads; t_i++) {
967967
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
968-
bigval_t *v = ptls2->gc_tls.heap.big_objects;
968+
bigval_t *v = ptls2->gc_tls.heap.young_generation_of_bigvals;
969969
while (v != NULL) {
970970
if (gc_marked(v->bits.gc)) {
971971
nused++;
972972
nbytes += v->sz & ~3;
973973
}
974974
v = v->next;
975975
}
976-
v = big_objects_marked;
976+
v = oldest_generation_of_bigvals;
977977
while (v != NULL) {
978978
if (gc_marked(v->bits.gc)) {
979979
nused_old++;

src/gc-tls.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@ typedef struct {
3131
struct _mallocarray_t *mallocarrays;
3232
struct _mallocarray_t *mafreelist;
3333

34-
// variables for tracking big objects
35-
struct _bigval_t *big_objects;
34+
// variable for tracking young (i.e. not in `GC_OLD_MARKED`/last generation) large objects
35+
struct _bigval_t *young_generation_of_bigvals;
3636

3737
// lower bound of the number of pointers inside remembered values
3838
int remset_nptr;

src/gc.c

Lines changed: 76 additions & 82 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ uv_cond_t gc_threads_cond;
3434
uv_sem_t gc_sweep_assists_needed;
3535
// Mutex used to coordinate entry of GC threads in the mark loop
3636
uv_mutex_t gc_queue_observer_lock;
37+
// Tag for sentinel nodes in bigval list
38+
uintptr_t gc_bigval_sentinel_tag;
3739

3840
// Linked list of callback functions
3941

@@ -150,7 +152,6 @@ JL_DLLEXPORT void jl_gc_set_cb_notify_gc_pressure(jl_gc_cb_notify_gc_pressure_t
150152
// is going to realloc the buffer (of its own list) or accessing the
151153
// list of another thread
152154
static jl_mutex_t finalizers_lock;
153-
static uv_mutex_t gc_cache_lock;
154155

155156
// mutex for gc-heap-snapshot.
156157
jl_mutex_t heapsnapshot_lock;
@@ -201,8 +202,8 @@ JL_DLLEXPORT uintptr_t jl_get_buff_tag(void) JL_NOTSAFEPOINT
201202
return jl_buff_tag;
202203
}
203204

204-
// List of marked big objects. Not per-thread. Accessed only by master thread.
205-
bigval_t *big_objects_marked = NULL;
205+
// List of big objects in oldest generation (`GC_OLD_MARKED`). Not per-thread. Accessed only by master thread.
206+
bigval_t *oldest_generation_of_bigvals = NULL;
206207

207208
// -- Finalization --
208209
// `ptls->finalizers` and `finalizer_list_marked` might have tagged pointers.
@@ -759,60 +760,25 @@ static int64_t t_start = 0; // Time GC starts;
759760
static int64_t last_trim_maxrss = 0;
760761
#endif
761762

762-
static void gc_sync_cache_nolock(jl_ptls_t ptls, jl_gc_mark_cache_t *gc_cache) JL_NOTSAFEPOINT
763+
static void gc_sync_cache(jl_ptls_t ptls, jl_gc_mark_cache_t *gc_cache) JL_NOTSAFEPOINT
763764
{
764-
const int nbig = gc_cache->nbig_obj;
765-
for (int i = 0; i < nbig; i++) {
766-
void *ptr = gc_cache->big_obj[i];
767-
bigval_t *hdr = (bigval_t*)gc_ptr_clear_tag(ptr, 1);
768-
gc_big_object_unlink(hdr);
769-
if (gc_ptr_tag(ptr, 1)) {
770-
gc_big_object_link(hdr, &ptls->gc_tls.heap.big_objects);
771-
}
772-
else {
773-
// Move hdr from `big_objects` list to `big_objects_marked list`
774-
gc_big_object_link(hdr, &big_objects_marked);
775-
}
776-
}
777-
gc_cache->nbig_obj = 0;
778765
perm_scanned_bytes += gc_cache->perm_scanned_bytes;
779766
scanned_bytes += gc_cache->scanned_bytes;
780767
gc_cache->perm_scanned_bytes = 0;
781768
gc_cache->scanned_bytes = 0;
782769
}
783770

784-
static void gc_sync_cache(jl_ptls_t ptls) JL_NOTSAFEPOINT
785-
{
786-
uv_mutex_lock(&gc_cache_lock);
787-
gc_sync_cache_nolock(ptls, &ptls->gc_tls.gc_cache);
788-
uv_mutex_unlock(&gc_cache_lock);
789-
}
790-
791771
// No other threads can be running marking at the same time
792-
static void gc_sync_all_caches_nolock(jl_ptls_t ptls)
772+
static void gc_sync_all_caches(jl_ptls_t ptls)
793773
{
794774
assert(gc_n_threads);
795775
for (int t_i = 0; t_i < gc_n_threads; t_i++) {
796776
jl_ptls_t ptls2 = gc_all_tls_states[t_i];
797777
if (ptls2 != NULL)
798-
gc_sync_cache_nolock(ptls, &ptls2->gc_tls.gc_cache);
778+
gc_sync_cache(ptls, &ptls2->gc_tls.gc_cache);
799779
}
800780
}
801781

802-
STATIC_INLINE void gc_queue_big_marked(jl_ptls_t ptls, bigval_t *hdr,
803-
int toyoung) JL_NOTSAFEPOINT
804-
{
805-
const int nentry = sizeof(ptls->gc_tls.gc_cache.big_obj) / sizeof(void*);
806-
size_t nobj = ptls->gc_tls.gc_cache.nbig_obj;
807-
if (__unlikely(nobj >= nentry)) {
808-
gc_sync_cache(ptls);
809-
nobj = 0;
810-
}
811-
uintptr_t v = (uintptr_t)hdr;
812-
ptls->gc_tls.gc_cache.big_obj[nobj] = (void*)(toyoung ? (v | 1) : v);
813-
ptls->gc_tls.gc_cache.nbig_obj = nobj + 1;
814-
}
815-
816782
// Atomically set the mark bit for object and return whether it was previously unmarked
817783
FORCE_INLINE int gc_try_setmark_tag(jl_taggedvalue_t *o, uint8_t mark_mode) JL_NOTSAFEPOINT
818784
{
@@ -849,16 +815,14 @@ STATIC_INLINE void gc_setmark_big(jl_ptls_t ptls, jl_taggedvalue_t *o,
849815
bigval_t *hdr = bigval_header(o);
850816
if (mark_mode == GC_OLD_MARKED) {
851817
ptls->gc_tls.gc_cache.perm_scanned_bytes += hdr->sz;
852-
gc_queue_big_marked(ptls, hdr, 0);
853818
}
854819
else {
855820
ptls->gc_tls.gc_cache.scanned_bytes += hdr->sz;
856-
// We can't easily tell if the object is old or being promoted
857-
// from the gc bits but if the `age` is `0` then the object
858-
// must be already on a young list.
859821
if (mark_reset_age) {
822+
assert(jl_atomic_load(&gc_n_threads_marking) == 0); // `mark_reset_age` is only used during single-threaded marking
860823
// Reset the object as if it was just allocated
861-
gc_queue_big_marked(ptls, hdr, 1);
824+
gc_big_object_unlink(hdr);
825+
gc_big_object_link(ptls->gc_tls.heap.young_generation_of_bigvals, hdr);
862826
}
863827
}
864828
}
@@ -1023,7 +987,7 @@ STATIC_INLINE jl_value_t *jl_gc_big_alloc_inner(jl_ptls_t ptls, size_t sz)
1023987
memset(v, 0xee, allocsz);
1024988
#endif
1025989
v->sz = allocsz;
1026-
gc_big_object_link(v, &ptls->gc_tls.heap.big_objects);
990+
gc_big_object_link(ptls->gc_tls.heap.young_generation_of_bigvals, v);
1027991
return jl_valueof(&v->header);
1028992
}
1029993

@@ -1043,60 +1007,85 @@ jl_value_t *jl_gc_big_alloc_noinline(jl_ptls_t ptls, size_t sz) {
10431007
return jl_gc_big_alloc_inner(ptls, sz);
10441008
}
10451009

1046-
// Sweep list rooted at *pv, removing and freeing any unmarked objects.
1047-
// Return pointer to last `next` field in the culled list.
1048-
static bigval_t **sweep_big_list(int sweep_full, bigval_t **pv) JL_NOTSAFEPOINT
1010+
FORCE_INLINE void sweep_unlink_and_free(bigval_t *v) JL_NOTSAFEPOINT
1011+
{
1012+
gc_big_object_unlink(v);
1013+
gc_num.freed += v->sz;
1014+
#ifdef MEMDEBUG
1015+
memset(v, 0xbb, v->sz);
1016+
#endif
1017+
gc_invoke_callbacks(jl_gc_cb_notify_external_free_t, gc_cblist_notify_external_free, (v));
1018+
jl_free_aligned(v);
1019+
}
1020+
1021+
static bigval_t *sweep_list_of_young_bigvals(bigval_t *young) JL_NOTSAFEPOINT
10491022
{
1050-
bigval_t *v = *pv;
1023+
bigval_t *last_node = young;
1024+
bigval_t *v = young->next; // skip the sentinel
1025+
bigval_t *old = oldest_generation_of_bigvals;
1026+
int sweep_full = current_sweep_full; // don't load the global in the hot loop
10511027
while (v != NULL) {
10521028
bigval_t *nxt = v->next;
10531029
int bits = v->bits.gc;
10541030
int old_bits = bits;
10551031
if (gc_marked(bits)) {
1056-
pv = &v->next;
10571032
if (sweep_full || bits == GC_MARKED) {
10581033
bits = GC_OLD;
1034+
last_node = v;
1035+
}
1036+
else { // `bits == GC_OLD_MARKED`
1037+
assert(bits == GC_OLD_MARKED);
1038+
// reached oldest generation, move from young list to old list
1039+
gc_big_object_unlink(v);
1040+
gc_big_object_link(old, v);
10591041
}
10601042
v->bits.gc = bits;
10611043
}
10621044
else {
1063-
// Remove v from list and free it
1064-
*pv = nxt;
1065-
if (nxt)
1066-
nxt->prev = pv;
1067-
gc_num.freed += v->sz;
1068-
#ifdef MEMDEBUG
1069-
memset(v, 0xbb, v->sz);
1070-
#endif
1071-
gc_invoke_callbacks(jl_gc_cb_notify_external_free_t,
1072-
gc_cblist_notify_external_free, (v));
1073-
jl_free_aligned(v);
1045+
sweep_unlink_and_free(v);
10741046
}
10751047
gc_time_count_big(old_bits, bits);
10761048
v = nxt;
10771049
}
1078-
return pv;
1050+
return last_node;
1051+
}
1052+
1053+
static void sweep_list_of_oldest_bigvals(bigval_t *young) JL_NOTSAFEPOINT
1054+
{
1055+
bigval_t *v = oldest_generation_of_bigvals->next; // skip the sentinel
1056+
while (v != NULL) {
1057+
bigval_t *nxt = v->next;
1058+
assert(v->bits.gc == GC_OLD_MARKED);
1059+
v->bits.gc = GC_OLD;
1060+
gc_time_count_big(GC_OLD_MARKED, GC_OLD);
1061+
v = nxt;
1062+
}
10791063
}
10801064

1081-
static void sweep_big(jl_ptls_t ptls, int sweep_full) JL_NOTSAFEPOINT
1065+
static void sweep_big(jl_ptls_t ptls) JL_NOTSAFEPOINT
10821066
{
10831067
gc_time_big_start();
10841068
assert(gc_n_threads);
1069+
bigval_t *last_node_in_my_list = NULL;
10851070
for (int i = 0; i < gc_n_threads; i++) {
10861071
jl_ptls_t ptls2 = gc_all_tls_states[i];
1087-
if (ptls2 != NULL)
1088-
sweep_big_list(sweep_full, &ptls2->gc_tls.heap.big_objects);
1072+
if (ptls2 != NULL) {
1073+
bigval_t *last_node = sweep_list_of_young_bigvals(ptls2->gc_tls.heap.young_generation_of_bigvals);
1074+
if (ptls == ptls2) {
1075+
last_node_in_my_list = last_node;
1076+
}
1077+
}
10891078
}
1090-
if (sweep_full) {
1091-
bigval_t **last_next = sweep_big_list(sweep_full, &big_objects_marked);
1092-
// Move all survivors from big_objects_marked list to the big_objects list of this thread.
1093-
if (ptls->gc_tls.heap.big_objects)
1094-
ptls->gc_tls.heap.big_objects->prev = last_next;
1095-
*last_next = ptls->gc_tls.heap.big_objects;
1096-
ptls->gc_tls.heap.big_objects = big_objects_marked;
1097-
if (ptls->gc_tls.heap.big_objects)
1098-
ptls->gc_tls.heap.big_objects->prev = &ptls->gc_tls.heap.big_objects;
1099-
big_objects_marked = NULL;
1079+
if (current_sweep_full) {
1080+
sweep_list_of_oldest_bigvals(ptls->gc_tls.heap.young_generation_of_bigvals);
1081+
// move all nodes in `oldest_generation_of_bigvals` to my list of bigvals
1082+
assert(last_node_in_my_list != NULL);
1083+
assert(last_node_in_my_list->next == NULL);
1084+
last_node_in_my_list->next = oldest_generation_of_bigvals->next; // skip the sentinel
1085+
if (oldest_generation_of_bigvals->next != NULL) {
1086+
oldest_generation_of_bigvals->next->prev = last_node_in_my_list;
1087+
}
1088+
oldest_generation_of_bigvals->next = NULL;
11001089
}
11011090
gc_time_big_end();
11021091
}
@@ -1536,7 +1525,7 @@ STATIC_INLINE void gc_sweep_pool_page(gc_page_profiler_serializer_t *s, jl_gc_pa
15361525
static void gc_sweep_other(jl_ptls_t ptls, int sweep_full) JL_NOTSAFEPOINT
15371526
{
15381527
sweep_malloced_arrays();
1539-
sweep_big(ptls, sweep_full);
1528+
sweep_big(ptls);
15401529
}
15411530

15421531
static void gc_pool_sync_nfree(jl_gc_pagemeta_t *pg, jl_taggedvalue_t *last) JL_NOTSAFEPOINT
@@ -3524,7 +3513,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
35243513
// marking is over
35253514

35263515
// Flush everything in mark cache
3527-
gc_sync_all_caches_nolock(ptls);
3516+
gc_sync_all_caches(ptls);
35283517

35293518
int64_t live_sz_ub = live_bytes + actual_allocd;
35303519
int64_t live_sz_est = scanned_bytes + perm_scanned_bytes;
@@ -3863,15 +3852,16 @@ void jl_init_thread_heap(jl_ptls_t ptls)
38633852
small_arraylist_new(&heap->free_stacks[i], 0);
38643853
heap->mallocarrays = NULL;
38653854
heap->mafreelist = NULL;
3866-
heap->big_objects = NULL;
3855+
heap->young_generation_of_bigvals = (bigval_t*)calloc_s(sizeof(bigval_t)); // sentinel
3856+
assert(gc_bigval_sentinel_tag != 0); // make sure the sentinel is initialized
3857+
heap->young_generation_of_bigvals->header = gc_bigval_sentinel_tag;
38673858
arraylist_new(&heap->remset, 0);
38683859
arraylist_new(&ptls->finalizers, 0);
38693860
arraylist_new(&ptls->gc_tls.sweep_objs, 0);
38703861

38713862
jl_gc_mark_cache_t *gc_cache = &ptls->gc_tls.gc_cache;
38723863
gc_cache->perm_scanned_bytes = 0;
38733864
gc_cache->scanned_bytes = 0;
3874-
gc_cache->nbig_obj = 0;
38753865

38763866
// Initialize GC mark-queue
38773867
jl_gc_markqueue_t *mq = &ptls->gc_tls.mark_queue;
@@ -3897,12 +3887,16 @@ void jl_gc_init(void)
38973887
JL_MUTEX_INIT(&heapsnapshot_lock, "heapsnapshot_lock");
38983888
JL_MUTEX_INIT(&finalizers_lock, "finalizers_lock");
38993889
uv_mutex_init(&page_profile_lock);
3900-
uv_mutex_init(&gc_cache_lock);
39013890
uv_mutex_init(&gc_perm_lock);
39023891
uv_mutex_init(&gc_threads_lock);
39033892
uv_cond_init(&gc_threads_cond);
39043893
uv_sem_init(&gc_sweep_assists_needed, 0);
39053894
uv_mutex_init(&gc_queue_observer_lock);
3895+
void *_addr = (void*)calloc_s(1); // dummy allocation to get the sentinel tag
3896+
uintptr_t addr = (uintptr_t)_addr;
3897+
gc_bigval_sentinel_tag = addr;
3898+
oldest_generation_of_bigvals = (bigval_t*)calloc_s(sizeof(bigval_t)); // sentinel
3899+
oldest_generation_of_bigvals->header = gc_bigval_sentinel_tag;
39063900

39073901
jl_gc_init_page();
39083902
jl_gc_debug_init();

0 commit comments

Comments
 (0)