From fbc1ce5091c8eeabd759f8dd1b85457606f74d41 Mon Sep 17 00:00:00 2001
From: Gabriel Baraldi <baraldigabriel@gmail.com>
Date: Mon, 12 Jun 2023 12:06:05 -0300
Subject: [PATCH] backport Gabriel's PR

---
 NEWS.md                        |  27 +-------
 contrib/generate_precompile.jl |   2 +-
 doc/src/devdocs/gc.md          |  76 +++++++++++++++++++++
 src/gc-debug.c                 |  19 +++++-
 src/gc-pages.c                 |   7 +-
 src/gc.c                       | 120 +++++++++++++++++++--------------
 src/gc.h                       |  19 +++++-
 7 files changed, 190 insertions(+), 80 deletions(-)
 create mode 100644 doc/src/devdocs/gc.md

diff --git a/NEWS.md b/NEWS.md
index a78249ed95182..e96c8f8b38528 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -38,30 +38,9 @@ Language changes
 Compiler/Runtime improvements
 -----------------------------
 
-* Bootstrapping time has been improved by about 25% ([#41794]).
-* The LLVM-based compiler has been separated from the run-time library into a new library,
-  `libjulia-codegen`. It is loaded by default, so normal usage should see no changes.
-  In deployments that do not need the compiler (e.g. system images where all needed code
-  is precompiled), this library (and its LLVM dependency) can simply be excluded ([#41936]).
-* Conditional type constraints are now be forwarded interprocedurally (i.e. propagated from caller to callee).
-  This allows inference to understand e.g. `Base.ifelse(isa(x, Int), x, 0)` returns `::Int`-value
-  even if the type of `x` is not known ([#42529]).
-* Julia-level SROA (Scalar Replacement of Aggregates) has been improved: allowing elimination of
-  `getfield` calls with constant global fields ([#42355]), enabling elimination of mutable structs with
-  uninitialized fields ([#43208]), improving performance ([#43232]), and handling more nested `getfield`
-  calls ([#43239]).
-* Abstract call sites can now be inlined or statically resolved as long as the call site has a single
-  matching method ([#43113]).
-* Inference now tracks various effects such as side-effectful-ness and nothrow-ness on a per-specialization basis.
-  Code heavily dependent on constant propagation should see significant compile-time performance improvements and
-  certain cases (e.g. calls to uninlinable functions that are nevertheless effect free) should see runtime performance
-  improvements. Effects may be overwritten manually with the `Base.@assume_effects` macro ([#43852]).
-* Precompilation (with explicit `precompile` directives or representative workloads) now saves more type-inferred code,
-  resulting in reduced time-to-first task for packages that use precompilation.  This change also eliminates the
-  runtime performance degradation occasionally triggered by precompilation on older Julia versions. More specifically,
-  any newly-inferred method/type combinations needed by your package--regardless of where those methods were
-  defined--can now be cached in the precompile file, as long as they are inferrably called by a method owned by
-  your package ([#43990]).
+* The `@pure` macro is now deprecated. Use `Base.@assume_effects :foldable` instead ([#48682]).
+* The mark phase of the Garbage Collector is now multi-threaded ([#48600]).
+* Updated GC heuristics to count allocated pages instead of individual objects ([#50144]).
 
 Command-line option changes
 ---------------------------
diff --git a/contrib/generate_precompile.jl b/contrib/generate_precompile.jl
index a10d195229cab..073ea09becd84 100644
--- a/contrib/generate_precompile.jl
+++ b/contrib/generate_precompile.jl
@@ -428,7 +428,7 @@ function generate_precompile_statements()
     print("Total ─────── "); Base.time_print(tot_time); println()
     print("Generation ── "); Base.time_print(gen_time);     print(" "); show(IOContext(stdout, :compact=>true), gen_time / tot_time * 100); println("%")
     print("Execution ─── "); Base.time_print(include_time); print(" "); show(IOContext(stdout, :compact=>true), include_time / tot_time * 100); println("%")
-
+    GC.gc(true)
     return
 end
 
diff --git a/doc/src/devdocs/gc.md b/doc/src/devdocs/gc.md
new file mode 100644
index 0000000000000..8a2190400a9e2
--- /dev/null
+++ b/doc/src/devdocs/gc.md
@@ -0,0 +1,76 @@
+# Garbage Collection in Julia
+
+## Introduction
+
+Julia has a serial, stop-the-world, generational, non-moving mark-sweep garbage collector.
+Native objects are precisely scanned and foreign ones are conservatively marked.
+
+## Memory layout of objects and GC bits
+
+An opaque tag is stored in the front of GC managed objects, and its lowest two bits are
+used for garbage collection.  The lowest bit is set for marked objects and the second
+lowest bit stores age information (e.g. it's only set for old objects).
+
+Objects are aligned by a multiple of 4 bytes to ensure this pointer tagging is legal.
+
+## Pool allocation
+
+Sufficiently small objects (up to 2032 bytes) are allocated on per-thread object
+pools.
+
+A three-level tree (analogous to a three-level page-table) is used to keep metadata
+(e.g. whether a page has been allocated, whether contains marked objects, number of free objects etc.)
+about address ranges spanning at least one page.
+Sweeping a pool allocated object consists of inserting it back into the free list
+maintained by its pool.
+
+## Malloc'd arrays and big objects
+
+Two lists are used to keep track of the remaining allocated objects:
+one for sufficiently large malloc'd arrays (`mallocarray_t`) and one for
+sufficiently large objects (`bigval_t`).
+
+Sweeping these objects consists of unlinking them from their list and calling `free` on the
+corresponding address.
+
+## Generational and remembered sets
+
+Field writes into old objects trigger a write barrier if the written field
+points to a young object and if a write barrier has not been triggered on the old object yet.
+In this case, the old object being written to is enqueued into a remembered set, and
+its mark bit is set to indicate that a write barrier has already been triggered on it.
+
+There is no explicit flag to determine whether a marking pass will scan the
+entire heap or only through young objects and remembered set.
+The mark bits of the objects themselves are used to determine whether a full mark happens.
+The mark-sweep algorithm follows this sequence of steps:
+
+- Objects in the remembered set have their GC mark bits reset
+(these are set once write barrier is triggered, as described above) and are enqueued.
+
+- Roots (e.g. thread locals) are enqueued.
+
+- Object graph is traversed and mark bits are set.
+
+- Object pools, malloc'd arrays and big objects are sweeped. On a full sweep,
+the mark bits of all marked objects are reset. On a generational sweep,
+only the mark bits of marked young objects are reset.
+
+- Mark bits of objects in the remembered set are set,
+so we don't trigger the write barrier on them again.
+
+After these stages, old objects will be left with their mark bits set,
+so that references from them are not explored in a subsequent generational collection.
+This scheme eliminates the need of explicitly keeping a flag to indicate a full mark
+(though a flag to indicate a full sweep is necessary).
+
+## Heuristics
+
+GC heuristics tune the GC by changing the size of the allocation interval between garbage collections.
+
+The GC heuristics measure how big the heap size is after a collection and set the next collection to when the heap size is twice as big as the current size or to the maximum heap size.
+The heuristics measure the heap size by counting the number of pages that are in use and the objects that use malloc. Previously we measured the heap size by counting
+the alive objects, but that doesn't take into account fragmentation which could lead to bad decisions, that also meant that we used thread local information (allocations) to make
+decisions about a process wide (when to GC), measuring pages means the decision is global.
+
+The GC will do full collections when the heap size reaches 80% of the maximum allowed size.
diff --git a/src/gc-debug.c b/src/gc-debug.c
index 6484c2bd8bd07..5007aa771c6a8 100644
--- a/src/gc-debug.c
+++ b/src/gc-debug.c
@@ -1,7 +1,10 @@
 // This file is a part of Julia. License is MIT: https://julialang.org/license
 
 #include "gc.h"
+#include "julia.h"
 #include <inttypes.h>
+#include <stddef.h>
+#include <stdint.h>
 #include <stdio.h>
 
 // re-include assert.h without NDEBUG,
@@ -1403,7 +1406,7 @@ JL_DLLEXPORT void jl_enable_gc_logging(int enable) {
     gc_logging_enabled = enable;
 }
 
-void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect) JL_NOTSAFEPOINT {
+void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect, int64_t live_bytes) JL_NOTSAFEPOINT {
     if (!gc_logging_enabled) {
         return;
     }
@@ -1412,6 +1415,20 @@ void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect
         full ? "full" : "incr",
         recollect ? "recollect" : ""
     );
+    jl_safe_printf("Heap stats: bytes_mapped %.1f, bytes_decomitted %.1f, bytes_allocd %.1f\nbytes_freed %.1f, bytes_mallocd %.1f, malloc_bytes_freed %.1f\npages_perm_allocd %zu, heap_size %.1f, heap_target %.1f, live_bytes %1.f\n",
+        jl_atomic_load_relaxed(&gc_heap_stats.bytes_mapped)/1e6,
+        jl_atomic_load_relaxed(&gc_heap_stats.bytes_decomitted)/1e6,
+        jl_atomic_load_relaxed(&gc_heap_stats.bytes_allocd)/1e6,
+        jl_atomic_load_relaxed(&gc_heap_stats.bytes_freed)/1e6,
+        jl_atomic_load_relaxed(&gc_heap_stats.bytes_mallocd)/1e6,
+        jl_atomic_load_relaxed(&gc_heap_stats.malloc_bytes_freed)/1e6,
+        jl_atomic_load_relaxed(&gc_heap_stats.pages_perm_allocd),
+        jl_atomic_load_relaxed(&gc_heap_stats.heap_size)/1e6,
+        jl_atomic_load_relaxed(&gc_heap_stats.heap_target)/1e6,
+        live_bytes/1e6
+
+    );
+    jl_safe_printf("Fragmentation %1.f\n", (double)live_bytes/(double)jl_atomic_load_relaxed(&gc_heap_stats.heap_size));
 }
 
 #ifdef __cplusplus
diff --git a/src/gc-pages.c b/src/gc-pages.c
index 454864d45c766..83efc274014f7 100644
--- a/src/gc-pages.c
+++ b/src/gc-pages.c
@@ -79,6 +79,7 @@ static char *jl_gc_try_alloc_pages(int pg_cnt) JL_NOTSAFEPOINT
         // round data pointer up to the nearest gc_page_data-aligned
         // boundary if mmap didn't already do so.
         mem = (char*)gc_page_data(mem + GC_PAGE_SZ - 1);
+    jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mapped, pages_sz);
     return mem;
 }
 
@@ -284,6 +285,8 @@ NOINLINE jl_gc_pagemeta_t *jl_gc_alloc_page(void) JL_NOTSAFEPOINT
     errno = last_errno;
     current_pg_count++;
     gc_final_count_page(current_pg_count);
+    jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_allocd, GC_PAGE_SZ);
+    jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, GC_PAGE_SZ);
     uv_mutex_unlock(&gc_perm_lock);
     return info.meta;
 }
@@ -334,7 +337,7 @@ void jl_gc_free_page(void *p) JL_NOTSAFEPOINT
 #else
     madvise(p, decommit_size, MADV_DONTNEED);
 #endif
-
+    jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_decomitted, GC_PAGE_SZ);
 no_decommit:
     // new pages are now available starting at max of lb and pagetable_i32
     if (memory_map.lb > info.pagetable_i32)
@@ -344,6 +347,8 @@ void jl_gc_free_page(void *p) JL_NOTSAFEPOINT
     if (info.pagetable0->lb > info.pagetable0_i32)
         info.pagetable0->lb = info.pagetable0_i32;
     current_pg_count--;
+    jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_freed, GC_PAGE_SZ);
+    jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -GC_PAGE_SZ);
 }
 
 #ifdef __cplusplus
diff --git a/src/gc.c b/src/gc.c
index 0f30e8305b78c..e8b2ca5c17fab 100644
--- a/src/gc.c
+++ b/src/gc.c
@@ -1,6 +1,7 @@
 // This file is a part of Julia. License is MIT: https://julialang.org/license
 
 #include "gc.h"
+#include "julia_atomics.h"
 #include "julia_gcext.h"
 #include "julia_assert.h"
 #ifdef __GLIBC__
@@ -171,6 +172,13 @@ static _Atomic(int) support_conservative_marking = 0;
 
 jl_gc_num_t gc_num = {0};
 static size_t last_long_collect_interval;
+gc_heapstatus_t gc_heap_stats = {0};
+int next_sweep_full = 0;
+const uint64_t _jl_buff_tag[3] = {0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull, 0x4eadc0004eadc000ull}; // aka 0xHEADER00
+JL_DLLEXPORT uintptr_t jl_get_buff_tag(void)
+{
+    return jl_buff_tag;
+}
 
 pagetable_t memory_map;
 
@@ -230,6 +238,8 @@ STATIC_INLINE void jl_free_aligned(void *p) JL_NOTSAFEPOINT
 #else
 STATIC_INLINE void *jl_malloc_aligned(size_t sz, size_t align)
 {
+    jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, sz);
+    jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, sz);
 #if defined(_P64) || defined(__APPLE__)
     if (align <= 16)
         return malloc(sz);
@@ -242,6 +252,9 @@ STATIC_INLINE void *jl_malloc_aligned(size_t sz, size_t align)
 STATIC_INLINE void *jl_realloc_aligned(void *d, size_t sz, size_t oldsz,
                                        size_t align)
 {
+    jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_mallocd, sz);
+    jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, oldsz);
+    jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, sz-oldsz);
 #if defined(_P64) || defined(__APPLE__)
     if (align <= 16)
         return realloc(d, sz);
@@ -609,11 +622,19 @@ static void gc_sweep_foreign_objs(void)
 static int64_t last_gc_total_bytes = 0;
 
 #ifdef _P64
-#define default_collect_interval (5600*1024*sizeof(void*))
-static size_t max_collect_interval = 1250000000UL;
+typedef uint64_t memsize_t;
+static const size_t default_collect_interval = 5600 * 1024 * sizeof(void*);
+
+static size_t total_mem;
+// We expose this to the user/ci as jl_gc_set_max_memory
+static memsize_t max_total_memory = (memsize_t) 2 * 1024 * 1024 * 1024 * 1024 * 1024;
 #else
-#define default_collect_interval (3200*1024*sizeof(void*))
-static size_t max_collect_interval =  500000000UL;
+typedef uint32_t memsize_t;
+static const size_t default_collect_interval = 3200 * 1024 * sizeof(void*);
+// Work really hard to stay within 2GB
+// Alternative is to risk running out of address space
+// on 32 bit architectures.
+static memsize_t max_total_memory = (memsize_t)  1024 * 1024 * 1024; // The new heuristics use all the heap, which makes it run out
 #endif
 
 // global variables for GC stats
@@ -906,7 +927,7 @@ void jl_gc_force_mark_old(jl_ptls_t ptls, jl_value_t *v) JL_NOTSAFEPOINT
 
 static inline void maybe_collect(jl_ptls_t ptls)
 {
-    if (jl_atomic_load_relaxed(&ptls->gc_num.allocd) >= 0 || jl_gc_debug_check_other()) {
+    if (jl_atomic_load_relaxed(&gc_heap_stats.heap_size) >= jl_atomic_load_relaxed(&gc_heap_stats.heap_target) || jl_gc_debug_check_other()) {
         jl_gc_collect(JL_GC_AUTO);
     }
     else {
@@ -1044,6 +1065,8 @@ static bigval_t **sweep_big_list(int sweep_full, bigval_t **pv) JL_NOTSAFEPOINT
             if (nxt)
                 nxt->prev = pv;
             gc_num.freed += v->sz&~3;
+            jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, v->sz&~3);
+            jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -(v->sz&~3));
 #ifdef MEMDEBUG
             memset(v, 0xbb, v->sz&~3);
 #endif
@@ -1159,6 +1182,8 @@ static void jl_gc_free_array(jl_array_t *a) JL_NOTSAFEPOINT
             jl_free_aligned(d);
         else
             free(d);
+        jl_atomic_fetch_add_relaxed(&gc_heap_stats.malloc_bytes_freed, jl_array_nbytes(a));
+        jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size, -jl_array_nbytes(a));
         gc_num.freed += jl_array_nbytes(a);
         gc_num.freecall++;
     }
@@ -3172,7 +3197,6 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
     uint64_t mark_time = end_mark_time - start_mark_time;
     gc_num.mark_time = mark_time;
     gc_num.total_mark_time += mark_time;
-    int64_t actual_allocd = gc_num.since_sweep;
     // marking is over
 
     // 4. check for objects to finalize
@@ -3213,12 +3237,7 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
     // Flush everything in mark cache
     gc_sync_all_caches_nolock(ptls);
 
-    int64_t live_sz_ub = live_bytes + actual_allocd;
-    int64_t live_sz_est = scanned_bytes + perm_scanned_bytes;
-    int64_t estimate_freed = live_sz_ub - live_sz_est;
-
     gc_verify(ptls);
-
     gc_stats_all_pool();
     gc_stats_big_obj();
     objprofile_printall();
@@ -3227,28 +3246,23 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
     if (!prev_sweep_full)
         promoted_bytes += perm_scanned_bytes - last_perm_scanned_bytes;
     // 5. next collection decision
-    int not_freed_enough = (collection == JL_GC_AUTO) && estimate_freed < (7*(actual_allocd/10));
-    int nptr = 0;
-    for (int i = 0;i < jl_n_threads;i++)
-        nptr += jl_all_tls_states[i]->heap.remset_nptr;
-
-    // many pointers in the intergen frontier => "quick" mark is not quick
-    int large_frontier = nptr*sizeof(void*) >= default_collect_interval;
-    int sweep_full = 0;
+    int remset_nptr = 0;
+    int sweep_full = next_sweep_full;
     int recollect = 0;
+    assert(gc_n_threads);
+    for (int i = 0; i < jl_n_threads; i++) {
+        jl_ptls_t ptls2 = jl_all_tls_states[i];
+        if (ptls2 != NULL)
+            remset_nptr += ptls2->heap.remset_nptr;
+    }
+    (void)remset_nptr; //Use this information for something?
 
-    // update heuristics only if this GC was automatically triggered
-    if (collection == JL_GC_AUTO) {
-        if (not_freed_enough) {
-            gc_num.interval = gc_num.interval * 2;
-        }
-        if (large_frontier) {
-            sweep_full = 1;
-        }
-        if (gc_num.interval > max_collect_interval) {
-            sweep_full = 1;
-            gc_num.interval = max_collect_interval;
-        }
+
+    // If the live data outgrows the suggested max_total_memory
+    // we keep going with minimum intervals and full gcs until
+    // we either free some space or get an OOM error.
+    if (live_bytes > max_total_memory) {
+        sweep_full = 1;
     }
     if (gc_sweep_always_full) {
         sweep_full = 1;
@@ -3276,6 +3290,14 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
     gc_sweep_pool(sweep_full);
     if (sweep_full)
         gc_sweep_perm_alloc();
+
+    size_t heap_size = jl_atomic_load_relaxed(&gc_heap_stats.heap_size);
+    if (heap_size > max_total_memory*0.8)
+        next_sweep_full = 1;
+    else
+        next_sweep_full = 0;
+    size_t new_heap_target = 2 * heap_size > max_total_memory ? max_total_memory : 2 * heap_size;
+    jl_atomic_store_relaxed(&gc_heap_stats.heap_target, new_heap_target);
     JL_PROBE_GC_SWEEP_END();
 
     uint64_t gc_end_time = jl_hrtime();
@@ -3319,30 +3341,20 @@ static int _jl_gc_collect(jl_ptls_t ptls, jl_gc_collection_t collection)
     }
 #endif
 
-
-    _report_gc_finished(pause, gc_num.freed, sweep_full, recollect);
-
-    gc_final_pause_end(t0, gc_end_time);
-    gc_time_sweep_pause(gc_end_time, actual_allocd, live_bytes,
-                        estimate_freed, sweep_full);
-    gc_num.full_sweep += sweep_full;
+    _report_gc_finished(pause, gc_num.freed, sweep_full, recollect, live_bytes);
     uint64_t max_memory = last_live_bytes + gc_num.allocd;
     if (max_memory > gc_num.max_memory) {
         gc_num.max_memory = max_memory;
     }
+    gc_final_pause_end(gc_start_time, gc_end_time);
+    gc_time_sweep_pause(gc_end_time, allocd, live_bytes,
+                        estimate_freed, sweep_full);
+    gc_num.full_sweep += sweep_full;
 
     gc_num.allocd = 0;
     last_live_bytes = live_bytes;
     live_bytes += -gc_num.freed + gc_num.since_sweep;
 
-    if (collection == JL_GC_AUTO) {
-      // If the current interval is larger than half the live data decrease the interval
-      int64_t half = live_bytes/2;
-      if (gc_num.interval > half) gc_num.interval = half;
-      // But never go below default
-      if (gc_num.interval < default_collect_interval) gc_num.interval = default_collect_interval;
-    }
-
     gc_time_summary(sweep_full, t_start, gc_end_time, gc_num.freed,
                     live_bytes, gc_num.interval, pause,
                     gc_num.time_to_safepoint,
@@ -3510,7 +3522,7 @@ void jl_gc_init(void)
 
     arraylist_new(&finalizer_list_marked, 0);
     arraylist_new(&to_finalize, 0);
-
+    jl_atomic_store_relaxed(&gc_heap_stats.heap_target, default_collect_interval);
     gc_num.interval = default_collect_interval;
     last_long_collect_interval = default_collect_interval;
     gc_num.allocd = 0;
@@ -3519,13 +3531,16 @@ void jl_gc_init(void)
 
 #ifdef _P64
     // on a big memory machine, set max_collect_interval to totalmem / nthreads / 2
-    uint64_t total_mem = uv_get_total_memory();
+    total_mem = uv_get_total_memory();
     uint64_t constrained_mem = uv_get_constrained_memory();
     if (constrained_mem != 0)
         total_mem = constrained_mem;
-    size_t maxmem = total_mem / jl_n_threads / 2;
-    if (maxmem > max_collect_interval)
-        max_collect_interval = maxmem;
+    double percent;
+    if (total_mem < 128e9)
+        percent = total_mem * 2.34375e-12 + 0.3; // 60% at 0 gigs and 90% at 128 to not
+    else                                         // overcommit too much on memory contrained devices
+        percent = 0.6;
+    max_total_memory = total_mem * percent;
 #endif
     jl_gc_mark_sp_t sp = {NULL, NULL, NULL, NULL};
     gc_mark_loop(NULL, sp);
@@ -3801,6 +3816,8 @@ static void *gc_perm_alloc_large(size_t sz, int zero, unsigned align, unsigned o
 #ifdef _OS_WINDOWS_
     SetLastError(last_error);
 #endif
+    jl_atomic_fetch_add_relaxed(&gc_heap_stats.bytes_allocd,sz);
+    jl_atomic_fetch_add_relaxed(&gc_heap_stats.heap_size,sz);
     errno = last_errno;
     jl_may_leak(base);
     unsigned diff = (offset - base) % align;
@@ -3843,6 +3860,7 @@ void *jl_gc_perm_alloc_nolock(size_t sz, int zero, unsigned align, unsigned offs
     errno = last_errno;
     if (__unlikely(pool == MAP_FAILED))
         return NULL;
+    jl_atomic_fetch_add_relaxed(&gc_heap_stats.pages_perm_allocd, 1);
 #endif
     gc_perm_pool = (uintptr_t)pool;
     gc_perm_end = gc_perm_pool + GC_PERM_POOL_SIZE;
diff --git a/src/gc.h b/src/gc.h
index 8436a653dc32b..76ef2ba480a4d 100644
--- a/src/gc.h
+++ b/src/gc.h
@@ -9,6 +9,8 @@
 #ifndef JL_GC_H
 #define JL_GC_H
 
+#include <stddef.h>
+#include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 #include <strings.h>
@@ -381,7 +383,19 @@ typedef struct {
     int ub;
 } pagetable_t;
 
-#ifdef __clang_gcanalyzer__
+typedef struct {
+    _Atomic(size_t) bytes_mapped;
+    _Atomic(size_t) bytes_freed;
+    _Atomic(size_t) bytes_allocd;
+    _Atomic(size_t) bytes_decomitted;
+    _Atomic(size_t) bytes_mallocd;
+    _Atomic(size_t) malloc_bytes_freed;
+    _Atomic(size_t) pages_perm_allocd;
+    _Atomic(size_t) heap_size;
+    _Atomic(size_t) heap_target;
+} gc_heapstatus_t;
+
+#ifdef __clang_gcanalyzer__ /* clang may not have __builtin_ffs */
 unsigned ffs_u32(uint32_t bitvec) JL_NOTSAFEPOINT;
 #else
 STATIC_INLINE unsigned ffs_u32(uint32_t bitvec)
@@ -396,6 +410,7 @@ extern bigval_t *big_objects_marked;
 extern arraylist_t finalizer_list_marked;
 extern arraylist_t to_finalize;
 extern int64_t lazy_freed_pages;
+extern gc_heapstatus_t gc_heap_stats;
 
 STATIC_INLINE bigval_t *bigval_header(jl_taggedvalue_t *o) JL_NOTSAFEPOINT
 {
@@ -717,7 +732,7 @@ void gc_count_pool(void);
 size_t jl_array_nbytes(jl_array_t *a) JL_NOTSAFEPOINT;
 
 JL_DLLEXPORT void jl_enable_gc_logging(int enable);
-void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect) JL_NOTSAFEPOINT;
+void _report_gc_finished(uint64_t pause, uint64_t freed, int full, int recollect, int64_t live_bytes) JL_NOTSAFEPOINT;
 
 #ifdef __cplusplus
 }