Skip to content

Commit f772c9a

Browse files
committed
request 64-byte alignment instead of 16-byte alignment for large objects
This should align the requested memory with cache lines, thus improving register loads.
1 parent bd7adbf commit f772c9a

File tree

3 files changed

+44
-40
lines changed

3 files changed

+44
-40
lines changed

src/array.c

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ extern "C" {
1818

1919
#define JL_ARRAY_ALIGN(jl_value, nbytes) LLT_ALIGN(jl_value, nbytes)
2020

21-
2221
// array constructors ---------------------------------------------------------
2322

2423
static inline int store_unboxed(jl_value_t *el_type)
@@ -74,13 +73,13 @@ static jl_array_t *_new_array_(jl_value_t *atype, uint32_t ndims, size_t *dims,
7473
}
7574

7675
int ndimwords = jl_array_ndimwords(ndims);
77-
int tsz = JL_ARRAY_ALIGN(sizeof(jl_array_t) + ndimwords*sizeof(size_t), 16);
76+
int tsz = JL_ARRAY_ALIGN(sizeof(jl_array_t) + ndimwords*sizeof(size_t), JL_CACHE_BYTE_ALIGNMENT);
7877
if (tot <= ARRAY_INLINE_NBYTES) {
7978
if (isunboxed && elsz >= 4)
80-
tsz = JL_ARRAY_ALIGN(tsz, 16); // align data area 16
79+
tsz = JL_ARRAY_ALIGN(tsz, JL_SMALL_BYTE_ALIGNMENT); // align data area
8180
size_t doffs = tsz;
8281
tsz += tot;
83-
tsz = JL_ARRAY_ALIGN(tsz, 16); // align whole object 16
82+
tsz = JL_ARRAY_ALIGN(tsz, JL_SMALL_BYTE_ALIGNMENT); // align whole object
8483
a = (jl_array_t*)jl_gc_allocobj(tsz);
8584
jl_set_typeof(a, atype);
8685
a->flags.how = 0;
@@ -90,7 +89,7 @@ static jl_array_t *_new_array_(jl_value_t *atype, uint32_t ndims, size_t *dims,
9089
}
9190
}
9291
else {
93-
tsz = JL_ARRAY_ALIGN(tsz, 16); // align whole object 16
92+
tsz = JL_ARRAY_ALIGN(tsz, JL_CACHE_BYTE_ALIGNMENT); // align whole object
9493
a = (jl_array_t*)jl_gc_allocobj(tsz);
9594
JL_GC_PUSH1(&a);
9695
jl_set_typeof(a, atype);
@@ -157,7 +156,7 @@ JL_DLLEXPORT jl_array_t *jl_reshape_array(jl_value_t *atype, jl_array_t *data,
157156
size_t ndims = jl_nfields(dims);
158157

159158
int ndimwords = jl_array_ndimwords(ndims);
160-
int tsz = JL_ARRAY_ALIGN(sizeof(jl_array_t) + ndimwords*sizeof(size_t) + sizeof(void*), 16);
159+
int tsz = JL_ARRAY_ALIGN(sizeof(jl_array_t) + ndimwords*sizeof(size_t) + sizeof(void*), JL_SMALL_BYTE_ALIGNMENT);
161160
a = (jl_array_t*)jl_gc_allocobj(tsz);
162161
jl_set_typeof(a, atype);
163162
a->flags.pooled = tsz <= GC_MAX_SZCLASS;
@@ -233,7 +232,7 @@ JL_DLLEXPORT jl_array_t *jl_ptr_to_array_1d(jl_value_t *atype, void *data,
233232
elsz = sizeof(void*);
234233

235234
int ndimwords = jl_array_ndimwords(1);
236-
int tsz = JL_ARRAY_ALIGN(sizeof(jl_array_t) + ndimwords*sizeof(size_t), 16);
235+
int tsz = JL_ARRAY_ALIGN(sizeof(jl_array_t) + ndimwords*sizeof(size_t), JL_CACHE_BYTE_ALIGNMENT);
237236
a = (jl_array_t*)jl_gc_allocobj(tsz);
238237
jl_set_typeof(a, atype);
239238
a->flags.pooled = tsz <= GC_MAX_SZCLASS;
@@ -284,7 +283,7 @@ JL_DLLEXPORT jl_array_t *jl_ptr_to_array(jl_value_t *atype, void *data,
284283
elsz = sizeof(void*);
285284

286285
int ndimwords = jl_array_ndimwords(ndims);
287-
int tsz = JL_ARRAY_ALIGN(sizeof(jl_array_t) + ndimwords*sizeof(size_t), 16);
286+
int tsz = JL_ARRAY_ALIGN(sizeof(jl_array_t) + ndimwords*sizeof(size_t), JL_CACHE_BYTE_ALIGNMENT);
288287
a = (jl_array_t*)jl_gc_allocobj(tsz);
289288
jl_set_typeof(a, atype);
290289
a->flags.pooled = tsz <= GC_MAX_SZCLASS;

src/gc.c

Lines changed: 34 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,13 @@ typedef struct _bigval_t {
135135
size_t sz;
136136
uintptr_t age : 2;
137137
};
138+
#ifdef _P64 // Add padding so that char data[] below is 64-byte aligned
139+
// (8 pointers of 8 bytes each) - (4 other pointers in struct)
140+
void *_padding[8 - 4];
141+
#else
142+
// (16 pointers of 4 bytes each) - (4 other pointers in struct)
143+
void *_padding[16 - 4];
144+
#endif
138145
//struct buff_t <>;
139146
union {
140147
uintptr_t header;
@@ -146,7 +153,7 @@ typedef struct _bigval_t {
146153
#if !defined(_COMPILER_MICROSOFT_)
147154
int _dummy[0];
148155
#endif
149-
// must be 16-aligned here, in 32 & 64b
156+
// must be 64-byte aligned here, in 32 & 64 bit modes
150157
char data[];
151158
} bigval_t;
152159

@@ -171,7 +178,7 @@ typedef struct _pool_t {
171178

172179
#define GC_PAGE_LG2 14 // log2(size of a page)
173180
#define GC_PAGE_SZ (1 << GC_PAGE_LG2) // 16k
174-
#define GC_PAGE_OFFSET (16 - (sizeof_jl_taggedvalue_t % 16))
181+
#define GC_PAGE_OFFSET (JL_SMALL_BYTE_ALIGNMENT - (sizeof_jl_taggedvalue_t % JL_SMALL_BYTE_ALIGNMENT))
175182

176183
// pool page metadata
177184
typedef struct _gcpage_t {
@@ -437,15 +444,8 @@ static int jl_gc_finalizers_inhibited; // don't run finalizers during codegen #1
437444

438445
// malloc wrappers, aligned allocation
439446

440-
#if defined(_P64) || defined(__APPLE__)
441-
#define malloc_a16(sz) malloc(sz)
442-
#define realloc_a16(p, sz, oldsz) realloc((p), (sz))
443-
#define free_a16(p) free(p)
444-
#else
445-
#define malloc_a16(sz) jl_malloc_aligned(sz, 16)
446-
#define realloc_a16(p, sz, oldsz) jl_realloc_aligned(p, sz, oldsz, 16)
447-
#define free_a16(p) jl_free_aligned(p)
448-
#endif
447+
#define malloc_cache_align(sz) jl_malloc_aligned(sz, JL_CACHE_BYTE_ALIGNMENT)
448+
#define realloc_cache_align(p, sz, oldsz) jl_realloc_aligned(p, sz, oldsz, JL_CACHE_BYTE_ALIGNMENT)
449449

450450
static void schedule_finalization(void *o, void *f)
451451
{
@@ -1011,10 +1011,10 @@ static NOINLINE void *alloc_big(size_t sz)
10111011
{
10121012
maybe_collect();
10131013
size_t offs = offsetof(bigval_t, header);
1014-
size_t allocsz = LLT_ALIGN(sz + offs, 16);
1014+
size_t allocsz = LLT_ALIGN(sz + offs, JL_CACHE_BYTE_ALIGNMENT);
10151015
if (allocsz < sz) // overflow in adding offs, size was "negative"
10161016
jl_throw(jl_memory_exception);
1017-
bigval_t *v = (bigval_t*)malloc_a16(allocsz);
1017+
bigval_t *v = (bigval_t*)malloc_cache_align(allocsz);
10181018
if (v == NULL)
10191019
jl_throw(jl_memory_exception);
10201020
jl_atomic_fetch_add(&allocd_bytes, allocsz);
@@ -1074,7 +1074,7 @@ static bigval_t **sweep_big_list(int sweep_mask, bigval_t **pv)
10741074
#ifdef MEMDEBUG
10751075
memset(v, 0xbb, v->sz&~3);
10761076
#endif
1077-
free_a16(v);
1077+
jl_free_aligned(v);
10781078
big_freed++;
10791079
}
10801080
big_total++;
@@ -1141,7 +1141,7 @@ static void jl_gc_free_array(jl_array_t *a)
11411141
if (a->flags.how == 2) {
11421142
char *d = (char*)a->data - a->offset*a->elsize;
11431143
if (a->flags.isaligned)
1144-
free_a16(d);
1144+
jl_free_aligned(d);
11451145
else
11461146
free(d);
11471147
freed_bytes += array_nbytes(a);
@@ -2500,7 +2500,7 @@ void *reallocb(void *b, size_t sz)
25002500
if (allocsz < sz) // overflow in adding offs, size was "negative"
25012501
jl_throw(jl_memory_exception);
25022502
bigval_t *bv = bigval_header(buff);
2503-
bv = (bigval_t*)realloc_a16(bv, allocsz, bv->sz&~3);
2503+
bv = (bigval_t*)realloc_cache_align(bv, allocsz, bv->sz&~3);
25042504
if (bv == NULL)
25052505
jl_throw(jl_memory_exception);
25062506
return &bv->data[0];
@@ -2539,7 +2539,7 @@ JL_DLLEXPORT jl_value_t *jl_gc_alloc_0w(void)
25392539

25402540
JL_DLLEXPORT jl_value_t *jl_gc_alloc_1w(void)
25412541
{
2542-
const int sz = LLT_ALIGN(sizeof_jl_taggedvalue_t + sizeof(void*), 16);
2542+
const int sz = LLT_ALIGN(sizeof_jl_taggedvalue_t + sizeof(void*), JL_SMALL_BYTE_ALIGNMENT);
25432543
void *tag = NULL;
25442544
#ifdef MEMDEBUG
25452545
tag = alloc_big(sz);
@@ -2552,7 +2552,7 @@ JL_DLLEXPORT jl_value_t *jl_gc_alloc_1w(void)
25522552

25532553
JL_DLLEXPORT jl_value_t *jl_gc_alloc_2w(void)
25542554
{
2555-
const int sz = LLT_ALIGN(sizeof_jl_taggedvalue_t + sizeof(void*) * 2, 16);
2555+
const int sz = LLT_ALIGN(sizeof_jl_taggedvalue_t + sizeof(void*) * 2, JL_SMALL_BYTE_ALIGNMENT);
25562556
void *tag = NULL;
25572557
#ifdef MEMDEBUG
25582558
tag = alloc_big(sz);
@@ -2565,7 +2565,7 @@ JL_DLLEXPORT jl_value_t *jl_gc_alloc_2w(void)
25652565

25662566
JL_DLLEXPORT jl_value_t *jl_gc_alloc_3w(void)
25672567
{
2568-
const int sz = LLT_ALIGN(sizeof_jl_taggedvalue_t + sizeof(void*) * 3, 16);
2568+
const int sz = LLT_ALIGN(sizeof_jl_taggedvalue_t + sizeof(void*) * 3, JL_SMALL_BYTE_ALIGNMENT);
25692569
void *tag = NULL;
25702570
#ifdef MEMDEBUG
25712571
tag = alloc_big(sz);
@@ -2612,7 +2612,7 @@ jl_thread_heap_t *jl_mk_thread_heap(void)
26122612
#ifdef JULIA_ENABLE_THREADING
26132613
// Cache-aligned malloc
26142614
jl_thread_heap =
2615-
(jl_thread_heap_t*)jl_malloc_aligned(sizeof(jl_thread_heap_t), 64);
2615+
(jl_thread_heap_t*)jl_malloc_aligned(sizeof(jl_thread_heap_t), JL_CACHE_BYTE_ALIGNMENT);
26162616
#endif
26172617
FOR_CURRENT_HEAP () {
26182618
const int *szc = sizeclasses;
@@ -2781,6 +2781,7 @@ static void big_obj_stats(void)
27812781

27822782
JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz)
27832783
{
2784+
sz += JL_SMALL_BYTE_ALIGNMENT;
27842785
maybe_collect();
27852786
allocd_bytes += sz;
27862787
gc_num.malloc++;
@@ -2792,6 +2793,7 @@ JL_DLLEXPORT void *jl_gc_counted_malloc(size_t sz)
27922793

27932794
JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz)
27942795
{
2796+
nm += JL_SMALL_BYTE_ALIGNMENT;
27952797
maybe_collect();
27962798
allocd_bytes += nm*sz;
27972799
gc_num.malloc++;
@@ -2804,15 +2806,15 @@ JL_DLLEXPORT void *jl_gc_counted_calloc(size_t nm, size_t sz)
28042806
JL_DLLEXPORT void jl_gc_counted_free(void *p, size_t sz)
28052807
{
28062808
free(p);
2807-
freed_bytes += sz;
2809+
freed_bytes += sz + JL_SMALL_BYTE_ALIGNMENT;
28082810
gc_num.freecall++;
28092811
}
28102812

2811-
JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old,
2812-
size_t sz)
2813+
JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old, size_t sz)
28132814
{
2815+
old += JL_SMALL_BYTE_ALIGNMENT;
2816+
sz += JL_SMALL_BYTE_ALIGNMENT;
28142817
maybe_collect();
2815-
28162818
if (sz < old)
28172819
freed_bytes += (old - sz);
28182820
else
@@ -2826,7 +2828,7 @@ JL_DLLEXPORT void *jl_gc_counted_realloc_with_old_size(void *p, size_t old,
28262828

28272829
JL_DLLEXPORT void *jl_malloc(size_t sz)
28282830
{
2829-
int64_t *p = (int64_t *)jl_gc_counted_malloc(sz + 16);
2831+
int64_t *p = (int64_t *)jl_gc_counted_malloc(sz);
28302832
p[0] = sz;
28312833
return (void *)(p + 2);
28322834
}
@@ -2835,7 +2837,7 @@ JL_DLLEXPORT void *jl_calloc(size_t nm, size_t sz)
28352837
{
28362838
int64_t *p;
28372839
size_t nmsz = nm*sz;
2838-
p = (int64_t *)jl_gc_counted_calloc(nmsz + 16, 1);
2840+
p = (int64_t *)jl_gc_counted_calloc(nmsz, 1);
28392841
p[0] = nmsz;
28402842
return (void *)(p + 2);
28412843
}
@@ -2844,27 +2846,27 @@ JL_DLLEXPORT void jl_free(void *p)
28442846
{
28452847
int64_t *pp = (int64_t *)p - 2;
28462848
size_t sz = pp[0];
2847-
jl_gc_counted_free(pp, sz + 16);
2849+
jl_gc_counted_free(pp, sz);
28482850
}
28492851

28502852
JL_DLLEXPORT void *jl_realloc(void *p, size_t sz)
28512853
{
28522854
int64_t *pp = (int64_t *)p - 2;
28532855
size_t szold = pp[0];
2854-
int64_t *pnew = (int64_t *)jl_gc_counted_realloc_with_old_size(pp, szold + 16, sz + 16);
2856+
int64_t *pnew = (int64_t *)jl_gc_counted_realloc_with_old_size(pp, szold, sz);
28552857
pnew[0] = sz;
28562858
return (void *)(pnew + 2);
28572859
}
28582860

28592861
JL_DLLEXPORT void *jl_gc_managed_malloc(size_t sz)
28602862
{
28612863
maybe_collect();
2862-
size_t allocsz = LLT_ALIGN(sz, 16);
2864+
size_t allocsz = LLT_ALIGN(sz, JL_CACHE_BYTE_ALIGNMENT);
28632865
if (allocsz < sz) // overflow in adding offs, size was "negative"
28642866
jl_throw(jl_memory_exception);
28652867
allocd_bytes += allocsz;
28662868
gc_num.malloc++;
2867-
void *b = malloc_a16(allocsz);
2869+
void *b = malloc_cache_align(allocsz);
28682870
if (b == NULL)
28692871
jl_throw(jl_memory_exception);
28702872
return b;
@@ -2875,7 +2877,7 @@ JL_DLLEXPORT void *jl_gc_managed_realloc(void *d, size_t sz, size_t oldsz,
28752877
{
28762878
maybe_collect();
28772879

2878-
size_t allocsz = LLT_ALIGN(sz, 16);
2880+
size_t allocsz = LLT_ALIGN(sz, JL_CACHE_BYTE_ALIGNMENT);
28792881
if (allocsz < sz) // overflow in adding offs, size was "negative"
28802882
jl_throw(jl_memory_exception);
28812883

@@ -2891,7 +2893,7 @@ JL_DLLEXPORT void *jl_gc_managed_realloc(void *d, size_t sz, size_t oldsz,
28912893

28922894
void *b;
28932895
if (isaligned)
2894-
b = realloc_a16(d, allocsz, oldsz);
2896+
b = realloc_cache_align(d, allocsz, oldsz);
28952897
else
28962898
b = realloc(d, allocsz);
28972899
if (b == NULL)

src/julia_internal.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -499,6 +499,9 @@ STATIC_INLINE void jl_free_aligned(void *p)
499499
}
500500
#endif
501501

502+
#define JL_SMALL_BYTE_ALIGNMENT 16
503+
#define JL_CACHE_BYTE_ALIGNMENT 64
504+
502505
#ifdef __cplusplus
503506
}
504507
#endif

0 commit comments

Comments
 (0)