Skip to content
This repository was archived by the owner on Oct 11, 2018. It is now read-only.

Commit 5c03017

Browse files
committed
Support for NUMA interleave policy
Summary: Feature: NUMA support Credits for research and implementation: Jeremy Cole and Davi Arnaut This patch provides startup options: * flush-caches: Flush and purge buffers/caches * numa-interleave: Run mysqld with its memory interleaved on all CPUs It also provides a config option: * innodb_buffer_pool_populate: pre-allocation of buffer pool memory at start up: -- Use MAP_POPULATE if supported (Linux 2.6.23 and higher) -- Forced pre-allocation using memset Test Plan: mtr Reviewers: steaphan, pivanof Reviewed By: pivanof CC: MarkCallaghan, jtolmer, jeremycole, flamingcow, andrew-ford, pengt, CalvinSun Differential Revision: https://reviews.facebook.net/D16965
1 parent 44ea8ea commit 5c03017

14 files changed

Lines changed: 173 additions & 10 deletions
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
CALL mtr.add_suppression(".* Forcing preallocation by faulting in pages.");
2+
SELECT @@GLOBAL.innodb_buffer_pool_populate;
3+
@@GLOBAL.innodb_buffer_pool_populate
4+
1
5+
1 Expected
6+
SET @@GLOBAL.innodb_buffer_pool_populate=0;
7+
ERROR HY000: Variable 'innodb_buffer_pool_populate' is a read only variable
8+
Expected error 'Read only variable'
9+
SELECT @@GLOBAL.innodb_buffer_pool_populate;
10+
@@GLOBAL.innodb_buffer_pool_populate
11+
1
12+
1 Expected
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
--innodb-buffer-pool-populate=true
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
--source include/have_innodb.inc
2+
3+
CALL mtr.add_suppression(".* Forcing preallocation by faulting in pages.");
4+
5+
# Display current value of innodb_buffer_pool_populate
6+
SELECT @@GLOBAL.innodb_buffer_pool_populate;
7+
--echo 1 Expected
8+
9+
# Variable should be read-only
10+
--error ER_INCORRECT_GLOBAL_LOCAL_VAR
11+
SET @@GLOBAL.innodb_buffer_pool_populate=0;
12+
--echo Expected error 'Read only variable'
13+
14+
SELECT @@GLOBAL.innodb_buffer_pool_populate;
15+
--echo 1 Expected
16+

scripts/mysqld_safe.sh

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ MYSQLD=
1717
niceness=0
1818
mysqld_ld_preload=
1919
mysqld_ld_library_path=
20+
flush_caches=0
21+
numa_interleave=0
2022

2123
# Initial logging status: error log is not open, and not using syslog
2224
logging=init
@@ -82,6 +84,9 @@ Usage: $0 [OPTIONS]
8284
--syslog Log messages to syslog with 'logger'
8385
--skip-syslog Log messages to error log (default)
8486
--syslog-tag=TAG Pass -t "mysqld-TAG" to 'logger'
87+
--flush-caches Flush and purge buffers/caches
88+
--numa-interleave Run mysqld with its memory interleaved
89+
on all CPUs
8590
8691
All other options are passed to the mysqld program.
8792
@@ -227,6 +232,8 @@ parse_arguments() {
227232
--skip-syslog) want_syslog=0 ;;
228233
--syslog-tag=*) syslog_tag="$val" ;;
229234
--timezone=*) TZ="$val"; export TZ; ;;
235+
--flush-caches) flush_caches=1 ;;
236+
--numa-interleave) numa_interleave=1 ;;
230237

231238
--help) usage ;;
232239

@@ -739,6 +746,41 @@ mysqld daemon not started"
739746
fi
740747
fi
741748

749+
#
750+
# Flush and purge buffers/caches.
751+
#
752+
753+
if @TARGET_LINUX@ && test $flush_caches -eq 1
754+
then
755+
# Locate sync, ensure it exists.
756+
if ! my_which sync > /dev/null 2>&1
757+
then
758+
log_error "sync command not found, required for --flush-caches"
759+
exit 1
760+
# Flush file system buffers.
761+
elif ! sync
762+
then
763+
# Huh, the sync() function is always successful...
764+
log_error "sync failed, check if sync is properly installed"
765+
fi
766+
767+
# Locate sysctl, ensure it exists.
768+
if ! my_which sysctl > /dev/null 2>&1
769+
then
770+
log_error "sysctl command not found, required for --flush-caches"
771+
exit 1
772+
# Purge page cache, dentries and inodes.
773+
elif ! sysctl -q -w vm.drop_caches=3
774+
then
775+
log_error "sysctl failed, check the error message for details"
776+
exit 1
777+
fi
778+
elif test $flush_caches -eq 1
779+
then
780+
log_error "--flush-caches is not supported on this platform"
781+
exit 1
782+
fi
783+
742784
#
743785
# Uncomment the following lines if you want all tables to be automatically
744786
# checked and repaired during startup. You should add sensible key_buffer
@@ -759,6 +801,31 @@ fi
759801

760802
cmd="`mysqld_ld_preload_text`$NOHUP_NICENESS"
761803

804+
#
805+
# Set mysqld's memory interleave policy.
806+
#
807+
808+
if @TARGET_LINUX@ && test $numa_interleave -eq 1
809+
then
810+
# Locate numactl, ensure it exists.
811+
if ! my_which numactl > /dev/null 2>&1
812+
then
813+
log_error "numactl command not found, required for --numa-interleave"
814+
exit 1
815+
# Attempt to run a command, ensure it works.
816+
elif ! numactl --interleave=all true
817+
then
818+
log_error "numactl failed, check if numactl is properly installed"
819+
fi
820+
821+
# Launch mysqld with numactl.
822+
cmd="$cmd numactl --interleave=all"
823+
elif test $numa_interleave -eq 1
824+
then
825+
log_error "--numa-interleave is not supported on this platform"
826+
exit 1
827+
fi
828+
762829
for i in "$ledir/$MYSQLD" "$defaults" "--basedir=$MY_BASEDIR_VERSION" \
763830
"--datadir=$DATADIR" "--plugin-dir=$plugin_dir" "$USER_OPTION"
764831
do

storage/innobase/buf/buf0buf.cc

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1024,7 +1024,8 @@ buf_chunk_init(
10241024
/*===========*/
10251025
buf_pool_t* buf_pool, /*!< in: buffer pool instance */
10261026
buf_chunk_t* chunk, /*!< out: chunk of buffers */
1027-
ulint mem_size) /*!< in: requested size in bytes */
1027+
ulint mem_size, /*!< in: requested size in bytes */
1028+
ibool populate) /*!< in: virtual page preallocation */
10281029
{
10291030
buf_block_t* block;
10301031
byte* frame;
@@ -1038,7 +1039,7 @@ buf_chunk_init(
10381039
+ (UNIV_PAGE_SIZE - 1), UNIV_PAGE_SIZE);
10391040

10401041
chunk->mem_size = mem_size;
1041-
chunk->mem = os_mem_alloc_large(&chunk->mem_size);
1042+
chunk->mem = os_mem_alloc_large(&chunk->mem_size, populate);
10421043

10431044
if (UNIV_UNLIKELY(chunk->mem == NULL)) {
10441045

@@ -1236,6 +1237,7 @@ buf_pool_init_instance(
12361237
/*===================*/
12371238
buf_pool_t* buf_pool, /*!< in: buffer pool instance */
12381239
ulint buf_pool_size, /*!< in: size in bytes */
1240+
ibool populate, /*!< in: virtual page preallocation */
12391241
ulint instance_no) /*!< in: id of the instance */
12401242
{
12411243
ulint i;
@@ -1258,7 +1260,7 @@ buf_pool_init_instance(
12581260

12591261
UT_LIST_INIT(buf_pool->free);
12601262

1261-
if (!buf_chunk_init(buf_pool, chunk, buf_pool_size)) {
1263+
if (!buf_chunk_init(buf_pool, chunk, buf_pool_size, populate)) {
12621264
mem_free(chunk);
12631265
mem_free(buf_pool);
12641266

@@ -1379,6 +1381,7 @@ dberr_t
13791381
buf_pool_init(
13801382
/*==========*/
13811383
ulint total_size, /*!< in: size of the total pool in bytes */
1384+
ibool populate, /*!< in: virtual page preallocation */
13821385
ulint n_instances) /*!< in: number of instances */
13831386
{
13841387
ulint i;
@@ -1394,7 +1397,7 @@ buf_pool_init(
13941397
for (i = 0; i < n_instances; i++) {
13951398
buf_pool_t* ptr = &buf_pool_ptr[i];
13961399

1397-
if (buf_pool_init_instance(ptr, size, i) != DB_SUCCESS) {
1400+
if (buf_pool_init_instance(ptr, size, populate, i) != DB_SUCCESS) {
13981401

13991402
/* Free all the instances created so far. */
14001403
buf_pool_free(i);

storage/innobase/handler/ha_innodb.cc

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15895,6 +15895,12 @@ static MYSQL_SYSVAR_ULONG(doublewrite_batch_size, srv_doublewrite_batch_size,
1589515895
NULL, NULL, 120, 1, 127, 0);
1589615896
#endif /* defined UNIV_DEBUG || defined UNIV_PERF_DEBUG */
1589715897

15898+
static MYSQL_SYSVAR_BOOL(buffer_pool_populate, srv_buf_pool_populate,
15899+
PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
15900+
"Preallocate (pre-fault) the page frames required for the mapping "
15901+
"established by the buffer pool memory region. Disabled by default.",
15902+
NULL, NULL, FALSE);
15903+
1589815904
static MYSQL_SYSVAR_LONG(buffer_pool_instances, innobase_buffer_pool_instances,
1589915905
PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1590015906
"Number of buffer pool instances, set to higher value on high-end machines to increase scalability",
@@ -16368,6 +16374,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
1636816374
MYSQL_SYSVAR(api_bk_commit_interval),
1636916375
MYSQL_SYSVAR(autoextend_increment),
1637016376
MYSQL_SYSVAR(buffer_pool_size),
16377+
MYSQL_SYSVAR(buffer_pool_populate),
1637116378
MYSQL_SYSVAR(buffer_pool_instances),
1637216379
MYSQL_SYSVAR(buffer_pool_filename),
1637316380
MYSQL_SYSVAR(buffer_pool_dump_now),

storage/innobase/include/buf0buf.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,7 @@ dberr_t
229229
buf_pool_init(
230230
/*=========*/
231231
ulint size, /*!< in: Size of the total pool in bytes */
232+
ibool populate, /*!< in: Force virtual page preallocation */
232233
ulint n_instances); /*!< in: Number of instances */
233234
/********************************************************************//**
234235
Frees the buffer pool at shutdown. This must not be invoked before

storage/innobase/include/os0proc.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,8 @@ UNIV_INTERN
5858
void*
5959
os_mem_alloc_large(
6060
/*===============*/
61-
ulint* n); /*!< in/out: number of bytes */
61+
ulint* n, /*!< in/out: number of bytes */
62+
ibool populate); /*!< in: virtual page preallocation */
6263
/****************************************************************//**
6364
Frees large pages memory. */
6465
UNIV_INTERN

storage/innobase/include/srv0srv.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,7 @@ extern my_bool srv_use_sys_malloc;
276276
extern ibool srv_use_sys_malloc;
277277
#endif /* UNIV_HOTBACKUP */
278278
extern ulint srv_buf_pool_size; /*!< requested size in bytes */
279+
extern my_bool srv_buf_pool_populate; /*!< virtual page preallocation */
279280
extern ulint srv_buf_pool_instances; /*!< requested number of buffer pool instances */
280281
extern ulong srv_n_page_hash_locks; /*!< number of locks to
281282
protect buf_pool->page_hash */

storage/innobase/os/os0proc.cc

Lines changed: 53 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,12 @@ Created 9/30/1995 Heikki Tuuri
3232
#include "ut0mem.h"
3333
#include "ut0byte.h"
3434

35+
/* Linux release version */
36+
#if defined(UNIV_LINUX) && defined(_GNU_SOURCE)
37+
#include <string.h> /* strverscmp() */
38+
#include <sys/utsname.h> /* uname() */
39+
#endif
40+
3541
/* FreeBSD for example has only MAP_ANON, Linux has MAP_ANONYMOUS and
3642
MAP_ANON but MAP_ANON is marked as deprecated */
3743
#if defined(MAP_ANONYMOUS)
@@ -40,6 +46,13 @@ MAP_ANON but MAP_ANON is marked as deprecated */
4046
#define OS_MAP_ANON MAP_ANON
4147
#endif
4248

49+
/* Linux's MAP_POPULATE */
50+
#if defined(MAP_POPULATE)
51+
#define OS_MAP_POPULATE MAP_POPULATE
52+
#else
53+
#define OS_MAP_POPULATE 0
54+
#endif
55+
4356
UNIV_INTERN ibool os_use_large_pages;
4457
/* Large page size. This may be a boot-time option on some platforms */
4558
UNIV_INTERN ulint os_large_page_size;
@@ -62,14 +75,32 @@ os_proc_get_number(void)
6275
#endif
6376
}
6477

78+
/****************************************************************//**
79+
Retrieve and compare operating system release.
80+
@return TRUE if the OS release is equal to, or later than release. */
81+
UNIV_INTERN
82+
ibool
83+
os_compare_release(
84+
/*===============*/
85+
const char* release) /*!< in: OS release */
86+
{
87+
#if defined(UNIV_LINUX) && defined(_GNU_SOURCE)
88+
struct utsname name;
89+
return(uname(&name) == 0 && strverscmp(name.release, release) >= 0);
90+
#else
91+
return(FALSE);
92+
#endif
93+
}
94+
6595
/****************************************************************//**
6696
Allocates large pages memory.
6797
@return allocated memory */
6898
UNIV_INTERN
6999
void*
70100
os_mem_alloc_large(
71101
/*===============*/
72-
ulint* n) /*!< in/out: number of bytes */
102+
ulint* n, /*!< in/out: number of bytes */
103+
ibool populate) /*!< in: virtual page preallocation */
73104
{
74105
void* ptr;
75106
ulint size;
@@ -155,7 +186,8 @@ os_mem_alloc_large(
155186
ut_ad(ut_is_2pow(size));
156187
size = *n = ut_2pow_round(*n + (size - 1), size);
157188
ptr = mmap(NULL, size, PROT_READ | PROT_WRITE,
158-
MAP_PRIVATE | OS_MAP_ANON, -1, 0);
189+
MAP_PRIVATE | OS_MAP_ANON |
190+
(populate ? OS_MAP_POPULATE : 0), -1, 0);
159191
if (UNIV_UNLIKELY(ptr == (void*) -1)) {
160192
fprintf(stderr, "InnoDB: mmap(%lu bytes) failed;"
161193
" errno %lu\n",
@@ -168,6 +200,25 @@ os_mem_alloc_large(
168200
UNIV_MEM_ALLOC(ptr, size);
169201
}
170202
#endif
203+
204+
#if OS_MAP_ANON && OS_MAP_POPULATE
205+
/* MAP_POPULATE is only supported for private mappings
206+
since Linux 2.6.23. */
207+
populate = populate && !os_compare_release("2.6.23");
208+
209+
if (ptr && populate) {
210+
fprintf(stderr, "InnoDB: Warning: mmap(MAP_POPULATE) "
211+
"is not supported for private mappings. "
212+
"Forcing preallocation by faulting in pages.\n");
213+
}
214+
#endif
215+
216+
/* Initialize the entire buffer to force the allocation
217+
of physical memory page frames. */
218+
if (ptr && populate) {
219+
memset(ptr, '\0', size);
220+
}
221+
171222
return(ptr);
172223
}
173224

0 commit comments

Comments
 (0)