From c616dcf7b0ddfd42620f13c08157fa08592dc717 Mon Sep 17 00:00:00 2001 From: Binbin Date: Tue, 10 Jun 2025 20:50:58 +0800 Subject: [PATCH 1/6] Add SAFE option to SHUTDOWN to reject shutdown in unsafe situations Add SAFE option to SHUTDOWN. If we passed SAFE, the SHUTDOWN will refuse to shutdown if it is not safe to shutdown. Like if myself is a voting primary, it will refuse to shutdown. This avoids the situation where a replica suddenly becomes the primary when we shutting down the replica, or shutting down a primary node by mistake and then causing the cluster to down. Add SAFE option to SHUTDOWN command. Add safe option to shutdown-on-sigint and shutdown-on-sigterm. Note that safe cannot prevent force, in the case of force, safe will print the relevant logs and do the force shutdown. Signed-off-by: Binbin --- src/cluster.h | 1 + src/cluster_legacy.c | 2 +- src/commands.def | 8 +++++++- src/commands/shutdown.json | 7 +++++++ src/config.c | 3 ++- src/db.c | 4 +++- src/server.c | 12 ++++++++++++ src/server.h | 10 +++++----- valkey.conf | 5 +++++ 9 files changed, 43 insertions(+), 9 deletions(-) diff --git a/src/cluster.h b/src/cluster.h index ae20ab048d..cd50803655 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -92,6 +92,7 @@ int getClusterSize(void); int getMyShardSlotCount(void); int clusterNodePending(clusterNode *node); int clusterNodeIsPrimary(clusterNode *n); +int clusterNodeIsVotingPrimary(clusterNode *n); char **getClusterNodesList(size_t *numnodes); char *clusterNodeIp(clusterNode *node, client *c); int clusterNodeIsReplica(clusterNode *node); diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index bb2b6cac21..5a3e2a7b00 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -130,7 +130,7 @@ void clusterCommandFlushslot(client *c); /* Only primaries that own slots have voting rights. * Returns 1 if the node has voting rights, otherwise returns 0. */ -static inline int clusterNodeIsVotingPrimary(clusterNode *n) { +int clusterNodeIsVotingPrimary(clusterNode *n) { return (n->flags & CLUSTER_NODE_PRIMARY) && n->numslots; } diff --git a/src/commands.def b/src/commands.def index 7585f56f32..7274d205ed 100644 --- a/src/commands.def +++ b/src/commands.def @@ -7917,11 +7917,17 @@ struct COMMAND_ARG SHUTDOWN_abort_selector_save_selector_block_save_selector_Sub {MAKE_ARG("save",ARG_TYPE_PURE_TOKEN,-1,"SAVE",NULL,NULL,CMD_ARG_NONE,0,NULL)}, }; +/* SHUTDOWN abort_selector save_selector_block force_safe_selector argument table */ +struct COMMAND_ARG SHUTDOWN_abort_selector_save_selector_block_force_safe_selector_Subargs[] = { +{MAKE_ARG("force",ARG_TYPE_PURE_TOKEN,-1,"FORCE",NULL,"7.0.0",CMD_ARG_NONE,0,NULL)}, +{MAKE_ARG("safe",ARG_TYPE_PURE_TOKEN,-1,"SAVE",NULL,"9.0.0",CMD_ARG_NONE,0,NULL)}, +}; + /* SHUTDOWN abort_selector save_selector_block argument table */ struct COMMAND_ARG SHUTDOWN_abort_selector_save_selector_block_Subargs[] = { {MAKE_ARG("save-selector",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_OPTIONAL,2,NULL),.subargs=SHUTDOWN_abort_selector_save_selector_block_save_selector_Subargs}, {MAKE_ARG("now",ARG_TYPE_PURE_TOKEN,-1,"NOW",NULL,"7.0.0",CMD_ARG_OPTIONAL,0,NULL)}, -{MAKE_ARG("force",ARG_TYPE_PURE_TOKEN,-1,"FORCE",NULL,"7.0.0",CMD_ARG_OPTIONAL,0,NULL)}, +{MAKE_ARG("force-safe-selector",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_OPTIONAL,2,NULL),.subargs=SHUTDOWN_abort_selector_save_selector_block_force_safe_selector_Subargs}, }; /* SHUTDOWN abort_selector argument table */ diff --git a/src/commands/shutdown.json b/src/commands/shutdown.json index 8c88da34a0..2de56388df 100644 --- a/src/commands/shutdown.json +++ b/src/commands/shutdown.json @@ -61,6 +61,13 @@ "token": "FORCE", "optional": true, "since": "7.0.0" + }, + { + "name": "safe", + "type": "pure-token", + "token": "SAFE", + "optional": true, + "since": "9.0.0" } ] }, diff --git a/src/config.c b/src/config.c index 717ed62af5..864003f586 100644 --- a/src/config.c +++ b/src/config.c @@ -98,11 +98,12 @@ configEnum aof_fsync_enum[] = { {NULL, 0}}; configEnum shutdown_on_sig_enum[] = { - {"default", 0}, + {"default", SHUTDOWN_NOFLAGS}, {"save", SHUTDOWN_SAVE}, {"nosave", SHUTDOWN_NOSAVE}, {"now", SHUTDOWN_NOW}, {"force", SHUTDOWN_FORCE}, + {"safe", SHUTDOWN_SAFE}, {NULL, 0}}; configEnum repl_diskless_load_enum[] = { diff --git a/src/db.c b/src/db.c index 90aff4a048..7797b07aa7 100644 --- a/src/db.c +++ b/src/db.c @@ -1314,7 +1314,7 @@ void typeCommand(client *c) { addReplyStatus(c, getObjectTypeName(o)); } -/* SHUTDOWN [[NOSAVE | SAVE] [NOW] [FORCE] | ABORT] */ +/* SHUTDOWN [[NOSAVE | SAVE] [NOW] [FORCE] [SAFE] | ABORT] */ void shutdownCommand(client *c) { int flags = SHUTDOWN_NOFLAGS; int abort = 0; @@ -1329,6 +1329,8 @@ void shutdownCommand(client *c) { flags |= SHUTDOWN_FORCE; } else if (!strcasecmp(c->argv[i]->ptr, "abort")) { abort = 1; + } else if (!strcasecmp(c->argv[i]->ptr, "safe")) { + flags |= SHUTDOWN_SAFE; } else { addReplyErrorObject(c, shared.syntaxerr); return; diff --git a/src/server.c b/src/server.c index a700898048..6012c7f423 100644 --- a/src/server.c +++ b/src/server.c @@ -4553,6 +4553,7 @@ int finishShutdown(void) { int save = server.shutdown_flags & SHUTDOWN_SAVE; int nosave = server.shutdown_flags & SHUTDOWN_NOSAVE; int force = server.shutdown_flags & SHUTDOWN_FORCE; + int safe = server.shutdown_flags & SHUTDOWN_SAFE; /* Log a warning for each replica that is lagging. */ listIter replicas_iter; @@ -4575,6 +4576,17 @@ int finishShutdown(void) { num_replicas); } + if (safe && server.cluster_enabled && clusterNodeIsVotingPrimary(getMyClusterNode())) { + if (force) { + serverLog(LL_WARNING, "I am a voting primary, shutting down may cause the cluster to down. Exit anyway."); + } else { + serverLog(LL_WARNING, "I am a voting primary, shutting down may cause the cluster to down, can't exit."); + if (server.supervised_mode == SUPERVISED_SYSTEMD) + serverCommunicateSystemd("I am a voting primary, shutting down may cause the cluster to down, can't exit.\n"); + goto error; + } + } + /* Kill all the Lua debugger forked sessions. */ ldbKillForkedSessions(); diff --git a/src/server.h b/src/server.h index 2153c56327..e514d1b2c5 100644 --- a/src/server.h +++ b/src/server.h @@ -571,11 +571,11 @@ typedef enum { /* SHUTDOWN flags */ #define SHUTDOWN_NOFLAGS 0 /* No flags. */ -#define SHUTDOWN_SAVE 1 /* Force SAVE on SHUTDOWN even if no save \ - points are configured. */ -#define SHUTDOWN_NOSAVE 2 /* Don't SAVE on SHUTDOWN. */ -#define SHUTDOWN_NOW 4 /* Don't wait for replicas to catch up. */ -#define SHUTDOWN_FORCE 8 /* Don't let errors prevent shutdown. */ +#define SHUTDOWN_SAVE (1 << 0) /* Force SAVE on SHUTDOWN even if no save points are configured. */ +#define SHUTDOWN_NOSAVE (1 << 1) /* Don't SAVE on SHUTDOWN. */ +#define SHUTDOWN_NOW (1 << 2) /* Don't wait for replicas to catch up. */ +#define SHUTDOWN_FORCE (1 << 3) /* Don't let errors prevent shutdown. */ +#define SHUTDOWN_SAFE (1 << 4) /* Shutdown only when safe. */ /* Command call flags, see call() function */ #define CMD_CALL_NONE 0 diff --git a/valkey.conf b/valkey.conf index e9dbee0b29..38c696ef5b 100644 --- a/valkey.conf +++ b/valkey.conf @@ -1705,6 +1705,11 @@ aof-timestamp-enabled no # nosave: Prevents DB saving operation even if one or more save points are configured. # now: Skips waiting for lagging replicas. # force: Ignores any errors that would normally prevent the server from exiting. +# safe: Shutdown only when safe. Note that safe cannot prevent force, in the case of +# force, safe will print the relevant logs. The definition of safe may be different +# in different modes. Here are the definitions: +# 1. In cluster mode, it is unsafe to shutdown a primary with slots, and may cause +# the cluster to go down. # # Any combination of values is allowed as long as "save" and "nosave" are not set simultaneously. # Example: "nosave force now" From b964d43190a1402a101e0426ef23e747d9f43fed Mon Sep 17 00:00:00 2001 From: Binbin Date: Fri, 13 Jun 2025 18:10:58 +0800 Subject: [PATCH 2/6] Fix format and add dummy test Signed-off-by: Binbin --- src/server.h | 2 +- tests/unit/cluster/shutdown.tcl | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) create mode 100644 tests/unit/cluster/shutdown.tcl diff --git a/src/server.h b/src/server.h index e514d1b2c5..59b1f804c7 100644 --- a/src/server.h +++ b/src/server.h @@ -570,7 +570,7 @@ typedef enum { #define UNIT_MILLISECONDS 1 /* SHUTDOWN flags */ -#define SHUTDOWN_NOFLAGS 0 /* No flags. */ +#define SHUTDOWN_NOFLAGS 0 /* No flags. */ #define SHUTDOWN_SAVE (1 << 0) /* Force SAVE on SHUTDOWN even if no save points are configured. */ #define SHUTDOWN_NOSAVE (1 << 1) /* Don't SAVE on SHUTDOWN. */ #define SHUTDOWN_NOW (1 << 2) /* Don't wait for replicas to catch up. */ diff --git a/tests/unit/cluster/shutdown.tcl b/tests/unit/cluster/shutdown.tcl new file mode 100644 index 0000000000..a5cd473668 --- /dev/null +++ b/tests/unit/cluster/shutdown.tcl @@ -0,0 +1,15 @@ +start_cluster 3 0 {tags {external:skip cluster shutdown}} { + test "Test shutdown safe is work" { + assert_error {ERR Errors trying to SHUTDOWN*} {R 0 shutdown safe} + assert_error {ERR Errors trying to SHUTDOWN*} {R 1 shutdown safe} + assert_error {ERR Errors trying to SHUTDOWN*} {R 2 shutdown safe} + + catch {R 0 shutdown safe force} + wait_for_condition 1000 50 { + [CI 1 cluster_state] eq {fail} && + [CI 2 cluster_state] eq {fail} + } else { + fail "Cluster doesn't fail" + } + } +} From f8b6417a28880b7bf9f260346f386e23289682f9 Mon Sep 17 00:00:00 2001 From: Binbin Date: Thu, 19 Jun 2025 22:39:46 +0800 Subject: [PATCH 3/6] Text review from jsoref and zuiderkwast Signed-off-by: Binbin --- src/server.c | 6 +++--- tests/unit/cluster/shutdown.tcl | 2 +- valkey.conf | 6 +++--- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/server.c b/src/server.c index 6012c7f423..47e0c01cf3 100644 --- a/src/server.c +++ b/src/server.c @@ -4578,11 +4578,11 @@ int finishShutdown(void) { if (safe && server.cluster_enabled && clusterNodeIsVotingPrimary(getMyClusterNode())) { if (force) { - serverLog(LL_WARNING, "I am a voting primary, shutting down may cause the cluster to down. Exit anyway."); + serverLog(LL_WARNING, "This is a voting primary. Shutting down may cause the cluster to go down. Exit anyway."); } else { - serverLog(LL_WARNING, "I am a voting primary, shutting down may cause the cluster to down, can't exit."); + serverLog(LL_WARNING, "This is a voting primary. Shutting down may cause the cluster to go down. Can't exit."); if (server.supervised_mode == SUPERVISED_SYSTEMD) - serverCommunicateSystemd("I am a voting primary, shutting down may cause the cluster to down, can't exit.\n"); + serverCommunicateSystemd("This is a voting primary. Shutting down may cause the cluster to go down. Can't exit.\n"); goto error; } } diff --git a/tests/unit/cluster/shutdown.tcl b/tests/unit/cluster/shutdown.tcl index a5cd473668..f9ed7be986 100644 --- a/tests/unit/cluster/shutdown.tcl +++ b/tests/unit/cluster/shutdown.tcl @@ -1,5 +1,5 @@ start_cluster 3 0 {tags {external:skip cluster shutdown}} { - test "Test shutdown safe is work" { + test "Test shutdown safe is working" { assert_error {ERR Errors trying to SHUTDOWN*} {R 0 shutdown safe} assert_error {ERR Errors trying to SHUTDOWN*} {R 1 shutdown safe} assert_error {ERR Errors trying to SHUTDOWN*} {R 2 shutdown safe} diff --git a/valkey.conf b/valkey.conf index 38c696ef5b..9dd686f192 100644 --- a/valkey.conf +++ b/valkey.conf @@ -1705,11 +1705,11 @@ aof-timestamp-enabled no # nosave: Prevents DB saving operation even if one or more save points are configured. # now: Skips waiting for lagging replicas. # force: Ignores any errors that would normally prevent the server from exiting. -# safe: Shutdown only when safe. Note that safe cannot prevent force, in the case of +# safe: Shut down only when safe. Note that safe cannot prevent force, in the case of # force, safe will print the relevant logs. The definition of safe may be different # in different modes. Here are the definitions: -# 1. In cluster mode, it is unsafe to shutdown a primary with slots, and may cause -# the cluster to go down. +# * In cluster mode, it is unsafe to shutdown a primary with slots, and may cause +# the cluster to go down. # # Any combination of values is allowed as long as "save" and "nosave" are not set simultaneously. # Example: "nosave force now" From c0ffc15f0e8da2c30d648a09170caefc65e8e1d5 Mon Sep 17 00:00:00 2001 From: Binbin Date: Thu, 19 Jun 2025 22:48:25 +0800 Subject: [PATCH 4/6] Update valkey.conf MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Viktor Söderqvist Signed-off-by: Binbin --- valkey.conf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/valkey.conf b/valkey.conf index 9dd686f192..a6b1c45600 100644 --- a/valkey.conf +++ b/valkey.conf @@ -1708,7 +1708,7 @@ aof-timestamp-enabled no # safe: Shut down only when safe. Note that safe cannot prevent force, in the case of # force, safe will print the relevant logs. The definition of safe may be different # in different modes. Here are the definitions: -# * In cluster mode, it is unsafe to shutdown a primary with slots, and may cause +# * In cluster mode, it is unsafe to shut down a primary with slots, and may cause # the cluster to go down. # # Any combination of values is allowed as long as "save" and "nosave" are not set simultaneously. From 7bae7d5d22b8ac8a355b6aa8426688e7967cd255 Mon Sep 17 00:00:00 2001 From: Binbin Date: Mon, 30 Jun 2025 13:04:40 +0800 Subject: [PATCH 5/6] update comment Signed-off-by: Binbin --- src/server.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/server.c b/src/server.c index 47e0c01cf3..91e1fe21fe 100644 --- a/src/server.c +++ b/src/server.c @@ -4456,6 +4456,12 @@ void closeListeningSockets(int unlink_unix_socket) { * - SHUTDOWN_FORCE: Ignore errors writing AOF and RDB files on disk, which * would normally prevent a shutdown. * + * - SHUTDOWN_SAFE: Shut down only when safe. Note that safe cannot prevent force, + * in the case of force, safe will print the relevant logs. The definition of + * safe may be different in different modes. Here are the definitions: + * * In cluster mode, it is unsafe to shut down a primary with slots, and may + * cause the cluster to go down. + * * Unless SHUTDOWN_NOW is set and if any replicas are lagging behind, C_ERR is * returned and server.shutdown_mstime is set to a timestamp to allow a grace * period for the replicas to catch up. This is checked and handled by From bd72a8da8f4a9eeae5c97b805ef52cb07b81c341 Mon Sep 17 00:00:00 2001 From: Binbin Date: Tue, 1 Jul 2025 10:30:11 +0800 Subject: [PATCH 6/6] update commands.c Signed-off-by: Binbin --- src/commands.def | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/src/commands.def b/src/commands.def index 7274d205ed..bc03feb733 100644 --- a/src/commands.def +++ b/src/commands.def @@ -7917,22 +7917,17 @@ struct COMMAND_ARG SHUTDOWN_abort_selector_save_selector_block_save_selector_Sub {MAKE_ARG("save",ARG_TYPE_PURE_TOKEN,-1,"SAVE",NULL,NULL,CMD_ARG_NONE,0,NULL)}, }; -/* SHUTDOWN abort_selector save_selector_block force_safe_selector argument table */ -struct COMMAND_ARG SHUTDOWN_abort_selector_save_selector_block_force_safe_selector_Subargs[] = { -{MAKE_ARG("force",ARG_TYPE_PURE_TOKEN,-1,"FORCE",NULL,"7.0.0",CMD_ARG_NONE,0,NULL)}, -{MAKE_ARG("safe",ARG_TYPE_PURE_TOKEN,-1,"SAVE",NULL,"9.0.0",CMD_ARG_NONE,0,NULL)}, -}; - /* SHUTDOWN abort_selector save_selector_block argument table */ struct COMMAND_ARG SHUTDOWN_abort_selector_save_selector_block_Subargs[] = { {MAKE_ARG("save-selector",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_OPTIONAL,2,NULL),.subargs=SHUTDOWN_abort_selector_save_selector_block_save_selector_Subargs}, {MAKE_ARG("now",ARG_TYPE_PURE_TOKEN,-1,"NOW",NULL,"7.0.0",CMD_ARG_OPTIONAL,0,NULL)}, -{MAKE_ARG("force-safe-selector",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_OPTIONAL,2,NULL),.subargs=SHUTDOWN_abort_selector_save_selector_block_force_safe_selector_Subargs}, +{MAKE_ARG("force",ARG_TYPE_PURE_TOKEN,-1,"FORCE",NULL,"7.0.0",CMD_ARG_OPTIONAL,0,NULL)}, +{MAKE_ARG("safe",ARG_TYPE_PURE_TOKEN,-1,"SAFE",NULL,"9.0.0",CMD_ARG_OPTIONAL,0,NULL)}, }; /* SHUTDOWN abort_selector argument table */ struct COMMAND_ARG SHUTDOWN_abort_selector_Subargs[] = { -{MAKE_ARG("save-selector-block",ARG_TYPE_BLOCK,-1,NULL,NULL,NULL,CMD_ARG_NONE,3,NULL),.subargs=SHUTDOWN_abort_selector_save_selector_block_Subargs}, +{MAKE_ARG("save-selector-block",ARG_TYPE_BLOCK,-1,NULL,NULL,NULL,CMD_ARG_NONE,4,NULL),.subargs=SHUTDOWN_abort_selector_save_selector_block_Subargs}, {MAKE_ARG("abort",ARG_TYPE_PURE_TOKEN,-1,"ABORT",NULL,"7.0.0",CMD_ARG_NONE,0,NULL)}, };