diff --git a/src/cluster.h b/src/cluster.h index ae20ab048d..cd50803655 100644 --- a/src/cluster.h +++ b/src/cluster.h @@ -92,6 +92,7 @@ int getClusterSize(void); int getMyShardSlotCount(void); int clusterNodePending(clusterNode *node); int clusterNodeIsPrimary(clusterNode *n); +int clusterNodeIsVotingPrimary(clusterNode *n); char **getClusterNodesList(size_t *numnodes); char *clusterNodeIp(clusterNode *node, client *c); int clusterNodeIsReplica(clusterNode *node); diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index bb2b6cac21..5a3e2a7b00 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -130,7 +130,7 @@ void clusterCommandFlushslot(client *c); /* Only primaries that own slots have voting rights. * Returns 1 if the node has voting rights, otherwise returns 0. */ -static inline int clusterNodeIsVotingPrimary(clusterNode *n) { +int clusterNodeIsVotingPrimary(clusterNode *n) { return (n->flags & CLUSTER_NODE_PRIMARY) && n->numslots; } diff --git a/src/commands.def b/src/commands.def index 7585f56f32..bc03feb733 100644 --- a/src/commands.def +++ b/src/commands.def @@ -7922,11 +7922,12 @@ struct COMMAND_ARG SHUTDOWN_abort_selector_save_selector_block_Subargs[] = { {MAKE_ARG("save-selector",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_OPTIONAL,2,NULL),.subargs=SHUTDOWN_abort_selector_save_selector_block_save_selector_Subargs}, {MAKE_ARG("now",ARG_TYPE_PURE_TOKEN,-1,"NOW",NULL,"7.0.0",CMD_ARG_OPTIONAL,0,NULL)}, {MAKE_ARG("force",ARG_TYPE_PURE_TOKEN,-1,"FORCE",NULL,"7.0.0",CMD_ARG_OPTIONAL,0,NULL)}, +{MAKE_ARG("safe",ARG_TYPE_PURE_TOKEN,-1,"SAFE",NULL,"9.0.0",CMD_ARG_OPTIONAL,0,NULL)}, }; /* SHUTDOWN abort_selector argument table */ struct COMMAND_ARG SHUTDOWN_abort_selector_Subargs[] = { -{MAKE_ARG("save-selector-block",ARG_TYPE_BLOCK,-1,NULL,NULL,NULL,CMD_ARG_NONE,3,NULL),.subargs=SHUTDOWN_abort_selector_save_selector_block_Subargs}, +{MAKE_ARG("save-selector-block",ARG_TYPE_BLOCK,-1,NULL,NULL,NULL,CMD_ARG_NONE,4,NULL),.subargs=SHUTDOWN_abort_selector_save_selector_block_Subargs}, {MAKE_ARG("abort",ARG_TYPE_PURE_TOKEN,-1,"ABORT",NULL,"7.0.0",CMD_ARG_NONE,0,NULL)}, }; diff --git a/src/commands/shutdown.json b/src/commands/shutdown.json index 8c88da34a0..2de56388df 100644 --- a/src/commands/shutdown.json +++ b/src/commands/shutdown.json @@ -61,6 +61,13 @@ "token": "FORCE", "optional": true, "since": "7.0.0" + }, + { + "name": "safe", + "type": "pure-token", + "token": "SAFE", + "optional": true, + "since": "9.0.0" } ] }, diff --git a/src/config.c b/src/config.c index 717ed62af5..864003f586 100644 --- a/src/config.c +++ b/src/config.c @@ -98,11 +98,12 @@ configEnum aof_fsync_enum[] = { {NULL, 0}}; configEnum shutdown_on_sig_enum[] = { - {"default", 0}, + {"default", SHUTDOWN_NOFLAGS}, {"save", SHUTDOWN_SAVE}, {"nosave", SHUTDOWN_NOSAVE}, {"now", SHUTDOWN_NOW}, {"force", SHUTDOWN_FORCE}, + {"safe", SHUTDOWN_SAFE}, {NULL, 0}}; configEnum repl_diskless_load_enum[] = { diff --git a/src/db.c b/src/db.c index 90aff4a048..7797b07aa7 100644 --- a/src/db.c +++ b/src/db.c @@ -1314,7 +1314,7 @@ void typeCommand(client *c) { addReplyStatus(c, getObjectTypeName(o)); } -/* SHUTDOWN [[NOSAVE | SAVE] [NOW] [FORCE] | ABORT] */ +/* SHUTDOWN [[NOSAVE | SAVE] [NOW] [FORCE] [SAFE] | ABORT] */ void shutdownCommand(client *c) { int flags = SHUTDOWN_NOFLAGS; int abort = 0; @@ -1329,6 +1329,8 @@ void shutdownCommand(client *c) { flags |= SHUTDOWN_FORCE; } else if (!strcasecmp(c->argv[i]->ptr, "abort")) { abort = 1; + } else if (!strcasecmp(c->argv[i]->ptr, "safe")) { + flags |= SHUTDOWN_SAFE; } else { addReplyErrorObject(c, shared.syntaxerr); return; diff --git a/src/server.c b/src/server.c index a700898048..91e1fe21fe 100644 --- a/src/server.c +++ b/src/server.c @@ -4456,6 +4456,12 @@ void closeListeningSockets(int unlink_unix_socket) { * - SHUTDOWN_FORCE: Ignore errors writing AOF and RDB files on disk, which * would normally prevent a shutdown. * + * - SHUTDOWN_SAFE: Shut down only when safe. Note that safe cannot prevent force, + * in the case of force, safe will print the relevant logs. The definition of + * safe may be different in different modes. Here are the definitions: + * * In cluster mode, it is unsafe to shut down a primary with slots, and may + * cause the cluster to go down. + * * Unless SHUTDOWN_NOW is set and if any replicas are lagging behind, C_ERR is * returned and server.shutdown_mstime is set to a timestamp to allow a grace * period for the replicas to catch up. This is checked and handled by @@ -4553,6 +4559,7 @@ int finishShutdown(void) { int save = server.shutdown_flags & SHUTDOWN_SAVE; int nosave = server.shutdown_flags & SHUTDOWN_NOSAVE; int force = server.shutdown_flags & SHUTDOWN_FORCE; + int safe = server.shutdown_flags & SHUTDOWN_SAFE; /* Log a warning for each replica that is lagging. */ listIter replicas_iter; @@ -4575,6 +4582,17 @@ int finishShutdown(void) { num_replicas); } + if (safe && server.cluster_enabled && clusterNodeIsVotingPrimary(getMyClusterNode())) { + if (force) { + serverLog(LL_WARNING, "This is a voting primary. Shutting down may cause the cluster to go down. Exit anyway."); + } else { + serverLog(LL_WARNING, "This is a voting primary. Shutting down may cause the cluster to go down. Can't exit."); + if (server.supervised_mode == SUPERVISED_SYSTEMD) + serverCommunicateSystemd("This is a voting primary. Shutting down may cause the cluster to go down. Can't exit.\n"); + goto error; + } + } + /* Kill all the Lua debugger forked sessions. */ ldbKillForkedSessions(); diff --git a/src/server.h b/src/server.h index 2153c56327..59b1f804c7 100644 --- a/src/server.h +++ b/src/server.h @@ -570,12 +570,12 @@ typedef enum { #define UNIT_MILLISECONDS 1 /* SHUTDOWN flags */ -#define SHUTDOWN_NOFLAGS 0 /* No flags. */ -#define SHUTDOWN_SAVE 1 /* Force SAVE on SHUTDOWN even if no save \ - points are configured. */ -#define SHUTDOWN_NOSAVE 2 /* Don't SAVE on SHUTDOWN. */ -#define SHUTDOWN_NOW 4 /* Don't wait for replicas to catch up. */ -#define SHUTDOWN_FORCE 8 /* Don't let errors prevent shutdown. */ +#define SHUTDOWN_NOFLAGS 0 /* No flags. */ +#define SHUTDOWN_SAVE (1 << 0) /* Force SAVE on SHUTDOWN even if no save points are configured. */ +#define SHUTDOWN_NOSAVE (1 << 1) /* Don't SAVE on SHUTDOWN. */ +#define SHUTDOWN_NOW (1 << 2) /* Don't wait for replicas to catch up. */ +#define SHUTDOWN_FORCE (1 << 3) /* Don't let errors prevent shutdown. */ +#define SHUTDOWN_SAFE (1 << 4) /* Shutdown only when safe. */ /* Command call flags, see call() function */ #define CMD_CALL_NONE 0 diff --git a/tests/unit/cluster/shutdown.tcl b/tests/unit/cluster/shutdown.tcl new file mode 100644 index 0000000000..f9ed7be986 --- /dev/null +++ b/tests/unit/cluster/shutdown.tcl @@ -0,0 +1,15 @@ +start_cluster 3 0 {tags {external:skip cluster shutdown}} { + test "Test shutdown safe is working" { + assert_error {ERR Errors trying to SHUTDOWN*} {R 0 shutdown safe} + assert_error {ERR Errors trying to SHUTDOWN*} {R 1 shutdown safe} + assert_error {ERR Errors trying to SHUTDOWN*} {R 2 shutdown safe} + + catch {R 0 shutdown safe force} + wait_for_condition 1000 50 { + [CI 1 cluster_state] eq {fail} && + [CI 2 cluster_state] eq {fail} + } else { + fail "Cluster doesn't fail" + } + } +} diff --git a/valkey.conf b/valkey.conf index e9dbee0b29..a6b1c45600 100644 --- a/valkey.conf +++ b/valkey.conf @@ -1705,6 +1705,11 @@ aof-timestamp-enabled no # nosave: Prevents DB saving operation even if one or more save points are configured. # now: Skips waiting for lagging replicas. # force: Ignores any errors that would normally prevent the server from exiting. +# safe: Shut down only when safe. Note that safe cannot prevent force, in the case of +# force, safe will print the relevant logs. The definition of safe may be different +# in different modes. Here are the definitions: +# * In cluster mode, it is unsafe to shut down a primary with slots, and may cause +# the cluster to go down. # # Any combination of values is allowed as long as "save" and "nosave" are not set simultaneously. # Example: "nosave force now"