Skip to content

Commit 1e05724

Browse files
Add SAFE option to SHUTDOWN to reject shutdown in unsafe situations (#2195)
Add SAFE option to SHUTDOWN. If we passed SAFE, the SHUTDOWN will refuse to shutdown if it is not safe to shutdown. Like if myself is a voting primary, it will refuse to shutdown. This avoids the situation where a replica suddenly becomes the primary when we shutting down the replica, or shutting down a primary node by mistake and then causing the cluster to down. Add SAFE option to SHUTDOWN command. Add safe option to shutdown-on-sigint and shutdown-on-sigterm. Note that SAFE cannot prevent FORCE, in the case of FORCE, SAFE will print the relevant logs and do the FORCE shutdown, we allow this combination. Signed-off-by: Binbin <binloveplay1314@qq.com> Co-authored-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
1 parent fef4dbf commit 1e05724

10 files changed

Lines changed: 60 additions & 10 deletions

File tree

src/cluster.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ int getClusterSize(void);
9292
int getMyShardSlotCount(void);
9393
int clusterNodePending(clusterNode *node);
9494
int clusterNodeIsPrimary(clusterNode *n);
95+
int clusterNodeIsVotingPrimary(clusterNode *n);
9596
char **getClusterNodesList(size_t *numnodes);
9697
char *clusterNodeIp(clusterNode *node, client *c);
9798
int clusterNodeIsReplica(clusterNode *node);

src/cluster_legacy.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ void clusterCommandFlushslot(client *c);
130130

131131
/* Only primaries that own slots have voting rights.
132132
* Returns 1 if the node has voting rights, otherwise returns 0. */
133-
static inline int clusterNodeIsVotingPrimary(clusterNode *n) {
133+
int clusterNodeIsVotingPrimary(clusterNode *n) {
134134
return (n->flags & CLUSTER_NODE_PRIMARY) && n->numslots;
135135
}
136136

src/commands.def

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7925,11 +7925,12 @@ struct COMMAND_ARG SHUTDOWN_abort_selector_save_selector_block_Subargs[] = {
79257925
{MAKE_ARG("save-selector",ARG_TYPE_ONEOF,-1,NULL,NULL,NULL,CMD_ARG_OPTIONAL,2,NULL),.subargs=SHUTDOWN_abort_selector_save_selector_block_save_selector_Subargs},
79267926
{MAKE_ARG("now",ARG_TYPE_PURE_TOKEN,-1,"NOW",NULL,"7.0.0",CMD_ARG_OPTIONAL,0,NULL)},
79277927
{MAKE_ARG("force",ARG_TYPE_PURE_TOKEN,-1,"FORCE",NULL,"7.0.0",CMD_ARG_OPTIONAL,0,NULL)},
7928+
{MAKE_ARG("safe",ARG_TYPE_PURE_TOKEN,-1,"SAFE",NULL,"9.0.0",CMD_ARG_OPTIONAL,0,NULL)},
79287929
};
79297930

79307931
/* SHUTDOWN abort_selector argument table */
79317932
struct COMMAND_ARG SHUTDOWN_abort_selector_Subargs[] = {
7932-
{MAKE_ARG("save-selector-block",ARG_TYPE_BLOCK,-1,NULL,NULL,NULL,CMD_ARG_NONE,3,NULL),.subargs=SHUTDOWN_abort_selector_save_selector_block_Subargs},
7933+
{MAKE_ARG("save-selector-block",ARG_TYPE_BLOCK,-1,NULL,NULL,NULL,CMD_ARG_NONE,4,NULL),.subargs=SHUTDOWN_abort_selector_save_selector_block_Subargs},
79337934
{MAKE_ARG("abort",ARG_TYPE_PURE_TOKEN,-1,"ABORT",NULL,"7.0.0",CMD_ARG_NONE,0,NULL)},
79347935
};
79357936

src/commands/shutdown.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,13 @@
6161
"token": "FORCE",
6262
"optional": true,
6363
"since": "7.0.0"
64+
},
65+
{
66+
"name": "safe",
67+
"type": "pure-token",
68+
"token": "SAFE",
69+
"optional": true,
70+
"since": "9.0.0"
6471
}
6572
]
6673
},

src/config.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,11 +98,12 @@ configEnum aof_fsync_enum[] = {
9898
{NULL, 0}};
9999

100100
configEnum shutdown_on_sig_enum[] = {
101-
{"default", 0},
101+
{"default", SHUTDOWN_NOFLAGS},
102102
{"save", SHUTDOWN_SAVE},
103103
{"nosave", SHUTDOWN_NOSAVE},
104104
{"now", SHUTDOWN_NOW},
105105
{"force", SHUTDOWN_FORCE},
106+
{"safe", SHUTDOWN_SAFE},
106107
{NULL, 0}};
107108

108109
configEnum repl_diskless_load_enum[] = {

src/db.c

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1325,7 +1325,7 @@ void typeCommand(client *c) {
13251325
addReplyStatus(c, getObjectTypeName(o));
13261326
}
13271327

1328-
/* SHUTDOWN [[NOSAVE | SAVE] [NOW] [FORCE] | ABORT] */
1328+
/* SHUTDOWN [[NOSAVE | SAVE] [NOW] [FORCE] [SAFE] | ABORT] */
13291329
void shutdownCommand(client *c) {
13301330
int flags = SHUTDOWN_NOFLAGS;
13311331
int abort = 0;
@@ -1340,6 +1340,8 @@ void shutdownCommand(client *c) {
13401340
flags |= SHUTDOWN_FORCE;
13411341
} else if (!strcasecmp(c->argv[i]->ptr, "abort")) {
13421342
abort = 1;
1343+
} else if (!strcasecmp(c->argv[i]->ptr, "safe")) {
1344+
flags |= SHUTDOWN_SAFE;
13431345
} else {
13441346
addReplyErrorObject(c, shared.syntaxerr);
13451347
return;

src/server.c

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4493,6 +4493,12 @@ void closeListeningSockets(int unlink_unix_socket) {
44934493
* - SHUTDOWN_FORCE: Ignore errors writing AOF and RDB files on disk, which
44944494
* would normally prevent a shutdown.
44954495
*
4496+
* - SHUTDOWN_SAFE: Shut down only when safe. Note that safe cannot prevent force,
4497+
* in the case of force, safe will print the relevant logs. The definition of
4498+
* safe may be different in different modes. Here are the definitions:
4499+
* * In cluster mode, it is unsafe to shut down a primary with slots, and may
4500+
* cause the cluster to go down.
4501+
*
44964502
* Unless SHUTDOWN_NOW is set and if any replicas are lagging behind, C_ERR is
44974503
* returned and server.shutdown_mstime is set to a timestamp to allow a grace
44984504
* period for the replicas to catch up. This is checked and handled by
@@ -4590,6 +4596,7 @@ int finishShutdown(void) {
45904596
int save = server.shutdown_flags & SHUTDOWN_SAVE;
45914597
int nosave = server.shutdown_flags & SHUTDOWN_NOSAVE;
45924598
int force = server.shutdown_flags & SHUTDOWN_FORCE;
4599+
int safe = server.shutdown_flags & SHUTDOWN_SAFE;
45934600

45944601
/* Log a warning for each replica that is lagging. */
45954602
listIter replicas_iter;
@@ -4612,6 +4619,17 @@ int finishShutdown(void) {
46124619
num_replicas);
46134620
}
46144621

4622+
if (safe && server.cluster_enabled && clusterNodeIsVotingPrimary(getMyClusterNode())) {
4623+
if (force) {
4624+
serverLog(LL_WARNING, "This is a voting primary. Shutting down may cause the cluster to go down. Exit anyway.");
4625+
} else {
4626+
serverLog(LL_WARNING, "This is a voting primary. Shutting down may cause the cluster to go down. Can't exit.");
4627+
if (server.supervised_mode == SUPERVISED_SYSTEMD)
4628+
serverCommunicateSystemd("This is a voting primary. Shutting down may cause the cluster to go down. Can't exit.\n");
4629+
goto error;
4630+
}
4631+
}
4632+
46154633
/* Kill all the Lua debugger forked sessions. */
46164634
ldbKillForkedSessions();
46174635

src/server.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -570,12 +570,12 @@ typedef enum {
570570
#define UNIT_MILLISECONDS 1
571571

572572
/* SHUTDOWN flags */
573-
#define SHUTDOWN_NOFLAGS 0 /* No flags. */
574-
#define SHUTDOWN_SAVE 1 /* Force SAVE on SHUTDOWN even if no save \
575-
points are configured. */
576-
#define SHUTDOWN_NOSAVE 2 /* Don't SAVE on SHUTDOWN. */
577-
#define SHUTDOWN_NOW 4 /* Don't wait for replicas to catch up. */
578-
#define SHUTDOWN_FORCE 8 /* Don't let errors prevent shutdown. */
573+
#define SHUTDOWN_NOFLAGS 0 /* No flags. */
574+
#define SHUTDOWN_SAVE (1 << 0) /* Force SAVE on SHUTDOWN even if no save points are configured. */
575+
#define SHUTDOWN_NOSAVE (1 << 1) /* Don't SAVE on SHUTDOWN. */
576+
#define SHUTDOWN_NOW (1 << 2) /* Don't wait for replicas to catch up. */
577+
#define SHUTDOWN_FORCE (1 << 3) /* Don't let errors prevent shutdown. */
578+
#define SHUTDOWN_SAFE (1 << 4) /* Shutdown only when safe. */
579579

580580
/* Command call flags, see call() function */
581581
#define CMD_CALL_NONE 0

tests/unit/cluster/shutdown.tcl

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
start_cluster 3 0 {tags {external:skip cluster shutdown}} {
2+
test "Test shutdown safe is working" {
3+
assert_error {ERR Errors trying to SHUTDOWN*} {R 0 shutdown safe}
4+
assert_error {ERR Errors trying to SHUTDOWN*} {R 1 shutdown safe}
5+
assert_error {ERR Errors trying to SHUTDOWN*} {R 2 shutdown safe}
6+
7+
catch {R 0 shutdown safe force}
8+
wait_for_condition 1000 50 {
9+
[CI 1 cluster_state] eq {fail} &&
10+
[CI 2 cluster_state] eq {fail}
11+
} else {
12+
fail "Cluster doesn't fail"
13+
}
14+
}
15+
}

valkey.conf

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1705,6 +1705,11 @@ aof-timestamp-enabled no
17051705
# nosave: Prevents DB saving operation even if one or more save points are configured.
17061706
# now: Skips waiting for lagging replicas.
17071707
# force: Ignores any errors that would normally prevent the server from exiting.
1708+
# safe: Shut down only when safe. Note that safe cannot prevent force, in the case of
1709+
# force, safe will print the relevant logs. The definition of safe may be different
1710+
# in different modes. Here are the definitions:
1711+
# * In cluster mode, it is unsafe to shut down a primary with slots, and may cause
1712+
# the cluster to go down.
17081713
#
17091714
# Any combination of values is allowed as long as "save" and "nosave" are not set simultaneously.
17101715
# Example: "nosave force now"

0 commit comments

Comments
 (0)