From f138c1b7d1bef47a73dea772d5faad875bc901bf Mon Sep 17 00:00:00 2001 From: Ran Shidlansik Date: Sun, 25 May 2025 20:16:20 +0300 Subject: [PATCH 01/35] Only mark the client reprocessing flag when unblocked on keys (#2109) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When we refactored the blocking framework we introduced the client reprocessing infrastructure. In cases the client was blocked on keys, it will attempt to reprocess the command. One challenge was to keep track of the command timeout, since we are reprocessing and do not want to re-register the client with a fresh timeout each time. The solution was to consider the client reprocessing flag when the client is blockedOnKeys: ``` if (!c->flag.reprocessing_command) { /* If the client is re-processing the command, we do not set the timeout * because we need to retain the client's original timeout. */ c->bstate->timeout = timeout; } ``` However, this introduced a new issue. There are cases where the client will consecutive blocking of different types for example: ``` CLIENT PAUSE 10000 ALL BZPOPMAX zset 1 ``` would have the client blocked on the zset endlessly if nothing will be written to it. **Credits to @uriyage for locating this with his fuzzer testing** The suggested solution is to only flag the client when it is specifically unblocked on keys. --------- Signed-off-by: Ran Shidlansik Co-authored-by: Binbin (cherry picked from commit d00fb8e713d651f7c42a36f052a4fd12a9637e0e) Signed-off-by: Viktor Söderqvist --- src/blocked.c | 5 ++++- src/server.c | 13 ++----------- src/server.h | 5 ++--- tests/unit/type/list.tcl | 23 +++++++++++++++++++++++ 4 files changed, 31 insertions(+), 15 deletions(-) diff --git a/src/blocked.c b/src/blocked.c index 3496b21d40..0632713fab 100644 --- a/src/blocked.c +++ b/src/blocked.c @@ -374,7 +374,7 @@ void blockForKeys(client *c, int btype, robj **keys, int numkeys, mstime_t timeo list *l; int j; - if (!c->flag.reprocessing_command) { + if (!c->flag.reexecuting_command) { /* If the client is re-processing the command, we do not set the timeout * because we need to retain the client's original timeout. */ c->bstate.timeout = timeout; @@ -648,6 +648,7 @@ static void unblockClientOnKey(client *c, robj *key) { * we need to re process the command again */ if (c->flag.pending_command) { c->flag.pending_command = 0; + c->flag.reexecuting_command = 1; /* We want the command processing and the unblock handler (see RM_Call 'K' option) * to run atomically, this is why we must enter the execution unit here before * running the command, and exit the execution unit after calling the unblock handler (if exists). @@ -666,6 +667,8 @@ static void unblockClientOnKey(client *c, robj *key) { } exitExecutionUnit(); afterCommand(c); + /* Clear the reexecuting_command flag after the proc is executed. */ + c->flag.reexecuting_command = 0; server.current_client = old_client; } } diff --git a/src/server.c b/src/server.c index eb9cf04e82..c0e792c6f2 100644 --- a/src/server.c +++ b/src/server.c @@ -3498,7 +3498,7 @@ void call(client *c, int flags) { * and a client which is reprocessing command again (after being unblocked). * Blocked clients can be blocked in different places and not always it means the call() function has been * called. For example this is required for avoiding double logging to monitors.*/ - int reprocessing_command = flags & CMD_CALL_REPROCESSING; + int reprocessing_command = c->flag.reexecuting_command ? 1 : 0; /* Initialization: clear the flags that must be set by the command on * demand, and initialize the array for additional commands propagation. */ @@ -3526,20 +3526,12 @@ void call(client *c, int flags) { * In case of blocking commands, the flag will be un-set only after successfully * re-processing and unblock the client.*/ c->flag.executing_command = 1; - - /* Setting the CLIENT_REPROCESSING_COMMAND flag so that during the actual - * processing of the command proc, the client is aware that it is being - * re-processed. */ - if (reprocessing_command) c->flag.reprocessing_command = 1; - + monotime monotonic_start = 0; if (monotonicGetType() == MONOTONIC_CLOCK_HW) monotonic_start = getMonotonicUs(); c->cmd->proc(c); - /* Clear the CLIENT_REPROCESSING_COMMAND flag after the proc is executed. */ - if (reprocessing_command) c->flag.reprocessing_command = 0; - exitExecutionUnit(); /* In case client is blocked after trying to execute the command, @@ -4162,7 +4154,6 @@ int processCommand(client *c) { addReply(c, shared.queued); } else { int flags = CMD_CALL_FULL; - if (client_reprocessing_command) flags |= CMD_CALL_REPROCESSING; call(c, flags); if (listLength(server.ready_keys) && !isInsideYieldingLongCommand()) handleClientsBlockedOnKeys(); } diff --git a/src/server.h b/src/server.h index f2fce0d328..3ca5b4e0ac 100644 --- a/src/server.h +++ b/src/server.h @@ -565,8 +565,7 @@ typedef enum { #define CMD_CALL_NONE 0 #define CMD_CALL_PROPAGATE_AOF (1 << 0) #define CMD_CALL_PROPAGATE_REPL (1 << 1) -#define CMD_CALL_REPROCESSING (1 << 2) -#define CMD_CALL_FROM_MODULE (1 << 3) /* From RM_Call */ +#define CMD_CALL_FROM_MODULE (1 << 2) /* From RM_Call */ #define CMD_CALL_PROPAGATE (CMD_CALL_PROPAGATE_AOF | CMD_CALL_PROPAGATE_REPL) #define CMD_CALL_FULL (CMD_CALL_PROPAGATE) @@ -1222,7 +1221,7 @@ typedef struct ClientFlags { from the Module. */ uint64_t module_prevent_aof_prop : 1; /* Module client do not want to propagate to AOF */ uint64_t module_prevent_repl_prop : 1; /* Module client do not want to propagate to replica */ - uint64_t reprocessing_command : 1; /* The client is re-processing the command. */ + uint64_t reexecuting_command : 1; /* The client is re-executing the command. */ uint64_t replication_done : 1; /* Indicate that replication has been done on the client */ uint64_t authenticated : 1; /* Indicate a client has successfully authenticated */ uint64_t ever_authenticated : 1; /* Indicate a client was ever successfully authenticated during it's lifetime */ diff --git a/tests/unit/type/list.tcl b/tests/unit/type/list.tcl index 83a93bffbf..0aa1a1f9f1 100644 --- a/tests/unit/type/list.tcl +++ b/tests/unit/type/list.tcl @@ -2430,4 +2430,27 @@ foreach {pop} {BLPOP BLMPOP_RIGHT} { close_replication_stream $repl } {} {needs:repl} + test "Blocking timeout following PAUSE should honor the timeout" { + # cleanup first + r del mylist + + # create a test client + set rd [valkey_deferring_client] + + # first PAUSE all writes for a very long time + r client pause 10000000000000 write + + # block a client on the list + $rd BLPOP mylist 1 + wait_for_blocked_clients_count 1 + + # now unpause the writes + r client unpause + + # client should time-out + wait_for_blocked_clients_count 0 + + $rd close + } + } ;# stop servers From f7f6749e9e2d516ea570c558ede92b85b46b82d7 Mon Sep 17 00:00:00 2001 From: uriyage <78144248+uriyage@users.noreply.github.com> Date: Sun, 25 May 2025 16:36:34 +0300 Subject: [PATCH 02/35] Fix bad slot used in sharded pubsub unsubscribe (#2137) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit fixes two issues in `pubsubUnsubscribeChannel` that could lead to memory corruption: 1. When calculating the slot for a channel, we were using getKeySlot() which might use the current_client's slot if available. This is problematic when a client kills another client (e.g., via CLIENT KILL command) as the slot won't match the channel's actual slot. 2. The `found` variable was not initialized to `NULL`, causing the serverAssert to potentially pass incorrectly when the hashtable lookup failed, leading to memory corruption in subsequent operations. The fix: - Calculate the slot directly from the channel name using keyHashSlot() instead of relying on the current client's slot - Initialize 'found' to NULL Added a test case that reproduces the issue by having one client kill another client that is subscribed to a sharded pubsub channel during a transaction. Crash log (After initializing the variable 'found' to null, without initialization, memory corruption could occur): ``` VALKEY BUG REPORT START: Cut & paste starting from here === 59707:M 24 May 2025 23:10:40.429 # === ASSERTION FAILED CLIENT CONTEXT === 59707:M 24 May 2025 23:10:40.429 # client->flags = 108086391057154048 59707:M 24 May 2025 23:10:40.429 # client->conn = fd=11 59707:M 24 May 2025 23:10:40.429 # client->argc = 0 59707:M 24 May 2025 23:10:40.429 # === RECURSIVE ASSERTION FAILED === 59707:M 24 May 2025 23:10:40.429 # ==> pubsub.c:348 'found' is not true ------ STACK TRACE ------ Backtrace: 0 valkey-server 0x0000000104974054 _serverAssertWithInfo + 112 1 valkey-server 0x000000010496c7fc pubsubUnsubscribeChannel + 268 2 valkey-server 0x000000010496cea0 pubsubUnsubscribeAllChannelsInternal + 216 3 valkey-server 0x000000010496c2e0 pubsubUnsubscribeShardAllChannels + 76 4 valkey-server 0x000000010496c1d4 freeClientPubSubData + 60 5 valkey-server 0x00000001048f3cbc freeClient + 792 6 valkey-server 0x0000000104900870 clientKillCommand + 356 7 valkey-server 0x00000001048d1790 call + 428 8 valkey-server 0x000000010496ef4c execCommand + 872 9 valkey-server 0x00000001048d1790 call + 428 10 valkey-server 0x00000001048d3a44 processCommand + 5056 11 valkey-server 0x00000001048fdc20 processCommandAndResetClient + 64 12 valkey-server 0x00000001048fdeac processInputBuffer + 276 13 valkey-server 0x00000001048f2ff0 readQueryFromClient + 148 14 valkey-server 0x0000000104a182e8 callHandler + 60 15 valkey-server 0x0000000104a1731c connSocketEventHandler + 488 16 valkey-server 0x00000001048b5e80 aeProcessEvents + 820 17 valkey-server 0x00000001048b6598 aeMain + 64 18 valkey-server 0x00000001048dcecc main + 4084 19 dyld 0x0000000186b34274 start + 2840 ```` --------- Signed-off-by: Uri Yagelnik (cherry picked from commit bd5dcb2819cd35f2e346f94ed26a8e1182f5c245) Signed-off-by: Viktor Söderqvist --- src/pubsub.c | 4 +++- tests/unit/cluster/sharded-pubsub.tcl | 25 +++++++++++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/src/pubsub.c b/src/pubsub.c index 5b037b5721..ad3429f8b4 100644 --- a/src/pubsub.c +++ b/src/pubsub.c @@ -307,7 +307,9 @@ int pubsubUnsubscribeChannel(client *c, robj *channel, int notify, pubsubtype ty retval = 1; /* Remove the client from the channel -> clients list hash table */ if (server.cluster_enabled && type.shard) { - slot = getKeySlot(channel->ptr); + /* Using keyHashSlot directly because we can't rely on the current_client's slot via getKeySlot() here, + * as it might differ from the channel's slot. */ + slot = keyHashSlot(channel->ptr, (int)sdslen(channel->ptr)); } de = kvstoreDictFind(*type.serverPubSubChannels, slot, channel); serverAssertWithInfo(c, NULL, de != NULL); diff --git a/tests/unit/cluster/sharded-pubsub.tcl b/tests/unit/cluster/sharded-pubsub.tcl index b5b19ff481..4bbaab20c2 100644 --- a/tests/unit/cluster/sharded-pubsub.tcl +++ b/tests/unit/cluster/sharded-pubsub.tcl @@ -53,4 +53,29 @@ start_cluster 1 1 {tags {external:skip cluster}} { catch {[$replica EXEC]} err assert_match {EXECABORT*} $err } + + test "SSUBSCRIBE client killed during transaction" { + # Create two clients + set rd1 [valkey_deferring_client $primary_id] + + # Get client 1 ID + $rd1 client id + set rd1_id [$rd1 read] + # Client1 subscribes to a shard channel + $rd1 ssubscribe channel0 + + # Wait for the subscription to be acknowledged + assert_equal {ssubscribe channel0 1} [$rd1 read] + + # Client2 starts a transaction + assert_equal {OK} [$primary multi] + + # sets a key so that its slot will be set to the slot of that key. + assert_equal {QUEUED} [$primary set k v] + # Kill client1 inside client2's transaction + assert_equal {QUEUED} [$primary client kill id $rd1_id] + + # Execute the transaction + assert_equal {OK 1} [$primary exec] "Transaction execution should return OK and kill count" + } } \ No newline at end of file From 31b9985fc728577ec4855f63f1095258ae5aa0a5 Mon Sep 17 00:00:00 2001 From: Jacob Murphy Date: Tue, 27 May 2025 17:56:10 +0000 Subject: [PATCH 03/35] Free module context even if there was no content written in auxsave2 (#2132) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a module saves to the Aux metadata with aux_save2 callback, the module context is not freed if the module didn't save anything in the callback. https://github.com/valkey-io/valkey/blob/8.1.1/src/rdb.c#L1277-L1284. Note that we return 0, however we should be doing the cleanup done on https://github.com/valkey-io/valkey/blob/8.1.1/src/rdb.c#L1300-L1303 still. Fixes #2125 Signed-off-by: Jacob Murphy (cherry picked from commit 258213ff7e305b7bd77549577712084cd2d9e512) Signed-off-by: Viktor Söderqvist --- src/rdb.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/rdb.c b/src/rdb.c index cc131cfa61..af68b67e78 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -1267,6 +1267,10 @@ ssize_t rdbSaveSingleModuleAux(rio *rdb, int when, moduleType *mt) { * to allow loading this RDB if the module is not present. */ sdsfree(io.pre_flush_buffer); io.pre_flush_buffer = NULL; + if (io.ctx) { + moduleFreeContext(io.ctx); + zfree(io.ctx); + } return 0; } } else { From 6010dc520ca832c6153d2687e6a6b244d66e924c Mon Sep 17 00:00:00 2001 From: Binbin Date: Tue, 10 Jun 2025 10:29:25 +0800 Subject: [PATCH 04/35] CLIENT UNBLOCK should't be able to unpause paused clients (#2117) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a client is blocked by something like `CLIENT PAUSE`, we should not allow `CLIENT UNBLOCK timeout` to unblock it, since some blocking types does not has the timeout callback, it will trigger a panic in the core, people should use `CLIENT UNPAUSE` to unblock it. Also using `CLIENT UNBLOCK error` is not right, it will return a UNBLOCKED error to the command, people don't expect a `SET` command to get an error. So in this commit, in these cases, we will return 0 to `CLIENT UNBLOCK` to indicate the unblock is fail. The reason is that we assume that if a command doesn't expect to be timedout, it also doesn't expect to be unblocked by `CLIENT UNBLOCK`. The old behavior of the following command will trigger panic in timeout and get UNBLOCKED error in error. Under the new behavior, client unblock will get the result of 0. ``` client 1> client pause 100000 write client 2> set x x client 1> client unblock 2 timeout or client 1> client unblock 2 error ``` Potentially breaking change, previously allowed `CLIENT UNBLOCK error`. Fixes #2111. Signed-off-by: Binbin (cherry picked from commit 3bc40be6cdef24f9db8a30e5f8659ea5c404027a) Signed-off-by: Viktor Söderqvist --- src/blocked.c | 16 ++++++++++++++++ src/networking.c | 5 +++-- src/server.h | 1 + tests/unit/pause.tcl | 29 +++++++++++++++++++++++++++++ 4 files changed, 49 insertions(+), 2 deletions(-) diff --git a/src/blocked.c b/src/blocked.c index 0632713fab..e9e3ad119f 100644 --- a/src/blocked.c +++ b/src/blocked.c @@ -227,6 +227,22 @@ void unblockClient(client *c, int queue_for_reprocessing) { if (queue_for_reprocessing) queueClientForReprocessing(c); } +/* Check if the specified client can be safely timed out using + * unblockClientOnTimeout(). */ +int blockedClientMayTimeout(client *c) { + if (c->bstate->btype == BLOCKED_MODULE) { + return moduleBlockedClientMayTimeout(c); + } + + if (c->bstate->btype == BLOCKED_LIST || + c->bstate->btype == BLOCKED_ZSET || + c->bstate->btype == BLOCKED_STREAM || + c->bstate->btype == BLOCKED_WAIT) { + return 1; + } + return 0; +} + /* This function gets called when a blocked client timed out in order to * send it a reply of some kind. After this function is called, * unblockClient() will be called with the same client as argument. */ diff --git a/src/networking.c b/src/networking.c index ab2df89a59..1fb8bf239e 100644 --- a/src/networking.c +++ b/src/networking.c @@ -3775,11 +3775,12 @@ NULL } if (getLongLongFromObjectOrReply(c, c->argv[2], &id, NULL) != C_OK) return; struct client *target = lookupClientByID(id); - /* Note that we never try to unblock a client blocked on a module command, which + /* Note that we never try to unblock a client blocked on a module command, + * or a client blocked by CLIENT PAUSE or some other blocking type which * doesn't have a timeout callback (even in the case of UNBLOCK ERROR). * The reason is that we assume that if a command doesn't expect to be timedout, * it also doesn't expect to be unblocked by CLIENT UNBLOCK */ - if (target && target->flag.blocked && moduleBlockedClientMayTimeout(target)) { + if (target && target->flag.blocked && blockedClientMayTimeout(target)) { if (unblock_error) unblockClientOnError(target, "-UNBLOCKED client unblocked via CLIENT UNBLOCK"); else diff --git a/src/server.h b/src/server.h index 3ca5b4e0ac..f4636f88fe 100644 --- a/src/server.h +++ b/src/server.h @@ -3672,6 +3672,7 @@ void unblockClient(client *c, int queue_for_reprocessing); void unblockClientOnTimeout(client *c); void unblockClientOnError(client *c, const char *err_str); void queueClientForReprocessing(client *c); +int blockedClientMayTimeout(client *c); void replyToBlockedClientTimedOut(client *c); int getTimeoutFromObjectOrReply(client *c, robj *object, mstime_t *timeout, int unit); void disconnectAllBlockedClients(void); diff --git a/tests/unit/pause.tcl b/tests/unit/pause.tcl index 121c603324..63ff0b4e7a 100644 --- a/tests/unit/pause.tcl +++ b/tests/unit/pause.tcl @@ -405,7 +405,36 @@ start_server {tags {"pause network"}} { r client unpause assert_equal [r randomkey] {} + } + + test "CLIENT UNBLOCK is not allow to unblock client blocked by CLIENT PAUSE" { + set rd1 [valkey_deferring_client] + set rd2 [valkey_deferring_client] + $rd1 client id + $rd2 client id + set client_id1 [$rd1 read] + set client_id2 [$rd2 read] + + r del mylist + r client pause 100000 write + $rd1 blpop mylist 0 + $rd2 blpop mylist 0 + wait_for_blocked_clients_count 2 50 100 + # This used to trigger a panic. + assert_equal 0 [r client unblock $client_id1 timeout] + # THis used to return a UNBLOCKED error. + assert_equal 0 [r client unblock $client_id2 error] + + # After the unpause, it must be able to unblock the client. + r client unpause + assert_equal 1 [r client unblock $client_id1 timeout] + assert_equal 1 [r client unblock $client_id2 error] + assert_equal {} [$rd1 read] + assert_error "UNBLOCKED*" {$rd2 read} + + $rd1 close + $rd2 close } # Make sure we unpause at the end From 5ebadd9fbe9597a3cd181c53117e8266592bba2e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Viktor=20S=C3=B6derqvist?= Date: Tue, 27 May 2025 20:31:12 +0200 Subject: [PATCH 05/35] Detect SSL_new() returning NULL in outgoing connections (#2140) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When creating an outgoing TLS connection, we don't check if `SSL_new()` returned NULL. Without this patch, the check was done only for incoming connections in `connCreateAcceptedTLS()`. This patch moves the check to `createTLSConnection()` which is used both for incoming and outgoing connections. This check makes sure we fail the connection before going any further, e.g. when `connCreate()` is followed by `connConnect()`, the latter returns `C_ERR` which is commonly detected where outgoing connections are established, such where a replica connects to a primary. ```c int connectWithPrimary(void) { server.repl_transfer_s = connCreate(connTypeOfReplication()); if (connConnect(server.repl_transfer_s, server.primary_host, server.primary_port, server.bind_source_addr, server.repl_mptcp, syncWithPrimary) == C_ERR) { serverLog(LL_WARNING, "Unable to connect to PRIMARY: %s", connGetLastError(server.repl_transfer_s)); connClose(server.repl_transfer_s); server.repl_transfer_s = NULL; return C_ERR; } ``` For a more thorough explanation, see https://github.com/valkey-io/valkey/issues/1939#issuecomment-2912177877. Might fix #1939. Signed-off-by: Viktor Söderqvist (cherry picked from commit 17e66863a5f9d6065544d0b3d655ba8f40081570) Signed-off-by: Viktor Söderqvist --- src/tls.c | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/src/tls.c b/src/tls.c index a1f101d9c5..f5ceeadfaa 100644 --- a/src/tls.c +++ b/src/tls.c @@ -456,6 +456,14 @@ typedef struct tls_connection { size_t last_failed_write_data_len; } tls_connection; +/* Fetch the latest OpenSSL error and store it in the connection */ +static void updateTLSError(tls_connection *conn) { + conn->c.last_errno = 0; + if (conn->ssl_error) zfree(conn->ssl_error); + conn->ssl_error = zmalloc(512); + ERR_error_string_n(ERR_get_error(), conn->ssl_error, 512); +} + static connection *createTLSConnection(int client_side) { SSL_CTX *ctx = valkey_tls_ctx; if (client_side && valkey_tls_client_ctx) ctx = valkey_tls_client_ctx; @@ -464,6 +472,10 @@ static connection *createTLSConnection(int client_side) { conn->c.fd = -1; conn->c.iovcnt = IOV_MAX; conn->ssl = SSL_new(ctx); + if (!conn->ssl) { + updateTLSError(conn); + conn->c.state = CONN_STATE_ERROR; + } return (connection *)conn; } @@ -471,14 +483,6 @@ static connection *connCreateTLS(void) { return createTLSConnection(1); } -/* Fetch the latest OpenSSL error and store it in the connection */ -static void updateTLSError(tls_connection *conn) { - conn->c.last_errno = 0; - if (conn->ssl_error) zfree(conn->ssl_error); - conn->ssl_error = zmalloc(512); - ERR_error_string_n(ERR_get_error(), conn->ssl_error, 512); -} - /* Create a new TLS connection that is already associated with * an accepted underlying file descriptor. * @@ -492,14 +496,9 @@ static connection *connCreateAcceptedTLS(int fd, void *priv) { int require_auth = *(int *)priv; tls_connection *conn = (tls_connection *)createTLSConnection(0); conn->c.fd = fd; + if (conn->c.state == CONN_STATE_ERROR) return (connection *)conn; conn->c.state = CONN_STATE_ACCEPTING; - if (!conn->ssl) { - updateTLSError(conn); - conn->c.state = CONN_STATE_ERROR; - return (connection *)conn; - } - switch (require_auth) { case TLS_CLIENT_AUTH_NO: SSL_set_verify(conn->ssl, SSL_VERIFY_NONE, NULL); break; case TLS_CLIENT_AUTH_OPTIONAL: SSL_set_verify(conn->ssl, SSL_VERIFY_PEER, NULL); break; From 4234a66e444d954e0042d95f3abd38ead08f88ac Mon Sep 17 00:00:00 2001 From: Madelyn Olson Date: Wed, 28 May 2025 19:15:07 +0100 Subject: [PATCH 06/35] Correctly cast the extension lengths (#2144) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Correctly use a 32 bit integer for accumulating the length of ping extensions. The current code may accidentally truncate the length of an extension that is greater than 64kb and fail the validation check. We don't currently emit any extensions that are this length, but if we were to do so in the future we might have issues with older nodes (without this fix) will silently drop packets from newer nodes. We should backport this to all versions. Signed-off-by: Madelyn Olson (cherry picked from commit 30d7f08a4e62d2686f69ef21615adf27b0c42a75) Signed-off-by: Viktor Söderqvist --- src/cluster_legacy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 087bf3758d..52d285a594 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -3034,7 +3034,7 @@ int clusterIsValidPacket(clusterLink *link) { if (hdr->mflags[0] & CLUSTERMSG_FLAG0_EXT_DATA) { clusterMsgPingExt *ext = getInitialPingExt(hdr, count); while (extensions--) { - uint16_t extlen = getPingExtLength(ext); + uint32_t extlen = getPingExtLength(ext); if (extlen % 8 != 0) { serverLog(LL_WARNING, "Received a %s packet without proper padding (%d bytes)", clusterGetMessageTypeString(type), (int)extlen); From 05213cf8fd55cc8794952519bc59d36ca65a337b Mon Sep 17 00:00:00 2001 From: Madelyn Olson Date: Thu, 29 May 2025 00:06:23 +0100 Subject: [PATCH 07/35] Incorporate Redis CVE for CVE-2025-27151 (#2146) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Resolves https://github.com/valkey-io/valkey/issues/2145 Incorporate the CVE patch that was sent to us by Redis Ltd. --------- Signed-off-by: Madelyn Olson Co-authored-by: Ping Xie (cherry picked from commit 73696bf6e2cf754acc3ec24eaf9ca6b879bfc5d7) Signed-off-by: Viktor Söderqvist --- src/valkey-check-aof.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/valkey-check-aof.c b/src/valkey-check-aof.c index bc71d366d5..c43c30b75b 100644 --- a/src/valkey-check-aof.c +++ b/src/valkey-check-aof.c @@ -556,6 +556,12 @@ int redis_check_aof_main(int argc, char **argv) { goto invalid_args; } + /* Check if filepath is longer than PATH_MAX */ + if (strnlen(filepath, PATH_MAX + 1) > PATH_MAX) { + printf("Error: filepath is too long (exceeds PATH_MAX)\n"); + goto invalid_args; + } + /* In the glibc implementation dirname may modify their argument. */ memcpy(temp_filepath, filepath, strlen(filepath) + 1); dirpath = dirname(temp_filepath); From 879c55dd3893ce8668038b10ca1e36df9cfc2b9f Mon Sep 17 00:00:00 2001 From: Binbin Date: Tue, 10 Jun 2025 09:30:16 +0800 Subject: [PATCH 08/35] Fix replica can't finish failover when config epoch is outdated (#2178) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the primary changes the config epoch and then down immediately, the replica may not update the config epoch in time. Although we will broadcast the change in cluster (see #1813), there may be a race in the network or in the code. In this case, the replica will never finish the failover since other primaries will refuse to vote because the replica's slot config epoch is old. We need a way to allow the replica can finish the failover in this case. When the primary refuses to vote because the replica's config epoch is less than the dead primary's config epoch, it can send an UPDATE packet to the replica to inform the replica about the dead primary. The UPDATE message contains information about the dead primary's config epoch and owned slots. The failover will time out, but later the replica can try again with the updated config epoch and it can succeed. Fixes #2169. --------- Signed-off-by: Binbin Signed-off-by: Harkrishn Patro Co-authored-by: Viktor Söderqvist Co-authored-by: Harkrishn Patro Co-authored-by: Madelyn Olson (cherry picked from commit 476671be194e4296b9100663560193069fa46453) Signed-off-by: Viktor Söderqvist --- src/cluster_legacy.c | 17 +++- tests/support/cluster_util.tcl | 6 ++ tests/unit/cluster/failover2.tcl | 169 +++++++++++++++++++++++++++++++ 3 files changed, 189 insertions(+), 3 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 52d285a594..49cd058c73 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -3484,9 +3484,10 @@ int clusterProcessPacket(clusterLink *link) { if (server.cluster->slots[j] == sender || isSlotUnclaimed(j)) continue; if (server.cluster->slots[j]->configEpoch > sender_claimed_config_epoch) { serverLog(LL_VERBOSE, - "Node %.40s has old slots configuration, sending " - "an UPDATE message about %.40s", - sender->name, server.cluster->slots[j]->name); + "Node %.40s (%s) has old slots configuration, sending " + "an UPDATE message about %.40s (%s)", + sender->name, sender->human_nodename, + server.cluster->slots[j]->name, server.cluster->slots[j]->human_nodename); clusterSendUpdate(sender->link, server.cluster->slots[j]); /* TODO: instead of exiting the loop send every other @@ -4395,6 +4396,16 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { "slot %d epoch (%llu) > reqEpoch (%llu)", node->name, node->human_nodename, j, (unsigned long long)server.cluster->slots[j]->configEpoch, (unsigned long long)requestConfigEpoch); + + /* Send an UPDATE message to the replica. After receiving the UPDATE message, + * the replica will update the slots config so that it can initiate a failover + * again later. Otherwise the replica will never get votes if the primary is down. */ + serverLog(LL_VERBOSE, + "Node %.40s (%s) has old slots configuration, sending " + "an UPDATE message about %.40s (%s)", + node->name, node->human_nodename, + server.cluster->slots[j]->name, server.cluster->slots[j]->human_nodename); + clusterSendUpdate(node->link, server.cluster->slots[j]); return; } diff --git a/tests/support/cluster_util.tcl b/tests/support/cluster_util.tcl index 3f31fd5a3b..b2a2754680 100644 --- a/tests/support/cluster_util.tcl +++ b/tests/support/cluster_util.tcl @@ -209,6 +209,12 @@ proc cluster_allocate_replicas {masters replicas} { # Setup method to be executed to configure the cluster before the # tests run. proc cluster_setup {masters replicas node_count slot_allocator replica_allocator code} { + set config_epoch 1 + for {set i 0} {$i < $node_count} {incr i} { + R $i CLUSTER SET-CONFIG-EPOCH $config_epoch + incr config_epoch + } + # Have all nodes meet if {$::tls} { set tls_cluster [lindex [R 0 CONFIG GET tls-cluster] 1] diff --git a/tests/unit/cluster/failover2.tcl b/tests/unit/cluster/failover2.tcl index 7bc6a05e95..284df1cc4e 100644 --- a/tests/unit/cluster/failover2.tcl +++ b/tests/unit/cluster/failover2.tcl @@ -64,3 +64,172 @@ start_cluster 3 4 {tags {external:skip cluster} overrides {cluster-ping-interval } } ;# start_cluster + +start_cluster 7 3 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 5000}} { + test "Primaries will not time out then they are elected in the same epoch" { + # Since we have the delay time, so these node may not initiate the + # election at the same time (same epoch). But if they do, we make + # sure there is no failover timeout. + + # Killing there primary nodes. + pause_process [srv 0 pid] + pause_process [srv -1 pid] + pause_process [srv -2 pid] + + # Wait for the failover + wait_for_condition 1000 50 { + [s -7 role] == "master" && + [s -8 role] == "master" && + [s -9 role] == "master" + } else { + fail "No failover detected" + } + + # Make sure there is no false epoch 0. + verify_no_log_message -7 "*Failover election in progress for epoch 0*" 0 + verify_no_log_message -8 "*Failover election in progress for epoch 0*" 0 + verify_no_log_message -9 "*Failover election in progress for epoch 0*" 0 + + # Make sure there is no failover timeout. + verify_no_log_message -7 "*Failover attempt expired*" 0 + verify_no_log_message -8 "*Failover attempt expired*" 0 + verify_no_log_message -9 "*Failover attempt expired*" 0 + + # Resuming these primary nodes, speed up the shutdown. + resume_process [srv 0 pid] + resume_process [srv -1 pid] + resume_process [srv -2 pid] + } +} ;# start_cluster + +run_solo {cluster} { + start_cluster 32 15 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 15000}} { + test "Multiple primary nodes are down, rank them based on the failed primary" { + # Killing these primary nodes. + for {set j 0} {$j < 15} {incr j} { + pause_process [srv -$j pid] + } + + # Make sure that a node starts failover. + wait_for_condition 1000 100 { + [s -40 role] == "master" + } else { + fail "No failover detected" + } + + # Wait for the cluster state to become ok. + for {set j 0} {$j < [llength $::servers]} {incr j} { + if {[process_is_paused [srv -$j pid]]} continue + wait_for_condition 1000 100 { + [CI $j cluster_state] eq "ok" + } else { + fail "Cluster node $j cluster_state:[CI $j cluster_state]" + } + } + + # Resuming these primary nodes, speed up the shutdown. + for {set j 0} {$j < 15} {incr j} { + resume_process [srv -$j pid] + } + } + } ;# start_cluster +} ;# run_solo + +# Needs to run in the body of +# start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-replica-validity-factor 0}} +proc test_replica_config_epoch_failover {type} { + test "Replica can update the config epoch when trigger the failover - $type" { + set CLUSTER_PACKET_TYPE_NONE -1 + set CLUSTER_PACKET_TYPE_ALL -2 + + if {$type == "automatic"} { + R 3 CONFIG SET cluster-replica-no-failover no + } elseif {$type == "manual"} { + R 3 CONFIG SET cluster-replica-no-failover yes + } + R 3 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_ALL + + set R0_nodeid [R 0 cluster myid] + + # R 0 is the first node, so we expect its epoch to be the smallest, + # so bumpepoch must succeed and it's config epoch will be changed. + set res [R 0 cluster bumpepoch] + assert_match {BUMPED *} $res + set R0_config_epoch [lindex $res 1] + + # Wait for the config epoch to propagate across the cluster. + wait_for_condition 1000 10 { + $R0_config_epoch == [dict get [cluster_get_node_by_id 1 $R0_nodeid] config_epoch] && + $R0_config_epoch == [dict get [cluster_get_node_by_id 2 $R0_nodeid] config_epoch] + } else { + fail "Other primaries does not update config epoch" + } + # Make sure that replica do not update config epoch. + assert_not_equal $R0_config_epoch [dict get [cluster_get_node_by_id 3 $R0_nodeid] config_epoch] + + # Pause the R 0 and wait for the cluster to be down. + pause_process [srv 0 pid] + R 3 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_NONE + wait_for_condition 1000 50 { + [CI 1 cluster_state] == "fail" && + [CI 2 cluster_state] == "fail" && + [CI 3 cluster_state] == "fail" + } else { + fail "Cluster does not fail" + } + + # Make sure both the automatic and the manual failover will fail in the first time. + if {$type == "automatic"} { + wait_for_log_messages -3 {"*Failover attempt expired*"} 0 1000 10 + } elseif {$type == "manual"} { + R 3 cluster failover force + wait_for_log_messages -3 {"*Manual failover timed out*"} 0 1000 10 + } + + # Make sure the primaries prints the relevant logs. + wait_for_log_messages -1 {"*Failover auth denied to* epoch * > reqConfigEpoch*"} 0 1000 10 + wait_for_log_messages -1 {"*has old slots configuration, sending an UPDATE message about*"} 0 1000 10 + wait_for_log_messages -2 {"*Failover auth denied to* epoch * > reqConfigEpoch*"} 0 1000 10 + wait_for_log_messages -2 {"*has old slots configuration, sending an UPDATE message about*"} 0 1000 10 + + # Make sure the replica has updated the config epoch. + wait_for_condition 1000 10 { + $R0_config_epoch == [dict get [cluster_get_node_by_id 1 $R0_nodeid] config_epoch] + } else { + fail "The replica does not update the config epoch" + } + + if {$type == "manual"} { + # The second manual failure will succeed because the config epoch + # has already propagated. + R 3 cluster failover force + } + + # Wait for the failover to success. + wait_for_condition 1000 50 { + [s -3 role] == "master" && + [CI 1 cluster_state] == "ok" && + [CI 2 cluster_state] == "ok" && + [CI 3 cluster_state] == "ok" + } else { + fail "Failover does not happen" + } + + # Restore the old primary, make sure it can covert + resume_process [srv 0 pid] + wait_for_condition 1000 50 { + [s 0 role] == "slave" && + [CI 0 cluster_state] == "ok" + } else { + fail "The old primary was not converted into replica" + } + } +} + +start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-replica-validity-factor 0}} { + test_replica_config_epoch_failover "automatic" +} + +start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-replica-validity-factor 0}} { + test_replica_config_epoch_failover "manual" +} From 7b12b4ab2567209727e7c12bfab18ff3e64ff2cf Mon Sep 17 00:00:00 2001 From: Binbin Date: Tue, 10 Jun 2025 10:31:46 +0800 Subject: [PATCH 09/35] Fix cluster myself CLUSTER SLOTS/NODES wrong port after updating port/tls-port (#2186) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When modifying port or tls-port through config set, we need to call clusterUpdateMyselfAnnouncedPorts to update myself's port, otherwise CLUSTER SLOTS/NODES will be old information from myself's perspective. In addition, in some places, such as clusterUpdateMyselfAnnouncedPorts and clusterUpdateMyselfIp, beforeSleep save is added so that the new ip info can be updated to nodes.conf. Remove clearCachedClusterSlotsResponse in updateClusterAnnouncedPort since now we add beforeSleep save in clusterUpdateMyselfAnnouncedPorts, and it will call clearCachedClusterSlotsResponse. Signed-off-by: Binbin (cherry picked from commit 5cc2b2575300f09c004bf73f0a5f69ed1775c00c) Signed-off-by: Viktor Söderqvist --- src/cluster_legacy.c | 2 ++ src/config.c | 3 +- tests/unit/cluster/announced-endpoints.tcl | 39 ++++++++++++++++++++++ 3 files changed, 43 insertions(+), 1 deletion(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 49cd058c73..49047d4acb 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -970,6 +970,7 @@ void clusterUpdateMyselfFlags(void) { void clusterUpdateMyselfAnnouncedPorts(void) { if (!myself) return; deriveAnnouncedPorts(&myself->tcp_port, &myself->tls_port, &myself->cport); + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); } /* We want to take myself->ip in sync with the cluster-announce-ip option. @@ -1000,6 +1001,7 @@ void clusterUpdateMyselfIp(void) { } else { myself->ip[0] = '\0'; /* Force autodetection. */ } + clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG); } } diff --git a/src/config.c b/src/config.c index 2d4e703d95..0f0836cc0f 100644 --- a/src/config.c +++ b/src/config.c @@ -2445,6 +2445,7 @@ static int updatePort(const char **err) { listener->bindaddr_count = server.bindaddr_count; listener->port = server.port; listener->ct = connectionByType(CONN_TYPE_SOCKET); + clusterUpdateMyselfAnnouncedPorts(); if (changeListener(listener) == C_ERR) { *err = "Unable to listen on this port. Check server logs."; return 0; @@ -2632,7 +2633,6 @@ int updateClusterFlags(const char **err) { static int updateClusterAnnouncedPort(const char **err) { UNUSED(err); clusterUpdateMyselfAnnouncedPorts(); - clearCachedClusterSlotsResponse(); return 1; } @@ -2691,6 +2691,7 @@ static int applyTLSPort(const char **err) { listener->bindaddr_count = server.bindaddr_count; listener->port = server.tls_port; listener->ct = connectionByType(CONN_TYPE_TLS); + clusterUpdateMyselfAnnouncedPorts(); if (changeListener(listener) == C_ERR) { *err = "Unable to listen on this port. Check server logs."; return 0; diff --git a/tests/unit/cluster/announced-endpoints.tcl b/tests/unit/cluster/announced-endpoints.tcl index b44a2a0e95..47ca8d2e41 100644 --- a/tests/unit/cluster/announced-endpoints.tcl +++ b/tests/unit/cluster/announced-endpoints.tcl @@ -53,4 +53,43 @@ start_cluster 2 2 {tags {external:skip cluster}} { R 0 config set cluster-announce-bus-port 0 assert_match "*@$base_bus_port *" [R 0 CLUSTER NODES] } + + test "Test change port and tls-port on runtime" { + if {$::tls} { + set baseport [lindex [R 0 config get tls-port] 1] + } else { + set baseport [lindex [R 0 config get port] 1] + } + set count [expr [llength $::servers] + 1] + set used_port [find_available_port $baseport $count] + + # We execute CLUSTER SLOTS command to trigger the `debugServerAssertWithInfo` in `clusterCommandSlots` function, ensuring + # that the cached response is invalidated upon updating any of port or tls-port. + R 0 CLUSTER SLOTS + R 1 CLUSTER SLOTS + + # Set port or tls-port to ensure changes are consistent across the cluster. + if {$::tls} { + R 0 config set tls-port $used_port + } else { + R 0 config set port $used_port + } + # Make sure changes in myself node's view are consistent. + assert_match "*:$used_port@*" [R 0 CLUSTER NODES] + assert_match "*$used_port*" [R 0 CLUSTER SLOTS] + # Make sure changes in other node's view are consistent. + wait_for_condition 50 100 { + [string match "*:$used_port@*" [R 1 CLUSTER NODES]] && + [string match "*$used_port*" [R 1 CLUSTER SLOTS]] + } else { + fail "Node port was not propagated via gossip" + } + + # Restore the original configuration item value. + if {$::tls} { + R 0 config set tls-port $baseport + } else { + R 0 config set port $baseport + } + } } From 7aaea8929de71dc858a14b348f94230c00d68c7f Mon Sep 17 00:00:00 2001 From: Binbin Date: Wed, 11 Jun 2025 17:01:06 +0800 Subject: [PATCH 10/35] Add packet-drop to fix the new flaky failover test (#2196) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The new test was added in #2178, obviously there may be pending reads in the connection, so there may be a race in the DROP-CLUSTER-PACKET-FILTER part causing the test to fail. Add CLOSE-CLUSTER-LINK-ON-PACKET-DROP to ensure that the replica does not process the packet. Signed-off-by: Binbin (cherry picked from commit 2019337e7442a1421955716bb7663801a2f29c8d) Signed-off-by: Viktor Söderqvist --- src/blocked.c | 10 +++++----- tests/unit/cluster/failover2.tcl | 2 ++ 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/blocked.c b/src/blocked.c index e9e3ad119f..e2ee9d621e 100644 --- a/src/blocked.c +++ b/src/blocked.c @@ -230,14 +230,14 @@ void unblockClient(client *c, int queue_for_reprocessing) { /* Check if the specified client can be safely timed out using * unblockClientOnTimeout(). */ int blockedClientMayTimeout(client *c) { - if (c->bstate->btype == BLOCKED_MODULE) { + if (c->bstate.btype == BLOCKED_MODULE) { return moduleBlockedClientMayTimeout(c); } - if (c->bstate->btype == BLOCKED_LIST || - c->bstate->btype == BLOCKED_ZSET || - c->bstate->btype == BLOCKED_STREAM || - c->bstate->btype == BLOCKED_WAIT) { + if (c->bstate.btype == BLOCKED_LIST || + c->bstate.btype == BLOCKED_ZSET || + c->bstate.btype == BLOCKED_STREAM || + c->bstate.btype == BLOCKED_WAIT) { return 1; } return 0; diff --git a/tests/unit/cluster/failover2.tcl b/tests/unit/cluster/failover2.tcl index 284df1cc4e..3411e7feff 100644 --- a/tests/unit/cluster/failover2.tcl +++ b/tests/unit/cluster/failover2.tcl @@ -148,6 +148,7 @@ proc test_replica_config_epoch_failover {type} { R 3 CONFIG SET cluster-replica-no-failover yes } R 3 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_ALL + R 3 DEBUG CLOSE-CLUSTER-LINK-ON-PACKET-DROP 1 set R0_nodeid [R 0 cluster myid] @@ -170,6 +171,7 @@ proc test_replica_config_epoch_failover {type} { # Pause the R 0 and wait for the cluster to be down. pause_process [srv 0 pid] R 3 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_NONE + R 3 DEBUG CLOSE-CLUSTER-LINK-ON-PACKET-DROP 0 wait_for_condition 1000 50 { [CI 1 cluster_state] == "fail" && [CI 2 cluster_state] == "fail" && From 1100316c1c204c469cbc68983c1f2d5f64ee6403 Mon Sep 17 00:00:00 2001 From: Madelyn Olson Date: Mon, 4 Nov 2024 12:36:20 -0800 Subject: [PATCH 11/35] Add a filter option to drop all cluster packets (#1252) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A minor debugging change that helped in the investigation of https://github.com/valkey-io/valkey/issues/1251. Basically there are some edge cases where we want to fully isolate a note from receiving packets, but can't suspend the process because we need it to continue sending outbound traffic. So, added a filter for that. Signed-off-by: Madelyn Olson Signed-off-by: Viktor Söderqvist --- src/cluster_legacy.c | 5 +++-- src/debug.c | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 49047d4acb..f3a7a5c132 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -3017,7 +3017,7 @@ int clusterIsValidPacket(clusterLink *link) { return 0; } - if (type == server.cluster_drop_packet_filter) { + if (type == server.cluster_drop_packet_filter || server.cluster_drop_packet_filter == -2) { serverLog(LL_WARNING, "Dropping packet that matches debug drop filter"); return 0; } @@ -3106,7 +3106,8 @@ int clusterProcessPacket(clusterLink *link) { if (!clusterIsValidPacket(link)) { clusterMsg *hdr = (clusterMsg *)link->rcvbuf; uint16_t type = ntohs(hdr->type); - if (server.debug_cluster_close_link_on_packet_drop && type == server.cluster_drop_packet_filter) { + if (server.debug_cluster_close_link_on_packet_drop && + (type == server.cluster_drop_packet_filter || server.cluster_drop_packet_filter == -2)) { freeClusterLink(link); serverLog(LL_WARNING, "Closing link for matching packet type %hu", type); return 0; diff --git a/src/debug.c b/src/debug.c index a01ccf9730..5c8733641a 100644 --- a/src/debug.c +++ b/src/debug.c @@ -431,7 +431,7 @@ void debugCommand(client *c) { " Some fields of the default behavior may be time consuming to fetch,", " and `fast` can be passed to avoid fetching them.", "DROP-CLUSTER-PACKET-FILTER ", - " Drop all packets that match the filtered type. Set to -1 allow all packets.", + " Drop all packets that match the filtered type. Set to -1 allow all packets or -2 to drop all packets.", "CLOSE-CLUSTER-LINK-ON-PACKET-DROP <0|1>", " This is valid only when DROP-CLUSTER-PACKET-FILTER is set to a valid packet type.", " When set to 1, the cluster link is closed after dropping a packet based on the filter.", From 4e3d55197ae3e2e3ea89ba002874b5a2e8d74d1b Mon Sep 17 00:00:00 2001 From: Binbin Date: Thu, 19 Dec 2024 16:12:34 +0800 Subject: [PATCH 12/35] Minor log fixes when failover auth denied due to slot epoch (#1341) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The old reqEpoch mainly refers to requestCurrentEpoch, see: ``` if (requestCurrentEpoch < server.cluster->currentEpoch) { serverLog(LL_WARNING, "Failover auth denied to %.40s (%s): reqEpoch (%llu) < curEpoch(%llu)", node->name, node->human_nodename, (unsigned long long)requestCurrentEpoch, (unsigned long long)server.cluster->currentEpoch); return; } ``` And in here we refer to requestConfigEpoch, it's a bit misleading, so change it to reqConfigEpoch to make it clear. Signed-off-by: Binbin Signed-off-by: Viktor Söderqvist --- src/cluster_legacy.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index f3a7a5c132..64e8b811a0 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -4396,7 +4396,7 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { * by the replica requesting our vote. Refuse to vote for this replica. */ serverLog(LL_WARNING, "Failover auth denied to %.40s (%s): " - "slot %d epoch (%llu) > reqEpoch (%llu)", + "slot %d epoch (%llu) > reqConfigEpoch (%llu)", node->name, node->human_nodename, j, (unsigned long long)server.cluster->slots[j]->configEpoch, (unsigned long long)requestConfigEpoch); @@ -4681,8 +4681,8 @@ void clusterHandleReplicaFailover(void) { if (server.cluster->failover_auth_sent == 0) { server.cluster->currentEpoch++; server.cluster->failover_auth_epoch = server.cluster->currentEpoch; - serverLog(LL_NOTICE, "Starting a failover election for epoch %llu.", - (unsigned long long)server.cluster->currentEpoch); + serverLog(LL_NOTICE, "Starting a failover election for epoch %llu, node config epoch is %llu", + (unsigned long long)server.cluster->currentEpoch, (unsigned long long)nodeEpoch(myself)); clusterRequestFailoverAuth(); server.cluster->failover_auth_sent = 1; clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_FSYNC_CONFIG); From de6394b2ff82c268d61d792346ec05b736954a8d Mon Sep 17 00:00:00 2001 From: Binbin Date: Wed, 25 Sep 2024 12:08:48 +0800 Subject: [PATCH 13/35] Trigger the election as soon as possible when doing a forced manual failover (#1067) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In CLUSTER FAILOVER FORCE case, we will set mf_can_start to 1 and wait for a cron to trigger the election. We can also set a CLUSTER_TODO_HANDLE_MANUALFAILOVER flag so that we can start the election as soon as possible instead of waiting for the cron, so that we won't have a 100ms delay (clusterCron). Signed-off-by: Binbin Signed-off-by: Viktor Söderqvist --- src/cluster_legacy.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 64e8b811a0..374b01bfc5 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -6776,6 +6776,9 @@ int clusterCommandSpecial(client *c) { * it without coordination. */ serverLog(LL_NOTICE, "Forced failover user request accepted (user request from '%s').", client); server.cluster->mf_can_start = 1; + /* We can start a manual failover as soon as possible, setting a flag + * here so that we don't need to waiting for the cron to kick in. */ + clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_MANUALFAILOVER); } else { serverLog(LL_NOTICE, "Manual failover user request accepted (user request from '%s').", client); clusterSendMFStart(myself->replicaof); From c80c29e47e645a8428f241946bc15c34c667e2f6 Mon Sep 17 00:00:00 2001 From: Binbin Date: Fri, 22 Nov 2024 10:28:59 +0800 Subject: [PATCH 14/35] Make manual failover reset the on-going election to promote failover (#1274) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a manual failover got timed out, like the election don't get the enough votes, since we have a auth_timeout and a auth_retry_time, a new manual failover will not be able to proceed on the replica side. Like if we initiate a new manual failover after a election timed out, we will pause the primary, but on the replica side, due to retry_time, replica does not trigger the new election and the manual failover will eventually time out. In this case, if we initiate manual failover again and there is an ongoing election, we will reset it so that the replica can initiate a new election at the manual failover's request. Signed-off-by: Binbin Signed-off-by: Viktor Söderqvist --- src/cluster_legacy.c | 25 +++++++++++++-- tests/unit/cluster/manual-failover.tcl | 42 ++++++++++++++++++++++++++ 2 files changed, 65 insertions(+), 2 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 374b01bfc5..eae2091363 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -4846,6 +4846,27 @@ void clusterHandleReplicaMigration(int max_replicas) { * data loss due to the asynchronous primary-replica replication. * -------------------------------------------------------------------------- */ +void manualFailoverCanStart(void) { + serverAssert(server.cluster->mf_can_start == 0); + + if (server.cluster->failover_auth_time) { + /* There is another manual failover requested by the user. + * If we have an ongoing election, reset it because the user may initiate + * manual failover again when the previous manual failover timed out. + * Otherwise, if the previous election timed out (see auth_timeout) and + * before the next retry (see auth_retry_time), the new manual failover + * will pause the primary and replica can not do anything to advance the + * manual failover, and then the manual failover eventually times out. */ + server.cluster->failover_auth_time = 0; + serverLog(LL_WARNING, + "Failover election in progress for epoch %llu, but received a new manual failover. " + "Resetting the election.", + (unsigned long long)server.cluster->failover_auth_epoch); + } + + server.cluster->mf_can_start = 1; +} + /* Reset the manual failover state. This works for both primaries and replicas * as all the state about manual failover is cleared. * @@ -4886,7 +4907,7 @@ void clusterHandleManualFailover(void) { if (server.cluster->mf_primary_offset == replicationGetReplicaOffset()) { /* Our replication offset matches the primary replication offset * announced after clients were paused. We can start the failover. */ - server.cluster->mf_can_start = 1; + manualFailoverCanStart(); serverLog(LL_NOTICE, "All primary replication stream processed, " "manual failover can start."); clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER); @@ -6775,7 +6796,7 @@ int clusterCommandSpecial(client *c) { * primary to agree about the offset. We just failover taking over * it without coordination. */ serverLog(LL_NOTICE, "Forced failover user request accepted (user request from '%s').", client); - server.cluster->mf_can_start = 1; + manualFailoverCanStart(); /* We can start a manual failover as soon as possible, setting a flag * here so that we don't need to waiting for the cron to kick in. */ clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_MANUALFAILOVER); diff --git a/tests/unit/cluster/manual-failover.tcl b/tests/unit/cluster/manual-failover.tcl index 2a9dff934b..a281cb58f2 100644 --- a/tests/unit/cluster/manual-failover.tcl +++ b/tests/unit/cluster/manual-failover.tcl @@ -183,3 +183,45 @@ test "Wait for instance #0 to return back alive" { } } ;# start_cluster + +start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 15000}} { + test "Manual failover will reset the on-going election" { + set CLUSTER_PACKET_TYPE_FAILOVER_AUTH_REQUEST 5 + set CLUSTER_PACKET_TYPE_NONE -1 + + # Let other primaries drop FAILOVER_AUTH_REQUEST so that the election won't + # get the enough votes and the election will time out. + R 1 debug drop-cluster-packet-filter $CLUSTER_PACKET_TYPE_FAILOVER_AUTH_REQUEST + R 2 debug drop-cluster-packet-filter $CLUSTER_PACKET_TYPE_FAILOVER_AUTH_REQUEST + + # Replica doing the manual failover. + R 3 cluster failover + + # Waiting for primary and replica to confirm manual failover timeout. + wait_for_log_messages 0 {"*Manual failover timed out*"} 0 1000 50 + wait_for_log_messages -3 {"*Manual failover timed out*"} 0 1000 50 + set loglines1 [count_log_lines 0] + set loglines2 [count_log_lines -3] + + # Undo packet drop, so that replica can win the next election. + R 1 debug drop-cluster-packet-filter $CLUSTER_PACKET_TYPE_NONE + R 2 debug drop-cluster-packet-filter $CLUSTER_PACKET_TYPE_NONE + + # Replica doing the manual failover again. + R 3 cluster failover + + # Make sure the election is reset. + wait_for_log_messages -3 {"*Failover election in progress*Resetting the election*"} $loglines2 1000 50 + + # Wait for failover. + wait_for_condition 1000 50 { + [s -3 role] == "master" + } else { + fail "No failover detected" + } + + # Make sure that the second manual failover does not time out. + verify_no_log_message 0 "*Manual failover timed out*" $loglines1 + verify_no_log_message -3 "*Manual failover timed out*" $loglines2 + } +} ;# start_cluster From 657814bb8f6eef4b99e5f6ee7c75d14ebe98428b Mon Sep 17 00:00:00 2001 From: Vitaly Arbuzov Date: Tue, 17 Jun 2025 11:05:07 -0700 Subject: [PATCH 15/35] Fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Viktor Söderqvist --- src/config.c | 3 +++ tests/support/cluster_util.tcl | 5 ----- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/config.c b/src/config.c index 0f0836cc0f..8757c43f87 100644 --- a/src/config.c +++ b/src/config.c @@ -2446,6 +2446,7 @@ static int updatePort(const char **err) { listener->port = server.port; listener->ct = connectionByType(CONN_TYPE_SOCKET); clusterUpdateMyselfAnnouncedPorts(); + clearCachedClusterSlotsResponse(); if (changeListener(listener) == C_ERR) { *err = "Unable to listen on this port. Check server logs."; return 0; @@ -2633,6 +2634,7 @@ int updateClusterFlags(const char **err) { static int updateClusterAnnouncedPort(const char **err) { UNUSED(err); clusterUpdateMyselfAnnouncedPorts(); + clearCachedClusterSlotsResponse(); return 1; } @@ -2692,6 +2694,7 @@ static int applyTLSPort(const char **err) { listener->port = server.tls_port; listener->ct = connectionByType(CONN_TYPE_TLS); clusterUpdateMyselfAnnouncedPorts(); + clearCachedClusterSlotsResponse(); if (changeListener(listener) == C_ERR) { *err = "Unable to listen on this port. Check server logs."; return 0; diff --git a/tests/support/cluster_util.tcl b/tests/support/cluster_util.tcl index b2a2754680..871f3e6404 100644 --- a/tests/support/cluster_util.tcl +++ b/tests/support/cluster_util.tcl @@ -209,11 +209,6 @@ proc cluster_allocate_replicas {masters replicas} { # Setup method to be executed to configure the cluster before the # tests run. proc cluster_setup {masters replicas node_count slot_allocator replica_allocator code} { - set config_epoch 1 - for {set i 0} {$i < $node_count} {incr i} { - R $i CLUSTER SET-CONFIG-EPOCH $config_epoch - incr config_epoch - } # Have all nodes meet if {$::tls} { From aab58d74b4077202d0b75ad614ea14c48d8e1f04 Mon Sep 17 00:00:00 2001 From: Vitaly Arbuzov Date: Tue, 17 Jun 2025 13:12:17 -0700 Subject: [PATCH 16/35] formatting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Viktor Söderqvist --- src/blocked.c | 4 +--- src/cluster_legacy.c | 8 ++++---- src/networking.c | 11 +++++------ src/server.c | 2 +- src/server.h | 12 ++++++------ 5 files changed, 17 insertions(+), 20 deletions(-) diff --git a/src/blocked.c b/src/blocked.c index e2ee9d621e..be034e4204 100644 --- a/src/blocked.c +++ b/src/blocked.c @@ -234,9 +234,7 @@ int blockedClientMayTimeout(client *c) { return moduleBlockedClientMayTimeout(c); } - if (c->bstate.btype == BLOCKED_LIST || - c->bstate.btype == BLOCKED_ZSET || - c->bstate.btype == BLOCKED_STREAM || + if (c->bstate.btype == BLOCKED_LIST || c->bstate.btype == BLOCKED_ZSET || c->bstate.btype == BLOCKED_STREAM || c->bstate.btype == BLOCKED_WAIT) { return 1; } diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index eae2091363..da559abd55 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -3489,8 +3489,8 @@ int clusterProcessPacket(clusterLink *link) { serverLog(LL_VERBOSE, "Node %.40s (%s) has old slots configuration, sending " "an UPDATE message about %.40s (%s)", - sender->name, sender->human_nodename, - server.cluster->slots[j]->name, server.cluster->slots[j]->human_nodename); + sender->name, sender->human_nodename, server.cluster->slots[j]->name, + server.cluster->slots[j]->human_nodename); clusterSendUpdate(sender->link, server.cluster->slots[j]); /* TODO: instead of exiting the loop send every other @@ -4406,8 +4406,8 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) { serverLog(LL_VERBOSE, "Node %.40s (%s) has old slots configuration, sending " "an UPDATE message about %.40s (%s)", - node->name, node->human_nodename, - server.cluster->slots[j]->name, server.cluster->slots[j]->human_nodename); + node->name, node->human_nodename, server.cluster->slots[j]->name, + server.cluster->slots[j]->human_nodename); clusterSendUpdate(node->link, server.cluster->slots[j]); return; } diff --git a/src/networking.c b/src/networking.c index 1fb8bf239e..93e3e655dd 100644 --- a/src/networking.c +++ b/src/networking.c @@ -101,7 +101,8 @@ void linkClient(client *c) { static void clientSetDefaultAuth(client *c) { /* If the default user does not require authentication, the user is * directly authenticated. */ - clientSetUser(c, DefaultUser, (DefaultUser->flags & USER_FLAG_NOPASS) && !(DefaultUser->flags & USER_FLAG_DISABLED)); + clientSetUser(c, DefaultUser, + (DefaultUser->flags & USER_FLAG_NOPASS) && !(DefaultUser->flags & USER_FLAG_DISABLED)); } /* Attach the user u to this client. @@ -111,8 +112,7 @@ static void clientSetDefaultAuth(client *c) { void clientSetUser(client *c, user *u, int authenticated) { c->user = u; c->flag.authenticated = authenticated; - if (authenticated) - c->flag.ever_authenticated = authenticated; + if (authenticated) c->flag.ever_authenticated = authenticated; } static int clientEverAuthenticated(client *c) { @@ -3776,7 +3776,7 @@ NULL if (getLongLongFromObjectOrReply(c, c->argv[2], &id, NULL) != C_OK) return; struct client *target = lookupClientByID(id); /* Note that we never try to unblock a client blocked on a module command, - * or a client blocked by CLIENT PAUSE or some other blocking type which + * or a client blocked by CLIENT PAUSE or some other blocking type which * doesn't have a timeout callback (even in the case of UNBLOCK ERROR). * The reason is that we assume that if a command doesn't expect to be timedout, * it also doesn't expect to be unblocked by CLIENT UNBLOCK */ @@ -4395,8 +4395,7 @@ int checkClientOutputBufferLimits(client *c) { /* For unauthenticated clients which were also never authenticated before the output buffer is limited to prevent * them from abusing it by not reading the replies */ - if (used_mem > REPLY_BUFFER_SIZE_UNAUTHENTICATED_CLIENT && authRequired(c) && !clientEverAuthenticated(c)) - return 1; + if (used_mem > REPLY_BUFFER_SIZE_UNAUTHENTICATED_CLIENT && authRequired(c) && !clientEverAuthenticated(c)) return 1; class = getClientType(c); /* For the purpose of output buffer limiting, primaries are handled diff --git a/src/server.c b/src/server.c index c0e792c6f2..d697bdfba2 100644 --- a/src/server.c +++ b/src/server.c @@ -3526,7 +3526,7 @@ void call(client *c, int flags) { * In case of blocking commands, the flag will be un-set only after successfully * re-processing and unblock the client.*/ c->flag.executing_command = 1; - + monotime monotonic_start = 0; if (monotonicGetType() == MONOTONIC_CLOCK_HW) monotonic_start = getMonotonicUs(); diff --git a/src/server.h b/src/server.h index f4636f88fe..e0b1ffdf19 100644 --- a/src/server.h +++ b/src/server.h @@ -1224,7 +1224,7 @@ typedef struct ClientFlags { uint64_t reexecuting_command : 1; /* The client is re-executing the command. */ uint64_t replication_done : 1; /* Indicate that replication has been done on the client */ uint64_t authenticated : 1; /* Indicate a client has successfully authenticated */ - uint64_t ever_authenticated : 1; /* Indicate a client was ever successfully authenticated during it's lifetime */ + uint64_t ever_authenticated : 1; /* Indicate a client was ever successfully authenticated during it's lifetime */ uint64_t protected_rdb_channel : 1; /* Dual channel replication sync: Protects the RDB client from premature \ * release during full sync. This flag is used to ensure that the RDB client, which \ @@ -1772,11 +1772,11 @@ struct valkeyServer { int events_per_io_thread; /* Number of events on the event loop to trigger IO threads activation. */ int prefetch_batch_max_size; /* Maximum number of keys to prefetch in a single batch */ long long events_processed_while_blocked; /* processEventsWhileBlocked() */ - int enable_protected_configs; /* Enable the modification of protected configs, see PROTECTED_ACTION_ALLOWED_* */ - int enable_debug_cmd; /* Enable DEBUG commands, see PROTECTED_ACTION_ALLOWED_* */ - int enable_module_cmd; /* Enable MODULE commands, see PROTECTED_ACTION_ALLOWED_* */ - int enable_debug_assert; /* Enable debug asserts */ - int debug_client_enforce_reply_list; /* Force client to always use the reply list */ + int enable_protected_configs; /* Enable the modification of protected configs, see PROTECTED_ACTION_ALLOWED_* */ + int enable_debug_cmd; /* Enable DEBUG commands, see PROTECTED_ACTION_ALLOWED_* */ + int enable_module_cmd; /* Enable MODULE commands, see PROTECTED_ACTION_ALLOWED_* */ + int enable_debug_assert; /* Enable debug asserts */ + int debug_client_enforce_reply_list; /* Force client to always use the reply list */ /* RDB / AOF loading information */ volatile sig_atomic_t loading; /* We are loading data from disk if true */ volatile sig_atomic_t async_loading; /* We are loading data without blocking the db being served */ From 5fe351478602413168c30086ccf85f128df62692 Mon Sep 17 00:00:00 2001 From: Vitaly Arbuzov Date: Tue, 17 Jun 2025 13:26:37 -0700 Subject: [PATCH 17/35] fix uninitialized variable warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Viktor Söderqvist --- src/evict.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/evict.c b/src/evict.c index 4b9f70eaa5..0a83af5f66 100644 --- a/src/evict.c +++ b/src/evict.c @@ -150,7 +150,7 @@ int evictionPoolPopulate(serverDb *db, kvstore *samplekvs, struct evictionPoolEn for (j = 0; j < count; j++) { unsigned long long idle; sds key; - robj *o; + robj *o = NULL; dictEntry *de; de = samples[j]; From 28547ae2dfcc30a4f73682acb46542839ba18747 Mon Sep 17 00:00:00 2001 From: Vitaly Arbuzov Date: Tue, 17 Jun 2025 14:56:29 -0700 Subject: [PATCH 18/35] Add retries on CLUSTERDOWN in tests and fix sanitizer error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Viktor Söderqvist --- src/functions.c | 7 ++++++- tests/support/cluster.tcl | 3 ++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/functions.c b/src/functions.c index 68fe404a6c..b72534a343 100644 --- a/src/functions.c +++ b/src/functions.c @@ -159,6 +159,11 @@ static void engineLibraryDispose(dict *d, void *obj) { engineLibraryFree(obj); } +/* Wrapper to free a library when used as a list free callback */ +static void engineLibraryListFree(void *obj) { + engineLibraryFree(obj); +} + /* Clear all the functions from the given library ctx */ void functionsLibCtxClear(functionsLibCtx *lib_ctx) { dictEmpty(lib_ctx->functions, NULL); @@ -338,7 +343,7 @@ libraryJoin(functionsLibCtx *functions_lib_ctx_dst, functionsLibCtx *functions_l } else { if (!old_libraries_list) { old_libraries_list = listCreate(); - listSetFreeMethod(old_libraries_list, (void (*)(void *))engineLibraryFree); + listSetFreeMethod(old_libraries_list, engineLibraryListFree); } libraryUnlink(functions_lib_ctx_dst, old_li); listAddNodeTail(old_libraries_list, old_li); diff --git a/tests/support/cluster.tcl b/tests/support/cluster.tcl index f9d4792d7b..b4289948e3 100644 --- a/tests/support/cluster.tcl +++ b/tests/support/cluster.tcl @@ -230,7 +230,8 @@ proc ::valkey_cluster::__dispatch__ {id method args} { if {[catch {$link $method {*}$args} e]} { if {$link eq {} || \ [string range $e 0 4] eq {MOVED} || \ - [string range $e 0 2] eq {I/O} \ + [string range $e 0 2] eq {I/O} || \ + [string range $e 0 10] eq {CLUSTERDOWN} \ } { # MOVED redirection. ::valkey_cluster::__method__refresh_nodes_map $id From f6fdd3315dbf8bc55062d20f16e4f81eb75b1bf4 Mon Sep 17 00:00:00 2001 From: Binbin Date: Mon, 11 Nov 2024 22:13:47 +0800 Subject: [PATCH 19/35] Fix empty primary may have dirty slots data due to bad migration (#1285) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If we become an empty primary for some reason, we still need to check if we need to delete dirty slots, because we may have dirty slots data left over from a bad migration. Like the target node forcibly executes CLUSTER SETSLOT NODE to take over the slot without performing key migration. Signed-off-by: Binbin Signed-off-by: Viktor Söderqvist --- src/cluster_legacy.c | 13 ++++++++++++- tests/unit/cluster/replica-migration.tcl | 20 ++++++++++++++++++++ 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index da559abd55..c9f4b25fd9 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -2483,6 +2483,7 @@ void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoc * need to delete all the keys in the slots we lost ownership. */ uint16_t dirty_slots[CLUSTER_SLOTS]; int dirty_slots_count = 0; + int delete_dirty_slots = 0; /* We should detect if sender is new primary of our shard. * We will know it if all our slots were migrated to sender, and sender @@ -2709,6 +2710,12 @@ void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoc serverLog(LL_NOTICE, "My last slot was migrated to node %.40s (%s) in shard %.40s. I am now an empty primary.", sender->name, sender->human_nodename, sender->shard_id); + /* We may still have dirty slots when we became a empty primary due to + * a bad migration. + * + * In order to maintain a consistent state between keys and slots + * we need to remove all the keys from the slots we lost. */ + delete_dirty_slots = 1; } } else if (dirty_slots_count) { /* If we are here, we received an update message which removed @@ -2718,6 +2725,10 @@ void clusterUpdateSlotsConfigWith(clusterNode *sender, uint64_t senderConfigEpoc * * In order to maintain a consistent state between keys and slots * we need to remove all the keys from the slots we lost. */ + delete_dirty_slots = 1; + } + + if (delete_dirty_slots) { for (int j = 0; j < dirty_slots_count; j++) { serverLog(LL_NOTICE, "Deleting keys in dirty slot %d on node %.40s (%s) in shard %.40s", dirty_slots[j], myself->name, myself->human_nodename, myself->shard_id); @@ -6098,7 +6109,7 @@ void removeChannelsInSlot(unsigned int slot) { /* Remove all the keys in the specified hash slot. * The number of removed items is returned. */ unsigned int delKeysInSlot(unsigned int hashslot) { - if (!kvstoreDictSize(server.db->keys, hashslot)) return 0; + if (!countKeysInSlot(hashslot)) return 0; /* We may lose a slot during the pause. We need to track this * state so that we don't assert in propagateNow(). */ diff --git a/tests/unit/cluster/replica-migration.tcl b/tests/unit/cluster/replica-migration.tcl index 05d6528684..d04069ef16 100644 --- a/tests/unit/cluster/replica-migration.tcl +++ b/tests/unit/cluster/replica-migration.tcl @@ -400,3 +400,23 @@ start_cluster 4 4 {tags {external:skip cluster} overrides {cluster-node-timeout start_cluster 4 4 {tags {external:skip cluster} overrides {cluster-node-timeout 1000 cluster-migration-barrier 999}} { test_cluster_setslot "setslot" } my_slot_allocation cluster_allocate_replicas ;# start_cluster + +start_cluster 3 0 {tags {external:skip cluster} overrides {cluster-node-timeout 1000 cluster-migration-barrier 999}} { + test "Empty primary will check and delete the dirty slots" { + R 2 config set cluster-allow-replica-migration no + + # Write a key to slot 0. + R 2 incr key_977613 + + # Move slot 0 from primary 2 to primary 0. + R 0 cluster bumpepoch + R 0 cluster setslot 0 node [R 0 cluster myid] + + # Wait for R 2 to report that it is an empty primary (cluster-allow-replica-migration no) + wait_for_log_messages -2 {"*I am now an empty primary*"} 0 1000 50 + + # Make sure primary 0 will delete the dirty slots. + verify_log_message -2 "*Deleting keys in dirty slot 0*" 0 + assert_equal [R 2 dbsize] 0 + } +} my_slot_allocation cluster_allocate_replicas ;# start_cluster From f5a995198b36f070e061242c2e0888fb794b5cf2 Mon Sep 17 00:00:00 2001 From: Binbin Date: Mon, 9 Dec 2024 16:19:02 +0800 Subject: [PATCH 20/35] Fix the election was reset wrongly before failover epoch was obtained (#1339) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After #1009, we will reset the election when we received a claim with an equal or higher epoch since a node can win an election in the past. But we need to consider the time before the node actually obtains the failover_auth_epoch. The failover_auth_epoch default is 0, so before the node actually get the failover epoch, we might wrongly reset the election. This is probably harmless, but will produce misleading log output and may delay election by a cron cycle or beforesleep. Now we will only reset the election when a node is actually obtains the failover epoch. Signed-off-by: Binbin Signed-off-by: Viktor Söderqvist --- src/cluster_legacy.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index c9f4b25fd9..5d4e85d882 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -3182,6 +3182,25 @@ int clusterProcessPacket(clusterLink *link) { if (sender_claims_to_be_primary && sender_claimed_config_epoch > sender->configEpoch) { sender->configEpoch = sender_claimed_config_epoch; clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_FSYNC_CONFIG); + + if (server.cluster->failover_auth_time && server.cluster->failover_auth_sent && + sender->configEpoch >= server.cluster->failover_auth_epoch) { + /* Another node has claimed an epoch greater than or equal to ours. + * If we have an ongoing election, reset it because we cannot win + * with an epoch smaller than or equal to the incoming claim. This + * allows us to start a new election as soon as possible. */ + server.cluster->failover_auth_time = 0; + serverLog(LL_WARNING, + "Failover election in progress for epoch %llu, but received a claim from " + "node %.40s (%s) with an equal or higher epoch %llu. Resetting the election " + "since we cannot win an election in the past.", + (unsigned long long)server.cluster->failover_auth_epoch, + sender->name, sender->human_nodename, + (unsigned long long)sender->configEpoch); + /* Maybe we could start a new election, set a flag here to make sure + * we check as soon as possible, instead of waiting for a cron. */ + clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER); + } } /* Update the replication offset info for this node. */ sender->repl_offset = ntohu64(hdr->offset); From b72fea27a572a559739fca20deed57ff9b7ff5b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Viktor=20S=C3=B6derqvist?= Date: Sun, 12 Jan 2025 01:02:39 +0100 Subject: [PATCH 21/35] Skip CLI tests with reply schema validation (#1545) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The commands used in valkey-cli tests are not important the reply schema validation. Skip them to avoid the problem if tests hanging. This has failed lately in the daily job: ``` [TIMEOUT]: clients state report follows. sock55fedcc19be0 => (IN PROGRESS) valkey-cli pubsub mode with single standard channel subscription Killing still running Valkey server 33357 ``` These test cases use a special valkey-cli command `:get pubsub` command, which is an internal command to valkey-cli rather than a Valkey server command. This command hangs when compiled with with logreqres enabled. Easy solution is to skip the tests in this setup. The test cases were introduced in #1432. Signed-off-by: Viktor Söderqvist --- tests/integration/valkey-cli.tcl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/valkey-cli.tcl b/tests/integration/valkey-cli.tcl index 6344215a25..2d51677ab9 100644 --- a/tests/integration/valkey-cli.tcl +++ b/tests/integration/valkey-cli.tcl @@ -6,7 +6,7 @@ if {$::singledb} { set ::dbnum 9 } -start_server {tags {"cli"}} { +start_server {tags {"cli logreqres:skip"}} { proc open_cli {{opts ""} {infile ""}} { if { $opts == "" } { set opts "-n $::dbnum" From 1931405dff336cca8fa858431e184b986a0a67e0 Mon Sep 17 00:00:00 2001 From: zvi-code <54795925+zvi-code@users.noreply.github.com> Date: Thu, 28 Nov 2024 17:27:00 +0200 Subject: [PATCH 22/35] Disable lazy free in defrag test to fix 32bit daily failure (#1370) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Zvi Schneider Co-authored-by: Zvi Schneider Signed-off-by: Viktor Söderqvist --- tests/unit/memefficiency.tcl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/memefficiency.tcl b/tests/unit/memefficiency.tcl index d5a6a6efe2..67329f03f1 100644 --- a/tests/unit/memefficiency.tcl +++ b/tests/unit/memefficiency.tcl @@ -720,11 +720,11 @@ run_solo {defrag} { } } - start_cluster 1 0 {tags {"defrag external:skip cluster"} overrides {appendonly yes auto-aof-rewrite-percentage 0 save ""}} { + start_cluster 1 0 {tags {"defrag external:skip cluster"} overrides {appendonly yes auto-aof-rewrite-percentage 0 save "" lazyfree-lazy-user-del no}} { test_active_defrag "cluster" } - start_server {tags {"defrag external:skip standalone"} overrides {appendonly yes auto-aof-rewrite-percentage 0 save ""}} { + start_server {tags {"defrag external:skip standalone"} overrides {appendonly yes auto-aof-rewrite-percentage 0 save "" lazyfree-lazy-user-del no}} { test_active_defrag "standalone" } } ;# run_solo From a5c9342fb73dcb79c7b263671510c375c5197f7c Mon Sep 17 00:00:00 2001 From: Madelyn Olson Date: Tue, 17 Dec 2024 17:48:53 -0800 Subject: [PATCH 23/35] Fix undefined behavior defined by ASAN (#1451) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Asan now supports making sure you are passing in the correct pointer type, which seems useful but we can't support it since we pass in an incorrect pointer in several places. This is most commonly done with generic free functions, where we simply cast it to the correct type. It's not a lot of code to clean up, so it seems appropriate to cleanup instead of disabling the check. --------- Signed-off-by: Madelyn Olson Co-authored-by: Viktor Söderqvist Signed-off-by: Viktor Söderqvist --- src/acl.c | 20 ++++++++++---------- src/adlist.c | 6 ++++++ src/adlist.h | 1 + src/call_reply.c | 2 +- src/db.c | 2 +- src/eval.c | 4 ++-- src/listpack.c | 6 ++++++ src/listpack.h | 1 + src/module.c | 2 +- src/networking.c | 2 +- src/replication.c | 2 +- src/sds.c | 7 +++++++ src/sds.h | 1 + src/t_stream.c | 19 +++++++++++++++---- src/unit/test_listpack.c | 2 +- src/unit/test_ziplist.c | 2 +- 16 files changed, 56 insertions(+), 23 deletions(-) diff --git a/src/acl.c b/src/acl.c index 14087ea2f4..d7cb504393 100644 --- a/src/acl.c +++ b/src/acl.c @@ -297,11 +297,6 @@ int ACLListMatchSds(void *a, void *b) { return sdscmp(a, b) == 0; } -/* Method to free list elements from ACL users password/patterns lists. */ -void ACLListFreeSds(void *item) { - sdsfree(item); -} - /* Method to duplicate list elements from ACL users password/patterns lists. */ void *ACLListDupSds(void *item) { return sdsdup(item); @@ -374,7 +369,7 @@ aclSelector *ACLCreateSelector(int flags) { listSetFreeMethod(selector->patterns, ACLListFreeKeyPattern); listSetDupMethod(selector->patterns, ACLListDupKeyPattern); listSetMatchMethod(selector->channels, ACLListMatchSds); - listSetFreeMethod(selector->channels, ACLListFreeSds); + listSetFreeMethod(selector->channels, sdsfreeVoid); listSetDupMethod(selector->channels, ACLListDupSds); memset(selector->allowed_commands, 0, sizeof(selector->allowed_commands)); @@ -445,7 +440,7 @@ user *ACLCreateUser(const char *name, size_t namelen) { u->passwords = listCreate(); u->acl_string = NULL; listSetMatchMethod(u->passwords, ACLListMatchSds); - listSetFreeMethod(u->passwords, ACLListFreeSds); + listSetFreeMethod(u->passwords, sdsfreeVoid); listSetDupMethod(u->passwords, ACLListDupSds); u->selectors = listCreate(); @@ -489,6 +484,11 @@ void ACLFreeUser(user *u) { zfree(u); } +/* Used for generic free functions. */ +static void ACLFreeUserVoid(void *u) { + ACLFreeUser(u); +} + /* When a user is deleted we need to cycle the active * connections in order to kill all the pending ones that * are authenticated with such user. */ @@ -2449,12 +2449,12 @@ sds ACLLoadFromFile(const char *filename) { c->user = new_user; } - if (user_channels) raxFreeWithCallback(user_channels, (void (*)(void *))listRelease); - raxFreeWithCallback(old_users, (void (*)(void *))ACLFreeUser); + if (user_channels) raxFreeWithCallback(user_channels, listReleaseVoid); + raxFreeWithCallback(old_users, ACLFreeUserVoid); sdsfree(errors); return NULL; } else { - raxFreeWithCallback(Users, (void (*)(void *))ACLFreeUser); + raxFreeWithCallback(Users, ACLFreeUserVoid); Users = old_users; errors = sdscat(errors, "WARNING: ACL errors detected, no change to the previously active ACL rules was performed"); diff --git a/src/adlist.c b/src/adlist.c index 11b152592b..0dc77cc038 100644 --- a/src/adlist.c +++ b/src/adlist.c @@ -77,6 +77,12 @@ void listRelease(list *list) { zfree(list); } +/* Just like listRelease, but takes the list as a (void *). + * Useful as generic free callback. */ +void listReleaseVoid(void *l) { + listRelease((list *)l); +} + /* Add a new node to the list, to head, containing the specified 'value' * pointer as value. * diff --git a/src/adlist.h b/src/adlist.h index bfc4280434..c642c1c791 100644 --- a/src/adlist.h +++ b/src/adlist.h @@ -72,6 +72,7 @@ typedef struct list { /* Prototypes */ list *listCreate(void); void listRelease(list *list); +void listReleaseVoid(void *list); void listEmpty(list *list); list *listAddNodeHead(list *list, void *value); list *listAddNodeTail(list *list, void *value); diff --git a/src/call_reply.c b/src/call_reply.c index 00d196081e..dc981b8be8 100644 --- a/src/call_reply.c +++ b/src/call_reply.c @@ -559,7 +559,7 @@ CallReply *callReplyCreateError(sds reply, void *private_data) { sdsfree(reply); } list *deferred_error_list = listCreate(); - listSetFreeMethod(deferred_error_list, (void (*)(void *))sdsfree); + listSetFreeMethod(deferred_error_list, sdsfreeVoid); listAddNodeTail(deferred_error_list, sdsnew(err_buff)); return callReplyCreate(err_buff, deferred_error_list, private_data); } diff --git a/src/db.c b/src/db.c index 68eb140557..12a5abd08c 100644 --- a/src/db.c +++ b/src/db.c @@ -1088,7 +1088,7 @@ void scanGenericCommand(client *c, robj *o, unsigned long long cursor) { * The exception to the above is ZSET, where we do allocate temporary * strings even when scanning a dict. */ if (o && (!ht || o->type == OBJ_ZSET)) { - listSetFreeMethod(keys, (void (*)(void *))sdsfree); + listSetFreeMethod(keys, sdsfreeVoid); } /* For main dictionary scan or data structure using hashtable. */ diff --git a/src/eval.c b/src/eval.c index 73d5e2fedc..5c37d63f83 100644 --- a/src/eval.c +++ b/src/eval.c @@ -205,7 +205,7 @@ void scriptingInit(int setup) { * and we need to free them respectively. */ lctx.lua_scripts = dictCreate(&shaScriptObjectDictType); lctx.lua_scripts_lru_list = listCreate(); - listSetFreeMethod(lctx.lua_scripts_lru_list, (void (*)(void *))sdsfree); + listSetFreeMethod(lctx.lua_scripts_lru_list, sdsfreeVoid); lctx.lua_scripts_mem = 0; luaRegisterServerAPI(lua); @@ -777,7 +777,7 @@ void ldbInit(void) { ldb.conn = NULL; ldb.active = 0; ldb.logs = listCreate(); - listSetFreeMethod(ldb.logs, (void (*)(void *))sdsfree); + listSetFreeMethod(ldb.logs, sdsfreeVoid); ldb.children = listCreate(); ldb.src = NULL; ldb.lines = 0; diff --git a/src/listpack.c b/src/listpack.c index 4c51b285d2..1891310b0e 100644 --- a/src/listpack.c +++ b/src/listpack.c @@ -250,6 +250,12 @@ void lpFree(unsigned char *lp) { lp_free(lp); } +/* Same as lpFree, but useful for when you are passing the listpack + * into a generic free function that expects (void *) */ +void lpFreeVoid(void *lp) { + lp_free((unsigned char *)lp); +} + /* Shrink the memory to fit. */ unsigned char *lpShrinkToFit(unsigned char *lp) { size_t size = lpGetTotalBytes(lp); diff --git a/src/listpack.h b/src/listpack.h index aa7636143f..b143797261 100644 --- a/src/listpack.h +++ b/src/listpack.h @@ -56,6 +56,7 @@ typedef struct { unsigned char *lpNew(size_t capacity); void lpFree(unsigned char *lp); +void lpFreeVoid(void *lp); unsigned char *lpShrinkToFit(unsigned char *lp); unsigned char * lpInsertString(unsigned char *lp, unsigned char *s, uint32_t slen, unsigned char *p, int where, unsigned char **newp); diff --git a/src/module.c b/src/module.c index 93d11b52d4..bdd1b8a868 100644 --- a/src/module.c +++ b/src/module.c @@ -10380,7 +10380,7 @@ ValkeyModuleServerInfoData *VM_GetServerInfo(ValkeyModuleCtx *ctx, const char *s * context instead of passing NULL. */ void VM_FreeServerInfo(ValkeyModuleCtx *ctx, ValkeyModuleServerInfoData *data) { if (ctx != NULL) autoMemoryFreed(ctx, VALKEYMODULE_AM_INFO, data); - raxFreeWithCallback(data->rax, (void (*)(void *))sdsfree); + raxFreeWithCallback(data->rax, sdsfreeVoid); zfree(data); } diff --git a/src/networking.c b/src/networking.c index 93e3e655dd..2cb126c9e2 100644 --- a/src/networking.c +++ b/src/networking.c @@ -565,7 +565,7 @@ void afterErrorReply(client *c, const char *s, size_t len, int flags) { if (c->flag.module) { if (!c->deferred_reply_errors) { c->deferred_reply_errors = listCreate(); - listSetFreeMethod(c->deferred_reply_errors, (void (*)(void *))sdsfree); + listSetFreeMethod(c->deferred_reply_errors, sdsfreeVoid); } listAddNodeTail(c->deferred_reply_errors, sdsnewlen(s, len)); return; diff --git a/src/replication.c b/src/replication.c index 4f87aa7df5..cca1849dc7 100644 --- a/src/replication.c +++ b/src/replication.c @@ -283,7 +283,7 @@ void removeReplicaFromPsyncWait(client *replica_main_client) { void resetReplicationBuffer(void) { server.repl_buffer_mem = 0; server.repl_buffer_blocks = listCreate(); - listSetFreeMethod(server.repl_buffer_blocks, (void (*)(void *))zfree); + listSetFreeMethod(server.repl_buffer_blocks, zfree); } int canFeedReplicaReplBuffer(client *replica) { diff --git a/src/sds.c b/src/sds.c index e14f4bd0bd..32b13b2138 100644 --- a/src/sds.c +++ b/src/sds.c @@ -216,6 +216,13 @@ void sdsfree(sds s) { s_free_with_size(sdsAllocPtr(s), sdsAllocSize(s)); } +/* This variant of sdsfree() gets its argument as void, and is useful + * as free method in data structures that expect a 'void free_object(void*)' + * prototype for the free method. */ +void sdsfreeVoid(void *s) { + sdsfree(s); +} + /* Set the sds string length to the length as obtained with strlen(), so * considering as content only up to the first null term character. * diff --git a/src/sds.h b/src/sds.h index e9c4a95f9a..e1b8531955 100644 --- a/src/sds.h +++ b/src/sds.h @@ -183,6 +183,7 @@ sds sdsempty(void); sds sdsdup(const sds s); size_t sdscopytobuffer(unsigned char *buf, size_t buf_len, sds s, uint8_t *hdr_size); void sdsfree(sds s); +void sdsfreeVoid(void *s); sds sdsgrowzero(sds s, size_t len); sds sdscatlen(sds s, const void *t, size_t len); sds sdscat(sds s, const char *t); diff --git a/src/t_stream.c b/src/t_stream.c index 801a5e4323..63fa0a361c 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -54,6 +54,7 @@ #define STREAM_LISTPACK_MAX_SIZE (1 << 30) void streamFreeCG(streamCG *cg); +void streamFreeCGVoid(void *cg); void streamFreeNACK(streamNACK *na); size_t streamReplyWithRangeFromConsumerPEL(client *c, stream *s, @@ -86,8 +87,8 @@ stream *streamNew(void) { /* Free a stream, including the listpacks stored inside the radix tree. */ void freeStream(stream *s) { - raxFreeWithCallback(s->rax, (void (*)(void *))lpFree); - if (s->cgroups) raxFreeWithCallback(s->cgroups, (void (*)(void *))streamFreeCG); + raxFreeWithCallback(s->rax, lpFreeVoid); + if (s->cgroups) raxFreeWithCallback(s->cgroups, streamFreeCGVoid); zfree(s); } @@ -2455,6 +2456,11 @@ void streamFreeConsumer(streamConsumer *sc) { zfree(sc); } +/* Used for generic free functions. */ +static void streamFreeConsumerVoid(void *sc) { + streamFreeConsumer((streamConsumer *)sc); +} + /* Create a new consumer group in the context of the stream 's', having the * specified name, last server ID and reads counter. If a consumer group with * the same name already exists NULL is returned, otherwise the pointer to the @@ -2474,11 +2480,16 @@ streamCG *streamCreateCG(stream *s, char *name, size_t namelen, streamID *id, lo /* Free a consumer group and all its associated data. */ void streamFreeCG(streamCG *cg) { - raxFreeWithCallback(cg->pel, (void (*)(void *))streamFreeNACK); - raxFreeWithCallback(cg->consumers, (void (*)(void *))streamFreeConsumer); + raxFreeWithCallback(cg->pel, zfree); + raxFreeWithCallback(cg->consumers, streamFreeConsumerVoid); zfree(cg); } +/* Used for generic free functions. */ +void streamFreeCGVoid(void *cg) { + streamFreeCG((streamCG *)cg); +} + /* Lookup the consumer group in the specified stream and returns its * pointer, otherwise if there is no such group, NULL is returned. */ streamCG *streamLookupCG(stream *s, sds groupname) { diff --git a/src/unit/test_listpack.c b/src/unit/test_listpack.c index 4838fc8952..0c71da18db 100644 --- a/src/unit/test_listpack.c +++ b/src/unit/test_listpack.c @@ -1184,7 +1184,7 @@ int test_listpackStressWithRandom(int argc, char **argv, int flags) { for (i = 0; i < iteration; i++) { lp = lpNew(0); ref = listCreate(); - listSetFreeMethod(ref, (void (*)(void *))sdsfree); + listSetFreeMethod(ref, sdsfreeVoid); len = rand() % 256; /* Create lists */ diff --git a/src/unit/test_ziplist.c b/src/unit/test_ziplist.c index d2f7ebe69c..58687d81fc 100644 --- a/src/unit/test_ziplist.c +++ b/src/unit/test_ziplist.c @@ -645,7 +645,7 @@ int test_ziplistStressWithRandomPayloadsOfDifferentEncoding(int argc, char **arg for (i = 0; i < iteration; i++) { zl = ziplistNew(); ref = listCreate(); - listSetFreeMethod(ref, (void (*)(void *))sdsfree); + listSetFreeMethod(ref, sdsfreeVoid); len = rand() % 256; /* Create lists */ From 696482701d4633ce119c2bd0ab6c0e1062456f29 Mon Sep 17 00:00:00 2001 From: Binbin Date: Wed, 2 Oct 2024 04:14:30 +0800 Subject: [PATCH 24/35] Avoid timing issue in diskless-load-swapdb test (#1077) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since we paused the primary node earlier, the replica may enter cluster down due to primary node pfail. Here set allow read to prevent subsequent read errors. Signed-off-by: Binbin Signed-off-by: Viktor Söderqvist --- tests/unit/cluster/diskless-load-swapdb.tcl | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/unit/cluster/diskless-load-swapdb.tcl b/tests/unit/cluster/diskless-load-swapdb.tcl index 68c2135493..e237fd81de 100644 --- a/tests/unit/cluster/diskless-load-swapdb.tcl +++ b/tests/unit/cluster/diskless-load-swapdb.tcl @@ -78,6 +78,11 @@ test "Main db not affected when fail to diskless load" { fail "Fail to stop the full sync" } + # Since we paused the primary node earlier, the replica may enter + # cluster down due to primary node pfail. Here set allow read to + # prevent subsequent read errors. + $replica config set cluster-allow-reads-when-down yes + # Replica keys and keys to slots map still both are right assert_equal {1} [$replica get $slot0_key] assert_equal $slot0_key [$replica CLUSTER GETKEYSINSLOT 0 1] From c6883b1576852ad4dfdf3668e5f7fc28acfcb7e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Viktor=20S=C3=B6derqvist?= Date: Mon, 13 Jan 2025 03:14:09 +0100 Subject: [PATCH 25/35] Test coverage for ECHO for reply schema validation (#1549) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After #1545 disabled some tests for reply schema validation, we now have another issue that ECHO is not covered. ``` WARNING! The following commands were not hit at all: echo ERROR! at least one command was not hit by the tests ``` This patch adds a test case for ECHO in the unit/other test suite. I haven't checked if there are more commands that aren't covered. Signed-off-by: Viktor Söderqvist --- .github/workflows/daily.yml | 2 +- tests/unit/other.tcl | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/daily.yml b/.github/workflows/daily.yml index edaa91a389..8af559ecd8 100644 --- a/.github/workflows/daily.yml +++ b/.github/workflows/daily.yml @@ -1150,7 +1150,7 @@ jobs: if: | (github.event_name == 'workflow_dispatch' || (github.event_name == 'schedule' && github.repository == 'valkey-io/valkey') || - (github.event_name == 'pull_request' && github.event.pull_request.base.ref != 'unstable')) && + (github.event_name == 'pull_request' && (contains(github.event.pull_request.labels.*.name, 'run-extra-tests') || github.event.pull_request.base.ref != 'unstable'))) && !contains(github.event.inputs.skipjobs, 'reply-schema') steps: - name: prep diff --git a/tests/unit/other.tcl b/tests/unit/other.tcl index 6e6230fc19..cb7d45b98f 100644 --- a/tests/unit/other.tcl +++ b/tests/unit/other.tcl @@ -30,6 +30,10 @@ start_server {tags {"other"}} { } } + test {Coverage: ECHO} { + assert_equal bang [r ECHO bang] + } + test {SAVE - make sure there are all the types as values} { # Wait for a background saving in progress to terminate waitForBgsave r From 224ccf9f8e202c33e3cca9cc833d582e0147f649 Mon Sep 17 00:00:00 2001 From: Sarthak Aggarwal Date: Thu, 31 Jul 2025 01:11:19 -0700 Subject: [PATCH 26/35] Try to stabilize aof test (#2399) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Based on @enjoy-binbin's suggestion on #1611, I made the change to find the available port. The test has been passing in the daily tests in my local repo. Resolved #1611 Signed-off-by: Sarthak Aggarwal Signed-off-by: Viktor Söderqvist --- tests/integration/aof.tcl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/integration/aof.tcl b/tests/integration/aof.tcl index 33c7c12d4b..a7f1fe21eb 100644 --- a/tests/integration/aof.tcl +++ b/tests/integration/aof.tcl @@ -689,9 +689,10 @@ tags {"aof cluster external:skip"} { append_to_manifest "file appendonly.aof.1.incr.aof seq 1 type i\n" } - start_server_aof [list dir $server_path cluster-enabled yes] { + start_server_aof [list dir $server_path cluster-enabled yes cluster-port [find_available_port $::baseport $::portcount]] { assert_equal [r ping] {PONG} } + clean_aof_persistence $aof_dirpath } } From 8c343f5312c4435a3b4397366fcc6281b05c5764 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Viktor=20S=C3=B6derqvist?= Date: Fri, 15 Aug 2025 12:55:03 +0200 Subject: [PATCH 27/35] Disable clang-format in 8.0 to simplify backporting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Viktor Söderqvist --- .github/workflows/clang-format.yml | 52 ------------------------------ 1 file changed, 52 deletions(-) delete mode 100644 .github/workflows/clang-format.yml diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml deleted file mode 100644 index efc63a1f6f..0000000000 --- a/.github/workflows/clang-format.yml +++ /dev/null @@ -1,52 +0,0 @@ -name: Clang Format Check - -on: - pull_request: - paths: - - 'src/**' - -concurrency: - group: clang-${{ github.head_ref || github.ref }} - cancel-in-progress: true - -jobs: - clang-format-check: - runs-on: ubuntu-latest - - steps: - - name: Checkout code - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - - - name: Set up Clang - run: | - sudo apt-get update -y - sudo apt-get upgrade -y - sudo apt-get install software-properties-common -y - wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | gpg --dearmor | sudo tee /usr/share/keyrings/llvm-toolchain.gpg > /dev/null - echo "deb [signed-by=/usr/share/keyrings/llvm-toolchain.gpg] http://apt.llvm.org/$(lsb_release -cs)/ llvm-toolchain-$(lsb_release -cs)-18 main" | sudo tee /etc/apt/sources.list.d/llvm.list - sudo apt-get update -y - sudo apt-get install clang-format-18 -y - - name: Run clang-format - id: clang-format - run: | - # Run clang-format and capture the diff - cd src - shopt -s globstar - clang-format-18 -i **/*.c **/*.h - # Capture the diff output - DIFF=$(git diff) - if [ ! -z "$DIFF" ]; then - # Encode the diff in Base64 to ensure it's handled as a single line - ENCODED_DIFF=$(echo "$DIFF" | base64 -w 0) - echo "diff=$ENCODED_DIFF" >> $GITHUB_OUTPUT - fi - shell: bash - - - name: Check for formatting changes - if: ${{ steps.clang-format.outputs.diff }} - run: | - echo "ERROR: Code is not formatted correctly. Here is the diff:" - # Decode the Base64 diff to display it - echo "${{ steps.clang-format.outputs.diff }}" | base64 --decode - exit 1 - shell: bash From 2cc78d5c7dfeca992bfbe8ca31b853b32ad2d219 Mon Sep 17 00:00:00 2001 From: Ran Shidlansik Date: Mon, 14 Jul 2025 19:19:13 +0300 Subject: [PATCH 28/35] update build-debian-old to use Bullseye instead of EOL buster (#2345) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Ran Shidlansik Signed-off-by: Viktor Söderqvist --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 6a1a9460fb..b21c9f3be8 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -70,7 +70,7 @@ jobs: build-debian-old: runs-on: ubuntu-latest - container: debian:buster + container: debian:bullseye steps: - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1 - name: make From fba379c5c59d7acabf4b6755a3bd22f703a47a55 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Viktor=20S=C3=B6derqvist?= Date: Fri, 15 Aug 2025 13:16:23 +0200 Subject: [PATCH 29/35] Fix defrag timeouts in 8.0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Viktor Söderqvist --- tests/unit/memefficiency.tcl | 63 ++++++++++++++++++------------------ 1 file changed, 32 insertions(+), 31 deletions(-) diff --git a/tests/unit/memefficiency.tcl b/tests/unit/memefficiency.tcl index 67329f03f1..4dc04f6e69 100644 --- a/tests/unit/memefficiency.tcl +++ b/tests/unit/memefficiency.tcl @@ -14,7 +14,7 @@ proc test_memory_efficiency {range} { for {set j 0} {$j < 10000} {incr j} { $rd read ; # Discard replies } - + $rd close set current_mem [s used_memory] set used [expr {$current_mem-$base_mem}] set efficiency [expr {double($written)/$used}] @@ -37,6 +37,15 @@ start_server {tags {"memefficiency external:skip"}} { } run_solo {defrag} { + # Make a deferring client with CLIENT REPLY OFF sync with the server by + # temporarily setting CLIENT REPLY ON and waiting for the reply, then + # switching back to CLIENT REPLY OFF. + proc client_reply_off_wait_for_server {rd} { + $rd client reply on + assert_equal OK [$rd read] + $rd client reply off + } + proc test_active_defrag {type} { if {[string match {*jemalloc*} [s mem_allocator]] && [r debug mallctl arenas.page] <= 8192} { test "Active defrag main dictionary: $type" { @@ -194,14 +203,12 @@ run_solo {defrag} { # Populate memory with interleaving script-key pattern of same size set dummy_script "--[string repeat x 400]\nreturn " set rd [valkey_deferring_client] + $rd client reply off for {set j 0} {$j < $n} {incr j} { set val "$dummy_script[format "%06d" $j]" $rd script load $val $rd set k$j $val - } - for {set j 0} {$j < $n} {incr j} { - $rd read ; # Discard script load replies - $rd read ; # Discard set replies + if {$j % 1000 == 999} {client_reply_off_wait_for_server $rd} } after 120 ;# serverCron only updates the info once in 100ms if {$::verbose} { @@ -213,8 +220,10 @@ run_solo {defrag} { assert_lessthan [s allocator_frag_ratio] 1.05 # Delete all the keys to create fragmentation - for {set j 0} {$j < $n} {incr j} { $rd del k$j } - for {set j 0} {$j < $n} {incr j} { $rd read } ; # Discard del replies + for {set j 0} {$j < $n} {incr j} { + $rd del k$j + if {$j % 1000 == 999} {client_reply_off_wait_for_server $rd} + } $rd close after 120 ;# serverCron only updates the info once in 100ms if {$::verbose} { @@ -287,15 +296,14 @@ run_solo {defrag} { # create big keys with 10k items set rd [valkey_deferring_client] + $rd client reply off for {set j 0} {$j < 10000} {incr j} { $rd hset bighash $j [concat "asdfasdfasdf" $j] $rd lpush biglist [concat "asdfasdfasdf" $j] $rd zadd bigzset $j [concat "asdfasdfasdf" $j] $rd sadd bigset [concat "asdfasdfasdf" $j] $rd xadd bigstream * item 1 value a - } - for {set j 0} {$j < 50000} {incr j} { - $rd read ; # Discard replies + if {$j % 100 == 99} {client_reply_off_wait_for_server $rd} } # create some small items (effective in cluster-enabled) @@ -311,9 +319,7 @@ run_solo {defrag} { # scale the hash to 1m fields in order to have a measurable the latency for {set j 10000} {$j < 1000000} {incr j} { $rd hset bighash $j [concat "asdfasdfasdf" $j] - } - for {set j 10000} {$j < 1000000} {incr j} { - $rd read ; # Discard replies + if {$j % 1000 == 999} {client_reply_off_wait_for_server $rd} } # creating that big hash, increased used_memory, so the relative frag goes down set expected_frag 1.3 @@ -322,19 +328,16 @@ run_solo {defrag} { # add a mass of string keys for {set j 0} {$j < 500000} {incr j} { $rd setrange $j 150 a - } - for {set j 0} {$j < 500000} {incr j} { - $rd read ; # Discard replies + if {$j % 1000 == 999} {client_reply_off_wait_for_server $rd} } assert_equal [r dbsize] 500015 # create some fragmentation for {set j 0} {$j < 500000} {incr j 2} { $rd del $j + if {$j % 1000 == 998} {client_reply_off_wait_for_server $rd} } - for {set j 0} {$j < 500000} {incr j 2} { - $rd read ; # Discard replies - } + $rd close assert_equal [r dbsize] 250015 # start defrag @@ -440,8 +443,11 @@ run_solo {defrag} { assert_lessthan [s allocator_frag_ratio] 1.05 # Delete all the keys to create fragmentation - for {set j 0} {$j < $n} {incr j} { $rd del k$j } - for {set j 0} {$j < $n} {incr j} { $rd read } ; # Discard del replies + $rd client reply off + for {set j 0} {$j < $n} {incr j} { + $rd del k$j + if {$j % 1000 == 999} {client_reply_off_wait_for_server $rd} + } $rd close after 120 ;# serverCron only updates the info once in 100ms if {$::verbose} { @@ -519,6 +525,7 @@ run_solo {defrag} { # create big keys with 10k items set rd [valkey_deferring_client] + $rd client reply off set expected_frag 1.7 # add a mass of list nodes to two lists (allocations are interlaced) @@ -527,10 +534,7 @@ run_solo {defrag} { for {set j 0} {$j < $elements} {incr j} { $rd lpush biglist1 $val $rd lpush biglist2 $val - } - for {set j 0} {$j < $elements} {incr j} { - $rd read ; # Discard replies - $rd read ; # Discard replies + if {$j % 1000 == 999} {client_reply_off_wait_for_server $rd} } # create some fragmentation @@ -638,12 +642,11 @@ run_solo {defrag} { # add a mass of keys with 600 bytes values, fill the bin of 640 bytes which has 32 regs per slab. set rd [valkey_deferring_client] + $rd client reply off set keys 640000 for {set j 0} {$j < $keys} {incr j} { $rd setrange $j 600 x - } - for {set j 0} {$j < $keys} {incr j} { - $rd read ; # Discard replies + if {$j % 1000 == 999} {client_reply_off_wait_for_server $rd} } # create some fragmentation of 50% @@ -652,9 +655,7 @@ run_solo {defrag} { $rd del $j incr sent incr j 1 - } - for {set j 0} {$j < $sent} {incr j} { - $rd read ; # Discard replies + if {$j % 1000 == 998} {client_reply_off_wait_for_server $rd} } # create higher fragmentation in the first slab From 3751964a3ef124804b45ae866ae9d00cc6a5b18e Mon Sep 17 00:00:00 2001 From: Katie Holly Date: Wed, 18 Jun 2025 17:23:46 +0000 Subject: [PATCH 30/35] Ensure empty error tables in scripts don't crash Valkey (#2229) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When calling the command `EVAL error{} 0`, Valkey crashes with the following stack trace. This patch ensures we never leave the `err_info.msg` field null when we fail to extract a proper error message. ``` === VALKEY BUG REPORT START: Cut & paste starting from here === 2595901:M 18 Jun 2025 01:20:12.917 # valkey 8.1.2 crashed by signal: 11, si_code: 1 2595901:M 18 Jun 2025 01:20:12.917 # Accessing address: (nil) 2595901:M 18 Jun 2025 01:20:12.917 # Crashed running the instruction at: 0x726f8e57ed1d ------ STACK TRACE ------ EIP: /usr/lib/libc.so.6(+0x16ed1d) [0x726f8e57ed1d] 2595905 bio_aof /usr/lib/libc.so.6(+0x9de22) [0x726f8e4ade22] /usr/lib/libc.so.6(+0x91fda) [0x726f8e4a1fda] /usr/lib/libc.so.6(+0x9264c) [0x726f8e4a264c] /usr/lib/libc.so.6(pthread_cond_wait+0x14e) [0x726f8e4a4d1e] valkey-server *:6379(bioProcessBackgroundJobs+0x1b4) [0x6530abb46db4] /usr/lib/libc.so.6(+0x957eb) [0x726f8e4a57eb] /usr/lib/libc.so.6(+0x11918c) [0x726f8e52918c] 2595904 bio_close_file /usr/lib/libc.so.6(+0x9de22) [0x726f8e4ade22] /usr/lib/libc.so.6(+0x91fda) [0x726f8e4a1fda] /usr/lib/libc.so.6(+0x9264c) [0x726f8e4a264c] /usr/lib/libc.so.6(pthread_cond_wait+0x14e) [0x726f8e4a4d1e] valkey-server *:6379(bioProcessBackgroundJobs+0x1b4) [0x6530abb46db4] /usr/lib/libc.so.6(+0x957eb) [0x726f8e4a57eb] /usr/lib/libc.so.6(+0x11918c) [0x726f8e52918c] 2595901 valkey-server * /usr/lib/libc.so.6(+0x3def0) [0x726f8e44def0] /usr/lib/libc.so.6(+0x16ed1d) [0x726f8e57ed1d] valkey-server *:6379(sdscatfmt+0x894) [0x6530abaa24a4] valkey-server *:6379(luaCallFunction+0x39a) [0x6530abbc66ea] valkey-server *:6379(+0x1a0992) [0x6530abbc6992] valkey-server *:6379(scriptingEngineCallFunction+0x98) [0x6530abbc1298] valkey-server *:6379(+0x11ff55) [0x6530abb45f55] valkey-server *:6379(call+0x174) [0x6530aba94454] valkey-server *:6379(processCommand+0x93d) [0x6530aba958dd] valkey-server *:6379(processCommandAndResetClient+0x21) [0x6530abaa9d11] valkey-server *:6379(processInputBuffer+0xe3) [0x6530abaaee83] valkey-server *:6379(readQueryFromClient+0x65) [0x6530abaaef55] valkey-server *:6379(+0x18e31a) [0x6530abbb431a] valkey-server *:6379(aeProcessEvents+0x24a) [0x6530aba790ca] valkey-server *:6379(aeMain+0x2d) [0x6530aba7938d] valkey-server *:6379(main+0x3f6) [0x6530aba6e7b6] /usr/lib/libc.so.6(+0x276b5) [0x726f8e4376b5] /usr/lib/libc.so.6(__libc_start_main+0x89) [0x726f8e437769] valkey-server *:6379(_start+0x25) [0x6530aba70235] 2595906 bio_lazy_free /usr/lib/libc.so.6(+0x9de22) [0x726f8e4ade22] /usr/lib/libc.so.6(+0x91fda) [0x726f8e4a1fda] /usr/lib/libc.so.6(+0x9264c) [0x726f8e4a264c] /usr/lib/libc.so.6(pthread_cond_wait+0x14e) [0x726f8e4a4d1e] valkey-server *:6379(bioProcessBackgroundJobs+0x1b4) [0x6530abb46db4] /usr/lib/libc.so.6(+0x957eb) [0x726f8e4a57eb] /usr/lib/libc.so.6(+0x11918c) [0x726f8e52918c] 4/4 expected stacktraces. ------ STACK TRACE DONE ------ ------ REGISTERS ------ 2595901:M 18 Jun 2025 01:20:12.920 # RAX:0000000000000000 RBX:0000726f8dd35663 RCX:0000000000000000 RDX:0000000000000000 RDI:0000000000000000 RSI:0000000000000010 RBP:00007ffc2b821a80 RSP:00007ffc2b821938 R8 :000000000000000c R9 :00006530abc111b8 R10:0000000000000001 R11:0000000000000003 R12:00006530abc49adc R13:00006530abc111b7 R14:0000000000000001 R15:0000000000000001 RIP:0000726f8e57ed1d EFL:0000000000010283 CSGSFS:002b000000000033 2595901:M 18 Jun 2025 01:20:12.921 * hide-user-data-from-log is on, skip logging stack content to avoid spilling user data. ------ INFO OUTPUT ------ redis_version:7.2.4 server_name:valkey valkey_version:8.1.2 valkey_release_stage:ga redis_git_sha1:00000000 redis_git_dirty:0 redis_build_id:38d65aa7b4148d2c server_mode:standalone os:Linux 6.14.6-arch1-1 x86_64 arch_bits:64 monotonic_clock:POSIX clock_gettime multiplexing_api:epoll gcc_version:15.1.1 process_id:2595901 process_supervised:no run_id:a0b75f67a217a81142f17553028c010e86c1ee80 tcp_port:6379 server_time_usec:1750209612917634 uptime_in_seconds:16 uptime_in_days:0 hz:10 configured_hz:10 clients_hz:10 lru_clock:5379148 executable:/home/fusl/valkey-server config_file: io_threads_active:0 availability_zone: listener0:name=tcp,bind=*,bind=-::*,port=6379 connected_clients:1 cluster_connections:0 maxclients:10000 client_recent_max_input_buffer:0 client_recent_max_output_buffer:0 blocked_clients:0 tracking_clients:0 pubsub_clients:0 watching_clients:0 clients_in_timeout_table:0 total_watched_keys:0 total_blocking_keys:0 total_blocking_keys_on_nokey:0 paused_reason:none paused_actions:none paused_timeout_milliseconds:0 used_memory:911824 used_memory_human:890.45K used_memory_rss:15323136 used_memory_rss_human:14.61M used_memory_peak:911824 used_memory_peak_human:890.45K used_memory_peak_perc:100.29% used_memory_overhead:892232 used_memory_startup:891824 used_memory_dataset:19592 used_memory_dataset_perc:97.96% allocator_allocated:1845952 allocator_active:1986560 allocator_resident:6672384 allocator_muzzy:0 total_system_memory:67323842560 total_system_memory_human:62.70G used_memory_lua:34816 used_memory_vm_eval:34816 used_memory_lua_human:34.00K used_memory_scripts_eval:184 number_of_cached_scripts:1 number_of_functions:0 number_of_libraries:0 used_memory_vm_functions:33792 used_memory_vm_total:68608 used_memory_vm_total_human:67.00K used_memory_functions:224 used_memory_scripts:408 used_memory_scripts_human:408B maxmemory:0 maxmemory_human:0B maxmemory_policy:noeviction allocator_frag_ratio:1.00 allocator_frag_bytes:0 allocator_rss_ratio:3.36 allocator_rss_bytes:4685824 rss_overhead_ratio:2.30 rss_overhead_bytes:8650752 mem_fragmentation_ratio:17.18 mem_fragmentation_bytes:14431168 mem_not_counted_for_evict:0 mem_replication_backlog:0 mem_total_replication_buffers:0 mem_clients_slaves:0 mem_clients_normal:0 mem_cluster_links:0 mem_aof_buffer:0 mem_allocator:jemalloc-5.3.0 mem_overhead_db_hashtable_rehashing:0 active_defrag_running:0 lazyfree_pending_objects:0 lazyfreed_objects:0 loading:0 async_loading:0 current_cow_peak:0 current_cow_size:0 current_cow_size_age:0 current_fork_perc:0.00 current_save_keys_processed:0 current_save_keys_total:0 rdb_changes_since_last_save:0 rdb_bgsave_in_progress:0 rdb_last_save_time:1750209596 rdb_last_bgsave_status:ok rdb_last_bgsave_time_sec:-1 rdb_current_bgsave_time_sec:-1 rdb_saves:0 rdb_last_cow_size:0 rdb_last_load_keys_expired:0 rdb_last_load_keys_loaded:0 aof_enabled:0 aof_rewrite_in_progress:0 aof_rewrite_scheduled:0 aof_last_rewrite_time_sec:-1 aof_current_rewrite_time_sec:-1 aof_last_bgrewrite_status:ok aof_rewrites:0 aof_rewrites_consecutive_failures:0 aof_last_write_status:ok aof_last_cow_size:0 module_fork_in_progress:0 module_fork_last_cow_size:0 total_connections_received:1 total_commands_processed:0 instantaneous_ops_per_sec:0 total_net_input_bytes:34 total_net_output_bytes:0 total_net_repl_input_bytes:0 total_net_repl_output_bytes:0 instantaneous_input_kbps:0.00 instantaneous_output_kbps:0.00 instantaneous_input_repl_kbps:0.00 instantaneous_output_repl_kbps:0.00 rejected_connections:0 sync_full:0 sync_partial_ok:0 sync_partial_err:0 expired_keys:0 expired_stale_perc:0.00 expired_time_cap_reached_count:0 expire_cycle_cpu_milliseconds:0 evicted_keys:0 evicted_clients:0 evicted_scripts:0 total_eviction_exceeded_time:0 current_eviction_exceeded_time:0 keyspace_hits:0 keyspace_misses:0 pubsub_channels:0 pubsub_patterns:0 pubsubshard_channels:0 latest_fork_usec:0 total_forks:0 migrate_cached_sockets:0 slave_expires_tracked_keys:0 active_defrag_hits:0 active_defrag_misses:0 active_defrag_key_hits:0 active_defrag_key_misses:0 total_active_defrag_time:0 current_active_defrag_time:0 tracking_total_keys:0 tracking_total_items:0 tracking_total_prefixes:0 unexpected_error_replies:0 total_error_replies:0 dump_payload_sanitizations:0 total_reads_processed:1 total_writes_processed:0 io_threaded_reads_processed:0 io_threaded_writes_processed:0 io_threaded_freed_objects:0 io_threaded_accept_processed:0 io_threaded_poll_processed:0 io_threaded_total_prefetch_batches:0 io_threaded_total_prefetch_entries:0 client_query_buffer_limit_disconnections:0 client_output_buffer_limit_disconnections:0 reply_buffer_shrinks:0 reply_buffer_expands:0 eventloop_cycles:170 eventloop_duration_sum:17739 eventloop_duration_cmd_sum:0 instantaneous_eventloop_cycles_per_sec:9 instantaneous_eventloop_duration_usec:99 acl_access_denied_auth:0 acl_access_denied_cmd:0 acl_access_denied_key:0 acl_access_denied_channel:0 role:master connected_slaves:0 replicas_waiting_psync:0 master_failover_state:no-failover master_replid:d35a0bb7979f490a60174bb363524431d7eb2428 master_replid2:0000000000000000000000000000000000000000 master_repl_offset:0 second_repl_offset:-1 repl_backlog_active:0 repl_backlog_size:10485760 repl_backlog_first_byte_offset:0 repl_backlog_histlen:0 used_cpu_sys:0.012543 used_cpu_user:0.016853 used_cpu_sys_children:0.000000 used_cpu_user_children:0.000000 used_cpu_sys_main_thread:0.012440 used_cpu_user_main_thread:0.016714 cluster_enabled:0 ------ CLIENT LIST OUTPUT ------ id=2 addr=127.0.0.1:41372 laddr=127.0.0.1:6379 fd=10 name=*redacted* age=0 idle=0 flags=N capa= db=0 sub=0 psub=0 ssub=0 multi=-1 watch=0 qbuf=0 qbuf-free=0 argv-mem=12 multi-mem=0 rbs=16384 rbp=16384 obl=0 oll=0 omem=0 tot-mem=17060 events=r cmd=eval user=*redacted* redir=-1 resp=2 lib-name= lib-ver= tot-net-in=34 tot-net-out=0 tot-cmds=0 ------ CURRENT CLIENT INFO ------ id=2 addr=127.0.0.1:41372 laddr=127.0.0.1:6379 fd=10 name=*redacted* age=0 idle=0 flags=N capa= db=0 sub=0 psub=0 ssub=0 multi=-1 watch=0 qbuf=0 qbuf-free=0 argv-mem=12 multi-mem=0 rbs=16384 rbp=16384 obl=0 oll=0 omem=0 tot-mem=17060 events=r cmd=eval user=*redacted* redir=-1 resp=2 lib-name= lib-ver= tot-net-in=34 tot-net-out=0 tot-cmds=0 argc: 3 argv[0]: "eval" argv[1]: 7 bytes argv[2]: 1 bytes ------ EXECUTING CLIENT INFO ------ id=2 addr=127.0.0.1:41372 laddr=127.0.0.1:6379 fd=10 name=*redacted* age=0 idle=0 flags=N capa= db=0 sub=0 psub=0 ssub=0 multi=-1 watch=0 qbuf=0 qbuf-free=0 argv-mem=12 multi-mem=0 rbs=16384 rbp=16384 obl=0 oll=0 omem=0 tot-mem=17060 events=r cmd=eval user=*redacted* redir=-1 resp=2 lib-name= lib-ver= tot-net-in=34 tot-net-out=0 tot-cmds=0 argc: 3 argv[0]: "eval" argv[1]: 7 bytes argv[2]: 1 bytes ------ MODULES INFO OUTPUT ------ ------ CONFIG DEBUG OUTPUT ------ repl-diskless-load disabled debug-context "" sanitize-dump-payload no lazyfree-lazy-user-del yes lazyfree-lazy-server-del yes import-mode no lazyfree-lazy-user-flush yes list-compress-depth 0 dual-channel-replication-enabled no repl-diskless-sync yes activedefrag no lazyfree-lazy-expire yes io-threads 1 replica-read-only yes client-query-buffer-limit 1gb slave-read-only yes lazyfree-lazy-eviction yes proto-max-bulk-len 512mb ------ FAST MEMORY TEST ------ 2595901:M 18 Jun 2025 01:20:12.921 # Bio worker thread #0 terminated 2595901:M 18 Jun 2025 01:20:12.921 # Bio worker thread #1 terminated 2595901:M 18 Jun 2025 01:20:12.921 # Bio worker thread #2 terminated *** Preparing to test memory region 6530abce2000 (212992 bytes) *** Preparing to test memory region 726f8af7f000 (2621440 bytes) *** Preparing to test memory region 726f8b200000 (8388608 bytes) *** Preparing to test memory region 726f8ba00000 (4194304 bytes) *** Preparing to test memory region 726f8bffe000 (8388608 bytes) *** Preparing to test memory region 726f8c7ff000 (8388608 bytes) *** Preparing to test memory region 726f8d000000 (8388608 bytes) *** Preparing to test memory region 726f8dc00000 (4194304 bytes) *** Preparing to test memory region 726f8e290000 (16384 bytes) *** Preparing to test memory region 726f8e3d2000 (20480 bytes) *** Preparing to test memory region 726f8e5f8000 (32768 bytes) *** Preparing to test memory region 726f8eb58000 (12288 bytes) *** Preparing to test memory region 726f8eb5c000 (16384 bytes) *** Preparing to test memory region 726f8ed63000 (4096 bytes) *** Preparing to test memory region 726f8eef2000 (397312 bytes) *** Preparing to test memory region 726f8efc7000 (4096 bytes) .O.O.O.O.O.O.O.O.O.O.O.O.O.O.O.O Fast memory test PASSED, however your memory can still be broken. Please run a memory test for several hours if possible. ------ DUMPING CODE AROUND EIP ------ Symbol: (null) (base: (nil)) Module: /usr/lib/libc.so.6 (base 0x726f8e410000) $ xxd -r -p /tmp/dump.hex /tmp/dump.bin $ objdump --adjust-vma=(nil) -D -b binary -m i386:x86-64 /tmp/dump.bin ------ === VALKEY BUG REPORT END. Make sure to include from START to END. === ``` --------- Signed-off-by: Fusl Signed-off-by: Binbin Co-authored-by: Binbin Signed-off-by: Viktor Söderqvist --- src/script_lua.c | 5 +++++ tests/unit/scripting.tcl | 23 +++++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/src/script_lua.c b/src/script_lua.c index 5fbdf743f9..e6df679196 100644 --- a/src/script_lua.c +++ b/src/script_lua.c @@ -1616,6 +1616,11 @@ void luaExtractErrorInformation(lua_State *lua, errorInfo *err_info) { err_info->ignore_err_stats_update = lua_toboolean(lua, -1); } lua_pop(lua, 1); + + if (err_info->msg == NULL) { + /* Ensure we never return a NULL msg. */ + err_info->msg = sdsnew("ERR unknown error"); + } } void luaCallFunction(scriptRunCtx *run_ctx, diff --git a/tests/unit/scripting.tcl b/tests/unit/scripting.tcl index db37129103..3f9b2cf174 100644 --- a/tests/unit/scripting.tcl +++ b/tests/unit/scripting.tcl @@ -2457,4 +2457,27 @@ start_server {tags {"scripting"}} { assert { [r memory usage foo] <= $expected_memory}; } } + + test {EVAL - Scripts support NULL byte} { + assert_equal [r eval "return \"\x00\";" 0] "\x00" + # Using a null byte never seemed to work with functions, so + # we don't have a test for that case. + } + + test {EVAL - explicit error() call handling} { + # error("simple string error") + assert_error {ERR user_script:1: simple string error script: *} { + r eval "error('simple string error')" 0 + } + + # error({"err": "ERR table error"}) + assert_error {ERR table error script: *} { + r eval "error({err='ERR table error'})" 0 + } + + # error({}) + assert_error {ERR unknown error script: *} { + r eval "error({})" 0 + } + } } From 0e7d648526daf147cbe20ebd33ebdfd70f6796e9 Mon Sep 17 00:00:00 2001 From: Binbin Date: Tue, 22 Jul 2025 17:19:02 +0800 Subject: [PATCH 31/35] Fix client tracking memory overhead calculation (#2360) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This should be + instread of *, otherwise it does not make any sense. Otherwise we would have to calculate 20 more bytes for each prefix rax node in 64 bits build. Signed-off-by: Binbin Signed-off-by: Viktor Söderqvist --- src/networking.c | 2 +- tests/test_helper.tcl | 24 ++++++++++++++++++++++++ tests/unit/introspection.tcl | 22 ---------------------- tests/unit/tracking.tcl | 28 ++++++++++++++++++++++++++++ 4 files changed, 53 insertions(+), 23 deletions(-) diff --git a/src/networking.c b/src/networking.c index 2cb126c9e2..2d748c812b 100644 --- a/src/networking.c +++ b/src/networking.c @@ -4335,7 +4335,7 @@ size_t getClientMemoryUsage(client *c, size_t *output_buffer_mem_usage) { /* Add memory overhead of the tracking prefixes, this is an underestimation so we don't need to traverse the entire * rax */ if (c->client_tracking_prefixes) - mem += c->client_tracking_prefixes->numnodes * (sizeof(raxNode) * sizeof(raxNode *)); + mem += c->client_tracking_prefixes->numnodes * (sizeof(raxNode) + sizeof(raxNode *)); return mem; } diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl index feee94ef80..1748049081 100644 --- a/tests/test_helper.tcl +++ b/tests/test_helper.tcl @@ -248,6 +248,30 @@ proc CI {index field} { getInfoProperty [R $index cluster info] $field } +# Provide easy access to CLIENT INFO properties from CLIENT INFO string. +proc get_field_in_client_info {info field} { + set info [string trim $info] + foreach item [split $info " "] { + set kv [split $item "="] + set k [lindex $kv 0] + if {[string match $field $k]} { + return [lindex $kv 1] + } + } + return "" +} + +# Provide easy access to CLIENT INFO properties from CLIENT LIST string. +proc get_field_in_client_list {id client_list filed} { + set list [split $client_list "\r\n"] + foreach info $list { + if {[string match "id=$id *" $info] } { + return [get_field_in_client_info $info $filed] + } + } + return "" +} + # Test wrapped into run_solo are sent back from the client to the # test server, so that the test server will send them again to # clients once the clients are idle. diff --git a/tests/unit/introspection.tcl b/tests/unit/introspection.tcl index 286b02b7d0..ae992bc51b 100644 --- a/tests/unit/introspection.tcl +++ b/tests/unit/introspection.tcl @@ -19,28 +19,6 @@ start_server {tags {"introspection"}} { r client info } {id=* addr=*:* laddr=*:* fd=* name=* age=* idle=* flags=N db=* sub=0 psub=0 ssub=0 multi=-1 watch=0 qbuf=0 qbuf-free=* argv-mem=* multi-mem=0 rbs=* rbp=* obl=0 oll=0 omem=0 tot-mem=* events=r cmd=client|info user=* redir=-1 resp=* lib-name=* lib-ver=* tot-net-in=* tot-net-out=* tot-cmds=*} - proc get_field_in_client_info {info field} { - set info [string trim $info] - foreach item [split $info " "] { - set kv [split $item "="] - set k [lindex $kv 0] - if {[string match $field $k]} { - return [lindex $kv 1] - } - } - return "" - } - - proc get_field_in_client_list {id client_list filed} { - set list [split $client_list "\r\n"] - foreach info $list { - if {[string match "id=$id *" $info] } { - return [get_field_in_client_info $info $filed] - } - } - return "" - } - proc get_client_tot_in_out_cmds {id} { set info_list [r client list] set in [get_field_in_client_list $id $info_list "tot-net-in"] diff --git a/tests/unit/tracking.tcl b/tests/unit/tracking.tcl index 313293dcb7..73bfd4b1e9 100644 --- a/tests/unit/tracking.tcl +++ b/tests/unit/tracking.tcl @@ -36,6 +36,34 @@ start_server {tags {"tracking network logreqres:skip"}} { } } + test {Client tracking prefixes memory overhead} { + r CLIENT TRACKING off + set tot_mem_before [get_field_in_client_info [r client info] "tot-mem"] + + # We add multiple $i to prefix to avoid prefix conflicts, so in this + # args we will have about 20000 rax nodes. + set args {} + for {set i 0} {$i < 10240} {incr i} { + lappend args PREFIX + lappend args PREFIX-$i-$i-$i-$i-$i-$i-$i + } + r CLIENT TRACKING on BCAST {*}$args + + set arch_bits [s arch_bits] + set tot_mem_after [get_field_in_client_info [r client info] "tot-mem"] + set diff [expr $tot_mem_after - $tot_mem_before] + + # In 64 bits, before we would consume about 20000 * (4 * 8), that is 640000. + # And now we are 20000 * (4 + 8), that is 240000. + if {$arch_bits == 64} { + assert_lessthan $diff 300000 + } elseif {$arch_bits == 32} { + assert_lessthan $diff 200000 + } + + r CLIENT TRACKING off + } + test {Clients are able to enable tracking and redirect it} { r CLIENT TRACKING on REDIRECT $redir_id } {*OK} From e05b7d187fdacc599c5498ff1af35edf0b323183 Mon Sep 17 00:00:00 2001 From: Harkrishn Patro Date: Fri, 20 Jun 2025 15:07:13 -0700 Subject: [PATCH 32/35] Converge divergent shard-id persisted in nodes.conf to primary's shard id (#2174) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes #2171 Handle divergent shard-id across primary and replica from nodes.conf and reconcile all the nodes in the shard to the primary node's shard-id. --------- Signed-off-by: Harkrishn Patro Signed-off-by: Viktor Söderqvist --- src/cluster_legacy.c | 24 ++++++++++++++----- tests/assets/divergent-shard-1.conf | 3 +++ tests/assets/divergent-shard-2.conf | 3 +++ tests/assets/divergent-shard-3.conf | 3 +++ tests/assets/divergent-shard-4.conf | 3 +++ tests/test_helper.tcl | 1 + .../divergent-cluster-shardid-conf.tcl | 17 +++++++++++++ 7 files changed, 48 insertions(+), 6 deletions(-) create mode 100644 tests/assets/divergent-shard-1.conf create mode 100644 tests/assets/divergent-shard-2.conf create mode 100644 tests/assets/divergent-shard-3.conf create mode 100644 tests/assets/divergent-shard-4.conf create mode 100644 tests/unit/cluster/divergent-cluster-shardid-conf.tcl diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 5d4e85d882..5789c21c57 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -293,10 +293,16 @@ int auxShardIdSetter(clusterNode *n, void *value, size_t length) { } memcpy(n->shard_id, value, CLUSTER_NAMELEN); /* if n already has replicas, make sure they all agree - * on the shard id */ + * on the shard id. If not, update them. */ for (int i = 0; i < n->num_replicas; i++) { if (memcmp(n->replicas[i]->shard_id, n->shard_id, CLUSTER_NAMELEN) != 0) { - return C_ERR; + serverLog(LL_NOTICE, + "Node %.40s has a different shard id (%.40s) than its primary's shard id %.40s (%.40s). " + "Updating replica's shard id to match primary's shard id.", + n->replicas[i]->name, n->replicas[i]->shard_id, n->name, n->shard_id); + clusterRemoveNodeFromShard(n->replicas[i]); + memcpy(n->replicas[i]->shard_id, n->shard_id, CLUSTER_NAMELEN); + clusterAddNodeToShard(n->shard_id, n->replicas[i]); } } clusterAddNodeToShard(value, n); @@ -696,10 +702,16 @@ int clusterLoadConfig(char *filename) { clusterAddNodeToShard(primary->shard_id, n); } else if (clusterGetNodesInMyShard(primary) != NULL && memcmp(primary->shard_id, n->shard_id, CLUSTER_NAMELEN) != 0) { - /* If the primary has been added to a shard, make sure this - * node has the same persisted shard id as the primary. */ - sdsfreesplitres(argv, argc); - goto fmterr; + /* If the primary has been added to a shard and this replica has + * a different shard id stored in nodes.conf, update it to match + * the primary instead of aborting the startup. */ + serverLog(LL_NOTICE, + "Node %.40s has a different shard id (%.40s) than its primary %.40s (%.40s). " + "Updating replica's shard id to match primary's shard id.", + n->name, n->shard_id, primary->name, primary->shard_id); + clusterRemoveNodeFromShard(n); + memcpy(n->shard_id, primary->shard_id, CLUSTER_NAMELEN); + clusterAddNodeToShard(primary->shard_id, n); } n->replicaof = primary; clusterNodeAddReplica(primary, n); diff --git a/tests/assets/divergent-shard-1.conf b/tests/assets/divergent-shard-1.conf new file mode 100644 index 0000000000..6f347d0470 --- /dev/null +++ b/tests/assets/divergent-shard-1.conf @@ -0,0 +1,3 @@ +43ee1cacd6948ee96bb367eb8795e62e8d153f05 127.0.0.1:0@6379,,tls-port=0,shard-id=f91532eb722943e035f34292cf586f3f750d65bd myself,master - 0 1749488968682 13 connected 0-16383 +8d89f4d4e7c57a2819277732f86213241c3ec0d3 127.0.0.1:0@6380,,tls-port=0,shard-id=a91532eb722943e035f34292cf586f3f750d65bd slave 43ee1cacd6948ee96bb367eb8795e62e8d153f05 0 1749488968682 13 connected +vars currentEpoch 13 lastVoteEpoch 9 \ No newline at end of file diff --git a/tests/assets/divergent-shard-2.conf b/tests/assets/divergent-shard-2.conf new file mode 100644 index 0000000000..3f7d75c947 --- /dev/null +++ b/tests/assets/divergent-shard-2.conf @@ -0,0 +1,3 @@ +8d89f4d4e7c57a2819277732f86213241c3ec0d3 127.0.0.1:0@6380,,tls-port=0,shard-id=a91532eb722943e035f34292cf586f3f750d65bd slave 43ee1cacd6948ee96bb367eb8795e62e8d153f05 0 1749488968682 13 connected +43ee1cacd6948ee96bb367eb8795e62e8d153f05 127.0.0.1:0@6379,,tls-port=0,shard-id=f91532eb722943e035f34292cf586f3f750d65bd myself,master - 0 1749488968682 13 connected 0-16383 +vars currentEpoch 13 lastVoteEpoch 9 \ No newline at end of file diff --git a/tests/assets/divergent-shard-3.conf b/tests/assets/divergent-shard-3.conf new file mode 100644 index 0000000000..363837f436 --- /dev/null +++ b/tests/assets/divergent-shard-3.conf @@ -0,0 +1,3 @@ +43ee1cacd6948ee96bb367eb8795e62e8d153f05 127.0.0.1:0@6379,,tls-port=0,shard-id=f91532eb722943e035f34292cf586f3f750d65bd master - 0 1749488968682 13 connected 0-16383 +8d89f4d4e7c57a2819277732f86213241c3ec0d3 127.0.0.1:0@6380,,tls-port=0,shard-id=a91532eb722943e035f34292cf586f3f750d65bd myself,slave 43ee1cacd6948ee96bb367eb8795e62e8d153f05 0 1749488968682 13 connected +vars currentEpoch 13 lastVoteEpoch 9 \ No newline at end of file diff --git a/tests/assets/divergent-shard-4.conf b/tests/assets/divergent-shard-4.conf new file mode 100644 index 0000000000..5fcd97bfad --- /dev/null +++ b/tests/assets/divergent-shard-4.conf @@ -0,0 +1,3 @@ +8d89f4d4e7c57a2819277732f86213241c3ec0d3 127.0.0.1:0@6380,,tls-port=0,shard-id=a91532eb722943e035f34292cf586f3f750d65bd myself,slave 43ee1cacd6948ee96bb367eb8795e62e8d153f05 0 1749488968682 13 connected +43ee1cacd6948ee96bb367eb8795e62e8d153f05 127.0.0.1:0@6379,,tls-port=0,shard-id=f91532eb722943e035f34292cf586f3f750d65bd master - 0 1749488968682 13 connected 0-16383 +vars currentEpoch 13 lastVoteEpoch 9 \ No newline at end of file diff --git a/tests/test_helper.tcl b/tests/test_helper.tcl index 1748049081..58781dda0f 100644 --- a/tests/test_helper.tcl +++ b/tests/test_helper.tcl @@ -288,6 +288,7 @@ proc cleanup {} { if {!$::quiet} {puts -nonewline "Cleanup: may take some time... "} flush stdout catch {exec rm -rf {*}[glob tests/tmp/valkey.conf.*]} + catch {exec rm -rf {*}[glob tests/tmp/nodes.conf.*]} catch {exec rm -rf {*}[glob tests/tmp/server*.*]} catch {exec rm -rf {*}[glob tests/tmp/*.acl.*]} if {!$::quiet} {puts "OK"} diff --git a/tests/unit/cluster/divergent-cluster-shardid-conf.tcl b/tests/unit/cluster/divergent-cluster-shardid-conf.tcl new file mode 100644 index 0000000000..6885eac214 --- /dev/null +++ b/tests/unit/cluster/divergent-cluster-shardid-conf.tcl @@ -0,0 +1,17 @@ +tags {external:skip cluster singledb} { + set old_singledb $::singledb + set ::singledb 1 + # Start a cluster with a divergent shard ID configuration + test "divergent cluster shardid conflict" { + for {set i 1} {$i <= 4} {incr i} { + if {$::verbose} { puts "Testing for tests/assets/divergent-shard-$i.conf"; flush stdout;} + exec cp -f tests/assets/divergent-shard-$i.conf tests/tmp/nodes.conf.divergent + start_server {overrides {"cluster-enabled" "yes" "cluster-config-file" "../nodes.conf.divergent"}} { + set shardid [r CLUSTER MYSHARDID] + set count [exec grep -c $shardid tests/tmp/nodes.conf.divergent]; + assert_equal $count 2 "Expect shard ID to be present twice in the configuration file" + } + } + } + set ::singledb $old_singledb +} From 6dc4112f899714fe6b9994976e05848bfba1c348 Mon Sep 17 00:00:00 2001 From: Ted Lyngmo Date: Wed, 20 Aug 2025 22:48:31 +0200 Subject: [PATCH 33/35] Fix assumptions that pthread functions set errno (#2526) pthread functions return the error instead of setting errno. Fixes #2525 Signed-off-by: Ted Lyngmo --- src/bio.c | 10 ++++++---- src/io_threads.c | 5 +++-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/bio.c b/src/bio.c index e55c729f74..ab07581ba9 100644 --- a/src/bio.c +++ b/src/bio.c @@ -142,8 +142,9 @@ void bioInit(void) { * responsible for. */ for (j = 0; j < BIO_WORKER_NUM; j++) { void *arg = (void *)(unsigned long)j; - if (pthread_create(&thread, &attr, bioProcessBackgroundJobs, arg) != 0) { - serverLog(LL_WARNING, "Fatal: Can't initialize Background Jobs. Error message: %s", strerror(errno)); + int err = pthread_create(&thread, &attr, bioProcessBackgroundJobs, arg); + if (err) { + serverLog(LL_WARNING, "Fatal: Can't initialize Background Jobs. Error message: %s", strerror(err)); exit(1); } bio_threads[j] = thread; @@ -222,8 +223,9 @@ void *bioProcessBackgroundJobs(void *arg) { * receive the watchdog signal. */ sigemptyset(&sigset); sigaddset(&sigset, SIGALRM); - if (pthread_sigmask(SIG_BLOCK, &sigset, NULL)) - serverLog(LL_WARNING, "Warning: can't mask SIGALRM in bio.c thread: %s", strerror(errno)); + int err = pthread_sigmask(SIG_BLOCK, &sigset, NULL); + if (err) + serverLog(LL_WARNING, "Warning: can't mask SIGALRM in bio.c thread: %s", strerror(err)); while (1) { listNode *ln; diff --git a/src/io_threads.c b/src/io_threads.c index 5b2230f635..c925b0f67c 100644 --- a/src/io_threads.c +++ b/src/io_threads.c @@ -265,8 +265,9 @@ static void createIOThread(int id) { pthread_mutex_init(&io_threads_mutex[id], NULL); IOJobQueue_init(&io_jobs[id], IO_JOB_QUEUE_SIZE); pthread_mutex_lock(&io_threads_mutex[id]); /* Thread will be stopped. */ - if (pthread_create(&tid, NULL, IOThreadMain, (void *)(long)id) != 0) { - serverLog(LL_WARNING, "Fatal: Can't initialize IO thread, pthread_create failed with: %s", strerror(errno)); + int err = pthread_create(&tid, NULL, IOThreadMain, (void *)(long)id); + if (err) { + serverLog(LL_WARNING, "Fatal: Can't initialize IO thread, pthread_create failed with: %s", strerror(err)); exit(1); } io_threads[id] = tid; From 41668254806c598611a681a3a6b680bb4de93170 Mon Sep 17 00:00:00 2001 From: Binbin Date: Fri, 22 Aug 2025 10:24:54 +0800 Subject: [PATCH 34/35] Fix pre-size hashtables per slot when reading RDB files (#2466) When reading RDB files with information about the number of keys per cluster slot, we need to create the dicts if they don't exist. Currently, when processing RDB slot-info, our expand has no effect because the dict does not exist (we initialize it only when we need it). We also update kvstoreExpand to use the kvstoreDictExpand to make sure there is only one code path. Also see #1199 for more details. Signed-off-by: Binbin --- src/kvstore.c | 25 +++++++++++++++++++------ src/kvstore.h | 2 ++ src/rdb.c | 4 ++-- src/unit/test_files.h | 3 ++- src/unit/test_kvstore.c | 17 +++++++++++++++++ 5 files changed, 42 insertions(+), 9 deletions(-) diff --git a/src/kvstore.c b/src/kvstore.c index d476b87baf..0012b5053b 100644 --- a/src/kvstore.c +++ b/src/kvstore.c @@ -425,10 +425,11 @@ int kvstoreExpand(kvstore *kvs, uint64_t newsize, int try_expand, kvstoreExpandS if (newsize == 0) return 1; for (int i = 0; i < kvs->num_dicts; i++) { if (skip_cb && skip_cb(i)) continue; - /* If the dictionary doesn't exist, create it */ - dict *d = createDictIfNeeded(kvs, i); - int result = try_expand ? dictTryExpand(d, newsize) : dictExpand(d, newsize); - if (try_expand && result == DICT_ERR) return 0; + if (try_expand) { + if (kvstoreDictTryExpand(kvs, i, newsize) == DICT_ERR) return 0; + } else { + kvstoreDictExpand(kvs, i, newsize); + } } return 1; @@ -673,6 +674,12 @@ unsigned long kvstoreDictSize(kvstore *kvs, int didx) { return dictSize(d); } +unsigned long kvstoreDictBuckets(kvstore *kvs, int didx) { + dict *d = kvstoreGetDict(kvs, didx); + if (!d) return 0; + return dictBuckets(d); +} + kvstoreDictIterator *kvstoreGetDictIterator(kvstore *kvs, int didx) { kvstoreDictIterator *kvs_di = zmalloc(sizeof(*kvs_di)); kvs_di->kvs = kvs; @@ -729,11 +736,17 @@ unsigned int kvstoreDictGetSomeKeys(kvstore *kvs, int didx, dictEntry **des, uns } int kvstoreDictExpand(kvstore *kvs, int didx, unsigned long size) { - dict *d = kvstoreGetDict(kvs, didx); - if (!d) return DICT_ERR; + if (size == 0) return DICT_ERR; + dict *d = createDictIfNeeded(kvs, didx); return dictExpand(d, size); } +int kvstoreDictTryExpand(kvstore *kvs, int didx, unsigned long size) { + if (size == 0) return DICT_ERR; + dict *d = createDictIfNeeded(kvs, didx); + return dictTryExpand(d, size); +} + unsigned long kvstoreDictScanDefrag(kvstore *kvs, int didx, unsigned long v, diff --git a/src/kvstore.h b/src/kvstore.h index 81a0d9a96e..7bdfb20aaf 100644 --- a/src/kvstore.h +++ b/src/kvstore.h @@ -56,6 +56,7 @@ unsigned long kvstoreDictRehashingCount(kvstore *kvs); /* Specific dict access by dict-index */ unsigned long kvstoreDictSize(kvstore *kvs, int didx); +unsigned long kvstoreDictBuckets(kvstore *kvs, int didx); kvstoreDictIterator *kvstoreGetDictIterator(kvstore *kvs, int didx); kvstoreDictIterator *kvstoreGetDictSafeIterator(kvstore *kvs, int didx); void kvstoreReleaseDictIterator(kvstoreDictIterator *kvs_id); @@ -64,6 +65,7 @@ dictEntry *kvstoreDictGetRandomKey(kvstore *kvs, int didx); dictEntry *kvstoreDictGetFairRandomKey(kvstore *kvs, int didx); unsigned int kvstoreDictGetSomeKeys(kvstore *kvs, int didx, dictEntry **des, unsigned int count); int kvstoreDictExpand(kvstore *kvs, int didx, unsigned long size); +int kvstoreDictTryExpand(kvstore *kvs, int didx, unsigned long size); unsigned long kvstoreDictScanDefrag(kvstore *kvs, int didx, unsigned long v, diff --git a/src/rdb.c b/src/rdb.c index af68b67e78..e5d0b7fa67 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -3157,8 +3157,8 @@ int rdbLoadRioWithLoadingCtx(rio *rdb, int rdbflags, rdbSaveInfo *rsi, rdbLoadin if (server.cluster_enabled) { /* In cluster mode we resize individual slot specific dictionaries based on the number of keys that * slot holds. */ - kvstoreDictExpand(db->keys, slot_id, slot_size); - kvstoreDictExpand(db->expires, slot_id, expires_slot_size); + if (slot_size) kvstoreDictExpand(db->keys, slot_id, slot_size); + if (expires_slot_size) kvstoreDictExpand(db->expires, slot_id, expires_slot_size); should_expand_db = 0; } } else { diff --git a/src/unit/test_files.h b/src/unit/test_files.h index 7a63ba5f2e..4e617276d5 100644 --- a/src/unit/test_files.h +++ b/src/unit/test_files.h @@ -33,6 +33,7 @@ int test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyDict(int argc, char **argv, in int test_kvstoreIteratorRemoveAllKeysDeleteEmptyDict(int argc, char **argv, int flags); int test_kvstoreDictIteratorRemoveAllKeysNoDeleteEmptyDict(int argc, char **argv, int flags); int test_kvstoreDictIteratorRemoveAllKeysDeleteEmptyDict(int argc, char **argv, int flags); +int test_kvstoreDictExpand(int argc, char **argv, int flags); int test_listpackCreateIntList(int argc, char **argv, int flags); int test_listpackCreateList(int argc, char **argv, int flags); int test_listpackLpPrepend(int argc, char **argv, int flags); @@ -144,7 +145,7 @@ unitTest __test_crc64combine_c[] = {{"test_crc64combine", test_crc64combine}, {N unitTest __test_dict_c[] = {{"test_dictCreate", test_dictCreate}, {"test_dictAdd16Keys", test_dictAdd16Keys}, {"test_dictDisableResize", test_dictDisableResize}, {"test_dictAddOneKeyTriggerResize", test_dictAddOneKeyTriggerResize}, {"test_dictDeleteKeys", test_dictDeleteKeys}, {"test_dictDeleteOneKeyTriggerResize", test_dictDeleteOneKeyTriggerResize}, {"test_dictEmptyDirAdd128Keys", test_dictEmptyDirAdd128Keys}, {"test_dictDisableResizeReduceTo3", test_dictDisableResizeReduceTo3}, {"test_dictDeleteOneKeyTriggerResizeAgain", test_dictDeleteOneKeyTriggerResizeAgain}, {"test_dictBenchmark", test_dictBenchmark}, {NULL, NULL}}; unitTest __test_endianconv_c[] = {{"test_endianconv", test_endianconv}, {NULL, NULL}}; unitTest __test_intset_c[] = {{"test_intsetValueEncodings", test_intsetValueEncodings}, {"test_intsetBasicAdding", test_intsetBasicAdding}, {"test_intsetLargeNumberRandomAdd", test_intsetLargeNumberRandomAdd}, {"test_intsetUpgradeFromint16Toint32", test_intsetUpgradeFromint16Toint32}, {"test_intsetUpgradeFromint16Toint64", test_intsetUpgradeFromint16Toint64}, {"test_intsetUpgradeFromint32Toint64", test_intsetUpgradeFromint32Toint64}, {"test_intsetStressLookups", test_intsetStressLookups}, {"test_intsetStressAddDelete", test_intsetStressAddDelete}, {NULL, NULL}}; -unitTest __test_kvstore_c[] = {{"test_kvstoreAdd16Keys", test_kvstoreAdd16Keys}, {"test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyDict", test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyDict}, {"test_kvstoreIteratorRemoveAllKeysDeleteEmptyDict", test_kvstoreIteratorRemoveAllKeysDeleteEmptyDict}, {"test_kvstoreDictIteratorRemoveAllKeysNoDeleteEmptyDict", test_kvstoreDictIteratorRemoveAllKeysNoDeleteEmptyDict}, {"test_kvstoreDictIteratorRemoveAllKeysDeleteEmptyDict", test_kvstoreDictIteratorRemoveAllKeysDeleteEmptyDict}, {NULL, NULL}}; +unitTest __test_kvstore_c[] = {{"test_kvstoreAdd16Keys", test_kvstoreAdd16Keys}, {"test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyDict", test_kvstoreIteratorRemoveAllKeysNoDeleteEmptyDict}, {"test_kvstoreIteratorRemoveAllKeysDeleteEmptyDict", test_kvstoreIteratorRemoveAllKeysDeleteEmptyDict}, {"test_kvstoreDictIteratorRemoveAllKeysNoDeleteEmptyDict", test_kvstoreDictIteratorRemoveAllKeysNoDeleteEmptyDict}, {"test_kvstoreDictIteratorRemoveAllKeysDeleteEmptyDict", test_kvstoreDictIteratorRemoveAllKeysDeleteEmptyDict}, {"test_kvstoreDictExpand", test_kvstoreDictExpand}, {NULL, NULL}}; unitTest __test_listpack_c[] = {{"test_listpackCreateIntList", test_listpackCreateIntList}, {"test_listpackCreateList", test_listpackCreateList}, {"test_listpackLpPrepend", test_listpackLpPrepend}, {"test_listpackLpPrependInteger", test_listpackLpPrependInteger}, {"test_listpackGetELementAtIndex", test_listpackGetELementAtIndex}, {"test_listpackPop", test_listpackPop}, {"test_listpackGetELementAtIndex2", test_listpackGetELementAtIndex2}, {"test_listpackIterate0toEnd", test_listpackIterate0toEnd}, {"test_listpackIterate1toEnd", test_listpackIterate1toEnd}, {"test_listpackIterate2toEnd", test_listpackIterate2toEnd}, {"test_listpackIterateBackToFront", test_listpackIterateBackToFront}, {"test_listpackIterateBackToFrontWithDelete", test_listpackIterateBackToFrontWithDelete}, {"test_listpackDeleteWhenNumIsMinusOne", test_listpackDeleteWhenNumIsMinusOne}, {"test_listpackDeleteWithNegativeIndex", test_listpackDeleteWithNegativeIndex}, {"test_listpackDeleteInclusiveRange0_0", test_listpackDeleteInclusiveRange0_0}, {"test_listpackDeleteInclusiveRange0_1", test_listpackDeleteInclusiveRange0_1}, {"test_listpackDeleteInclusiveRange1_2", test_listpackDeleteInclusiveRange1_2}, {"test_listpackDeleteWitStartIndexOutOfRange", test_listpackDeleteWitStartIndexOutOfRange}, {"test_listpackDeleteWitNumOverflow", test_listpackDeleteWitNumOverflow}, {"test_listpackBatchDelete", test_listpackBatchDelete}, {"test_listpackDeleteFooWhileIterating", test_listpackDeleteFooWhileIterating}, {"test_listpackReplaceWithSameSize", test_listpackReplaceWithSameSize}, {"test_listpackReplaceWithDifferentSize", test_listpackReplaceWithDifferentSize}, {"test_listpackRegressionGt255Bytes", test_listpackRegressionGt255Bytes}, {"test_listpackCreateLongListAndCheckIndices", test_listpackCreateLongListAndCheckIndices}, {"test_listpackCompareStrsWithLpEntries", test_listpackCompareStrsWithLpEntries}, {"test_listpackLpMergeEmptyLps", test_listpackLpMergeEmptyLps}, {"test_listpackLpMergeLp1Larger", test_listpackLpMergeLp1Larger}, {"test_listpackLpMergeLp2Larger", test_listpackLpMergeLp2Larger}, {"test_listpackLpNextRandom", test_listpackLpNextRandom}, {"test_listpackLpNextRandomCC", test_listpackLpNextRandomCC}, {"test_listpackRandomPairWithOneElement", test_listpackRandomPairWithOneElement}, {"test_listpackRandomPairWithManyElements", test_listpackRandomPairWithManyElements}, {"test_listpackRandomPairsWithOneElement", test_listpackRandomPairsWithOneElement}, {"test_listpackRandomPairsWithManyElements", test_listpackRandomPairsWithManyElements}, {"test_listpackRandomPairsUniqueWithOneElement", test_listpackRandomPairsUniqueWithOneElement}, {"test_listpackRandomPairsUniqueWithManyElements", test_listpackRandomPairsUniqueWithManyElements}, {"test_listpackPushVariousEncodings", test_listpackPushVariousEncodings}, {"test_listpackLpFind", test_listpackLpFind}, {"test_listpackLpValidateIntegrity", test_listpackLpValidateIntegrity}, {"test_listpackNumberOfElementsExceedsLP_HDR_NUMELE_UNKNOWN", test_listpackNumberOfElementsExceedsLP_HDR_NUMELE_UNKNOWN}, {"test_listpackStressWithRandom", test_listpackStressWithRandom}, {"test_listpackSTressWithVariableSize", test_listpackSTressWithVariableSize}, {"test_listpackBenchmarkInit", test_listpackBenchmarkInit}, {"test_listpackBenchmarkLpAppend", test_listpackBenchmarkLpAppend}, {"test_listpackBenchmarkLpFindString", test_listpackBenchmarkLpFindString}, {"test_listpackBenchmarkLpFindNumber", test_listpackBenchmarkLpFindNumber}, {"test_listpackBenchmarkLpSeek", test_listpackBenchmarkLpSeek}, {"test_listpackBenchmarkLpValidateIntegrity", test_listpackBenchmarkLpValidateIntegrity}, {"test_listpackBenchmarkLpCompareWithString", test_listpackBenchmarkLpCompareWithString}, {"test_listpackBenchmarkLpCompareWithNumber", test_listpackBenchmarkLpCompareWithNumber}, {"test_listpackBenchmarkFree", test_listpackBenchmarkFree}, {NULL, NULL}}; unitTest __test_rax_c[] = {{"test_raxRecompressHugeKey", test_raxRecompressHugeKey}, {NULL, NULL}}; unitTest __test_sds_c[] = {{"test_sds", test_sds}, {"test_typesAndAllocSize", test_typesAndAllocSize}, {"test_sdsHeaderSizes", test_sdsHeaderSizes}, {NULL, NULL}}; diff --git a/src/unit/test_kvstore.c b/src/unit/test_kvstore.c index b3eff7d132..5832bf2c7f 100644 --- a/src/unit/test_kvstore.c +++ b/src/unit/test_kvstore.c @@ -203,3 +203,20 @@ int test_kvstoreDictIteratorRemoveAllKeysDeleteEmptyDict(int argc, char **argv, kvstoreRelease(kvs2); return 0; } + +int test_kvstoreDictExpand(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + kvstore *kvs = kvstoreCreate(&KvstoreDictTestType, 0, KVSTORE_ALLOCATE_DICTS_ON_DEMAND | KVSTORE_FREE_EMPTY_DICTS); + + TEST_ASSERT(kvstoreGetDict(kvs, 0) == NULL); + TEST_ASSERT(kvstoreDictExpand(kvs, 0, 10000) == DICT_OK); + TEST_ASSERT(kvstoreGetDict(kvs, 0) != NULL); + TEST_ASSERT(kvstoreBuckets(kvs) > 0); + TEST_ASSERT(kvstoreBuckets(kvs) == kvstoreDictBuckets(kvs, 0)); + + kvstoreRelease(kvs); + return 0; +} From b20251cf5fdd6375e838e72b5784fe11b665ad09 Mon Sep 17 00:00:00 2001 From: Nikhil Manglore Date: Fri, 27 Jun 2025 20:50:25 +0000 Subject: [PATCH 35/35] Add release notes for 8.0.5 and update version file MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Viktor Söderqvist Signed-off-by: Nikhil Manglore Signed-off-by: Viktor Söderqvist --- 00-RELEASENOTES | 45 +++++++++++++++++++++++++++++++++++++++++++++ src/version.h | 4 ++-- 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/00-RELEASENOTES b/00-RELEASENOTES index 17e7d82f1f..7b1937b2ee 100644 --- a/00-RELEASENOTES +++ b/00-RELEASENOTES @@ -9,6 +9,51 @@ CRITICAL: There is a critical bug affecting MOST USERS. Upgrade ASAP. SECURITY: There are security fixes in the release. -------------------------------------------------------------------------------- +================================================================================ +Valkey 8.0.5 - Released Thu 22 Aug 2025 +================================================================================ + +Upgrade urgency SECURITY: This release includes security fixes we recommend you +apply as soon as possible. + +Bug fixes +========= +* Fix clients remaining blocked when reprocessing commands after certain + blocking operations (#2109) +* Fix a memory corruption issue in the sharded pub/sub unsubscribe logic (#2137) +* Fix potential memory leak by ensuring module context is freed when `aux_save2` + callback writes no data (#2132) +* Fix `CLIENT UNBLOCK` triggering unexpected errors when used on paused clients + (#2117) +* Fix missing NULL check on `SSL_new()` when creating outgoing TLS connections + (#2140) +* Fix incorrect casting of ping extension lengths to prevent silent packet drops + (#2144) +* Fix replica failover stall due to outdated config epoch (#2178) +* Fix incorrect port/tls-port info in `CLUSTER SLOTS`/`CLUSTER NODES` after + dynamic config change (#2186) +* Ensure empty error tables in Lua scripts don't crash Valkey (#2229) +* Fix client tracking memory overhead calculation (#2360) +* Handle divergent shard-id from nodes.conf and reconcile to the primary node's + shard-id (#2174) +* Fix pre-size hashtables per slot when reading RDB files (#2466) + +Behavior changes +================ +* Trigger election immediately during a forced manual failover (`CLUSTER + FAILOVER FORCE`) to avoid delay (#1067) +* Reset ongoing election state when initiating a new manual failover (#1274) + +Logging and Tooling Improvements +================================ +* Add support to drop all cluster packets (#1252) +* Improve log clarity in failover auth denial message (#1341) + +Security fixes +============== +* CVE-2025-27151: Check length of AOF file name in valkey-check-aof and reject + paths longer than `PATH_MAX` (#2146) + ================================================================================ Valkey 8.0.4 - Released Mon 07 July 2025 ================================================================================ diff --git a/src/version.h b/src/version.h index fd2a474ab1..e0180c6ed0 100644 --- a/src/version.h +++ b/src/version.h @@ -4,8 +4,8 @@ * similar. */ #define SERVER_NAME "valkey" #define SERVER_TITLE "Valkey" -#define VALKEY_VERSION "8.0.4" -#define VALKEY_VERSION_NUM 0x00080004 +#define VALKEY_VERSION "8.0.5" +#define VALKEY_VERSION_NUM 0x00080005 /* Redis OSS compatibility version, should never * exceed 7.2.x. */