From 00d4b4cf12410a1b58cad4372d8bcd8ceab2d970 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Thu, 12 Jun 2025 20:02:27 +0800
Subject: [PATCH 1/4] Remove the 500ms fixed delay for automatic failover for
 faster failover

An automatic failover has a fixed delay of 500ms, its main purpose is
to wait for the FAIL packet to propagate in the cluster so that other
primaries can respond to the vote request.
```
server.cluster->failover_auth_time =
now +
500 +           /* Fixed delay of 500 milliseconds, let FAIL msg propagate. */
random() % 500; /* Random delay between 0 and 500 milliseconds. */
```

A fixed delay of 500ms is a bit long in now days, we are looking into
removing it for a faster failover. If we can ensure it is safe to remove,
then we can remove it.

Currently a replica now can only learn of the death of its primary
node from two places:
1. Collecting pfail messages from other primary nodes, and then check
in markNodeAsFailingIfNeeded to see if we have the quorum.
2. Receiving FAIL messages from other primary nodes (or replica nodes).

Anyway both the failing state is triggered collecting failure reports
from primaries.

In markNodeAsFailingIfNeeded, if `nodeIsReplica(myself) && myself->replicaof == node`
is true, then we can try to trigger a failover in clusterBeforeSleep as soon as
possible without waiting for clusterCron. Moreover, we can remove 500ms, because
we broadcast a FAIL here, which means that myself's FAILOVER_AUTH will definitely
be after the FAIL, which means we don't have to wait 500ms for FAIL to propagate.
```
void markNodeAsFailingIfNeeded(clusterNode *node) {
    ...
    /* Broadcast the failing node name to everybody, forcing all the other
     * reachable nodes to flag the node as FAIL.
     * We do that even if this node is a replica and not a primary: anyway
     * the failing state is triggered collecting failure reports from primaries,
     * so here the replica is only helping propagating this status. */
    clusterSendFail(node->name);
    clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_SAVE_CONFIG);
```

And moreover, if we do the same when getting a FAIL, we check if failing is
myself's primary, and we call clusterSendFail to send the FAIL, this look like
we can also remove the 500ms delay. It is somehow like the first case in a very
badly way, like all the replicas got the pfail and then send the fail.
```
    } else if (type == CLUSTERMSG_TYPE_FAIL) {
        clusterNode *failing;

        if (sender) {
            failing = clusterLookupNode(hdr->data.fail.about.nodename, CLUSTER_NAMELEN);
            if (failing && !(failing->flags & (CLUSTER_NODE_FAIL | CLUSTER_NODE_MYSELF))) {
                serverLog(LL_NOTICE, "FAIL message received from %.40s (%s) about %.40s (%s)", hdr->sender,
                          sender->human_nodename, hdr->data.fail.about.nodename, failing->human_nodename);
                failing->flags |= CLUSTER_NODE_FAIL;
                failing->fail_time = now;
                failing->flags &= ~CLUSTER_NODE_PFAIL;
                clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE);
            }
```

So in this commit, we try to remove the fixed delay of 500ms. For now, we
keep the random 500ms and use the ranking to avoid the vote confilct.

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c | 36 ++++++++++++++++++++++++++++++++++--
 src/cluster_legacy.h |  1 +
 2 files changed, 35 insertions(+), 2 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 22402c8046..f2a3500887 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -2211,6 +2211,12 @@ void markNodeAsFailingIfNeeded(clusterNode *node) {
      * so here the replica is only helping propagating this status. */
     clusterSendFail(node->name);
     clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_SAVE_CONFIG);
+
+    if (nodeIsReplica(myself) && myself->replicaof == node) {
+        /* Immediately check if the node is our primary node, if so, we can try
+         * to initiate a failover as soon as possible. */
+        clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FASTER_FAILOVER);
+    }
 }
 
 /* This function is called only if a node is marked as FAIL, but we are able
@@ -3543,6 +3549,12 @@ int clusterProcessPacket(clusterLink *link) {
                     noaddr_node->flags |= CLUSTER_NODE_FAIL;
                     noaddr_node->fail_time = now;
                     clusterSendFail(noaddr_node->name);
+
+                    if (nodeIsReplica(myself) && myself->replicaof == noaddr_node) {
+                        /* Immediately check if the node is our primary node, if so, we can try
+                         * to initiate a failover as soon as possible. */
+                        clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FASTER_FAILOVER);
+                    }
                 }
                 clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE);
                 return 0;
@@ -3777,6 +3789,15 @@ int clusterProcessPacket(clusterLink *link) {
                 failing->fail_time = now;
                 failing->flags &= ~CLUSTER_NODE_PFAIL;
                 clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE);
+
+                if (nodeIsReplica(myself) && myself->replicaof == failing) {
+                    /* Immediately check if the node is our primary node, if so, we can try
+                     * to initiate a failover as soon as possible. We will also try to send
+                     * the FAIL packet in case we trigger the failover immediately but unable
+                     * to get the votes. */
+                    clusterSendFail(failing->name);
+                    clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FASTER_FAILOVER);
+                }
             }
         } else {
             serverLog(LL_NOTICE, "Ignoring FAIL message from unknown node %.40s about %.40s", hdr->sender,
@@ -4963,7 +4984,6 @@ void clusterHandleReplicaFailover(void) {
      * elapsed, we can setup a new one. */
     if (auth_age > auth_retry_time) {
         server.cluster->failover_auth_time = now +
-                                             500 +           /* Fixed delay of 500 milliseconds, let FAIL msg propagate. */
                                              random() % 500; /* Random delay between 0 and 500 milliseconds. */
         server.cluster->failover_auth_count = 0;
         server.cluster->failover_auth_sent = 0;
@@ -4984,6 +5004,11 @@ void clusterHandleReplicaFailover(void) {
             /* Reset auth_age since it is outdated now and we can bypass the auth_timeout
              * check in the next state and start the election ASAP. */
             auth_age = 0;
+        } else if (server.cluster->failover_auth_time == now) {
+            /* If we happen to initiate a failover immediately, reset auth_age since it is
+             * outdated now and we can bypass the auth_timeout check in the next state and
+             * start the election ASAP. */
+            auth_age = 0;
         }
         serverLog(LL_NOTICE,
                   "Start of election delayed for %lld milliseconds "
@@ -5573,7 +5598,8 @@ void clusterBeforeSleep(void) {
         }
     } else if (flags & CLUSTER_TODO_HANDLE_FAILOVER) {
         /* Handle failover, this is needed when it is likely that there is already
-         * the quorum from primaries in order to react fast. */
+         * the quorum from primaries in order to react fast. Or when we determine
+         * that we can proceed with the failover. */
         clusterHandleReplicaFailover();
     }
 
@@ -5592,6 +5618,12 @@ void clusterBeforeSleep(void) {
          * regular ping. */
         clusterBroadcastPong(CLUSTER_BROADCAST_ALL);
     }
+
+    if (flags & CLUSTER_TODO_HANDLE_FASTER_FAILOVER) {
+        /* Handle faster failover, this goes after all the todos to check if we
+         * can initiate a fast failover. */
+        clusterHandleReplicaFailover();
+    }
 }
 
 void clusterDoBeforeSleep(int flags) {
diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h
index 09accca26a..a3edc5ed35 100644
--- a/src/cluster_legacy.h
+++ b/src/cluster_legacy.h
@@ -25,6 +25,7 @@
 #define CLUSTER_TODO_FSYNC_CONFIG (1 << 3)
 #define CLUSTER_TODO_HANDLE_MANUALFAILOVER (1 << 4)
 #define CLUSTER_TODO_BROADCAST_ALL (1 << 5)
+#define CLUSTER_TODO_HANDLE_FASTER_FAILOVER (1 << 6)
 
 /* clusterLink encapsulates everything needed to talk with a remote node. */
 typedef struct clusterLink {

From 01a1cf3687e3b4c8b03f0b9eea12ce0efb77dae7 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Wed, 25 Jun 2025 12:32:54 +0800
Subject: [PATCH 2/4] Revert "Remove the 500ms fixed delay for automatic
 failover for faster failover"

This reverts commit 00d4b4cf12410a1b58cad4372d8bcd8ceab2d970.
---
 src/cluster_legacy.c | 36 ++----------------------------------
 src/cluster_legacy.h |  1 -
 2 files changed, 2 insertions(+), 35 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index f2a3500887..22402c8046 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -2211,12 +2211,6 @@ void markNodeAsFailingIfNeeded(clusterNode *node) {
      * so here the replica is only helping propagating this status. */
     clusterSendFail(node->name);
     clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_SAVE_CONFIG);
-
-    if (nodeIsReplica(myself) && myself->replicaof == node) {
-        /* Immediately check if the node is our primary node, if so, we can try
-         * to initiate a failover as soon as possible. */
-        clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FASTER_FAILOVER);
-    }
 }
 
 /* This function is called only if a node is marked as FAIL, but we are able
@@ -3549,12 +3543,6 @@ int clusterProcessPacket(clusterLink *link) {
                     noaddr_node->flags |= CLUSTER_NODE_FAIL;
                     noaddr_node->fail_time = now;
                     clusterSendFail(noaddr_node->name);
-
-                    if (nodeIsReplica(myself) && myself->replicaof == noaddr_node) {
-                        /* Immediately check if the node is our primary node, if so, we can try
-                         * to initiate a failover as soon as possible. */
-                        clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FASTER_FAILOVER);
-                    }
                 }
                 clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE);
                 return 0;
@@ -3789,15 +3777,6 @@ int clusterProcessPacket(clusterLink *link) {
                 failing->fail_time = now;
                 failing->flags &= ~CLUSTER_NODE_PFAIL;
                 clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE);
-
-                if (nodeIsReplica(myself) && myself->replicaof == failing) {
-                    /* Immediately check if the node is our primary node, if so, we can try
-                     * to initiate a failover as soon as possible. We will also try to send
-                     * the FAIL packet in case we trigger the failover immediately but unable
-                     * to get the votes. */
-                    clusterSendFail(failing->name);
-                    clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FASTER_FAILOVER);
-                }
             }
         } else {
             serverLog(LL_NOTICE, "Ignoring FAIL message from unknown node %.40s about %.40s", hdr->sender,
@@ -4984,6 +4963,7 @@ void clusterHandleReplicaFailover(void) {
      * elapsed, we can setup a new one. */
     if (auth_age > auth_retry_time) {
         server.cluster->failover_auth_time = now +
+                                             500 +           /* Fixed delay of 500 milliseconds, let FAIL msg propagate. */
                                              random() % 500; /* Random delay between 0 and 500 milliseconds. */
         server.cluster->failover_auth_count = 0;
         server.cluster->failover_auth_sent = 0;
@@ -5004,11 +4984,6 @@ void clusterHandleReplicaFailover(void) {
             /* Reset auth_age since it is outdated now and we can bypass the auth_timeout
              * check in the next state and start the election ASAP. */
             auth_age = 0;
-        } else if (server.cluster->failover_auth_time == now) {
-            /* If we happen to initiate a failover immediately, reset auth_age since it is
-             * outdated now and we can bypass the auth_timeout check in the next state and
-             * start the election ASAP. */
-            auth_age = 0;
         }
         serverLog(LL_NOTICE,
                   "Start of election delayed for %lld milliseconds "
@@ -5598,8 +5573,7 @@ void clusterBeforeSleep(void) {
         }
     } else if (flags & CLUSTER_TODO_HANDLE_FAILOVER) {
         /* Handle failover, this is needed when it is likely that there is already
-         * the quorum from primaries in order to react fast. Or when we determine
-         * that we can proceed with the failover. */
+         * the quorum from primaries in order to react fast. */
         clusterHandleReplicaFailover();
     }
 
@@ -5618,12 +5592,6 @@ void clusterBeforeSleep(void) {
          * regular ping. */
         clusterBroadcastPong(CLUSTER_BROADCAST_ALL);
     }
-
-    if (flags & CLUSTER_TODO_HANDLE_FASTER_FAILOVER) {
-        /* Handle faster failover, this goes after all the todos to check if we
-         * can initiate a fast failover. */
-        clusterHandleReplicaFailover();
-    }
 }
 
 void clusterDoBeforeSleep(int flags) {
diff --git a/src/cluster_legacy.h b/src/cluster_legacy.h
index a3edc5ed35..09accca26a 100644
--- a/src/cluster_legacy.h
+++ b/src/cluster_legacy.h
@@ -25,7 +25,6 @@
 #define CLUSTER_TODO_FSYNC_CONFIG (1 << 3)
 #define CLUSTER_TODO_HANDLE_MANUALFAILOVER (1 << 4)
 #define CLUSTER_TODO_BROADCAST_ALL (1 << 5)
-#define CLUSTER_TODO_HANDLE_FASTER_FAILOVER (1 << 6)
 
 /* clusterLink encapsulates everything needed to talk with a remote node. */
 typedef struct clusterLink {

From 2f076938048a256e46a6db1f37e4cc75e4df6a1c Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Fri, 27 Jun 2025 11:38:50 +0800
Subject: [PATCH 3/4] Trigger the election as soon as possible when myself
 marked primary as FAIL

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c | 41 +++++++++++++++++++++++++++--------------
 1 file changed, 27 insertions(+), 14 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index bf684655ad..9e9e44c493 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -2176,6 +2176,25 @@ int clusterBlacklistExists(char *nodeid, size_t len) {
  * CLUSTER messages exchange - PING/PONG and gossip
  * -------------------------------------------------------------------------- */
 
+/* Marks a node as FAIL. Apart from clusterLoadConfig, this is the only way we mark a
+ * node as FAIL during runtime. */
+void markNodeAsFailing(clusterNode *node) {
+    /* Mark the node as FAIL. */
+    node->flags |= CLUSTER_NODE_FAIL;
+    node->fail_time = mstime();
+    /* Remove the PFAIL flag. */
+    node->flags &= ~CLUSTER_NODE_PFAIL;
+
+    /* Immediately check if the failing node is our primary node. */
+    if (nodeIsReplica(myself) && myself->replicaof == node) {
+        /* We can start an automatic failover as soon as possible, setting a flag
+         * here so that we don't need to waiting for the cron to kick in. */
+        clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER);
+    }
+
+    clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE);
+}
+
 /* This function checks if a given node should be marked as FAIL.
  * It happens if the following conditions are met:
  *
@@ -2212,9 +2231,7 @@ void markNodeAsFailingIfNeeded(clusterNode *node) {
     serverLog(LL_NOTICE, "Marking node %.40s (%s) as failing (quorum reached).", node->name, node->human_nodename);
 
     /* Mark the node as failing. */
-    node->flags &= ~CLUSTER_NODE_PFAIL;
-    node->flags |= CLUSTER_NODE_FAIL;
-    node->fail_time = mstime();
+    markNodeAsFailing(node);
 
     /* Broadcast the failing node name to everybody, forcing all the other
      * reachable nodes to flag the node as FAIL.
@@ -2222,7 +2239,6 @@ void markNodeAsFailingIfNeeded(clusterNode *node) {
      * the failing state is triggered collecting failure reports from primaries,
      * so here the replica is only helping propagating this status. */
     clusterSendFail(node->name);
-    clusterDoBeforeSleep(CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_SAVE_CONFIG);
 }
 
 /* This function is called only if a node is marked as FAIL, but we are able
@@ -3551,9 +3567,7 @@ int clusterProcessPacket(clusterLink *link) {
                  * the clients, and the replica will never initiate a failover since the
                  * node is not actually in FAIL state. */
                 if (!nodeFailed(noaddr_node)) {
-                    noaddr_node->flags &= ~CLUSTER_NODE_PFAIL;
-                    noaddr_node->flags |= CLUSTER_NODE_FAIL;
-                    noaddr_node->fail_time = now;
+                    markNodeAsFailing(noaddr_node);
                     clusterSendFail(noaddr_node->name);
                 }
                 clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE);
@@ -3785,10 +3799,7 @@ int clusterProcessPacket(clusterLink *link) {
             if (failing && !(failing->flags & (CLUSTER_NODE_FAIL | CLUSTER_NODE_MYSELF))) {
                 serverLog(LL_NOTICE, "FAIL message received from %.40s (%s) about %.40s (%s)", hdr->sender,
                           sender->human_nodename, hdr->data.fail.about.nodename, failing->human_nodename);
-                failing->flags |= CLUSTER_NODE_FAIL;
-                failing->fail_time = now;
-                failing->flags &= ~CLUSTER_NODE_PFAIL;
-                clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE);
+                markNodeAsFailing(failing);
             }
         } else {
             serverLog(LL_NOTICE, "Ignoring FAIL message from unknown node %.40s about %.40s", hdr->sender,
@@ -5578,6 +5589,11 @@ void clusterBeforeSleep(void) {
      * called for flags set should be able to clear its flag). */
     server.cluster->todo_before_sleep = 0;
 
+    /* Update the cluster state. We handle this flag first so that if we happen
+     * to also has have failover flag, we can check the state first (and log the
+     * state) before attempting the failover. */
+    if (flags & CLUSTER_TODO_UPDATE_STATE) clusterUpdateState();
+
     if (flags & CLUSTER_TODO_HANDLE_MANUALFAILOVER) {
         /* Handle manual failover as soon as possible so that won't have a 100ms
          * as it was handled only in clusterCron */
@@ -5596,9 +5612,6 @@ void clusterBeforeSleep(void) {
         }
     }
 
-    /* Update the cluster state. */
-    if (flags & CLUSTER_TODO_UPDATE_STATE) clusterUpdateState();
-
     /* Save the config, possibly using fsync. */
     if (flags & CLUSTER_TODO_SAVE_CONFIG) {
         int fsync = flags & CLUSTER_TODO_FSYNC_CONFIG;

From e98cd2d7ecffb6cffe496037e3ce88875a6e3ec9 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Fri, 27 Jun 2025 23:38:05 +0800
Subject: [PATCH 4/4] Update src/cluster_legacy.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 9e9e44c493..81af0b9670 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -5590,7 +5590,7 @@ void clusterBeforeSleep(void) {
     server.cluster->todo_before_sleep = 0;
 
     /* Update the cluster state. We handle this flag first so that if we happen
-     * to also has have failover flag, we can check the state first (and log the
+     * to also have a failover flag, we can check the state first (and log the
      * state) before attempting the failover. */
     if (flags & CLUSTER_TODO_UPDATE_STATE) clusterUpdateState();