From a0541b6b6d93e4bc12dabb75733ecb0722f8e1b0 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Tue, 10 Sep 2024 16:06:01 +0800
Subject: [PATCH 1/7] Optimize cluster election when nodes initiate elections
 at the same epoch

If multiple primary nodes go down at the same time, their replica nodes will
initiate the elections at the same time. There is a certain probability that
the replicas will initate the elections in the same epoch.

And obviously, in our current election mechanism, only one replica node can
eventually get the enough votes, and the other replica node will fail to win
due the the insufficient majority, and then its election will time out and
we will wait for the retry, which result in a long failure time.

If another node has been won the election in the failover epoch, we can assume
that my election has failed and we can retry as soom as possible.

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c             | 11 +++++++++++
 tests/unit/cluster/failover2.tcl | 33 ++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index f3925f5695..6610a3b7a1 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -3113,6 +3113,17 @@ int clusterProcessPacket(clusterLink *link) {
         if (sender_claims_to_be_primary && sender_claimed_config_epoch > sender->configEpoch) {
             sender->configEpoch = sender_claimed_config_epoch;
             clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_FSYNC_CONFIG);
+
+            if (server.cluster->failover_auth_time && sender->configEpoch == server.cluster->failover_auth_epoch) {
+                /* There are another node has claimed it in this epoch, if we have any ongoing
+                 * election, we can reset it since there won't be enough votes and we can start
+                 * a new one ASAP. */
+                server.cluster->failover_auth_time = 0;
+                serverLog(LL_WARNING, "I have a failover election for epoch %lld in progress and "
+                                      "received node %.40s (%s) claiming this epoch, resetting the election.",
+                                      sender->configEpoch, sender->name, sender->human_nodename);
+                clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER);
+            }
         }
         /* Update the replication offset info for this node. */
         sender->repl_offset = ntohu64(hdr->offset);
diff --git a/tests/unit/cluster/failover2.tcl b/tests/unit/cluster/failover2.tcl
index 7bc6a05e95..21c4f4a678 100644
--- a/tests/unit/cluster/failover2.tcl
+++ b/tests/unit/cluster/failover2.tcl
@@ -64,3 +64,36 @@ start_cluster 3 4 {tags {external:skip cluster} overrides {cluster-ping-interval
     }
 
 } ;# start_cluster
+
+
+start_cluster 7 3 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 5000}} {
+    test "Primaries will not time out then they are elected in the same epoch" {
+        # Since we have the delay time, so these node may not initiate the
+        # election at the same time (same epoch). But if they do, we make
+        # sure there is no failover timeout.
+
+        # Killing there primary nodes.
+        pause_process [srv 0 pid]
+        pause_process [srv -1 pid]
+        pause_process [srv -2 pid]
+
+        # Wait for the failover
+        wait_for_condition 1000 50 {
+            [s -7 role] == "master" &&
+            [s -8 role] == "master" &&
+            [s -9 role] == "master"
+        } else {
+            fail "No failover detected"
+        }
+
+        # Make sure there is no failover timeout.
+        verify_no_log_message -7 "*Failover attempt expired*" 0
+        verify_no_log_message -8 "*Failover attempt expired*" 0
+        verify_no_log_message -9 "*Failover attempt expired*" 0
+
+        # Resuming these primary nodes, speed up the shutdown.
+        resume_process [srv 0 pid]
+        resume_process [srv -1 pid]
+        resume_process [srv -2 pid]
+    }
+} ;# start_cluster

From cea92678cf7eb3bf5846e74743cb16d210ed222f Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Tue, 10 Sep 2024 16:37:51 +0800
Subject: [PATCH 2/7] fix build and fix format

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 6610a3b7a1..8711ca5ce8 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -3119,9 +3119,10 @@ int clusterProcessPacket(clusterLink *link) {
                  * election, we can reset it since there won't be enough votes and we can start
                  * a new one ASAP. */
                 server.cluster->failover_auth_time = 0;
-                serverLog(LL_WARNING, "I have a failover election for epoch %lld in progress and "
-                                      "received node %.40s (%s) claiming this epoch, resetting the election.",
-                                      sender->configEpoch, sender->name, sender->human_nodename);
+                serverLog(LL_WARNING,
+                          "I have a failover election for epoch %llu in progress and "
+                          "received node %.40s (%s) claiming this epoch, resetting the election.",
+                          (unsigned long long)sender->configEpoch, sender->name, sender->human_nodename);
                 clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER);
             }
         }

From 41034bdd56f5a12126c0c1bab6a191aaaf23b1f7 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Fri, 13 Sep 2024 15:08:40 +0800
Subject: [PATCH 3/7] Apply suggestions from code review

Co-authored-by: Ping Xie <pingxie@outlook.com>
Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 8711ca5ce8..7e98046014 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -3114,15 +3114,15 @@ int clusterProcessPacket(clusterLink *link) {
             sender->configEpoch = sender_claimed_config_epoch;
             clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_FSYNC_CONFIG);
 
-            if (server.cluster->failover_auth_time && sender->configEpoch == server.cluster->failover_auth_epoch) {
-                /* There are another node has claimed it in this epoch, if we have any ongoing
-                 * election, we can reset it since there won't be enough votes and we can start
-                 * a new one ASAP. */
+            if (server.cluster->failover_auth_time && sender->configEpoch >= server.cluster->failover_auth_epoch) {
+                /* Another node has claimed an epoch greater than or equal to ours. If we have an ongoing election, 
+                 * reset it because we cannot win with an epoch smaller than or equal to the incoming claim. 
+                 * This allows us to start a new election as soon as possible. */
                 server.cluster->failover_auth_time = 0;
                 serverLog(LL_WARNING,
-                          "I have a failover election for epoch %llu in progress and "
-                          "received node %.40s (%s) claiming this epoch, resetting the election.",
-                          (unsigned long long)sender->configEpoch, sender->name, sender->human_nodename);
+                          "Failover election in progress for epoch %llu, but received a claim from "
+                          "node %.40s (%s) with an equal or higher epoch %llu. Resetting the election since we cannot win.",
+                          (unsigned long long)server.cluster->failover_auth_epoch, sender->name, sender->human_nodename, (unsigned long long)sender->configEpoch);
                 clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER);
             }
         }

From 0270e716d7f2975011b34c21d4a0254ec4095f56 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Fri, 13 Sep 2024 15:13:22 +0800
Subject: [PATCH 4/7] fix format

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 7e98046014..36b45d4caa 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -3115,14 +3115,16 @@ int clusterProcessPacket(clusterLink *link) {
             clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_FSYNC_CONFIG);
 
             if (server.cluster->failover_auth_time && sender->configEpoch >= server.cluster->failover_auth_epoch) {
-                /* Another node has claimed an epoch greater than or equal to ours. If we have an ongoing election, 
-                 * reset it because we cannot win with an epoch smaller than or equal to the incoming claim. 
-                 * This allows us to start a new election as soon as possible. */
+                /* Another node has claimed an epoch greater than or equal to ours.
+                 * If we have an ongoing election,  reset it because we cannot win
+                 * with an epoch smaller than or equal to the incoming claim. This
+                 * allows us to start a new election as soon as possible. */
                 server.cluster->failover_auth_time = 0;
                 serverLog(LL_WARNING,
-                          "Failover election in progress for epoch %llu, but received a claim from "
-                          "node %.40s (%s) with an equal or higher epoch %llu. Resetting the election since we cannot win.",
-                          (unsigned long long)server.cluster->failover_auth_epoch, sender->name, sender->human_nodename, (unsigned long long)sender->configEpoch);
+                          "Failover election in progress for epoch %llu, but received a claim from node %.40s (%s) "
+                          "with an equal or higher epoch %llu. Resetting the election since we cannot win.",
+                          (unsigned long long)server.cluster->failover_auth_epoch, sender->name, sender->human_nodename,
+                          (unsigned long long)sender->configEpoch);
                 clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER);
             }
         }

From ce0e75e1bc5afaa84e308859842f4c1dbe377f71 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Tue, 24 Sep 2024 11:09:43 +0800
Subject: [PATCH 5/7] update comment

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index a5acd39236..0aa031b090 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -3138,7 +3138,7 @@ int clusterProcessPacket(clusterLink *link) {
 
             if (server.cluster->failover_auth_time && sender->configEpoch >= server.cluster->failover_auth_epoch) {
                 /* Another node has claimed an epoch greater than or equal to ours.
-                 * If we have an ongoing election,  reset it because we cannot win
+                 * If we have an ongoing election, reset it because we cannot win
                  * with an epoch smaller than or equal to the incoming claim. This
                  * allows us to start a new election as soon as possible. */
                 server.cluster->failover_auth_time = 0;
@@ -3147,6 +3147,8 @@ int clusterProcessPacket(clusterLink *link) {
                           "with an equal or higher epoch %llu. Resetting the election since we cannot win.",
                           (unsigned long long)server.cluster->failover_auth_epoch, sender->name, sender->human_nodename,
                           (unsigned long long)sender->configEpoch);
+                /* Maybe we could start a new election, set a flag here to make sure
+                 * we check as soon as possible, instead of waiting for a cron. */
                 clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER);
             }
         }

From 7ce57f3b34bd9bb58747693c805c5594a18ddd7b Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Fri, 8 Nov 2024 14:06:17 +0800
Subject: [PATCH 6/7] Update the comment

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 0a469d0ce1..1c510ee712 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -3142,11 +3142,11 @@ int clusterProcessPacket(clusterLink *link) {
                  * with an epoch smaller than or equal to the incoming claim. This
                  * allows us to start a new election as soon as possible. */
                 server.cluster->failover_auth_time = 0;
-                serverLog(LL_WARNING,
-                          "Failover election in progress for epoch %llu, but received a claim from node %.40s (%s) "
-                          "with an equal or higher epoch %llu. Resetting the election since we cannot win.",
-                          (unsigned long long)server.cluster->failover_auth_epoch, sender->name, sender->human_nodename,
-                          (unsigned long long)sender->configEpoch);
+                serverLog(LL_WARNING, "Failover election in progress for epoch %llu, but received a claim from "
+                                      "node %.40s (%s) with an equal or higher epoch %llu. Resetting the election "
+                                      "since we cannot win an election in the past.",
+                                      (unsigned long long)server.cluster->failover_auth_epoch,
+                                      sender->name, sender->human_nodename, (unsigned long long)sender->configEpoch);
                 /* Maybe we could start a new election, set a flag here to make sure
                  * we check as soon as possible, instead of waiting for a cron. */
                 clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER);

From 6291ed690d528635b61bbf64541a0145ac89371c Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Sat, 9 Nov 2024 21:21:58 +0800
Subject: [PATCH 7/7] Update src/cluster_legacy.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 src/cluster_legacy.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 1c510ee712..ae55b062ff 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -3142,11 +3142,13 @@ int clusterProcessPacket(clusterLink *link) {
                  * with an epoch smaller than or equal to the incoming claim. This
                  * allows us to start a new election as soon as possible. */
                 server.cluster->failover_auth_time = 0;
-                serverLog(LL_WARNING, "Failover election in progress for epoch %llu, but received a claim from "
-                                      "node %.40s (%s) with an equal or higher epoch %llu. Resetting the election "
-                                      "since we cannot win an election in the past.",
-                                      (unsigned long long)server.cluster->failover_auth_epoch,
-                                      sender->name, sender->human_nodename, (unsigned long long)sender->configEpoch);
+                serverLog(LL_WARNING,
+                          "Failover election in progress for epoch %llu, but received a claim from "
+                          "node %.40s (%s) with an equal or higher epoch %llu. Resetting the election "
+                          "since we cannot win an election in the past.",
+                          (unsigned long long)server.cluster->failover_auth_epoch,
+                          sender->name, sender->human_nodename,
+                          (unsigned long long)sender->configEpoch);
                 /* Maybe we could start a new election, set a flag here to make sure
                  * we check as soon as possible, instead of waiting for a cron. */
                 clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER);