From a0cb0bc2ea54597aa1a8721aead9efa5c1a2f3c4 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Thu, 5 Jun 2025 20:29:13 +0800
Subject: [PATCH 1/3] Fix replica can to finish failover when config epoch is
 outdated
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the primary changes the config epoch and then down immediately,
the replica may not update the config epoch in time. Although we will
broadcast the change in cluster (see #1813), there may be a race in
the network or in the code. In this case, the replica will never finish
the failover since other primaries will refuse to vote because the
replica's slot config epoch is old.

We need a way to allow the replica can finish the failover in this case.

When the primary refuses to vote because the replica's config epoch is
less than the dead primary's config epoch, it can send an UPDATE packet
to the replica to inform the replica about the dead primary. The UPDATE
message contains information about the dead primary's config epoch and
owned slots. The failover will time out, but later the replica can try
again with the updated config epoch and it can succeed.

Fixes #2169.

Signed-off-by: Binbin <binloveplay1314@qq.com>
Co-authored-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
---
 src/cluster_legacy.c             | 17 +++++-
 tests/support/cluster_util.tcl   |  6 ++
 tests/unit/cluster/failover2.tcl | 97 ++++++++++++++++++++++++++++++++
 3 files changed, 117 insertions(+), 3 deletions(-)

diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
index 87a58965cb..1b940ef26b 100644
--- a/src/cluster_legacy.c
+++ b/src/cluster_legacy.c
@@ -3734,9 +3734,10 @@ int clusterProcessPacket(clusterLink *link) {
                     if (server.cluster->slots[j] == sender || isSlotUnclaimed(j)) continue;
                     if (server.cluster->slots[j]->configEpoch > sender_claimed_config_epoch) {
                         serverLog(LL_VERBOSE,
-                                  "Node %.40s has old slots configuration, sending "
-                                  "an UPDATE message about %.40s",
-                                  sender->name, server.cluster->slots[j]->name);
+                                  "Node %.40s (%s) has old slots configuration, sending "
+                                  "an UPDATE message about %.40s (%s)",
+                                  sender->name, sender->human_nodename,
+                                  server.cluster->slots[j]->name, server.cluster->slots[j]->human_nodename);
                         clusterSendUpdate(sender->link, server.cluster->slots[j]);
 
                         /* TODO: instead of exiting the loop send every other
@@ -4682,6 +4683,16 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) {
                   "slot %d epoch (%llu) > reqConfigEpoch (%llu)",
                   node->name, node->human_nodename, j, (unsigned long long)server.cluster->slots[j]->configEpoch,
                   (unsigned long long)requestConfigEpoch);
+
+        /* Send an UPDATE message to the replica. After receiving the UPDATE message,
+         * the replica will update the slots config so that it can initiate a failover
+         * again later. Otherwise the replica will never get votes if the primary is down. */
+        serverLog(LL_VERBOSE,
+                  "Node %.40s (%s) has old slots configuration, sending "
+                  "an UPDATE message about %.40s (%s)",
+                  node->name, node->human_nodename,
+                  server.cluster->slots[j]->name, server.cluster->slots[j]->human_nodename);
+        clusterSendUpdate(node->link, server.cluster->slots[j]);
         return;
     }
 
diff --git a/tests/support/cluster_util.tcl b/tests/support/cluster_util.tcl
index 03c35f91fc..960259e4fd 100644
--- a/tests/support/cluster_util.tcl
+++ b/tests/support/cluster_util.tcl
@@ -210,6 +210,12 @@ proc cluster_allocate_replicas {masters replicas} {
 # Setup method to be executed to configure the cluster before the
 # tests run.
 proc cluster_setup {masters replicas node_count slot_allocator replica_allocator code} {
+    set config_epoch 1
+    for {set i 0} {$i < $node_count} {incr i} {
+        R $i CLUSTER SET-CONFIG-EPOCH $config_epoch
+        incr config_epoch
+    }
+
     # Have all nodes meet
     if {$::tls} {
         set tls_cluster [lindex [R 0 CONFIG GET tls-cluster] 1]
diff --git a/tests/unit/cluster/failover2.tcl b/tests/unit/cluster/failover2.tcl
index 2272a150ee..5bbfa4952a 100644
--- a/tests/unit/cluster/failover2.tcl
+++ b/tests/unit/cluster/failover2.tcl
@@ -133,3 +133,100 @@ run_solo {cluster} {
         }
     } ;# start_cluster
 } ;# run_solo
+
+proc test_replica_config_epoch_failover {type} {
+    test "Replica can update the config epoch when trigger the failover - $type" {
+        set CLUSTER_PACKET_TYPE_NONE -1
+        set CLUSTER_PACKET_TYPE_ALL -2
+
+        if {$type == "automatic"} {
+            R 3 CONFIG SET cluster-replica-no-failover no
+        } elseif {$type == "manual"} {
+            R 3 CONFIG SET cluster-replica-no-failover yes
+        }
+        R 3 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_ALL
+
+        set R0_nodeid [R 0 cluster myid]
+
+        # R 0 is the first node, so we expect its epoch to be the smallest,
+        # so bumpepoch must succeed and it's config epoch will be changed.
+        set res [R 0 cluster bumpepoch]
+        assert_match {BUMPED *} $res
+        set R0_config_epoch [lindex $res 1]
+
+        # Wait for the config epoch to propagate across the cluster.
+        wait_for_condition 1000 10 {
+            $R0_config_epoch == [dict get [cluster_get_node_by_id 1 $R0_nodeid] config_epoch] &&
+            $R0_config_epoch == [dict get [cluster_get_node_by_id 2 $R0_nodeid] config_epoch]
+        } else {
+            fail "Other primaries does not update config epoch"
+        }
+        # Make sure that replica do not update config epoch.
+        assert_not_equal $R0_config_epoch [dict get [cluster_get_node_by_id 3 $R0_nodeid] config_epoch]
+
+        # Pause the R 0 and wait for the cluster to be down.
+        pause_process [srv 0 pid]
+        R 3 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_NONE
+        wait_for_condition 1000 50 {
+            [CI 1 cluster_state] == "fail" &&
+            [CI 2 cluster_state] == "fail" &&
+            [CI 3 cluster_state] == "fail"
+        } else {
+            fail "Cluster does not fail"
+        }
+
+        # Make sure both the automatic and the manual failover will fail in the first time.
+        if {$type == "automatic"} {
+            wait_for_log_messages -3 {"*Failover attempt expired*"} 0 1000 10
+        } elseif {$type == "manual"} {
+            R 3 cluster failover force
+            wait_for_log_messages -3 {"*Manual failover timed out*"} 0 1000 10
+        }
+
+        # Make sure the primaries prints the relevant logs.
+        wait_for_log_messages -1 {"*Failover auth denied to* epoch * > reqConfigEpoch*"} 0 1000 10
+        wait_for_log_messages -1 {"*has old slots configuration, sending an UPDATE message about*"} 0 1000 10
+        wait_for_log_messages -2 {"*Failover auth denied to* epoch * > reqConfigEpoch*"} 0 1000 10
+        wait_for_log_messages -2 {"*has old slots configuration, sending an UPDATE message about*"} 0 1000 10
+
+        # Make sure the replica has updated the config epoch.
+        wait_for_condition 1000 10 {
+            $R0_config_epoch == [dict get [cluster_get_node_by_id 1 $R0_nodeid] config_epoch]
+        } else {
+            fail "The replica does not update the config epoch"
+        }
+
+        if {$type == "manual"} {
+            # The second manual failure will succeed because the config epoch
+            # has already propagated.
+            R 3 cluster failover force
+        }
+
+        # Wait for the failover to success.
+        wait_for_condition 1000 50 {
+            [s -3 role] == "master" &&
+            [CI 1 cluster_state] == "ok" &&
+            [CI 2 cluster_state] == "ok" &&
+            [CI 3 cluster_state] == "ok"
+        } else {
+            fail "Failover does not happen"
+        }
+
+        # Restore the old primary, make sure it can covert
+        resume_process [srv 0 pid]
+        wait_for_condition 1000 50 {
+            [s 0 role] == "slave" &&
+            [CI 0 cluster_state] == "ok"
+        } else {
+            fail "The old primary was not converted into replica"
+        }
+    }
+} ;# start_cluster
+
+start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-replica-validity-factor 0}} {
+    test_replica_config_epoch_failover "automatic"
+}
+
+start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-replica-validity-factor 0}} {
+    test_replica_config_epoch_failover "manual"
+}

From c7a5f7462beefa74d9b92d95acb68bb0d1305665 Mon Sep 17 00:00:00 2001
From: Binbin <binloveplay1314@qq.com>
Date: Sat, 7 Jun 2025 14:26:29 +0800
Subject: [PATCH 2/3] start_cluster comment around the proc

Signed-off-by: Binbin <binloveplay1314@qq.com>
---
 tests/unit/cluster/failover2.tcl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/unit/cluster/failover2.tcl b/tests/unit/cluster/failover2.tcl
index 5bbfa4952a..0d0d8c705e 100644
--- a/tests/unit/cluster/failover2.tcl
+++ b/tests/unit/cluster/failover2.tcl
@@ -134,6 +134,7 @@ run_solo {cluster} {
     } ;# start_cluster
 } ;# run_solo
 
+# start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-replica-validity-factor 0}}
 proc test_replica_config_epoch_failover {type} {
     test "Replica can update the config epoch when trigger the failover - $type" {
         set CLUSTER_PACKET_TYPE_NONE -1

From d5bedc24597ca1d7057eae1417c7a85789cbe8cd Mon Sep 17 00:00:00 2001
From: Harkrishn Patro <bunty.hari@gmail.com>
Date: Mon, 9 Jun 2025 14:34:26 -0700
Subject: [PATCH 3/3] Apply suggestions from code review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Viktor Söderqvist <viktor.soderqvist@est.tech>
Co-authored-by: Madelyn Olson <madelyneolson@gmail.com>
Signed-off-by: Harkrishn Patro <bunty.hari@gmail.com>
---
 tests/unit/cluster/failover2.tcl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/unit/cluster/failover2.tcl b/tests/unit/cluster/failover2.tcl
index 0d0d8c705e..84db475929 100644
--- a/tests/unit/cluster/failover2.tcl
+++ b/tests/unit/cluster/failover2.tcl
@@ -134,6 +134,7 @@ run_solo {cluster} {
     } ;# start_cluster
 } ;# run_solo
 
+# Needs to run in the body of
 # start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-replica-validity-factor 0}}
 proc test_replica_config_epoch_failover {type} {
     test "Replica can update the config epoch when trigger the failover - $type" {
@@ -222,7 +223,7 @@ proc test_replica_config_epoch_failover {type} {
             fail "The old primary was not converted into replica"
         }
     }
-} ;# start_cluster
+}
 
 start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-replica-validity-factor 0}} {
     test_replica_config_epoch_failover "automatic"