Fix flaky test in failover2

Seungmin Lee · Seungmin Lee · commit a98062a9164e · 2025-02-05T04:57:59.000-08:00
Signed-off-by: Seungmin Lee &lt;sungming@amazon.com&gt;
diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
@@ -3260,17 +3260,18 @@ int clusterProcessPacket(clusterLink *link) {
         sender_claimed_config_epoch = ntohu64(hdr->configEpoch);
         if (sender_claimed_current_epoch > server.cluster->currentEpoch)
             server.cluster->currentEpoch = sender_claimed_current_epoch;
-        /* Update the sender configEpoch if it is a primary publishing a newer one. */
+
         if (sender_claims_to_be_primary && sender_claimed_config_epoch > sender->configEpoch) {
+            /* Update the sender configEpoch if it is a primary publishing a newer one. */
             sender->configEpoch = sender_claimed_config_epoch;
             clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_FSYNC_CONFIG);
 
+            /* Another node has claimed an epoch greater than or equal to ours.
+             * If we have an ongoing election, reset it because we cannot win
+             * with an epoch smaller than or equal to the incoming claim. This
+             * allows us to start a new election as soon as possible. */
             if (server.cluster->failover_auth_time && server.cluster->failover_auth_sent &&
                 sender->configEpoch >= server.cluster->failover_auth_epoch) {
-                /* Another node has claimed an epoch greater than or equal to ours.
-                 * If we have an ongoing election, reset it because we cannot win
-                 * with an epoch smaller than or equal to the incoming claim. This
-                 * allows us to start a new election as soon as possible. */
                 server.cluster->failover_auth_time = 0;
                 serverLog(LL_WARNING,
                           "Failover election in progress for epoch %llu, but received a claim from "
@@ -4672,12 +4673,15 @@ int clusterGetFailedPrimaryRank(void) {
         clusterNode *node = dictGetVal(de);
 
         /* Skip nodes that do not need to participate in the rank. */
-        if (!nodeFailed(node) || !clusterNodeIsVotingPrimary(node) || node->num_replicas == 0) continue;
+        if (!(nodeFailed(node) || nodeTimedOut(node)) || !clusterNodeIsVotingPrimary(node) || node->num_replicas == 0) continue;
 
         /* If cluster-replica-validity-factor is enabled, skip the invalid nodes. */
-        if (server.cluster_replica_validity_factor) {
-            if ((now - node->fail_time) > (server.cluster_node_timeout * server.cluster_replica_validity_factor))
+        if (nodeFailed(node) && server.cluster_replica_validity_factor) {
+            if ((now - node->fail_time) > (server.cluster_node_timeout * server.cluster_replica_validity_factor)) {
+                serverLog(LL_DEBUG, "Skip failed primary rank since validity factor is enabled. node failed time: %llu",
+                          (unsigned long long)node->fail_time);
                 continue;
+            }
         }
 
         if (memcmp(node->shard_id, myself->shard_id, CLUSTER_NAMELEN) < 0) rank++;
@@ -4783,6 +4787,13 @@ void clusterFailoverReplaceYourPrimary(void) {
 
     /* 5) If there was a manual failover in progress, clear the state. */
     resetManualFailover();
+
+    /* 6) Since we have changed to a new primary node, the previously set
+     * failover_auth_time and failover_auth_sent should no longer be used */
+    if (server.cluster->failover_auth_time) {
+        server.cluster->failover_auth_time = 0;
+        server.cluster->failover_auth_sent = 0;
+    }
 }
 
 /* This function is called if we are a replica node and our primary serving
@@ -4863,6 +4874,7 @@ void clusterHandleReplicaFailover(void) {
                                              random() % 500; /* Random delay between 0 and 500 milliseconds. */
         server.cluster->failover_auth_count = 0;
         server.cluster->failover_auth_sent = 0;
+        server.cluster->failover_failed_primary_rank = 0;
         server.cluster->failover_auth_rank = clusterGetReplicaRank();
         /* We add another delay that is proportional to the replica rank.
          * Specifically 1 second * rank. This way replicas that have a probably
@@ -4872,7 +4884,7 @@ void clusterHandleReplicaFailover(void) {
          * Specifically 0.5 second * rank. This way those failed primaries will be
          * elected in rank to avoid the vote conflicts. */
         server.cluster->failover_failed_primary_rank = clusterGetFailedPrimaryRank();
-        server.cluster->failover_auth_time += server.cluster->failover_failed_primary_rank * 500;
+        server.cluster->failover_auth_time += server.cluster->failover_failed_primary_rank * (1000 + random() % 100);
         /* However if this is a manual failover, no delay is needed. */
         if (server.cluster->mf_end) {
             server.cluster->failover_auth_time = now;
@@ -4916,8 +4928,8 @@ void clusterHandleReplicaFailover(void) {
         }
 
         int new_failed_primary_rank = clusterGetFailedPrimaryRank();
-        if (new_failed_primary_rank != server.cluster->failover_failed_primary_rank) {
-            long long added_delay = (new_failed_primary_rank - server.cluster->failover_failed_primary_rank) * 500;
+        if (new_failed_primary_rank > server.cluster->failover_failed_primary_rank) {
+            long long added_delay = (new_failed_primary_rank - server.cluster->failover_failed_primary_rank) * (1000 + random() % 100);
             server.cluster->failover_auth_time += added_delay;
             server.cluster->failover_failed_primary_rank = new_failed_primary_rank;
             serverLog(LL_NOTICE, "Failed primary rank updated to #%d, added %lld milliseconds of delay.",
@@ -4941,8 +4953,8 @@ void clusterHandleReplicaFailover(void) {
     if (server.cluster->failover_auth_sent == 0) {
         server.cluster->currentEpoch++;
         server.cluster->failover_auth_epoch = server.cluster->currentEpoch;
-        serverLog(LL_NOTICE, "Starting a failover election for epoch %llu, node config epoch is %llu",
-                  (unsigned long long)server.cluster->currentEpoch, (unsigned long long)nodeEpoch(myself));
+        serverLog(LL_NOTICE, "Starting a failover election for epoch %llu, node config epoch is %llu, failover primary rank is %llu",
+                  (unsigned long long)server.cluster->currentEpoch, (unsigned long long)nodeEpoch(myself), (unsigned long long)server.cluster->failover_failed_primary_rank);
         clusterRequestFailoverAuth();
         server.cluster->failover_auth_sent = 1;
         clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_FSYNC_CONFIG);