@@ -3260,17 +3260,18 @@ int clusterProcessPacket(clusterLink *link) {
32603260 sender_claimed_config_epoch = ntohu64 (hdr -> configEpoch );
32613261 if (sender_claimed_current_epoch > server .cluster -> currentEpoch )
32623262 server .cluster -> currentEpoch = sender_claimed_current_epoch ;
3263- /* Update the sender configEpoch if it is a primary publishing a newer one. */
3263+
32643264 if (sender_claims_to_be_primary && sender_claimed_config_epoch > sender -> configEpoch ) {
3265+ /* Update the sender configEpoch if it is a primary publishing a newer one. */
32653266 sender -> configEpoch = sender_claimed_config_epoch ;
32663267 clusterDoBeforeSleep (CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_FSYNC_CONFIG );
32673268
3269+ /* Another node has claimed an epoch greater than or equal to ours.
3270+ * If we have an ongoing election, reset it because we cannot win
3271+ * with an epoch smaller than or equal to the incoming claim. This
3272+ * allows us to start a new election as soon as possible. */
32683273 if (server .cluster -> failover_auth_time && server .cluster -> failover_auth_sent &&
32693274 sender -> configEpoch >= server .cluster -> failover_auth_epoch ) {
3270- /* Another node has claimed an epoch greater than or equal to ours.
3271- * If we have an ongoing election, reset it because we cannot win
3272- * with an epoch smaller than or equal to the incoming claim. This
3273- * allows us to start a new election as soon as possible. */
32743275 server .cluster -> failover_auth_time = 0 ;
32753276 serverLog (LL_WARNING ,
32763277 "Failover election in progress for epoch %llu, but received a claim from "
@@ -4672,12 +4673,15 @@ int clusterGetFailedPrimaryRank(void) {
46724673 clusterNode * node = dictGetVal (de );
46734674
46744675 /* Skip nodes that do not need to participate in the rank. */
4675- if (!nodeFailed (node ) || !clusterNodeIsVotingPrimary (node ) || node -> num_replicas == 0 ) continue ;
4676+ if (!( nodeFailed (node ) || nodeTimedOut ( node ) ) || !clusterNodeIsVotingPrimary (node ) || node -> num_replicas == 0 ) continue ;
46764677
46774678 /* If cluster-replica-validity-factor is enabled, skip the invalid nodes. */
4678- if (server .cluster_replica_validity_factor ) {
4679- if ((now - node -> fail_time ) > (server .cluster_node_timeout * server .cluster_replica_validity_factor ))
4679+ if (nodeFailed (node ) && server .cluster_replica_validity_factor ) {
4680+ if ((now - node -> fail_time ) > (server .cluster_node_timeout * server .cluster_replica_validity_factor )) {
4681+ serverLog (LL_DEBUG , "Skip failed primary rank since validity factor is enabled. node failed time: %llu" ,
4682+ (unsigned long long )node -> fail_time );
46804683 continue ;
4684+ }
46814685 }
46824686
46834687 if (memcmp (node -> shard_id , myself -> shard_id , CLUSTER_NAMELEN ) < 0 ) rank ++ ;
@@ -4783,6 +4787,13 @@ void clusterFailoverReplaceYourPrimary(void) {
47834787
47844788 /* 5) If there was a manual failover in progress, clear the state. */
47854789 resetManualFailover ();
4790+
4791+ /* 6) Since we have changed to a new primary node, the previously set
4792+ * failover_auth_time and failover_auth_sent should no longer be used */
4793+ if (server .cluster -> failover_auth_time ) {
4794+ server .cluster -> failover_auth_time = 0 ;
4795+ server .cluster -> failover_auth_sent = 0 ;
4796+ }
47864797}
47874798
47884799/* This function is called if we are a replica node and our primary serving
@@ -4863,6 +4874,7 @@ void clusterHandleReplicaFailover(void) {
48634874 random () % 500 ; /* Random delay between 0 and 500 milliseconds. */
48644875 server .cluster -> failover_auth_count = 0 ;
48654876 server .cluster -> failover_auth_sent = 0 ;
4877+ server .cluster -> failover_failed_primary_rank = 0 ;
48664878 server .cluster -> failover_auth_rank = clusterGetReplicaRank ();
48674879 /* We add another delay that is proportional to the replica rank.
48684880 * Specifically 1 second * rank. This way replicas that have a probably
@@ -4872,7 +4884,7 @@ void clusterHandleReplicaFailover(void) {
48724884 * Specifically 0.5 second * rank. This way those failed primaries will be
48734885 * elected in rank to avoid the vote conflicts. */
48744886 server .cluster -> failover_failed_primary_rank = clusterGetFailedPrimaryRank ();
4875- server .cluster -> failover_auth_time += server .cluster -> failover_failed_primary_rank * 500 ;
4887+ server .cluster -> failover_auth_time += server .cluster -> failover_failed_primary_rank * ( 1000 + random () % 100 ) ;
48764888 /* However if this is a manual failover, no delay is needed. */
48774889 if (server .cluster -> mf_end ) {
48784890 server .cluster -> failover_auth_time = now ;
@@ -4916,8 +4928,8 @@ void clusterHandleReplicaFailover(void) {
49164928 }
49174929
49184930 int new_failed_primary_rank = clusterGetFailedPrimaryRank ();
4919- if (new_failed_primary_rank != server .cluster -> failover_failed_primary_rank ) {
4920- long long added_delay = (new_failed_primary_rank - server .cluster -> failover_failed_primary_rank ) * 500 ;
4931+ if (new_failed_primary_rank > server .cluster -> failover_failed_primary_rank ) {
4932+ long long added_delay = (new_failed_primary_rank - server .cluster -> failover_failed_primary_rank ) * ( 1000 + random () % 100 ) ;
49214933 server .cluster -> failover_auth_time += added_delay ;
49224934 server .cluster -> failover_failed_primary_rank = new_failed_primary_rank ;
49234935 serverLog (LL_NOTICE , "Failed primary rank updated to #%d, added %lld milliseconds of delay." ,
@@ -4941,8 +4953,8 @@ void clusterHandleReplicaFailover(void) {
49414953 if (server .cluster -> failover_auth_sent == 0 ) {
49424954 server .cluster -> currentEpoch ++ ;
49434955 server .cluster -> failover_auth_epoch = server .cluster -> currentEpoch ;
4944- serverLog (LL_NOTICE , "Starting a failover election for epoch %llu, node config epoch is %llu" ,
4945- (unsigned long long )server .cluster -> currentEpoch , (unsigned long long )nodeEpoch (myself ));
4956+ serverLog (LL_NOTICE , "Starting a failover election for epoch %llu, node config epoch is %llu, failover primary rank is %llu " ,
4957+ (unsigned long long )server .cluster -> currentEpoch , (unsigned long long )nodeEpoch (myself ), ( unsigned long long ) server . cluster -> failover_failed_primary_rank );
49464958 clusterRequestFailoverAuth ();
49474959 server .cluster -> failover_auth_sent = 1 ;
49484960 clusterDoBeforeSleep (CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_FSYNC_CONFIG );
0 commit comments