Skip to content

Commit a98062a

Browse files
author
Seungmin Lee
committed
Fix flaky test in failover2
Signed-off-by: Seungmin Lee <[email protected]>
1 parent 8d8ce19 commit a98062a

File tree

1 file changed

+25
-13
lines changed

1 file changed

+25
-13
lines changed

src/cluster_legacy.c

Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3260,17 +3260,18 @@ int clusterProcessPacket(clusterLink *link) {
32603260
sender_claimed_config_epoch = ntohu64(hdr->configEpoch);
32613261
if (sender_claimed_current_epoch > server.cluster->currentEpoch)
32623262
server.cluster->currentEpoch = sender_claimed_current_epoch;
3263-
/* Update the sender configEpoch if it is a primary publishing a newer one. */
3263+
32643264
if (sender_claims_to_be_primary && sender_claimed_config_epoch > sender->configEpoch) {
3265+
/* Update the sender configEpoch if it is a primary publishing a newer one. */
32653266
sender->configEpoch = sender_claimed_config_epoch;
32663267
clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_FSYNC_CONFIG);
32673268

3269+
/* Another node has claimed an epoch greater than or equal to ours.
3270+
* If we have an ongoing election, reset it because we cannot win
3271+
* with an epoch smaller than or equal to the incoming claim. This
3272+
* allows us to start a new election as soon as possible. */
32683273
if (server.cluster->failover_auth_time && server.cluster->failover_auth_sent &&
32693274
sender->configEpoch >= server.cluster->failover_auth_epoch) {
3270-
/* Another node has claimed an epoch greater than or equal to ours.
3271-
* If we have an ongoing election, reset it because we cannot win
3272-
* with an epoch smaller than or equal to the incoming claim. This
3273-
* allows us to start a new election as soon as possible. */
32743275
server.cluster->failover_auth_time = 0;
32753276
serverLog(LL_WARNING,
32763277
"Failover election in progress for epoch %llu, but received a claim from "
@@ -4672,12 +4673,15 @@ int clusterGetFailedPrimaryRank(void) {
46724673
clusterNode *node = dictGetVal(de);
46734674

46744675
/* Skip nodes that do not need to participate in the rank. */
4675-
if (!nodeFailed(node) || !clusterNodeIsVotingPrimary(node) || node->num_replicas == 0) continue;
4676+
if (!(nodeFailed(node) || nodeTimedOut(node)) || !clusterNodeIsVotingPrimary(node) || node->num_replicas == 0) continue;
46764677

46774678
/* If cluster-replica-validity-factor is enabled, skip the invalid nodes. */
4678-
if (server.cluster_replica_validity_factor) {
4679-
if ((now - node->fail_time) > (server.cluster_node_timeout * server.cluster_replica_validity_factor))
4679+
if (nodeFailed(node) && server.cluster_replica_validity_factor) {
4680+
if ((now - node->fail_time) > (server.cluster_node_timeout * server.cluster_replica_validity_factor)) {
4681+
serverLog(LL_DEBUG, "Skip failed primary rank since validity factor is enabled. node failed time: %llu",
4682+
(unsigned long long)node->fail_time);
46804683
continue;
4684+
}
46814685
}
46824686

46834687
if (memcmp(node->shard_id, myself->shard_id, CLUSTER_NAMELEN) < 0) rank++;
@@ -4783,6 +4787,13 @@ void clusterFailoverReplaceYourPrimary(void) {
47834787

47844788
/* 5) If there was a manual failover in progress, clear the state. */
47854789
resetManualFailover();
4790+
4791+
/* 6) Since we have changed to a new primary node, the previously set
4792+
* failover_auth_time and failover_auth_sent should no longer be used */
4793+
if (server.cluster->failover_auth_time) {
4794+
server.cluster->failover_auth_time = 0;
4795+
server.cluster->failover_auth_sent = 0;
4796+
}
47864797
}
47874798

47884799
/* This function is called if we are a replica node and our primary serving
@@ -4863,6 +4874,7 @@ void clusterHandleReplicaFailover(void) {
48634874
random() % 500; /* Random delay between 0 and 500 milliseconds. */
48644875
server.cluster->failover_auth_count = 0;
48654876
server.cluster->failover_auth_sent = 0;
4877+
server.cluster->failover_failed_primary_rank = 0;
48664878
server.cluster->failover_auth_rank = clusterGetReplicaRank();
48674879
/* We add another delay that is proportional to the replica rank.
48684880
* Specifically 1 second * rank. This way replicas that have a probably
@@ -4872,7 +4884,7 @@ void clusterHandleReplicaFailover(void) {
48724884
* Specifically 0.5 second * rank. This way those failed primaries will be
48734885
* elected in rank to avoid the vote conflicts. */
48744886
server.cluster->failover_failed_primary_rank = clusterGetFailedPrimaryRank();
4875-
server.cluster->failover_auth_time += server.cluster->failover_failed_primary_rank * 500;
4887+
server.cluster->failover_auth_time += server.cluster->failover_failed_primary_rank * (1000 + random() % 100);
48764888
/* However if this is a manual failover, no delay is needed. */
48774889
if (server.cluster->mf_end) {
48784890
server.cluster->failover_auth_time = now;
@@ -4916,8 +4928,8 @@ void clusterHandleReplicaFailover(void) {
49164928
}
49174929

49184930
int new_failed_primary_rank = clusterGetFailedPrimaryRank();
4919-
if (new_failed_primary_rank != server.cluster->failover_failed_primary_rank) {
4920-
long long added_delay = (new_failed_primary_rank - server.cluster->failover_failed_primary_rank) * 500;
4931+
if (new_failed_primary_rank > server.cluster->failover_failed_primary_rank) {
4932+
long long added_delay = (new_failed_primary_rank - server.cluster->failover_failed_primary_rank) * (1000 + random() % 100);
49214933
server.cluster->failover_auth_time += added_delay;
49224934
server.cluster->failover_failed_primary_rank = new_failed_primary_rank;
49234935
serverLog(LL_NOTICE, "Failed primary rank updated to #%d, added %lld milliseconds of delay.",
@@ -4941,8 +4953,8 @@ void clusterHandleReplicaFailover(void) {
49414953
if (server.cluster->failover_auth_sent == 0) {
49424954
server.cluster->currentEpoch++;
49434955
server.cluster->failover_auth_epoch = server.cluster->currentEpoch;
4944-
serverLog(LL_NOTICE, "Starting a failover election for epoch %llu, node config epoch is %llu",
4945-
(unsigned long long)server.cluster->currentEpoch, (unsigned long long)nodeEpoch(myself));
4956+
serverLog(LL_NOTICE, "Starting a failover election for epoch %llu, node config epoch is %llu, failover primary rank is %llu",
4957+
(unsigned long long)server.cluster->currentEpoch, (unsigned long long)nodeEpoch(myself), (unsigned long long)server.cluster->failover_failed_primary_rank);
49464958
clusterRequestFailoverAuth();
49474959
server.cluster->failover_auth_sent = 1;
49484960
clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_UPDATE_STATE | CLUSTER_TODO_FSYNC_CONFIG);

0 commit comments

Comments
 (0)