From a0541b6b6d93e4bc12dabb75733ecb0722f8e1b0 Mon Sep 17 00:00:00 2001 From: Binbin Date: Tue, 10 Sep 2024 16:06:01 +0800 Subject: [PATCH 1/7] Optimize cluster election when nodes initiate elections at the same epoch If multiple primary nodes go down at the same time, their replica nodes will initiate the elections at the same time. There is a certain probability that the replicas will initate the elections in the same epoch. And obviously, in our current election mechanism, only one replica node can eventually get the enough votes, and the other replica node will fail to win due the the insufficient majority, and then its election will time out and we will wait for the retry, which result in a long failure time. If another node has been won the election in the failover epoch, we can assume that my election has failed and we can retry as soom as possible. Signed-off-by: Binbin --- src/cluster_legacy.c | 11 +++++++++++ tests/unit/cluster/failover2.tcl | 33 ++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index f3925f5695..6610a3b7a1 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -3113,6 +3113,17 @@ int clusterProcessPacket(clusterLink *link) { if (sender_claims_to_be_primary && sender_claimed_config_epoch > sender->configEpoch) { sender->configEpoch = sender_claimed_config_epoch; clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_FSYNC_CONFIG); + + if (server.cluster->failover_auth_time && sender->configEpoch == server.cluster->failover_auth_epoch) { + /* There are another node has claimed it in this epoch, if we have any ongoing + * election, we can reset it since there won't be enough votes and we can start + * a new one ASAP. */ + server.cluster->failover_auth_time = 0; + serverLog(LL_WARNING, "I have a failover election for epoch %lld in progress and " + "received node %.40s (%s) claiming this epoch, resetting the election.", + sender->configEpoch, sender->name, sender->human_nodename); + clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER); + } } /* Update the replication offset info for this node. */ sender->repl_offset = ntohu64(hdr->offset); diff --git a/tests/unit/cluster/failover2.tcl b/tests/unit/cluster/failover2.tcl index 7bc6a05e95..21c4f4a678 100644 --- a/tests/unit/cluster/failover2.tcl +++ b/tests/unit/cluster/failover2.tcl @@ -64,3 +64,36 @@ start_cluster 3 4 {tags {external:skip cluster} overrides {cluster-ping-interval } } ;# start_cluster + + +start_cluster 7 3 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 5000}} { + test "Primaries will not time out then they are elected in the same epoch" { + # Since we have the delay time, so these node may not initiate the + # election at the same time (same epoch). But if they do, we make + # sure there is no failover timeout. + + # Killing there primary nodes. + pause_process [srv 0 pid] + pause_process [srv -1 pid] + pause_process [srv -2 pid] + + # Wait for the failover + wait_for_condition 1000 50 { + [s -7 role] == "master" && + [s -8 role] == "master" && + [s -9 role] == "master" + } else { + fail "No failover detected" + } + + # Make sure there is no failover timeout. + verify_no_log_message -7 "*Failover attempt expired*" 0 + verify_no_log_message -8 "*Failover attempt expired*" 0 + verify_no_log_message -9 "*Failover attempt expired*" 0 + + # Resuming these primary nodes, speed up the shutdown. + resume_process [srv 0 pid] + resume_process [srv -1 pid] + resume_process [srv -2 pid] + } +} ;# start_cluster From cea92678cf7eb3bf5846e74743cb16d210ed222f Mon Sep 17 00:00:00 2001 From: Binbin Date: Tue, 10 Sep 2024 16:37:51 +0800 Subject: [PATCH 2/7] fix build and fix format Signed-off-by: Binbin --- src/cluster_legacy.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 6610a3b7a1..8711ca5ce8 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -3119,9 +3119,10 @@ int clusterProcessPacket(clusterLink *link) { * election, we can reset it since there won't be enough votes and we can start * a new one ASAP. */ server.cluster->failover_auth_time = 0; - serverLog(LL_WARNING, "I have a failover election for epoch %lld in progress and " - "received node %.40s (%s) claiming this epoch, resetting the election.", - sender->configEpoch, sender->name, sender->human_nodename); + serverLog(LL_WARNING, + "I have a failover election for epoch %llu in progress and " + "received node %.40s (%s) claiming this epoch, resetting the election.", + (unsigned long long)sender->configEpoch, sender->name, sender->human_nodename); clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER); } } From 41034bdd56f5a12126c0c1bab6a191aaaf23b1f7 Mon Sep 17 00:00:00 2001 From: Binbin Date: Fri, 13 Sep 2024 15:08:40 +0800 Subject: [PATCH 3/7] Apply suggestions from code review Co-authored-by: Ping Xie Signed-off-by: Binbin --- src/cluster_legacy.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 8711ca5ce8..7e98046014 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -3114,15 +3114,15 @@ int clusterProcessPacket(clusterLink *link) { sender->configEpoch = sender_claimed_config_epoch; clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_FSYNC_CONFIG); - if (server.cluster->failover_auth_time && sender->configEpoch == server.cluster->failover_auth_epoch) { - /* There are another node has claimed it in this epoch, if we have any ongoing - * election, we can reset it since there won't be enough votes and we can start - * a new one ASAP. */ + if (server.cluster->failover_auth_time && sender->configEpoch >= server.cluster->failover_auth_epoch) { + /* Another node has claimed an epoch greater than or equal to ours. If we have an ongoing election, + * reset it because we cannot win with an epoch smaller than or equal to the incoming claim. + * This allows us to start a new election as soon as possible. */ server.cluster->failover_auth_time = 0; serverLog(LL_WARNING, - "I have a failover election for epoch %llu in progress and " - "received node %.40s (%s) claiming this epoch, resetting the election.", - (unsigned long long)sender->configEpoch, sender->name, sender->human_nodename); + "Failover election in progress for epoch %llu, but received a claim from " + "node %.40s (%s) with an equal or higher epoch %llu. Resetting the election since we cannot win.", + (unsigned long long)server.cluster->failover_auth_epoch, sender->name, sender->human_nodename, (unsigned long long)sender->configEpoch); clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER); } } From 0270e716d7f2975011b34c21d4a0254ec4095f56 Mon Sep 17 00:00:00 2001 From: Binbin Date: Fri, 13 Sep 2024 15:13:22 +0800 Subject: [PATCH 4/7] fix format Signed-off-by: Binbin --- src/cluster_legacy.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 7e98046014..36b45d4caa 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -3115,14 +3115,16 @@ int clusterProcessPacket(clusterLink *link) { clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_FSYNC_CONFIG); if (server.cluster->failover_auth_time && sender->configEpoch >= server.cluster->failover_auth_epoch) { - /* Another node has claimed an epoch greater than or equal to ours. If we have an ongoing election, - * reset it because we cannot win with an epoch smaller than or equal to the incoming claim. - * This allows us to start a new election as soon as possible. */ + /* Another node has claimed an epoch greater than or equal to ours. + * If we have an ongoing election, reset it because we cannot win + * with an epoch smaller than or equal to the incoming claim. This + * allows us to start a new election as soon as possible. */ server.cluster->failover_auth_time = 0; serverLog(LL_WARNING, - "Failover election in progress for epoch %llu, but received a claim from " - "node %.40s (%s) with an equal or higher epoch %llu. Resetting the election since we cannot win.", - (unsigned long long)server.cluster->failover_auth_epoch, sender->name, sender->human_nodename, (unsigned long long)sender->configEpoch); + "Failover election in progress for epoch %llu, but received a claim from node %.40s (%s) " + "with an equal or higher epoch %llu. Resetting the election since we cannot win.", + (unsigned long long)server.cluster->failover_auth_epoch, sender->name, sender->human_nodename, + (unsigned long long)sender->configEpoch); clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER); } } From ce0e75e1bc5afaa84e308859842f4c1dbe377f71 Mon Sep 17 00:00:00 2001 From: Binbin Date: Tue, 24 Sep 2024 11:09:43 +0800 Subject: [PATCH 5/7] update comment Signed-off-by: Binbin --- src/cluster_legacy.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index a5acd39236..0aa031b090 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -3138,7 +3138,7 @@ int clusterProcessPacket(clusterLink *link) { if (server.cluster->failover_auth_time && sender->configEpoch >= server.cluster->failover_auth_epoch) { /* Another node has claimed an epoch greater than or equal to ours. - * If we have an ongoing election, reset it because we cannot win + * If we have an ongoing election, reset it because we cannot win * with an epoch smaller than or equal to the incoming claim. This * allows us to start a new election as soon as possible. */ server.cluster->failover_auth_time = 0; @@ -3147,6 +3147,8 @@ int clusterProcessPacket(clusterLink *link) { "with an equal or higher epoch %llu. Resetting the election since we cannot win.", (unsigned long long)server.cluster->failover_auth_epoch, sender->name, sender->human_nodename, (unsigned long long)sender->configEpoch); + /* Maybe we could start a new election, set a flag here to make sure + * we check as soon as possible, instead of waiting for a cron. */ clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER); } } From 7ce57f3b34bd9bb58747693c805c5594a18ddd7b Mon Sep 17 00:00:00 2001 From: Binbin Date: Fri, 8 Nov 2024 14:06:17 +0800 Subject: [PATCH 6/7] Update the comment Signed-off-by: Binbin --- src/cluster_legacy.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 0a469d0ce1..1c510ee712 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -3142,11 +3142,11 @@ int clusterProcessPacket(clusterLink *link) { * with an epoch smaller than or equal to the incoming claim. This * allows us to start a new election as soon as possible. */ server.cluster->failover_auth_time = 0; - serverLog(LL_WARNING, - "Failover election in progress for epoch %llu, but received a claim from node %.40s (%s) " - "with an equal or higher epoch %llu. Resetting the election since we cannot win.", - (unsigned long long)server.cluster->failover_auth_epoch, sender->name, sender->human_nodename, - (unsigned long long)sender->configEpoch); + serverLog(LL_WARNING, "Failover election in progress for epoch %llu, but received a claim from " + "node %.40s (%s) with an equal or higher epoch %llu. Resetting the election " + "since we cannot win an election in the past.", + (unsigned long long)server.cluster->failover_auth_epoch, + sender->name, sender->human_nodename, (unsigned long long)sender->configEpoch); /* Maybe we could start a new election, set a flag here to make sure * we check as soon as possible, instead of waiting for a cron. */ clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER); From 6291ed690d528635b61bbf64541a0145ac89371c Mon Sep 17 00:00:00 2001 From: Binbin Date: Sat, 9 Nov 2024 21:21:58 +0800 Subject: [PATCH 7/7] Update src/cluster_legacy.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Viktor Söderqvist Signed-off-by: Binbin --- src/cluster_legacy.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 1c510ee712..ae55b062ff 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -3142,11 +3142,13 @@ int clusterProcessPacket(clusterLink *link) { * with an epoch smaller than or equal to the incoming claim. This * allows us to start a new election as soon as possible. */ server.cluster->failover_auth_time = 0; - serverLog(LL_WARNING, "Failover election in progress for epoch %llu, but received a claim from " - "node %.40s (%s) with an equal or higher epoch %llu. Resetting the election " - "since we cannot win an election in the past.", - (unsigned long long)server.cluster->failover_auth_epoch, - sender->name, sender->human_nodename, (unsigned long long)sender->configEpoch); + serverLog(LL_WARNING, + "Failover election in progress for epoch %llu, but received a claim from " + "node %.40s (%s) with an equal or higher epoch %llu. Resetting the election " + "since we cannot win an election in the past.", + (unsigned long long)server.cluster->failover_auth_epoch, + sender->name, sender->human_nodename, + (unsigned long long)sender->configEpoch); /* Maybe we could start a new election, set a flag here to make sure * we check as soon as possible, instead of waiting for a cron. */ clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER);