Skip to content
12 changes: 12 additions & 0 deletions src/cluster_legacy.c
Original file line number Diff line number Diff line change
Expand Up @@ -3113,6 +3113,18 @@ int clusterProcessPacket(clusterLink *link) {
if (sender_claims_to_be_primary && sender_claimed_config_epoch > sender->configEpoch) {
sender->configEpoch = sender_claimed_config_epoch;
clusterDoBeforeSleep(CLUSTER_TODO_SAVE_CONFIG | CLUSTER_TODO_FSYNC_CONFIG);

if (server.cluster->failover_auth_time && sender->configEpoch == server.cluster->failover_auth_epoch) {
/* There are another node has claimed it in this epoch, if we have any ongoing
* election, we can reset it since there won't be enough votes and we can start
* a new one ASAP. */
server.cluster->failover_auth_time = 0;
serverLog(LL_WARNING,
"I have a failover election for epoch %llu in progress and "
"received node %.40s (%s) claiming this epoch, resetting the election.",
(unsigned long long)sender->configEpoch, sender->name, sender->human_nodename);
clusterDoBeforeSleep(CLUSTER_TODO_HANDLE_FAILOVER);
}
}
/* Update the replication offset info for this node. */
sender->repl_offset = ntohu64(hdr->offset);
Expand Down
33 changes: 33 additions & 0 deletions tests/unit/cluster/failover2.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,36 @@ start_cluster 3 4 {tags {external:skip cluster} overrides {cluster-ping-interval
}

} ;# start_cluster


start_cluster 7 3 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 5000}} {
test "Primaries will not time out then they are elected in the same epoch" {
# Since we have the delay time, so these node may not initiate the
# election at the same time (same epoch). But if they do, we make
# sure there is no failover timeout.

# Killing there primary nodes.
pause_process [srv 0 pid]
pause_process [srv -1 pid]
pause_process [srv -2 pid]

# Wait for the failover
wait_for_condition 1000 50 {
[s -7 role] == "master" &&
[s -8 role] == "master" &&
[s -9 role] == "master"
} else {
fail "No failover detected"
}

# Make sure there is no failover timeout.
verify_no_log_message -7 "*Failover attempt expired*" 0
verify_no_log_message -8 "*Failover attempt expired*" 0
verify_no_log_message -9 "*Failover attempt expired*" 0

# Resuming these primary nodes, speed up the shutdown.
resume_process [srv 0 pid]
resume_process [srv -1 pid]
resume_process [srv -2 pid]
}
} ;# start_cluster