Skip to content

Commit 116c402

Browse files
enjoy-binbinzuiderkwasthpatromadolson
authored andcommitted
Fix replica can't finish failover when config epoch is outdated (valkey-io#2178)
When the primary changes the config epoch and then down immediately, the replica may not update the config epoch in time. Although we will broadcast the change in cluster (see valkey-io#1813), there may be a race in the network or in the code. In this case, the replica will never finish the failover since other primaries will refuse to vote because the replica's slot config epoch is old. We need a way to allow the replica can finish the failover in this case. When the primary refuses to vote because the replica's config epoch is less than the dead primary's config epoch, it can send an UPDATE packet to the replica to inform the replica about the dead primary. The UPDATE message contains information about the dead primary's config epoch and owned slots. The failover will time out, but later the replica can try again with the updated config epoch and it can succeed. Fixes valkey-io#2169. --------- Signed-off-by: Binbin <[email protected]> Signed-off-by: Harkrishn Patro <[email protected]> Co-authored-by: Viktor Söderqvist <[email protected]> Co-authored-by: Harkrishn Patro <[email protected]> Co-authored-by: Madelyn Olson <[email protected]> (cherry picked from commit 476671b)
1 parent baad892 commit 116c402

File tree

3 files changed

+189
-3
lines changed

3 files changed

+189
-3
lines changed

src/cluster_legacy.c

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3483,9 +3483,10 @@ int clusterProcessPacket(clusterLink *link) {
34833483
if (server.cluster->slots[j] == sender || isSlotUnclaimed(j)) continue;
34843484
if (server.cluster->slots[j]->configEpoch > sender_claimed_config_epoch) {
34853485
serverLog(LL_VERBOSE,
3486-
"Node %.40s has old slots configuration, sending "
3487-
"an UPDATE message about %.40s",
3488-
sender->name, server.cluster->slots[j]->name);
3486+
"Node %.40s (%s) has old slots configuration, sending "
3487+
"an UPDATE message about %.40s (%s)",
3488+
sender->name, sender->human_nodename,
3489+
server.cluster->slots[j]->name, server.cluster->slots[j]->human_nodename);
34893490
clusterSendUpdate(sender->link, server.cluster->slots[j]);
34903491

34913492
/* TODO: instead of exiting the loop send every other
@@ -4394,6 +4395,16 @@ void clusterSendFailoverAuthIfNeeded(clusterNode *node, clusterMsg *request) {
43944395
"slot %d epoch (%llu) > reqEpoch (%llu)",
43954396
node->name, node->human_nodename, j, (unsigned long long)server.cluster->slots[j]->configEpoch,
43964397
(unsigned long long)requestConfigEpoch);
4398+
4399+
/* Send an UPDATE message to the replica. After receiving the UPDATE message,
4400+
* the replica will update the slots config so that it can initiate a failover
4401+
* again later. Otherwise the replica will never get votes if the primary is down. */
4402+
serverLog(LL_VERBOSE,
4403+
"Node %.40s (%s) has old slots configuration, sending "
4404+
"an UPDATE message about %.40s (%s)",
4405+
node->name, node->human_nodename,
4406+
server.cluster->slots[j]->name, server.cluster->slots[j]->human_nodename);
4407+
clusterSendUpdate(node->link, server.cluster->slots[j]);
43974408
return;
43984409
}
43994410

tests/support/cluster_util.tcl

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,12 @@ proc cluster_allocate_replicas {masters replicas} {
209209
# Setup method to be executed to configure the cluster before the
210210
# tests run.
211211
proc cluster_setup {masters replicas node_count slot_allocator replica_allocator code} {
212+
set config_epoch 1
213+
for {set i 0} {$i < $node_count} {incr i} {
214+
R $i CLUSTER SET-CONFIG-EPOCH $config_epoch
215+
incr config_epoch
216+
}
217+
212218
# Have all nodes meet
213219
if {$::tls} {
214220
set tls_cluster [lindex [R 0 CONFIG GET tls-cluster] 1]

tests/unit/cluster/failover2.tcl

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,3 +64,172 @@ start_cluster 3 4 {tags {external:skip cluster} overrides {cluster-ping-interval
6464
}
6565

6666
} ;# start_cluster
67+
68+
start_cluster 7 3 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 5000}} {
69+
test "Primaries will not time out then they are elected in the same epoch" {
70+
# Since we have the delay time, so these node may not initiate the
71+
# election at the same time (same epoch). But if they do, we make
72+
# sure there is no failover timeout.
73+
74+
# Killing there primary nodes.
75+
pause_process [srv 0 pid]
76+
pause_process [srv -1 pid]
77+
pause_process [srv -2 pid]
78+
79+
# Wait for the failover
80+
wait_for_condition 1000 50 {
81+
[s -7 role] == "master" &&
82+
[s -8 role] == "master" &&
83+
[s -9 role] == "master"
84+
} else {
85+
fail "No failover detected"
86+
}
87+
88+
# Make sure there is no false epoch 0.
89+
verify_no_log_message -7 "*Failover election in progress for epoch 0*" 0
90+
verify_no_log_message -8 "*Failover election in progress for epoch 0*" 0
91+
verify_no_log_message -9 "*Failover election in progress for epoch 0*" 0
92+
93+
# Make sure there is no failover timeout.
94+
verify_no_log_message -7 "*Failover attempt expired*" 0
95+
verify_no_log_message -8 "*Failover attempt expired*" 0
96+
verify_no_log_message -9 "*Failover attempt expired*" 0
97+
98+
# Resuming these primary nodes, speed up the shutdown.
99+
resume_process [srv 0 pid]
100+
resume_process [srv -1 pid]
101+
resume_process [srv -2 pid]
102+
}
103+
} ;# start_cluster
104+
105+
run_solo {cluster} {
106+
start_cluster 32 15 {tags {external:skip cluster} overrides {cluster-ping-interval 1000 cluster-node-timeout 15000}} {
107+
test "Multiple primary nodes are down, rank them based on the failed primary" {
108+
# Killing these primary nodes.
109+
for {set j 0} {$j < 15} {incr j} {
110+
pause_process [srv -$j pid]
111+
}
112+
113+
# Make sure that a node starts failover.
114+
wait_for_condition 1000 100 {
115+
[s -40 role] == "master"
116+
} else {
117+
fail "No failover detected"
118+
}
119+
120+
# Wait for the cluster state to become ok.
121+
for {set j 0} {$j < [llength $::servers]} {incr j} {
122+
if {[process_is_paused [srv -$j pid]]} continue
123+
wait_for_condition 1000 100 {
124+
[CI $j cluster_state] eq "ok"
125+
} else {
126+
fail "Cluster node $j cluster_state:[CI $j cluster_state]"
127+
}
128+
}
129+
130+
# Resuming these primary nodes, speed up the shutdown.
131+
for {set j 0} {$j < 15} {incr j} {
132+
resume_process [srv -$j pid]
133+
}
134+
}
135+
} ;# start_cluster
136+
} ;# run_solo
137+
138+
# Needs to run in the body of
139+
# start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-replica-validity-factor 0}}
140+
proc test_replica_config_epoch_failover {type} {
141+
test "Replica can update the config epoch when trigger the failover - $type" {
142+
set CLUSTER_PACKET_TYPE_NONE -1
143+
set CLUSTER_PACKET_TYPE_ALL -2
144+
145+
if {$type == "automatic"} {
146+
R 3 CONFIG SET cluster-replica-no-failover no
147+
} elseif {$type == "manual"} {
148+
R 3 CONFIG SET cluster-replica-no-failover yes
149+
}
150+
R 3 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_ALL
151+
152+
set R0_nodeid [R 0 cluster myid]
153+
154+
# R 0 is the first node, so we expect its epoch to be the smallest,
155+
# so bumpepoch must succeed and it's config epoch will be changed.
156+
set res [R 0 cluster bumpepoch]
157+
assert_match {BUMPED *} $res
158+
set R0_config_epoch [lindex $res 1]
159+
160+
# Wait for the config epoch to propagate across the cluster.
161+
wait_for_condition 1000 10 {
162+
$R0_config_epoch == [dict get [cluster_get_node_by_id 1 $R0_nodeid] config_epoch] &&
163+
$R0_config_epoch == [dict get [cluster_get_node_by_id 2 $R0_nodeid] config_epoch]
164+
} else {
165+
fail "Other primaries does not update config epoch"
166+
}
167+
# Make sure that replica do not update config epoch.
168+
assert_not_equal $R0_config_epoch [dict get [cluster_get_node_by_id 3 $R0_nodeid] config_epoch]
169+
170+
# Pause the R 0 and wait for the cluster to be down.
171+
pause_process [srv 0 pid]
172+
R 3 DEBUG DROP-CLUSTER-PACKET-FILTER $CLUSTER_PACKET_TYPE_NONE
173+
wait_for_condition 1000 50 {
174+
[CI 1 cluster_state] == "fail" &&
175+
[CI 2 cluster_state] == "fail" &&
176+
[CI 3 cluster_state] == "fail"
177+
} else {
178+
fail "Cluster does not fail"
179+
}
180+
181+
# Make sure both the automatic and the manual failover will fail in the first time.
182+
if {$type == "automatic"} {
183+
wait_for_log_messages -3 {"*Failover attempt expired*"} 0 1000 10
184+
} elseif {$type == "manual"} {
185+
R 3 cluster failover force
186+
wait_for_log_messages -3 {"*Manual failover timed out*"} 0 1000 10
187+
}
188+
189+
# Make sure the primaries prints the relevant logs.
190+
wait_for_log_messages -1 {"*Failover auth denied to* epoch * > reqConfigEpoch*"} 0 1000 10
191+
wait_for_log_messages -1 {"*has old slots configuration, sending an UPDATE message about*"} 0 1000 10
192+
wait_for_log_messages -2 {"*Failover auth denied to* epoch * > reqConfigEpoch*"} 0 1000 10
193+
wait_for_log_messages -2 {"*has old slots configuration, sending an UPDATE message about*"} 0 1000 10
194+
195+
# Make sure the replica has updated the config epoch.
196+
wait_for_condition 1000 10 {
197+
$R0_config_epoch == [dict get [cluster_get_node_by_id 1 $R0_nodeid] config_epoch]
198+
} else {
199+
fail "The replica does not update the config epoch"
200+
}
201+
202+
if {$type == "manual"} {
203+
# The second manual failure will succeed because the config epoch
204+
# has already propagated.
205+
R 3 cluster failover force
206+
}
207+
208+
# Wait for the failover to success.
209+
wait_for_condition 1000 50 {
210+
[s -3 role] == "master" &&
211+
[CI 1 cluster_state] == "ok" &&
212+
[CI 2 cluster_state] == "ok" &&
213+
[CI 3 cluster_state] == "ok"
214+
} else {
215+
fail "Failover does not happen"
216+
}
217+
218+
# Restore the old primary, make sure it can covert
219+
resume_process [srv 0 pid]
220+
wait_for_condition 1000 50 {
221+
[s 0 role] == "slave" &&
222+
[CI 0 cluster_state] == "ok"
223+
} else {
224+
fail "The old primary was not converted into replica"
225+
}
226+
}
227+
}
228+
229+
start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-replica-validity-factor 0}} {
230+
test_replica_config_epoch_failover "automatic"
231+
}
232+
233+
start_cluster 3 1 {tags {external:skip cluster} overrides {cluster-replica-validity-factor 0}} {
234+
test_replica_config_epoch_failover "manual"
235+
}

0 commit comments

Comments
 (0)