Skip to content

Commit 66cc412

Browse files
hpatrozuiderkwast
authored andcommitted
Converge divergent shard-id persisted in nodes.conf to primary's shard id (#2174)
Fixes #2171 Handle divergent shard-id across primary and replica from nodes.conf and reconcile all the nodes in the shard to the primary node's shard-id. --------- Signed-off-by: Harkrishn Patro <[email protected]>
1 parent f7388cb commit 66cc412

File tree

7 files changed

+48
-6
lines changed

7 files changed

+48
-6
lines changed

src/cluster_legacy.c

Lines changed: 18 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -316,10 +316,16 @@ int auxShardIdSetter(clusterNode *n, void *value, size_t length) {
316316
}
317317
memcpy(n->shard_id, value, CLUSTER_NAMELEN);
318318
/* if n already has replicas, make sure they all agree
319-
* on the shard id */
319+
* on the shard id. If not, update them. */
320320
for (int i = 0; i < n->num_replicas; i++) {
321321
if (memcmp(n->replicas[i]->shard_id, n->shard_id, CLUSTER_NAMELEN) != 0) {
322-
return C_ERR;
322+
serverLog(LL_NOTICE,
323+
"Node %.40s has a different shard id (%.40s) than its primary's shard id %.40s (%.40s). "
324+
"Updating replica's shard id to match primary's shard id.",
325+
n->replicas[i]->name, n->replicas[i]->shard_id, n->name, n->shard_id);
326+
clusterRemoveNodeFromShard(n->replicas[i]);
327+
memcpy(n->replicas[i]->shard_id, n->shard_id, CLUSTER_NAMELEN);
328+
clusterAddNodeToShard(n->shard_id, n->replicas[i]);
323329
}
324330
}
325331
clusterAddNodeToShard(value, n);
@@ -720,10 +726,16 @@ int clusterLoadConfig(char *filename) {
720726
clusterAddNodeToShard(primary->shard_id, n);
721727
} else if (clusterGetNodesInMyShard(primary) != NULL &&
722728
memcmp(primary->shard_id, n->shard_id, CLUSTER_NAMELEN) != 0) {
723-
/* If the primary has been added to a shard, make sure this
724-
* node has the same persisted shard id as the primary. */
725-
sdsfreesplitres(argv, argc);
726-
goto fmterr;
729+
/* If the primary has been added to a shard and this replica has
730+
* a different shard id stored in nodes.conf, update it to match
731+
* the primary instead of aborting the startup. */
732+
serverLog(LL_NOTICE,
733+
"Node %.40s has a different shard id (%.40s) than its primary %.40s (%.40s). "
734+
"Updating replica's shard id to match primary's shard id.",
735+
n->name, n->shard_id, primary->name, primary->shard_id);
736+
clusterRemoveNodeFromShard(n);
737+
memcpy(n->shard_id, primary->shard_id, CLUSTER_NAMELEN);
738+
clusterAddNodeToShard(primary->shard_id, n);
727739
}
728740
n->replicaof = primary;
729741
clusterNodeAddReplica(primary, n);
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
43ee1cacd6948ee96bb367eb8795e62e8d153f05 127.0.0.1:0@6379,,tls-port=0,shard-id=f91532eb722943e035f34292cf586f3f750d65bd myself,master - 0 1749488968682 13 connected 0-16383
2+
8d89f4d4e7c57a2819277732f86213241c3ec0d3 127.0.0.1:0@6380,,tls-port=0,shard-id=a91532eb722943e035f34292cf586f3f750d65bd slave 43ee1cacd6948ee96bb367eb8795e62e8d153f05 0 1749488968682 13 connected
3+
vars currentEpoch 13 lastVoteEpoch 9
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
8d89f4d4e7c57a2819277732f86213241c3ec0d3 127.0.0.1:0@6380,,tls-port=0,shard-id=a91532eb722943e035f34292cf586f3f750d65bd slave 43ee1cacd6948ee96bb367eb8795e62e8d153f05 0 1749488968682 13 connected
2+
43ee1cacd6948ee96bb367eb8795e62e8d153f05 127.0.0.1:0@6379,,tls-port=0,shard-id=f91532eb722943e035f34292cf586f3f750d65bd myself,master - 0 1749488968682 13 connected 0-16383
3+
vars currentEpoch 13 lastVoteEpoch 9
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
43ee1cacd6948ee96bb367eb8795e62e8d153f05 127.0.0.1:0@6379,,tls-port=0,shard-id=f91532eb722943e035f34292cf586f3f750d65bd master - 0 1749488968682 13 connected 0-16383
2+
8d89f4d4e7c57a2819277732f86213241c3ec0d3 127.0.0.1:0@6380,,tls-port=0,shard-id=a91532eb722943e035f34292cf586f3f750d65bd myself,slave 43ee1cacd6948ee96bb367eb8795e62e8d153f05 0 1749488968682 13 connected
3+
vars currentEpoch 13 lastVoteEpoch 9
Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
8d89f4d4e7c57a2819277732f86213241c3ec0d3 127.0.0.1:0@6380,,tls-port=0,shard-id=a91532eb722943e035f34292cf586f3f750d65bd myself,slave 43ee1cacd6948ee96bb367eb8795e62e8d153f05 0 1749488968682 13 connected
2+
43ee1cacd6948ee96bb367eb8795e62e8d153f05 127.0.0.1:0@6379,,tls-port=0,shard-id=f91532eb722943e035f34292cf586f3f750d65bd master - 0 1749488968682 13 connected 0-16383
3+
vars currentEpoch 13 lastVoteEpoch 9

tests/test_helper.tcl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,7 @@ proc cleanup {} {
304304
if {!$::quiet} {puts -nonewline "Cleanup: may take some time... "}
305305
flush stdout
306306
catch {exec rm -rf {*}[glob tests/tmp/valkey.conf.*]}
307+
catch {exec rm -rf {*}[glob tests/tmp/nodes.conf.*]}
307308
catch {exec rm -rf {*}[glob tests/tmp/server*.*]}
308309
catch {exec rm -rf {*}[glob tests/tmp/*.acl.*]}
309310
if {!$::quiet} {puts "OK"}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
tags {external:skip cluster singledb} {
2+
set old_singledb $::singledb
3+
set ::singledb 1
4+
# Start a cluster with a divergent shard ID configuration
5+
test "divergent cluster shardid conflict" {
6+
for {set i 1} {$i <= 4} {incr i} {
7+
if {$::verbose} { puts "Testing for tests/assets/divergent-shard-$i.conf"; flush stdout;}
8+
exec cp -f tests/assets/divergent-shard-$i.conf tests/tmp/nodes.conf.divergent
9+
start_server {overrides {"cluster-enabled" "yes" "cluster-config-file" "../nodes.conf.divergent"}} {
10+
set shardid [r CLUSTER MYSHARDID]
11+
set count [exec grep -c $shardid tests/tmp/nodes.conf.divergent];
12+
assert_equal $count 2 "Expect shard ID to be present twice in the configuration file"
13+
}
14+
}
15+
}
16+
set ::singledb $old_singledb
17+
}

0 commit comments

Comments
 (0)