Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 24 additions & 1 deletion src/cluster_legacy.c
Original file line number Diff line number Diff line change
Expand Up @@ -6384,18 +6384,41 @@ sds genClusterInfoString(void) {
}
}

dictIterator *di = dictGetIterator(server.cluster->nodes);
dictEntry *de;
unsigned nodes_pfail = 0, nodes_fail = 0;
while ((de = dictNext(di)) != NULL) {
clusterNode *node = dictGetVal(de);
if (nodeTimedOut(node)) {
nodes_pfail++;
}
if (nodeFailed(node)) {
nodes_fail++;
}
}
dictReleaseIterator(di);

if (nodes_pfail + nodes_fail > dictSize(server.cluster->nodes) && !server.crashed) {
serverLog(LL_WARNING, "Aggregated count of nodes marked as PFAIL and FAIL exceeds the total count of nodes."
"PFAIL nodes: %u, FAIL nodes: %u, total nodes: %lu",
nodes_pfail, nodes_fail, dictSize(server.cluster->nodes));
serverAssert(0);
}

info = sdscatprintf(info,
"cluster_state:%s\r\n"
"cluster_slots_assigned:%d\r\n"
"cluster_slots_ok:%d\r\n"
"cluster_slots_pfail:%d\r\n"
"cluster_slots_fail:%d\r\n"
"cluster_nodes_pfail:%u\r\n"
"cluster_nodes_fail:%u\r\n"
"cluster_known_nodes:%lu\r\n"
"cluster_size:%d\r\n"
"cluster_current_epoch:%llu\r\n"
"cluster_my_epoch:%llu\r\n",
statestr[server.cluster->state], slots_assigned, slots_ok, slots_pfail, slots_fail,
dictSize(server.cluster->nodes), server.cluster->size,
nodes_pfail, nodes_fail, dictSize(server.cluster->nodes), server.cluster->size,
(unsigned long long)server.cluster->currentEpoch, (unsigned long long)nodeEpoch(myself));

/* Show stats about messages sent and received. */
Expand Down
32 changes: 32 additions & 0 deletions tests/unit/cluster/info.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,35 @@ start_cluster 3 0 {tags {external:skip cluster} overrides {cluster-node-timeout
wait_for_cluster_state ok
}
}

start_cluster 3 0 {tags {external:skip cluster} overrides {cluster-node-timeout 1000}} {
# Kill two primaries to observe partial failure on the remaining one.
pause_process [srv 0 pid]
pause_process [srv -1 pid]

test "count - node partial failure" {
wait_for_condition 500 10 {
[CI 2 cluster_nodes_pfail] eq 2 &&
[CI 2 cluster_nodes_fail] eq 0
} else {
puts [R 2 CLUSTER INFO]
fail "Node 0/1 never timed out"
}
}

# Enable one more primary to reach quorum about node 0 failure
resume_process [srv -1 pid]

test "count - node complete failure" {
# After reaching quorum about failure,
# node 0 should be marked as FAIL across all nodes in the cluster
wait_for_condition 100 100 {
[CI 1 cluster_nodes_fail] eq 1 &&
[CI 2 cluster_nodes_fail] eq 1 &&
[CI 1 cluster_nodes_pfail] eq 0 &&
[CI 2 cluster_nodes_pfail] eq 0
} else {
fail "Node 0 never completely failed"
}
}
}
Loading