diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c index 040b578cba..70279a4c86 100644 --- a/src/cluster_legacy.c +++ b/src/cluster_legacy.c @@ -6384,18 +6384,42 @@ sds genClusterInfoString(void) { } } + dictIterator *di = dictGetIterator(server.cluster->nodes); + dictEntry *de; + unsigned nodes_pfail = 0, nodes_fail = 0, voting_nodes_pfail = 0, voting_nodes_fail = 0; + while ((de = dictNext(di)) != NULL) { + clusterNode *node = dictGetVal(de); + if (nodeTimedOut(node)) { + nodes_pfail++; + if (clusterNodeIsVotingPrimary(node)) { + voting_nodes_pfail++; + } + } + if (nodeFailed(node)) { + nodes_fail++; + if (clusterNodeIsVotingPrimary(node)) { + voting_nodes_fail++; + } + } + } + dictReleaseIterator(di); + info = sdscatprintf(info, "cluster_state:%s\r\n" "cluster_slots_assigned:%d\r\n" "cluster_slots_ok:%d\r\n" "cluster_slots_pfail:%d\r\n" "cluster_slots_fail:%d\r\n" + "cluster_nodes_pfail:%u\r\n" + "cluster_nodes_fail:%u\r\n" + "cluster_voting_nodes_pfail:%u\r\n" + "cluster_voting_nodes_fail:%u\r\n" "cluster_known_nodes:%lu\r\n" "cluster_size:%d\r\n" "cluster_current_epoch:%llu\r\n" "cluster_my_epoch:%llu\r\n", statestr[server.cluster->state], slots_assigned, slots_ok, slots_pfail, slots_fail, - dictSize(server.cluster->nodes), server.cluster->size, + nodes_pfail, nodes_fail, voting_nodes_pfail, voting_nodes_fail, dictSize(server.cluster->nodes), server.cluster->size, (unsigned long long)server.cluster->currentEpoch, (unsigned long long)nodeEpoch(myself)); /* Show stats about messages sent and received. */ diff --git a/tests/unit/cluster/info.tcl b/tests/unit/cluster/info.tcl index f882378172..6c8f77b25b 100644 --- a/tests/unit/cluster/info.tcl +++ b/tests/unit/cluster/info.tcl @@ -64,3 +64,40 @@ start_cluster 3 0 {tags {external:skip cluster} overrides {cluster-node-timeout wait_for_cluster_state ok } } + +start_cluster 3 0 {tags {external:skip cluster} overrides {cluster-node-timeout 1000}} { + # Kill two primaries to observe partial failure on the remaining one. + pause_process [srv 0 pid] + pause_process [srv -1 pid] + + test "count - node partial failure" { + wait_for_condition 500 10 { + [CI 2 cluster_nodes_pfail] eq 2 && + [CI 2 cluster_nodes_fail] eq 0 && + [CI 2 cluster_voting_nodes_pfail] eq 2 && + [CI 2 cluster_voting_nodes_fail] eq 0 + } else { + puts [R 2 CLUSTER INFO] + fail "Node 0/1 never timed out" + } + } + + # Enable one more primary to reach quorum about node 0 failure + resume_process [srv -1 pid] + + test "count - node complete failure" { + # After reaching quorum about failure, + # node 0 should be marked as FAIL across all nodes in the cluster + wait_for_condition 100 100 { + [CI 1 cluster_nodes_fail] eq 1 && + [CI 2 cluster_nodes_fail] eq 1 && + [CI 1 cluster_nodes_pfail] eq 0 && + [CI 2 cluster_nodes_pfail] eq 0 && + [CI 1 cluster_voting_nodes_fail] eq 1 && + [CI 2 cluster_voting_nodes_fail] eq 1 + + } else { + fail "Node 0 never completely failed" + } + } +}