Skip to content

Commit b75c514

Browse files
hpatroNitai Caro
authored andcommitted
Add node pfail and fail count to cluster info metrics (valkey-io#1910)
New fields in CLUSTER INFO: * `cluster_nodes_pfail` * `cluster_nodes_fail` * `cluster_voting_nodes_pfail` * `cluster_voting_nodes_fail` I'm running few tests and trying to capture partially failed and completely failed count. Slot partially failed / completely failed stats exists but is more difficult to assess the node failure count with that. New output: ``` > CLUSTER INFO cluster_state:fail cluster_slots_assigned:0 cluster_slots_ok:0 cluster_slots_pfail:0 cluster_slots_fail:0 cluster_nodes_pfail:1 cluster_nodes_fail:0 cluster_voting_nodes_pfail:1 cluster_voting_nodes_fail:0 cluster_known_nodes:3 cluster_size:0 cluster_current_epoch:1 cluster_my_epoch:1 cluster_stats_messages_ping_sent:2104 cluster_stats_messages_pong_sent:1906 cluster_stats_messages_meet_sent:1 cluster_stats_messages_sent:4011 cluster_stats_messages_ping_received:1906 cluster_stats_messages_pong_received:1964 cluster_stats_messages_received:3870 total_cluster_links_buffer_limit_exceeded:0 ``` --------- Signed-off-by: Harkrishn Patro <[email protected]> Signed-off-by: Nitai Caro <[email protected]>
1 parent 0a8eef1 commit b75c514

File tree

2 files changed

+62
-1
lines changed

2 files changed

+62
-1
lines changed

src/cluster_legacy.c

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6444,18 +6444,42 @@ sds genClusterInfoString(void) {
64446444
}
64456445
}
64466446

6447+
dictIterator *di = dictGetIterator(server.cluster->nodes);
6448+
dictEntry *de;
6449+
unsigned nodes_pfail = 0, nodes_fail = 0, voting_nodes_pfail = 0, voting_nodes_fail = 0;
6450+
while ((de = dictNext(di)) != NULL) {
6451+
clusterNode *node = dictGetVal(de);
6452+
if (nodeTimedOut(node)) {
6453+
nodes_pfail++;
6454+
if (clusterNodeIsVotingPrimary(node)) {
6455+
voting_nodes_pfail++;
6456+
}
6457+
}
6458+
if (nodeFailed(node)) {
6459+
nodes_fail++;
6460+
if (clusterNodeIsVotingPrimary(node)) {
6461+
voting_nodes_fail++;
6462+
}
6463+
}
6464+
}
6465+
dictReleaseIterator(di);
6466+
64476467
info = sdscatprintf(info,
64486468
"cluster_state:%s\r\n"
64496469
"cluster_slots_assigned:%d\r\n"
64506470
"cluster_slots_ok:%d\r\n"
64516471
"cluster_slots_pfail:%d\r\n"
64526472
"cluster_slots_fail:%d\r\n"
6473+
"cluster_nodes_pfail:%u\r\n"
6474+
"cluster_nodes_fail:%u\r\n"
6475+
"cluster_voting_nodes_pfail:%u\r\n"
6476+
"cluster_voting_nodes_fail:%u\r\n"
64536477
"cluster_known_nodes:%lu\r\n"
64546478
"cluster_size:%d\r\n"
64556479
"cluster_current_epoch:%llu\r\n"
64566480
"cluster_my_epoch:%llu\r\n",
64576481
statestr[server.cluster->state], slots_assigned, slots_ok, slots_pfail, slots_fail,
6458-
dictSize(server.cluster->nodes), server.cluster->size,
6482+
nodes_pfail, nodes_fail, voting_nodes_pfail, voting_nodes_fail, dictSize(server.cluster->nodes), server.cluster->size,
64596483
(unsigned long long)server.cluster->currentEpoch, (unsigned long long)nodeEpoch(myself));
64606484

64616485
/* Show stats about messages sent and received. */

tests/unit/cluster/info.tcl

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,3 +64,40 @@ start_cluster 3 0 {tags {external:skip cluster} overrides {cluster-node-timeout
6464
wait_for_cluster_state ok
6565
}
6666
}
67+
68+
start_cluster 3 0 {tags {external:skip cluster} overrides {cluster-node-timeout 1000}} {
69+
# Kill two primaries to observe partial failure on the remaining one.
70+
pause_process [srv 0 pid]
71+
pause_process [srv -1 pid]
72+
73+
test "count - node partial failure" {
74+
wait_for_condition 500 10 {
75+
[CI 2 cluster_nodes_pfail] eq 2 &&
76+
[CI 2 cluster_nodes_fail] eq 0 &&
77+
[CI 2 cluster_voting_nodes_pfail] eq 2 &&
78+
[CI 2 cluster_voting_nodes_fail] eq 0
79+
} else {
80+
puts [R 2 CLUSTER INFO]
81+
fail "Node 0/1 never timed out"
82+
}
83+
}
84+
85+
# Enable one more primary to reach quorum about node 0 failure
86+
resume_process [srv -1 pid]
87+
88+
test "count - node complete failure" {
89+
# After reaching quorum about failure,
90+
# node 0 should be marked as FAIL across all nodes in the cluster
91+
wait_for_condition 100 100 {
92+
[CI 1 cluster_nodes_fail] eq 1 &&
93+
[CI 2 cluster_nodes_fail] eq 1 &&
94+
[CI 1 cluster_nodes_pfail] eq 0 &&
95+
[CI 2 cluster_nodes_pfail] eq 0 &&
96+
[CI 1 cluster_voting_nodes_fail] eq 1 &&
97+
[CI 2 cluster_voting_nodes_fail] eq 1
98+
99+
} else {
100+
fail "Node 0 never completely failed"
101+
}
102+
}
103+
}

0 commit comments

Comments
 (0)