Add node pfail and fail count to cluster info metrics (valkey-io#1910)

hpatro · Nitai Caro · commit b75c514e9272 · 2025-04-22T07:25:38.000Z
New fields in CLUSTER INFO:

* `cluster_nodes_pfail`
* `cluster_nodes_fail`
* `cluster_voting_nodes_pfail`
* `cluster_voting_nodes_fail`

I'm running few tests and trying to capture partially failed and
completely failed count. Slot partially failed / completely failed stats
exists but is more difficult to assess the node failure count with that.

New output:

```
&gt; CLUSTER INFO
cluster_state:fail
cluster_slots_assigned:0
cluster_slots_ok:0
cluster_slots_pfail:0
cluster_slots_fail:0
cluster_nodes_pfail:1
cluster_nodes_fail:0
cluster_voting_nodes_pfail:1
cluster_voting_nodes_fail:0
cluster_known_nodes:3
cluster_size:0
cluster_current_epoch:1
cluster_my_epoch:1
cluster_stats_messages_ping_sent:2104
cluster_stats_messages_pong_sent:1906
cluster_stats_messages_meet_sent:1
cluster_stats_messages_sent:4011
cluster_stats_messages_ping_received:1906
cluster_stats_messages_pong_received:1964
cluster_stats_messages_received:3870
total_cluster_links_buffer_limit_exceeded:0
```

---------

Signed-off-by: Harkrishn Patro &lt;harkrisp@amazon.com&gt;
Signed-off-by: Nitai Caro &lt;caronita@amazon.com&gt;
diff --git a/src/cluster_legacy.c b/src/cluster_legacy.c
@@ -6444,18 +6444,42 @@ sds genClusterInfoString(void) {
         }
     }
 
+    dictIterator *di = dictGetIterator(server.cluster->nodes);
+    dictEntry *de;
+    unsigned nodes_pfail = 0, nodes_fail = 0, voting_nodes_pfail = 0, voting_nodes_fail = 0;
+    while ((de = dictNext(di)) != NULL) {
+        clusterNode *node = dictGetVal(de);
+        if (nodeTimedOut(node)) {
+            nodes_pfail++;
+            if (clusterNodeIsVotingPrimary(node)) {
+                voting_nodes_pfail++;
+            }
+        }
+        if (nodeFailed(node)) {
+            nodes_fail++;
+            if (clusterNodeIsVotingPrimary(node)) {
+                voting_nodes_fail++;
+            }
+        }
+    }
+    dictReleaseIterator(di);
+
     info = sdscatprintf(info,
                         "cluster_state:%s\r\n"
                         "cluster_slots_assigned:%d\r\n"
                         "cluster_slots_ok:%d\r\n"
                         "cluster_slots_pfail:%d\r\n"
                         "cluster_slots_fail:%d\r\n"
+                        "cluster_nodes_pfail:%u\r\n"
+                        "cluster_nodes_fail:%u\r\n"
+                        "cluster_voting_nodes_pfail:%u\r\n"
+                        "cluster_voting_nodes_fail:%u\r\n"
                         "cluster_known_nodes:%lu\r\n"
                         "cluster_size:%d\r\n"
                         "cluster_current_epoch:%llu\r\n"
                         "cluster_my_epoch:%llu\r\n",
                         statestr[server.cluster->state], slots_assigned, slots_ok, slots_pfail, slots_fail,
-                        dictSize(server.cluster->nodes), server.cluster->size,
+                        nodes_pfail, nodes_fail, voting_nodes_pfail, voting_nodes_fail, dictSize(server.cluster->nodes), server.cluster->size,
                         (unsigned long long)server.cluster->currentEpoch, (unsigned long long)nodeEpoch(myself));
 
     /* Show stats about messages sent and received. */
diff --git a/tests/unit/cluster/info.tcl b/tests/unit/cluster/info.tcl
@@ -64,3 +64,40 @@ start_cluster 3 0 {tags {external:skip cluster} overrides {cluster-node-timeout
         wait_for_cluster_state ok
     }
 }
+
+start_cluster 3 0 {tags {external:skip cluster} overrides {cluster-node-timeout 1000}} {
+    # Kill two primaries to observe partial failure on the remaining one.
+    pause_process [srv 0 pid]
+    pause_process [srv -1 pid]
+
+    test "count - node partial failure" {
+        wait_for_condition 500 10 {
+            [CI 2 cluster_nodes_pfail] eq 2 &&
+            [CI 2 cluster_nodes_fail] eq 0 &&
+            [CI 2 cluster_voting_nodes_pfail] eq 2 &&
+            [CI 2 cluster_voting_nodes_fail] eq 0
+        } else {
+            puts [R 2 CLUSTER INFO]
+            fail "Node 0/1 never timed out"
+        }
+    }
+
+    # Enable one more primary to reach quorum about node 0 failure
+    resume_process [srv -1 pid]
+
+    test "count - node complete failure" {
+        # After reaching quorum about failure,
+        # node 0 should be marked as FAIL across all nodes in the cluster
+        wait_for_condition 100 100 {
+            [CI 1 cluster_nodes_fail] eq 1 &&
+            [CI 2 cluster_nodes_fail] eq 1 &&
+            [CI 1 cluster_nodes_pfail] eq 0 &&
+            [CI 2 cluster_nodes_pfail] eq 0 &&
+            [CI 1 cluster_voting_nodes_fail] eq 1 &&
+            [CI 2 cluster_voting_nodes_fail] eq 1
+
+        } else {
+            fail "Node 0 never completely failed"
+        }
+    }
+}