Skip to content

Commit f4edcf7

Browse files
authored
feat: add custom health check for CloudNativePG (#22802)
Signed-off-by: Jonathan Gonzalez V. <[email protected]>
1 parent 6d25734 commit f4edcf7

6 files changed

Lines changed: 744 additions & 0 deletions

File tree

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
local hs = {}
2+
3+
local cnpgStatus = {
4+
["Cluster in healthy state"] = "Healthy",
5+
["Setting up primary"] = "Progressing",
6+
["Setting up primary"] = "Progressing",
7+
["Creating a new replica"] = "Progressing",
8+
["Upgrading cluster"] = "Progressing",
9+
["Waiting for the instances to become active"] = "Progressing",
10+
["Promoting to primary cluster"] = "Progressing",
11+
["Switchover in progress"] = "Degraded",
12+
["Failing over"] = "Degraded",
13+
["Upgrading Postgres major version"] = "Degraded",
14+
["Cluster upgrade delayed"] = "Degraded",
15+
["Waiting for user action"] = "Degraded",
16+
["Primary instance is being restarted in-place"] = "Degraded",
17+
["Primary instance is being restarted without a switchover"] = "Degraded",
18+
["Cluster cannot execute instance online upgrade due to missing architecture binary"] = "Degraded",
19+
["Online upgrade in progress"] = "Degraded",
20+
["Applying configuration"] = "Degraded",
21+
["Unable to create required cluster objects"] = "Suspended",
22+
["Cluster cannot proceed to reconciliation due to an unknown plugin being required"] = "Suspended",
23+
["Cluster has incomplete or invalid image catalog"] = "Suspended",
24+
["Cluster is unrecoverable and needs manual intervention"] = "Suspended",
25+
}
26+
27+
function hibernating(obj)
28+
for i, condition in pairs(obj.status.conditions) do
29+
if condition.type == "cnpg.io/hibernation" then
30+
return condition
31+
end
32+
end
33+
return nil
34+
end
35+
36+
if obj.status ~= nil and obj.status.conditions ~= nil then
37+
local hibernation = hibernating(obj)
38+
if hibernation ~= nil then
39+
if hibernation.status == "True" then
40+
hs.status = "Suspended"
41+
hs.message = hibernation.message
42+
return hs
43+
else
44+
hs.status = "Degraded"
45+
hs.message = hibernation.message
46+
return hs
47+
end
48+
end
49+
statusState = cnpgStatus[obj.status.phase]
50+
if statusState ~= nil then
51+
hs.status = statusState
52+
hs.message = obj.status.phaseReason
53+
return hs
54+
else
55+
hs.status = "Unknown"
56+
hs.message = obj.status.phaseReason
57+
return hs
58+
end
59+
end
60+
61+
hs.status = "Progressing"
62+
hs.message = obj.status.phaseReason
63+
return hs
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
tests:
2+
- healthStatus:
3+
status: Progressing
4+
message: "Creating primary instance cluster-example-1"
5+
inputPath: testdata/cluster_progressing.yaml
6+
- healthStatus:
7+
status: Healthy
8+
message: ""
9+
inputPath: testdata/cluster_healthy.yaml
10+
- healthStatus:
11+
status: Suspended
12+
message: "Cluster has been hibernated"
13+
inputPath: testdata/cluster_suspended.yaml
14+
- healthStatus:
15+
status: Degraded
16+
message: "Initiating a failover from cluster-example-2"
17+
inputPath: testdata/cluster_degraded.yaml
Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
apiVersion: postgresql.cnpg.io/v1
2+
kind: Cluster
3+
metadata:
4+
annotations:
5+
cnpg.io/hibernation: "off"
6+
kubectl.kubernetes.io/last-applied-configuration: |
7+
{"apiVersion":"postgresql.cnpg.io/v1","kind":"Cluster","metadata":{"annotations":{},"name":"cluster-example","namespace":"default"},"spec":{"imageName":"ghcr.io/cloudnative-pg/postgresql:13","instances":3,"storage":{"size":"1Gi"}}}
8+
creationTimestamp: "2025-04-25T19:59:15Z"
9+
generation: 1
10+
name: cluster-example
11+
namespace: default
12+
resourceVersion: "18125"
13+
uid: ec7a9e21-a2fe-46cb-a7a3-d5c72e3fa9d4
14+
spec:
15+
affinity:
16+
podAntiAffinityType: preferred
17+
bootstrap:
18+
initdb:
19+
database: app
20+
encoding: UTF8
21+
localeCType: C
22+
localeCollate: C
23+
owner: app
24+
enablePDB: true
25+
enableSuperuserAccess: false
26+
failoverDelay: 0
27+
imageName: ghcr.io/cloudnative-pg/postgresql:13
28+
instances: 3
29+
logLevel: info
30+
maxSyncReplicas: 0
31+
minSyncReplicas: 0
32+
monitoring:
33+
customQueriesConfigMap:
34+
- key: queries
35+
name: cnpg-default-monitoring
36+
disableDefaultQueries: false
37+
enablePodMonitor: false
38+
postgresGID: 26
39+
postgresUID: 26
40+
postgresql:
41+
parameters:
42+
archive_mode: "on"
43+
archive_timeout: 5min
44+
dynamic_shared_memory_type: posix
45+
full_page_writes: "on"
46+
log_destination: csvlog
47+
log_directory: /controller/log
48+
log_filename: postgres
49+
log_rotation_age: "0"
50+
log_rotation_size: "0"
51+
log_truncate_on_rotation: "false"
52+
logging_collector: "on"
53+
max_parallel_workers: "32"
54+
max_replication_slots: "32"
55+
max_worker_processes: "32"
56+
shared_memory_type: mmap
57+
shared_preload_libraries: ""
58+
ssl_max_protocol_version: TLSv1.3
59+
ssl_min_protocol_version: TLSv1.3
60+
wal_keep_size: 512MB
61+
wal_level: logical
62+
wal_log_hints: "on"
63+
wal_receiver_timeout: 5s
64+
wal_sender_timeout: 5s
65+
syncReplicaElectionConstraint:
66+
enabled: false
67+
primaryUpdateMethod: restart
68+
primaryUpdateStrategy: unsupervised
69+
replicationSlots:
70+
highAvailability:
71+
enabled: true
72+
slotPrefix: _cnpg_
73+
synchronizeReplicas:
74+
enabled: true
75+
updateInterval: 30
76+
resources: {}
77+
smartShutdownTimeout: 180
78+
startDelay: 3600
79+
stopDelay: 1800
80+
storage:
81+
resizeInUseVolumes: true
82+
size: 1Gi
83+
switchoverDelay: 3600
84+
status:
85+
availableArchitectures:
86+
- goArch: amd64
87+
hash: d54839c128b2b38034c6f73006b2a979d916c9715cda5d59a1241018cc44904e
88+
certificates:
89+
clientCASecret: cluster-example-ca
90+
expirations:
91+
cluster-example-ca: 2025-07-24 19:54:15 +0000 UTC
92+
cluster-example-replication: 2025-07-24 19:54:16 +0000 UTC
93+
cluster-example-server: 2025-07-24 19:54:15 +0000 UTC
94+
replicationTLSSecret: cluster-example-replication
95+
serverAltDNSNames:
96+
- cluster-example-rw
97+
- cluster-example-rw.default
98+
- cluster-example-rw.default.svc
99+
- cluster-example-rw.default.svc.cluster.local
100+
- cluster-example-r
101+
- cluster-example-r.default
102+
- cluster-example-r.default.svc
103+
- cluster-example-r.default.svc.cluster.local
104+
- cluster-example-ro
105+
- cluster-example-ro.default
106+
- cluster-example-ro.default.svc
107+
- cluster-example-ro.default.svc.cluster.local
108+
serverCASecret: cluster-example-ca
109+
serverTLSSecret: cluster-example-server
110+
cloudNativePGCommitHash: 005e82a17
111+
cloudNativePGOperatorHash: d54839c128b2b38034c6f73006b2a979d916c9715cda5d59a1241018cc44904e
112+
conditions:
113+
- lastTransitionTime: "2025-04-25T20:34:45Z"
114+
message: Cluster Is Not Ready
115+
reason: ClusterIsNotReady
116+
status: "False"
117+
type: Ready
118+
- lastTransitionTime: "2025-04-25T20:34:20Z"
119+
message: Continuous archiving is working
120+
reason: ContinuousArchivingSuccess
121+
status: "True"
122+
type: ContinuousArchiving
123+
configMapResourceVersion:
124+
metrics:
125+
cnpg-default-monitoring: "12266"
126+
currentPrimary: cluster-example-2
127+
currentPrimaryTimestamp: "2025-04-25T20:34:19.149106Z"
128+
danglingPVC:
129+
- cluster-example-2
130+
healthyPVC:
131+
- cluster-example-1
132+
- cluster-example-3
133+
image: ghcr.io/cloudnative-pg/postgresql:13
134+
instanceNames:
135+
- cluster-example-1
136+
- cluster-example-2
137+
- cluster-example-3
138+
instances: 3
139+
instancesReportedState:
140+
cluster-example-1:
141+
isPrimary: false
142+
timeLineID: 1
143+
cluster-example-3:
144+
isPrimary: false
145+
timeLineID: 1
146+
instancesStatus:
147+
healthy:
148+
- cluster-example-1
149+
- cluster-example-3
150+
latestGeneratedNode: 3
151+
managedRolesStatus: {}
152+
phase: Failing over
153+
phaseReason: Initiating a failover from cluster-example-2
154+
poolerIntegrations:
155+
pgBouncerIntegration: {}
156+
pvcCount: 3
157+
readService: cluster-example-r
158+
readyInstances: 2
159+
secretsResourceVersion:
160+
applicationSecretVersion: "12239"
161+
clientCaSecretVersion: "12236"
162+
replicationSecretVersion: "12238"
163+
serverCaSecretVersion: "12236"
164+
serverSecretVersion: "12237"
165+
switchReplicaClusterStatus: {}
166+
targetPrimary: pending
167+
targetPrimaryTimestamp: "2025-04-25T20:34:45.928098Z"
168+
timelineID: 2
169+
topology:
170+
instances:
171+
cluster-example-1: {}
172+
cluster-example-3: {}
173+
nodesUsed: 2
174+
successfullyExtracted: true
175+
writeService: cluster-example-rw

0 commit comments

Comments
 (0)