Skip to content

Commit c086b73

Browse files
committed
feat: Add health check for CephCluster CRD
1 parent d7364b4 commit c086b73

File tree

10 files changed

+325
-0
lines changed

10 files changed

+325
-0
lines changed
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
-- CRD documentation: https://rook.github.io/docs/rook/latest-release/CRDs/Cluster/ceph-cluster-crd/
2+
local hs = {
3+
status = "Progressing",
4+
message = ""
5+
}
6+
7+
if obj.status == nil then
8+
append_to_message("Waiting for status to be reported")
9+
return hs
10+
end
11+
12+
function append_to_message(message)
13+
if message ~= "" then
14+
if hs.message ~= "" then
15+
hs.message = hs.message .. " - " .. message
16+
else
17+
hs.message = message
18+
end
19+
end
20+
end
21+
22+
-- Check the main Ceph health status first - https://github.com/ceph/ceph/blob/v20.3.0/src/include/health.h#L12
23+
if obj.status.ceph ~= nil and obj.status.ceph.health ~= nil then
24+
local ceph_health = obj.status.ceph.health
25+
local details_message = ""
26+
27+
-- Build details message from status.ceph.details if available
28+
if obj.status.ceph.details ~= nil then
29+
local detail_parts = {}
30+
for detail_type, detail_info in pairs(obj.status.ceph.details) do
31+
if detail_info.message ~= nil then
32+
table.insert(detail_parts, detail_info.message)
33+
end
34+
end
35+
if #detail_parts > 0 then
36+
details_message = table.concat(detail_parts, "; ")
37+
end
38+
end
39+
40+
if ceph_health == "HEALTH_ERR" or ceph_health == "HEALTH_WARN" then
41+
hs.status = "Degraded"
42+
elseif ceph_health == "HEALTH_OK" then
43+
hs.status = "Healthy"
44+
end
45+
append_to_message("Ceph health is " .. ceph_health)
46+
append_to_message(details_message)
47+
end
48+
49+
-- Check state - https://github.com/rook/rook/blob/v1.17.7/pkg/apis/ceph.rook.io/v1/types.go#L621
50+
if obj.status.state ~= nil then
51+
if hs.status == "Healthy" or hs.status == "Connected" then
52+
append_to_message("Ceph cluster state is " .. obj.status.state)
53+
if obj.status.state == "Created" then
54+
hs.status = "Healthy"
55+
elseif obj.status.state == "Error" then
56+
hs.status = "Degraded"
57+
else
58+
hs.status = "Progressing"
59+
end
60+
end
61+
end
62+
63+
if obj.status.message ~= nil then
64+
append_to_message(obj.status.message)
65+
end
66+
67+
return hs
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
tests:
2+
- healthStatus:
3+
status: Healthy
4+
message: 'Ceph health is HEALTH_OK - Ceph cluster state is Created - Cluster created successfully'
5+
inputPath: testdata/healthy.yaml
6+
- healthStatus:
7+
status: Degraded
8+
message: 'Ceph health is HEALTH_WARN - 4 osds down; 2 pools degraded - Cluster has warnings'
9+
inputPath: testdata/degraded_warn.yaml
10+
- healthStatus:
11+
status: Degraded
12+
message: 'Ceph health is HEALTH_ERR - 8 osds down - Cluster has critical errors'
13+
inputPath: testdata/degraded_error.yaml
14+
- healthStatus:
15+
status: Progressing
16+
message: 'Cluster is being created'
17+
inputPath: testdata/state_creating.yaml
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
apiVersion: ceph.rook.io/v1
2+
kind: CephCluster
3+
metadata:
4+
name: test-ceph-cluster
5+
namespace: rook-ceph
6+
spec:
7+
cephVersion:
8+
image: quay.io/ceph/ceph:v19.2.0
9+
mon:
10+
count: 3
11+
mgr:
12+
count: 2
13+
storage:
14+
useAllNodes: true
15+
useAllDevices: false
16+
status:
17+
ceph:
18+
health: HEALTH_WARN
19+
lastChecked: "2025-08-11T16:03:08Z"
20+
fsid: c121226d-cac9-492f-8b0b-c05693243380
21+
details:
22+
OSD_DOWN:
23+
message: 4 osds down
24+
severity: HEALTH_WARN
25+
PG_AVAILABILITY:
26+
message: 'Reduced data availability: 129 pgs inactive'
27+
severity: HEALTH_WARN
28+
PG_DEGRADED:
29+
message: 'Degraded data redundancy: 368/1110 objects degraded (33.153%), 26 pgs degraded, 185 pgs undersized'
30+
severity: HEALTH_WARN
31+
capacity:
32+
bytesAvailable: 35183103942656
33+
bytesTotal: 35184372088832
34+
bytesUsed: 1268146176
35+
lastUpdated: "2025-08-11T16:03:08Z"
36+
conditions:
37+
- lastHeartbeatTime: "2025-08-11T16:03:08Z"
38+
lastTransitionTime: "2025-08-11T16:03:08Z"
39+
message: Cluster has warnings
40+
reason: ClusterWarning
41+
status: "False"
42+
type: Ready
43+
- lastHeartbeatTime: "2025-08-11T16:03:08Z"
44+
lastTransitionTime: "2025-08-11T16:03:08Z"
45+
message: Some OSDs are down
46+
reason: OSDsDown
47+
status: "False"
48+
type: Progressing
49+
message: Cluster has warnings
50+
phase: Ready
51+
state: Created
52+
version:
53+
image: quay.io/ceph/ceph:v19.2.0
54+
version: 19.2.0-0
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
apiVersion: ceph.rook.io/v1
2+
kind: CephCluster
3+
metadata:
4+
name: test-ceph-cluster
5+
namespace: rook-ceph
6+
spec:
7+
cephVersion:
8+
image: quay.io/ceph/ceph:v19.2.0
9+
status:
10+
ceph:
11+
health: HEALTH_ERR
12+
details:
13+
OSD_DOWN:
14+
message: 8 osds down
15+
severity: HEALTH_ERR
16+
state: Error
17+
message: Cluster has critical errors
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
apiVersion: ceph.rook.io/v1
2+
kind: CephCluster
3+
metadata:
4+
name: test-ceph-cluster
5+
namespace: rook-ceph
6+
spec:
7+
cephVersion:
8+
image: quay.io/ceph/ceph:v19.2.0
9+
status:
10+
ceph:
11+
health: HEALTH_WARN
12+
details:
13+
OSD_DOWN:
14+
message: 4 osds down
15+
severity: HEALTH_WARN
16+
POOL_DEGRADED:
17+
message: 2 pools degraded
18+
severity: HEALTH_WARN
19+
state: Created
20+
message: Cluster has warnings
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
apiVersion: ceph.rook.io/v1
2+
kind: CephCluster
3+
metadata:
4+
name: test-ceph-cluster
5+
namespace: rook-ceph
6+
spec:
7+
cephVersion:
8+
image: quay.io/ceph/ceph:v19.2.0
9+
mon:
10+
count: 3
11+
mgr:
12+
count: 2
13+
storage:
14+
useAllNodes: true
15+
useAllDevices: false
16+
status:
17+
ceph:
18+
health: HEALTH_ERR
19+
lastChecked: "2025-08-11T16:03:08Z"
20+
fsid: c121226d-cac9-492f-8b0b-c05693243380
21+
details:
22+
OSD_DOWN:
23+
message: 8 osds down
24+
severity: HEALTH_ERR
25+
PG_AVAILABILITY:
26+
message: 'Critical data availability: 256 pgs inactive'
27+
severity: HEALTH_ERR
28+
PG_DEGRADED:
29+
message: 'Severe data redundancy: 512/1110 objects degraded (46.126%), 52 pgs degraded, 256 pgs undersized'
30+
severity: HEALTH_ERR
31+
RECENT_CRASH:
32+
message: 12 daemons have recently crashed
33+
severity: HEALTH_ERR
34+
capacity:
35+
bytesAvailable: 35183103942656
36+
bytesTotal: 35184372088832
37+
bytesUsed: 1268146176
38+
lastUpdated: "2025-08-11T16:03:08Z"
39+
conditions:
40+
- lastHeartbeatTime: "2025-08-11T16:03:08Z"
41+
lastTransitionTime: "2025-08-11T16:03:08Z"
42+
message: Cluster has critical errors
43+
reason: ClusterError
44+
status: "False"
45+
type: Ready
46+
message: Cluster has critical errors
47+
phase: Failed
48+
state: Created
49+
version:
50+
image: quay.io/ceph/ceph:v19.2.0
51+
version: 19.2.0-0
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
apiVersion: ceph.rook.io/v1
2+
kind: CephCluster
3+
metadata:
4+
name: test-ceph-cluster
5+
namespace: rook-ceph
6+
spec:
7+
cephVersion:
8+
image: quay.io/ceph/ceph:v19.2.0
9+
mon:
10+
count: 3
11+
mgr:
12+
count: 2
13+
storage:
14+
useAllNodes: true
15+
useAllDevices: false
16+
status:
17+
ceph:
18+
health: HEALTH_OK
19+
lastChecked: "2025-08-11T16:03:08Z"
20+
fsid: c121226d-cac9-492f-8b0b-c05693243380
21+
capacity:
22+
bytesAvailable: 35183103942656
23+
bytesTotal: 35184372088832
24+
bytesUsed: 1268146176
25+
lastUpdated: "2025-08-11T16:03:08Z"
26+
conditions:
27+
- lastHeartbeatTime: "2025-08-11T16:03:08Z"
28+
lastTransitionTime: "2025-08-11T16:03:08Z"
29+
message: Cluster created successfully
30+
reason: ClusterCreated
31+
status: "True"
32+
type: Ready
33+
- lastHeartbeatTime: "2025-08-11T16:03:08Z"
34+
lastTransitionTime: "2025-08-11T16:03:08Z"
35+
message: All OSDs are running
36+
reason: OSDsRunning
37+
status: "True"
38+
type: Progressing
39+
message: Cluster created successfully
40+
phase: Ready
41+
state: Created
42+
version:
43+
image: quay.io/ceph/ceph:v19.2.0
44+
version: 19.2.0-0
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
apiVersion: ceph.rook.io/v1
2+
kind: CephCluster
3+
metadata:
4+
name: test-ceph-cluster
5+
namespace: rook-ceph
6+
spec:
7+
cephVersion:
8+
image: quay.io/ceph/ceph:v19.2.0
9+
mon:
10+
count: 3
11+
mgr:
12+
count: 2
13+
storage:
14+
useAllNodes: true
15+
useAllDevices: false
16+
status:
17+
ceph:
18+
health: HEALTH_WARN
19+
lastChecked: "2025-08-11T16:03:08Z"
20+
fsid: c121226d-cac9-492f-8b0b-c05693243380
21+
details:
22+
OSD_DOWN:
23+
message: 4 osds down
24+
severity: HEALTH_WARN
25+
conditions:
26+
- lastHeartbeatTime: "2025-08-11T16:03:08Z"
27+
lastTransitionTime: "2025-08-11T16:03:08Z"
28+
message: Processing OSD 9 on node "ip-100-65-83-47.us-west-2.compute.internal"
29+
reason: ClusterProgressing
30+
status: "True"
31+
type: Progressing
32+
- lastHeartbeatTime: "2025-08-11T16:03:08Z"
33+
lastTransitionTime: "2025-08-11T16:03:08Z"
34+
message: Cluster is not ready yet
35+
reason: ClusterNotReady
36+
status: "False"
37+
type: Ready
38+
message: Cluster is progressing
39+
phase: Progressing
40+
state: Creating
41+
version:
42+
image: quay.io/ceph/ceph:v19.2.0
43+
version: 19.2.0-0
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
apiVersion: ceph.rook.io/v1
2+
kind: CephCluster
3+
metadata:
4+
name: test-ceph-cluster
5+
namespace: rook-ceph
6+
spec:
7+
cephVersion:
8+
image: quay.io/ceph/ceph:v19.2.0
9+
status:
10+
state: Creating
11+
message: Cluster is being created

0 commit comments

Comments
 (0)