Skip to content

Commit 4cd49d0

Browse files
committed
restore: restore crd after accidental deletion
with these changes we'll be able to restore any crd which stuck deletion due some dependencies after after accidental deletion. example: kubectl rook-ceph -n <ns> restore <cr> <cr_name> Signed-off-by: subhamkrai <[email protected]>
1 parent bacdc39 commit 4cd49d0

12 files changed

Lines changed: 457 additions & 51 deletions

File tree

.github/workflows/go-test.yaml

Lines changed: 36 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ concurrency:
1515
jobs:
1616
default-namespace:
1717
runs-on: ubuntu-20.04
18+
env:
19+
ROOK_PLUGIN_SKIP_PROMPTS: true
1820
steps:
1921
- name: checkout
2022
uses: actions/checkout@v4
@@ -53,8 +55,6 @@ jobs:
5355
kubectl rook-ceph --context=$(kubectl config current-context) ceph status
5456
5557
- name: Mon restore
56-
env:
57-
ROOK_PLUGIN_SKIP_PROMPTS: true
5858
run: |
5959
set -ex
6060
# test the mon restore to restore to mon a, delete mons b and c, then add d and e
@@ -112,6 +112,22 @@ jobs:
112112
kubectl -n rook-ceph scale deployment rook-ceph-osd-0 --replicas 0
113113
kubectl rook-ceph rook purge-osd 0 --force
114114
115+
- name: Restore CRD without CRName
116+
run: |
117+
# First let's delete the cephCluster
118+
kubectl -n rook-ceph delete cephcluster my-cluster --timeout 3s --wait=false
119+
120+
kubectl rook-ceph -n rook-ceph restore-deleted cephcluster
121+
tests/github-action-helper.sh wait_for_crd_to_be_ready_default
122+
123+
- name: Restore CRD with CRName
124+
run: |
125+
# First let's delete the cephCluster
126+
kubectl -n rook-ceph delete cephcluster my-cluster --timeout 3s --wait=false
127+
128+
kubectl rook-ceph -n rook-ceph restore-deleted cephcluster my-cluster
129+
tests/github-action-helper.sh wait_for_crd_to_be_ready_default
130+
115131
- name: collect common logs
116132
if: always()
117133
uses: ./.github/workflows/collect-logs
@@ -126,6 +142,8 @@ jobs:
126142

127143
custom-namespace:
128144
runs-on: ubuntu-20.04
145+
env:
146+
ROOK_PLUGIN_SKIP_PROMPTS: true
129147
steps:
130148
- name: checkout
131149
uses: actions/checkout@v4
@@ -166,8 +184,6 @@ jobs:
166184
kubectl rook-ceph --operator-namespace test-operator -n test-cluster --context=$(kubectl config current-context) ceph status
167185
168186
- name: Mon restore
169-
env:
170-
ROOK_PLUGIN_SKIP_PROMPTS: true
171187
run: |
172188
set -ex
173189
# test the mon restore to restore to mon a, delete mons b and c, then add d and e
@@ -225,6 +241,22 @@ jobs:
225241
kubectl -n test-cluster scale deployment rook-ceph-osd-0 --replicas 0
226242
kubectl rook-ceph --operator-namespace test-operator -n test-cluster rook purge-osd 0 --force
227243
244+
- name: Restore CRD without CRName
245+
run: |
246+
# First let's delete the cephCluster
247+
kubectl -n test-cluster delete cephcluster my-cluster --timeout 3s --wait=false
248+
249+
kubectl rook-ceph --operator-namespace test-operator -n test-cluster restore-deleted cephcluster
250+
tests/github-action-helper.sh wait_for_crd_to_be_ready_custom
251+
252+
- name: Restore CRD with CRName
253+
run: |
254+
# First let's delete the cephCluster
255+
kubectl -n test-cluster delete cephcluster my-cluster --timeout 3s --wait=false
256+
257+
kubectl rook-ceph --operator-namespace test-operator -n test-cluster restore-deleted cephcluster my-cluster
258+
tests/github-action-helper.sh wait_for_crd_to_be_ready_custom
259+
228260
- name: collect common logs
229261
if: always()
230262
uses: ./.github/workflows/collect-logs

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ Visit docs below for complete details about each command and their flags uses.
101101
1. [Debug OSDs and Mons](docs/debug.md)
102102
1. [Restore mon quorum](docs/mons.md#restore-quorum)
103103
1. [Disaster Recovery](docs/dr-health.md)
104+
1. [Restore deleted CRs](docs/crd.md)
104105

105106
## Examples
106107

cmd/commands/restore.go

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
/*
2+
Copyright 2023 The Rook Authors. All rights reserved.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package command
18+
19+
import (
20+
"github.com/rook/kubectl-rook-ceph/pkg/restore"
21+
"github.com/spf13/cobra"
22+
)
23+
24+
// RestoreCmd represents the restore commands
25+
var RestoreCmd = &cobra.Command{
26+
Use: "restore-deleted",
27+
Short: "Restores a CR that was accidentally deleted and is still in terminating state. Ex: restore cephcluster <my-cluster>",
28+
Args: cobra.MinimumNArgs(1),
29+
Run: func(cmd *cobra.Command, args []string) {
30+
clientsets := GetClientsets(cmd.Context())
31+
VerifyOperatorPodIsRunning(cmd.Context(), clientsets, OperatorNamespace, CephClusterNamespace)
32+
restore.RestoreCrd(cmd.Context(), clientsets, OperatorNamespace, CephClusterNamespace, args)
33+
},
34+
}

cmd/main.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,5 +38,6 @@ func addcommands() {
3838
command.DebugCmd,
3939
command.Health,
4040
command.DrCmd,
41+
command.RestoreCmd,
4142
)
4243
}

docs/crd.md

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
# Restoring Deleted CRs
2+
3+
When a Rook CR is deleted, the Rook operator will respond to the deletion event to attempt to clean up the cluster resources. If any data is still present in the cluster, Rook will refuse to delete the CR to ensure data is not lost. The operator will refuse to remove the finalizer on the CR until the underlying data is deleted.
4+
5+
While the underlying Ceph data and daemons continue to be available, the CRs will be stuck indefinitely in a Deleting state in which the operator will not continue to ensure cluster health. Upgrades will be blocked, further updates to the CRs are prevented, and so on. Since Kubernetes does not allow undeleting resources, the command below will allow repairing the CRs without even necessarily suffering cluster downtime.
6+
7+
## Restore Command
8+
9+
- `<CRD>`: the CRD type that is to be restored, such as CephCluster, CephFilesystem, CephBlockPool and so on.
10+
- `[CRName]`: is the name of the specific CR which you want to restore since there can be multiple instances under the same CRD. For example, if there are multiple CephFilesystems stuck in deleting state, a specific filesystem can be restored: `restore-deleted cephfilesystem filesystem-2`.
11+
12+
```bash
13+
kubectl rook-ceph restore-deleted <CRD> [CRName]
14+
15+
Info: Detecting which resources to restore for crd "cephcluster"
16+
Info: Restoring CR my-cluster
17+
Warning: The resource my-cluster was found deleted. Do you want to restore it? yes | no
18+
19+
Info: skipped prompt since ROOK_PLUGIN_SKIP_PROMPTS=true
20+
Info: Scaling down the operator to 0
21+
Info: Backing up kubernetes and crd resources
22+
---
23+
---
24+
---
25+
Info: Removing owner references for service rook-ceph-mgr
26+
Info: Removed ownerReference for service: rook-ceph-mgr
27+
28+
Info: Removing owner references for service rook-ceph-mgr-dashboard
29+
Info: Removed ownerReference for service: rook-ceph-mgr-dashboard
30+
31+
Info: Removing owner references for service rook-ceph-mon-a
32+
Info: Removed ownerReference for service: rook-ceph-mon-a
33+
---
34+
---
35+
---
36+
Info: Removing finalizers from cephcluster/my-cluster
37+
Info: cephcluster.ceph.rook.io/my-cluster patched
38+
39+
Info: Re-creating the CR cephcluster from file cephcluster-my-cluster.yaml created above
40+
Info: cephcluster.ceph.rook.io/my-cluster created
41+
42+
Info: Scaling up the operator to 1
43+
Info: CR is successfully restored. Please watch the operator logs and check the crd
44+
```
45+
46+
## CephCluster Restore Example
47+
48+
```bash
49+
kubectl rook-ceph restore-deleted cephcluster [cephClusterName]
50+
51+
Info: Detecting which resources to restore for crd "cephcluster"
52+
Info: Restoring CR my-cluster
53+
Warning: The resource my-cluster was found deleted. Do you want to restore it? yes | no
54+
55+
Info: skipped prompt since ROOK_PLUGIN_SKIP_PROMPTS=true
56+
Info: Scaling down the operator to 0
57+
Info: Backing up kubernetes and crd resources
58+
Info: Backed up crd cephcluster/my-cluster in file cephcluster-my-cluster.yaml
59+
Info: Deleting validating webhook rook-ceph-webhook if present
60+
Info: Fetching the UID for cephcluster/my-cluster
61+
Info: Successfully fetched uid 9575a4c9-7d0f-4b98-93d5-0b7dab7efeb4 from cephcluster/my-cluster
62+
Info: Removing ownerreferences from resources with matching uid 9575a4c9-7d0f-4b98-93d5-0b7dab7efeb4
63+
Info: Removing owner references for secret cluster-peer-token-my-cluster
64+
Info: Removed ownerReference for Secret: cluster-peer-token-my-cluster
65+
66+
Info: Removing owner references for secret rook-ceph-admin-keyring
67+
Info: Removed ownerReference for Secret: rook-ceph-admin-keyring
68+
---
69+
---
70+
---
71+
Info: Removing owner references for service rook-ceph-mgr-dashboard
72+
Info: Removed ownerReference for service: rook-ceph-mgr-dashboard
73+
74+
Info: Removing owner references for service rook-ceph-mon-a
75+
Info: Removed ownerReference for service: rook-ceph-mon-a
76+
---
77+
---
78+
---
79+
Info: Removing finalizers from cephcluster/my-cluster
80+
Info: cephcluster.ceph.rook.io/my-cluster patched
81+
82+
Info: Re-creating the CR cephcluster from file cephcluster-my-cluster.yaml created above
83+
Info: cephcluster.ceph.rook.io/my-cluster created
84+
85+
Info: Scaling up the operator to 1
86+
Info: CR is successfully restored. Please watch the operator logs and check the crd
87+
```

pkg/debug/start_debug.go

Lines changed: 3 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@ import (
2525
"github.com/rook/kubectl-rook-ceph/pkg/logging"
2626

2727
appsv1 "k8s.io/api/apps/v1"
28-
autoscalingv1 "k8s.io/api/autoscaling/v1"
2928
kerrors "k8s.io/apimachinery/pkg/api/errors"
3029
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
3130
"k8s.io/client-go/kubernetes"
@@ -39,7 +38,7 @@ func StartDebug(ctx context.Context, k8sclientset kubernetes.Interface, clusterN
3938
}
4039

4140
func startDebug(ctx context.Context, k8sclientset kubernetes.Interface, clusterNamespace, deploymentName, alternateImageValue string) error {
42-
originalDeployment, err := GetDeployment(ctx, k8sclientset, clusterNamespace, deploymentName)
41+
originalDeployment, err := k8sutil.GetDeployment(ctx, k8sclientset, clusterNamespace, deploymentName)
4342
if err != nil {
4443
return fmt.Errorf("Missing mon or osd deployment name %s. %v\n", deploymentName, err)
4544
}
@@ -69,7 +68,7 @@ func startDebug(ctx context.Context, k8sclientset kubernetes.Interface, clusterN
6968
return err
7069
}
7170

72-
if err := SetDeploymentScale(ctx, k8sclientset, clusterNamespace, deployment.Name, 0); err != nil {
71+
if err := k8sutil.SetDeploymentScale(ctx, k8sclientset, clusterNamespace, deployment.Name, 0); err != nil {
7372
return err
7473
}
7574

@@ -96,7 +95,7 @@ func startDebug(ctx context.Context, k8sclientset kubernetes.Interface, clusterN
9695
}
9796
logging.Info("ensure the debug deployment %s is scaled up\n", deploymentName)
9897

99-
if err := SetDeploymentScale(ctx, k8sclientset, clusterNamespace, debugDeployment.Name, 1); err != nil {
98+
if err := k8sutil.SetDeploymentScale(ctx, k8sclientset, clusterNamespace, debugDeployment.Name, 1); err != nil {
10099
return err
101100
}
102101

@@ -109,34 +108,6 @@ func startDebug(ctx context.Context, k8sclientset kubernetes.Interface, clusterN
109108
return nil
110109
}
111110

112-
func SetDeploymentScale(ctx context.Context, k8sclientset kubernetes.Interface, namespace, deploymentName string, scaleCount int) error {
113-
scale := &autoscalingv1.Scale{
114-
ObjectMeta: v1.ObjectMeta{
115-
Name: deploymentName,
116-
Namespace: namespace,
117-
},
118-
Spec: autoscalingv1.ScaleSpec{
119-
Replicas: int32(scaleCount),
120-
},
121-
}
122-
_, err := k8sclientset.AppsV1().Deployments(namespace).UpdateScale(ctx, deploymentName, scale, v1.UpdateOptions{})
123-
if err != nil {
124-
return fmt.Errorf("failed to update scale of deployment %s. %v\n", deploymentName, err)
125-
}
126-
return nil
127-
}
128-
129-
func GetDeployment(ctx context.Context, k8sclientset kubernetes.Interface, clusterNamespace, deploymentName string) (*appsv1.Deployment, error) {
130-
logging.Info("fetching the deployment %s to be running\n", deploymentName)
131-
deployment, err := k8sclientset.AppsV1().Deployments(clusterNamespace).Get(ctx, deploymentName, v1.GetOptions{})
132-
if err != nil {
133-
return nil, err
134-
}
135-
136-
logging.Info("deployment %s exists\n", deploymentName)
137-
return deployment, nil
138-
}
139-
140111
func waitForPodDeletion(ctx context.Context, k8sclientset kubernetes.Interface, clusterNamespace, podName string) error {
141112
for i := 0; i < 60; i++ {
142113
_, err := k8sclientset.CoreV1().Pods(clusterNamespace).Get(ctx, podName, v1.GetOptions{})

pkg/debug/stop_debug.go

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"fmt"
2222
"strings"
2323

24+
"github.com/rook/kubectl-rook-ceph/pkg/k8sutil"
2425
"github.com/rook/kubectl-rook-ceph/pkg/logging"
2526

2627
kerrors "k8s.io/apimachinery/pkg/api/errors"
@@ -41,7 +42,7 @@ func stopDebug(ctx context.Context, k8sclientset kubernetes.Interface, clusterNa
4142
deploymentName = deploymentName + "-debug"
4243
}
4344

44-
debugDeployment, err := GetDeployment(ctx, k8sclientset, clusterNamespace, deploymentName)
45+
debugDeployment, err := k8sutil.GetDeployment(ctx, k8sclientset, clusterNamespace, deploymentName)
4546
if err != nil {
4647
return fmt.Errorf("Missing mon or osd debug deployment name %s. %v\n", deploymentName, err)
4748
}
@@ -53,7 +54,7 @@ func stopDebug(ctx context.Context, k8sclientset kubernetes.Interface, clusterNa
5354
}
5455

5556
original_deployment_name := strings.ReplaceAll(deploymentName, "-debug", "")
56-
if err := SetDeploymentScale(ctx, k8sclientset, clusterNamespace, original_deployment_name, 1); err != nil {
57+
if err := k8sutil.SetDeploymentScale(ctx, k8sclientset, clusterNamespace, original_deployment_name, 1); err != nil {
5758
return err
5859
}
5960
logging.Info("Successfully deleted debug deployment and restored deployment %q", original_deployment_name)

pkg/exec/bash.go

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,19 +17,18 @@ limitations under the License.
1717
package exec
1818

1919
import (
20+
"os"
2021
"os/exec"
2122

2223
"github.com/rook/kubectl-rook-ceph/pkg/logging"
2324
)
2425

2526
func ExecuteBashCommand(command string) string {
2627
cmd := exec.Command("/bin/bash",
27-
"-x", // Print commands and their arguments as they are executed
28-
"-e", // Exit immediately if a command exits with a non-zero status.
29-
"-m", // Terminal job control, allows job to be terminated by SIGTERM
3028
"-c", // Command to run
3129
command,
3230
)
31+
cmd.Stderr = os.Stderr
3332
stdout, err := cmd.Output()
3433
if err != nil {
3534
logging.Fatal(err)
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@ import (
2323

2424
"github.com/rook/kubectl-rook-ceph/pkg/logging"
2525

26+
appsv1 "k8s.io/api/apps/v1"
27+
autoscalingv1 "k8s.io/api/autoscaling/v1"
2628
corev1 "k8s.io/api/core/v1"
2729
v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2830
"k8s.io/apimachinery/pkg/types"
@@ -74,3 +76,31 @@ func UpdateConfigMap(ctx context.Context, k8sclientset kubernetes.Interface, nam
7476

7577
logging.Info("configmap/%s patched\n", configMapName)
7678
}
79+
80+
func SetDeploymentScale(ctx context.Context, k8sclientset kubernetes.Interface, namespace, deploymentName string, scaleCount int) error {
81+
scale := &autoscalingv1.Scale{
82+
ObjectMeta: v1.ObjectMeta{
83+
Name: deploymentName,
84+
Namespace: namespace,
85+
},
86+
Spec: autoscalingv1.ScaleSpec{
87+
Replicas: int32(scaleCount),
88+
},
89+
}
90+
_, err := k8sclientset.AppsV1().Deployments(namespace).UpdateScale(ctx, deploymentName, scale, v1.UpdateOptions{})
91+
if err != nil {
92+
return fmt.Errorf("failed to update scale of deployment %s. %v\n", deploymentName, err)
93+
}
94+
return nil
95+
}
96+
97+
func GetDeployment(ctx context.Context, k8sclientset kubernetes.Interface, clusterNamespace, deploymentName string) (*appsv1.Deployment, error) {
98+
logging.Info("fetching the deployment %s to be running\n", deploymentName)
99+
deployment, err := k8sclientset.AppsV1().Deployments(clusterNamespace).Get(ctx, deploymentName, v1.GetOptions{})
100+
if err != nil {
101+
return nil, err
102+
}
103+
104+
logging.Info("deployment %s exists\n", deploymentName)
105+
return deployment, nil
106+
}

0 commit comments

Comments
 (0)