restore: restore crd after accidental deletion

subhamkrai · subhamkrai · commit 4cd49d07ec01 · 2023-09-27T09:40:14.000+05:30
with these changes we'll be able to restore any
crd which stuck deletion due some dependencies
after after accidental deletion.
example:
kubectl rook-ceph -n &lt;ns&gt; restore &lt;cr&gt; &lt;cr_name&gt;

Signed-off-by: subhamkrai &lt;srai@redhat.com&gt;
diff --git a/.github/workflows/go-test.yaml b/.github/workflows/go-test.yaml
@@ -15,6 +15,8 @@ concurrency:
 jobs:
   default-namespace:
     runs-on: ubuntu-20.04
+    env:
+      ROOK_PLUGIN_SKIP_PROMPTS: true
     steps:
       - name: checkout
         uses: actions/checkout@v4
@@ -53,8 +55,6 @@ jobs:
           kubectl rook-ceph --context=$(kubectl config current-context) ceph status
 
       - name: Mon restore
-        env:
-          ROOK_PLUGIN_SKIP_PROMPTS: true
         run: |
           set -ex
           # test the mon restore to restore to mon a, delete mons b and c, then add d and e
@@ -112,6 +112,22 @@ jobs:
           kubectl -n rook-ceph scale deployment rook-ceph-osd-0 --replicas 0
           kubectl rook-ceph rook purge-osd 0 --force
 
+      - name: Restore CRD without CRName
+        run: |
+          # First let's delete the cephCluster
+          kubectl -n rook-ceph delete cephcluster my-cluster --timeout 3s --wait=false
+
+          kubectl rook-ceph -n rook-ceph restore-deleted cephcluster
+          tests/github-action-helper.sh wait_for_crd_to_be_ready_default
+
+      - name: Restore CRD with CRName
+        run: |
+          # First let's delete the cephCluster
+          kubectl -n rook-ceph delete cephcluster my-cluster --timeout 3s --wait=false
+
+          kubectl rook-ceph -n rook-ceph restore-deleted cephcluster my-cluster
+          tests/github-action-helper.sh wait_for_crd_to_be_ready_default
+
       - name: collect common logs
         if: always()
         uses: ./.github/workflows/collect-logs
@@ -126,6 +142,8 @@ jobs:
 
   custom-namespace:
     runs-on: ubuntu-20.04
+    env:
+      ROOK_PLUGIN_SKIP_PROMPTS: true
     steps:
       - name: checkout
         uses: actions/checkout@v4
@@ -166,8 +184,6 @@ jobs:
           kubectl rook-ceph --operator-namespace test-operator -n test-cluster --context=$(kubectl config current-context) ceph status
 
       - name: Mon restore
-        env:
-          ROOK_PLUGIN_SKIP_PROMPTS: true
         run: |
           set -ex
           # test the mon restore to restore to mon a, delete mons b and c, then add d and e
@@ -225,6 +241,22 @@ jobs:
           kubectl -n test-cluster scale deployment rook-ceph-osd-0 --replicas 0
           kubectl rook-ceph --operator-namespace test-operator -n test-cluster rook purge-osd 0 --force
 
+      - name: Restore CRD without CRName
+        run: |
+          # First let's delete the cephCluster
+          kubectl -n test-cluster delete cephcluster my-cluster --timeout 3s --wait=false
+
+          kubectl rook-ceph --operator-namespace test-operator -n test-cluster restore-deleted cephcluster
+          tests/github-action-helper.sh wait_for_crd_to_be_ready_custom
+
+      - name: Restore CRD with CRName
+        run: |
+          # First let's delete the cephCluster
+          kubectl -n test-cluster delete cephcluster my-cluster --timeout 3s --wait=false
+
+          kubectl rook-ceph --operator-namespace test-operator -n test-cluster restore-deleted cephcluster my-cluster
+          tests/github-action-helper.sh wait_for_crd_to_be_ready_custom
+
       - name: collect common logs
         if: always()
         uses: ./.github/workflows/collect-logs
diff --git a/README.md b/README.md
@@ -101,6 +101,7 @@ Visit docs below for complete details about each command and their flags uses.
 1. [Debug OSDs and Mons](docs/debug.md)
 1. [Restore mon quorum](docs/mons.md#restore-quorum)
 1. [Disaster Recovery](docs/dr-health.md)
+1. [Restore deleted CRs](docs/crd.md)
 
 ## Examples
 
diff --git a/cmd/commands/restore.go b/cmd/commands/restore.go
@@ -0,0 +1,34 @@
+/*
+Copyright 2023 The Rook Authors. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package command
+
+import (
+	"github.com/rook/kubectl-rook-ceph/pkg/restore"
+	"github.com/spf13/cobra"
+)
+
+// RestoreCmd represents the restore commands
+var RestoreCmd = &cobra.Command{
+	Use:   "restore-deleted",
+	Short: "Restores a CR that was accidentally deleted and is still in terminating state. Ex: restore cephcluster <my-cluster>",
+	Args:  cobra.MinimumNArgs(1),
+	Run: func(cmd *cobra.Command, args []string) {
+		clientsets := GetClientsets(cmd.Context())
+		VerifyOperatorPodIsRunning(cmd.Context(), clientsets, OperatorNamespace, CephClusterNamespace)
+		restore.RestoreCrd(cmd.Context(), clientsets, OperatorNamespace, CephClusterNamespace, args)
+	},
+}
diff --git a/cmd/main.go b/cmd/main.go
@@ -38,5 +38,6 @@ func addcommands() {
 		command.DebugCmd,
 		command.Health,
 		command.DrCmd,
+		command.RestoreCmd,
 	)
 }
diff --git a/docs/crd.md b/docs/crd.md
@@ -0,0 +1,87 @@
+# Restoring Deleted CRs
+
+When a Rook CR is deleted, the Rook operator will respond to the deletion event to attempt to clean up the cluster resources. If any data is still present in the cluster, Rook will refuse to delete the CR to ensure data is not lost. The operator will refuse to remove the finalizer on the CR until the underlying data is deleted.
+
+While the underlying Ceph data and daemons continue to be available, the CRs will be stuck indefinitely in a Deleting state in which the operator will not continue to ensure cluster health. Upgrades will be blocked, further updates to the CRs are prevented, and so on. Since Kubernetes does not allow undeleting resources, the command below will allow repairing the CRs without even necessarily suffering cluster downtime.
+
+## Restore Command
+
+- `<CRD>`: the CRD type that is to be restored, such as CephCluster, CephFilesystem, CephBlockPool and so on.
+- `[CRName]`: is the name of the specific CR which you want to restore since there can be multiple instances under the same CRD. For example, if there are multiple CephFilesystems stuck in deleting state, a specific filesystem can be restored: `restore-deleted cephfilesystem filesystem-2`.
+
+```bash
+kubectl rook-ceph restore-deleted <CRD> [CRName]
+
+Info: Detecting which resources to restore for crd "cephcluster"
+Info: Restoring CR my-cluster
+Warning: The resource my-cluster was found deleted. Do you want to restore it? yes | no
+
+Info: skipped prompt since ROOK_PLUGIN_SKIP_PROMPTS=true
+Info: Scaling down the operator to 0
+Info: Backing up kubernetes and crd resources
+---
+---
+---
+Info: Removing owner references for service rook-ceph-mgr
+Info: Removed ownerReference for service: rook-ceph-mgr
+
+Info: Removing owner references for service rook-ceph-mgr-dashboard
+Info: Removed ownerReference for service: rook-ceph-mgr-dashboard
+
+Info: Removing owner references for service rook-ceph-mon-a
+Info: Removed ownerReference for service: rook-ceph-mon-a
+---
+---
+---
+Info: Removing finalizers from cephcluster/my-cluster
+Info: cephcluster.ceph.rook.io/my-cluster patched
+
+Info: Re-creating the CR cephcluster from file cephcluster-my-cluster.yaml created above
+Info: cephcluster.ceph.rook.io/my-cluster created
+
+Info: Scaling up the operator to 1
+Info: CR is successfully restored. Please watch the operator logs and check the crd
+```
+
+## CephCluster Restore Example
+
+```bash
+kubectl rook-ceph restore-deleted cephcluster [cephClusterName]
+
+Info: Detecting which resources to restore for crd "cephcluster"
+Info: Restoring CR my-cluster
+Warning: The resource my-cluster was found deleted. Do you want to restore it? yes | no
+
+Info: skipped prompt since ROOK_PLUGIN_SKIP_PROMPTS=true
+Info: Scaling down the operator to 0
+Info: Backing up kubernetes and crd resources
+Info: Backed up crd cephcluster/my-cluster in file cephcluster-my-cluster.yaml
+Info: Deleting validating webhook rook-ceph-webhook if present
+Info: Fetching the UID for cephcluster/my-cluster
+Info: Successfully fetched uid 9575a4c9-7d0f-4b98-93d5-0b7dab7efeb4 from cephcluster/my-cluster
+Info: Removing ownerreferences from resources with matching uid 9575a4c9-7d0f-4b98-93d5-0b7dab7efeb4
+Info: Removing owner references for secret cluster-peer-token-my-cluster
+Info: Removed ownerReference for Secret: cluster-peer-token-my-cluster
+
+Info: Removing owner references for secret rook-ceph-admin-keyring
+Info: Removed ownerReference for Secret: rook-ceph-admin-keyring
+---
+---
+---
+Info: Removing owner references for service rook-ceph-mgr-dashboard
+Info: Removed ownerReference for service: rook-ceph-mgr-dashboard
+
+Info: Removing owner references for service rook-ceph-mon-a
+Info: Removed ownerReference for service: rook-ceph-mon-a
+---
+---
+---
+Info: Removing finalizers from cephcluster/my-cluster
+Info: cephcluster.ceph.rook.io/my-cluster patched
+
+Info: Re-creating the CR cephcluster from file cephcluster-my-cluster.yaml created above
+Info: cephcluster.ceph.rook.io/my-cluster created
+
+Info: Scaling up the operator to 1
+Info: CR is successfully restored. Please watch the operator logs and check the crd
+```
diff --git a/pkg/debug/start_debug.go b/pkg/debug/start_debug.go
@@ -25,7 +25,6 @@ import (
 	"github.com/rook/kubectl-rook-ceph/pkg/logging"
 
 	appsv1 "k8s.io/api/apps/v1"
-	autoscalingv1 "k8s.io/api/autoscaling/v1"
 	kerrors "k8s.io/apimachinery/pkg/api/errors"
 	v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/client-go/kubernetes"
@@ -39,7 +38,7 @@ func StartDebug(ctx context.Context, k8sclientset kubernetes.Interface, clusterN
 }
 
 func startDebug(ctx context.Context, k8sclientset kubernetes.Interface, clusterNamespace, deploymentName, alternateImageValue string) error {
-	originalDeployment, err := GetDeployment(ctx, k8sclientset, clusterNamespace, deploymentName)
+	originalDeployment, err := k8sutil.GetDeployment(ctx, k8sclientset, clusterNamespace, deploymentName)
 	if err != nil {
 		return fmt.Errorf("Missing mon or osd deployment name %s. %v\n", deploymentName, err)
 	}
@@ -69,7 +68,7 @@ func startDebug(ctx context.Context, k8sclientset kubernetes.Interface, clusterN
 		return err
 	}
 
-	if err := SetDeploymentScale(ctx, k8sclientset, clusterNamespace, deployment.Name, 0); err != nil {
+	if err := k8sutil.SetDeploymentScale(ctx, k8sclientset, clusterNamespace, deployment.Name, 0); err != nil {
 		return err
 	}
 
@@ -96,7 +95,7 @@ func startDebug(ctx context.Context, k8sclientset kubernetes.Interface, clusterN
 	}
 	logging.Info("ensure the debug deployment %s is scaled up\n", deploymentName)
 
-	if err := SetDeploymentScale(ctx, k8sclientset, clusterNamespace, debugDeployment.Name, 1); err != nil {
+	if err := k8sutil.SetDeploymentScale(ctx, k8sclientset, clusterNamespace, debugDeployment.Name, 1); err != nil {
 		return err
 	}
 
@@ -109,34 +108,6 @@ func startDebug(ctx context.Context, k8sclientset kubernetes.Interface, clusterN
 	return nil
 }
 
-func SetDeploymentScale(ctx context.Context, k8sclientset kubernetes.Interface, namespace, deploymentName string, scaleCount int) error {
-	scale := &autoscalingv1.Scale{
-		ObjectMeta: v1.ObjectMeta{
-			Name:      deploymentName,
-			Namespace: namespace,
-		},
-		Spec: autoscalingv1.ScaleSpec{
-			Replicas: int32(scaleCount),
-		},
-	}
-	_, err := k8sclientset.AppsV1().Deployments(namespace).UpdateScale(ctx, deploymentName, scale, v1.UpdateOptions{})
-	if err != nil {
-		return fmt.Errorf("failed to update scale of deployment %s. %v\n", deploymentName, err)
-	}
-	return nil
-}
-
-func GetDeployment(ctx context.Context, k8sclientset kubernetes.Interface, clusterNamespace, deploymentName string) (*appsv1.Deployment, error) {
-	logging.Info("fetching the deployment %s to be running\n", deploymentName)
-	deployment, err := k8sclientset.AppsV1().Deployments(clusterNamespace).Get(ctx, deploymentName, v1.GetOptions{})
-	if err != nil {
-		return nil, err
-	}
-
-	logging.Info("deployment %s exists\n", deploymentName)
-	return deployment, nil
-}
-
 func waitForPodDeletion(ctx context.Context, k8sclientset kubernetes.Interface, clusterNamespace, podName string) error {
 	for i := 0; i < 60; i++ {
 		_, err := k8sclientset.CoreV1().Pods(clusterNamespace).Get(ctx, podName, v1.GetOptions{})
diff --git a/pkg/debug/stop_debug.go b/pkg/debug/stop_debug.go
@@ -21,6 +21,7 @@ import (
 	"fmt"
 	"strings"
 
+	"github.com/rook/kubectl-rook-ceph/pkg/k8sutil"
 	"github.com/rook/kubectl-rook-ceph/pkg/logging"
 
 	kerrors "k8s.io/apimachinery/pkg/api/errors"
@@ -41,7 +42,7 @@ func stopDebug(ctx context.Context, k8sclientset kubernetes.Interface, clusterNa
 		deploymentName = deploymentName + "-debug"
 	}
 
-	debugDeployment, err := GetDeployment(ctx, k8sclientset, clusterNamespace, deploymentName)
+	debugDeployment, err := k8sutil.GetDeployment(ctx, k8sclientset, clusterNamespace, deploymentName)
 	if err != nil {
 		return fmt.Errorf("Missing mon or osd debug deployment name %s. %v\n", deploymentName, err)
 	}
@@ -53,7 +54,7 @@ func stopDebug(ctx context.Context, k8sclientset kubernetes.Interface, clusterNa
 	}
 
 	original_deployment_name := strings.ReplaceAll(deploymentName, "-debug", "")
-	if err := SetDeploymentScale(ctx, k8sclientset, clusterNamespace, original_deployment_name, 1); err != nil {
+	if err := k8sutil.SetDeploymentScale(ctx, k8sclientset, clusterNamespace, original_deployment_name, 1); err != nil {
 		return err
 	}
 	logging.Info("Successfully deleted debug deployment and restored deployment %q", original_deployment_name)
diff --git a/pkg/exec/bash.go b/pkg/exec/bash.go
@@ -17,19 +17,18 @@ limitations under the License.
 package exec
 
 import (
+	"os"
 	"os/exec"
 
 	"github.com/rook/kubectl-rook-ceph/pkg/logging"
 )
 
 func ExecuteBashCommand(command string) string {
 	cmd := exec.Command("/bin/bash",
-		"-x", // Print commands and their arguments as they are executed
-		"-e", // Exit immediately if a command exits with a non-zero status.
-		"-m", // Terminal job control, allows job to be terminated by SIGTERM
 		"-c", // Command to run
 		command,
 	)
+	cmd.Stderr = os.Stderr
 	stdout, err := cmd.Output()
 	if err != nil {
 		logging.Fatal(err)
diff --git a/pkg/k8sutil/k8sutil.go b/pkg/k8sutil/k8sutil.go
@@ -23,6 +23,8 @@ import (
 
 	"github.com/rook/kubectl-rook-ceph/pkg/logging"
 
+	appsv1 "k8s.io/api/apps/v1"
+	autoscalingv1 "k8s.io/api/autoscaling/v1"
 	corev1 "k8s.io/api/core/v1"
 	v1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/types"
@@ -74,3 +76,31 @@ func UpdateConfigMap(ctx context.Context, k8sclientset kubernetes.Interface, nam
 
 	logging.Info("configmap/%s patched\n", configMapName)
 }
+
+func SetDeploymentScale(ctx context.Context, k8sclientset kubernetes.Interface, namespace, deploymentName string, scaleCount int) error {
+	scale := &autoscalingv1.Scale{
+		ObjectMeta: v1.ObjectMeta{
+			Name:      deploymentName,
+			Namespace: namespace,
+		},
+		Spec: autoscalingv1.ScaleSpec{
+			Replicas: int32(scaleCount),
+		},
+	}
+	_, err := k8sclientset.AppsV1().Deployments(namespace).UpdateScale(ctx, deploymentName, scale, v1.UpdateOptions{})
+	if err != nil {
+		return fmt.Errorf("failed to update scale of deployment %s. %v\n", deploymentName, err)
+	}
+	return nil
+}
+
+func GetDeployment(ctx context.Context, k8sclientset kubernetes.Interface, clusterNamespace, deploymentName string) (*appsv1.Deployment, error) {
+	logging.Info("fetching the deployment %s to be running\n", deploymentName)
+	deployment, err := k8sclientset.AppsV1().Deployments(clusterNamespace).Get(ctx, deploymentName, v1.GetOptions{})
+	if err != nil {
+		return nil, err
+	}
+
+	logging.Info("deployment %s exists\n", deploymentName)
+	return deployment, nil
+}
diff --git a/pkg/mons/restore_quorum.go b/pkg/mons/restore_quorum.go
diff --git a/pkg/restore/crd.go b/pkg/restore/crd.go
diff --git a/tests/github-action-helper.sh b/tests/github-action-helper.sh

Original file line number	Diff line number	Diff line change
`@@ -38,5 +38,6 @@ func addcommands() {`
`38`	`38`	`command.DebugCmd,`
`39`	`39`	`command.Health,`
`40`	`40`	`command.DrCmd,`
	`41`	`+ command.RestoreCmd,`
`41`	`42`	`)`
`42`	`43`	`}`
Original file line number	Diff line number	Diff line change
`@@ -21,6 +21,7 @@ import (`
`21`	`21`	`"fmt"`
`22`	`22`	`"strings"`
`23`	`23`
	`24`	`+ "github.com/rook/kubectl-rook-ceph/pkg/k8sutil"`
`24`	`25`	`"github.com/rook/kubectl-rook-ceph/pkg/logging"`
`25`	`26`
`26`	`27`	`kerrors "k8s.io/apimachinery/pkg/api/errors"`
`@@ -41,7 +42,7 @@ func stopDebug(ctx context.Context, k8sclientset kubernetes.Interface, clusterNa`
`41`	`42`	`deploymentName = deploymentName + "-debug"`
`42`	`43`	`}`
`43`	`44`
`44`		`- debugDeployment, err := GetDeployment(ctx, k8sclientset, clusterNamespace, deploymentName)`
	`45`	`+ debugDeployment, err := k8sutil.GetDeployment(ctx, k8sclientset, clusterNamespace, deploymentName)`
`45`	`46`	`if err != nil {`
`46`	`47`	`return fmt.Errorf("Missing mon or osd debug deployment name %s. %v\n", deploymentName, err)`
`47`	`48`	`}`
`@@ -53,7 +54,7 @@ func stopDebug(ctx context.Context, k8sclientset kubernetes.Interface, clusterNa`
`53`	`54`	`}`
`54`	`55`
`55`	`56`	`original_deployment_name := strings.ReplaceAll(deploymentName, "-debug", "")`
`56`		`- if err := SetDeploymentScale(ctx, k8sclientset, clusterNamespace, original_deployment_name, 1); err != nil {`
	`57`	`+ if err := k8sutil.SetDeploymentScale(ctx, k8sclientset, clusterNamespace, original_deployment_name, 1); err != nil {`
`57`	`58`	`return err`
`58`	`59`	`}`
`59`	`60`	`logging.Info("Successfully deleted debug deployment and restored deployment %q", original_deployment_name)`