diff --git a/.github/workflows/copy-stackgres-cluster.yml b/.github/workflows/copy-stackgres-cluster.yml new file mode 100644 index 00000000000..e47185d3268 --- /dev/null +++ b/.github/workflows/copy-stackgres-cluster.yml @@ -0,0 +1,237 @@ +# SPDX-License-Identifier: Apache-2.0 + +name: Copy StackGres Citus Cluster + +concurrency: + group: ${{ github.workflow }}-${{ inputs.target_cluster_name }} + cancel-in-progress: true + +on: + workflow_call: + inputs: + source_cluster_location: + type: string + default: "us-central1" + required: true + source_cluster_name: + type: string + default: "mainnet-na" + required: true + source_project: + type: string + default: "prod" + required: true + target_cluster_location: + type: string + default: "us-central1" + required: true + target_cluster_name: + type: string + default: "mainnet-staging-na" + required: true + target_default_pool: + type: string + default: "mainnet-staging-na" + required: true + target_project: + type: string + default: "nonprod" + required: true + teardown_target: + type: boolean + default: true + required: true + secrets: + GH_ACTIONS_KUBECTL_MAIN_PROJECT_ID: + required: true + GH_ACTIONS_KUBECTL_GCP_SERVICE_ACCOUNT: + required: true + GH_ACTIONS_KUBECTL_STAGING_PROJECT_ID: + required: true + GH_ACTIONS_KUBECTL_WORKLOAD_ID_PROVIDER: + required: true + workflow_dispatch: + inputs: + source_cluster_location: + default: "us-central1" + description: "Source: GKE location (zone or region)" + required: true + source_cluster_name: + description: "Source: GKE cluster name" + default: "mainnet-na" + required: true + source_project: + default: prod + description: "Source: GCP project" + options: [prod, nonprod] + required: true + type: choice + target_cluster_location: + default: "us-central1" + description: "Target: GKE location (zone or region)" + required: true + target_cluster_name: + default: "mainnet-staging-na" + description: "Target: GKE cluster name" + required: true + target_default_pool: + default: "mainnet-staging-na" + description: "Target: default pool name" + required: true + target_project: + default: nonprod + description: "Target: GCP project" + options: [prod, nonprod] + required: true + type: choice + teardown_target: + default: true + description: "Tear down target cluster after k6 tests run" + required: true + type: boolean + +permissions: + id-token: write + contents: read + +jobs: + run-copy: + name: Copy Citus from SOURCE ➜ TARGET + runs-on: hiero-mirror-node-linux-medium + env: + FLUX_VERSION: "2.3.0" + GCP_SNAPSHOT_PROJECT: ${{ inputs.source_project == 'prod' && secrets.GH_ACTIONS_KUBECTL_MAIN_PROJECT_ID || secrets.GH_ACTIONS_KUBECTL_STAGING_PROJECT_ID }} + GCP_K8S_SOURCE_CLUSTER_NAME: ${{ inputs.source_cluster_name }} + GCP_K8S_SOURCE_CLUSTER_REGION: ${{ inputs.source_cluster_location }} + GCP_K8S_TARGET_CLUSTER_NAME: ${{ inputs.target_cluster_name }} + GCP_K8S_TARGET_CLUSTER_REGION: ${{ inputs.target_cluster_location }} + GCP_TARGET_PROJECT: ${{ inputs.target_project == 'prod' && secrets.GH_ACTIONS_KUBECTL_MAIN_PROJECT_ID || secrets.GH_ACTIONS_KUBECTL_STAGING_PROJECT_ID }} + K8S_SOURCE_CLUSTER_CONTEXT: "source_gke_context" + K8S_TARGET_CLUSTER_CONTEXT: "target_gke_context" + PINNED_KUBECONFIG: ${{ github.workspace }}/.kube/config + SA_EMAIL: ${{ secrets.GH_ACTIONS_KUBECTL_GCP_SERVICE_ACCOUNT }} + WIF_PROVIDER: ${{ secrets.GH_ACTIONS_KUBECTL_WORKLOAD_ID_PROVIDER }} + + steps: + - name: Checkout + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 + + - name: Ensure jq is available + run: jq --version || (sudo apt-get update && sudo apt-get install -y jq) + + - name: Setup gcloud + kubectl + GKE auth plugin + uses: google-github-actions/setup-gcloud@e427ad8a34f8676edf47cf7d7925499adf3eb74f + with: + install_components: gke-gcloud-auth-plugin, kubectl + + - name: Create application default credentials + shell: bash + run: | + set -euo pipefail + : "${ACTIONS_ID_TOKEN_REQUEST_URL:?missing OIDC URL}" + : "${ACTIONS_ID_TOKEN_REQUEST_TOKEN:?missing OIDC token}" + ADC_DIR="${RUNNER_TEMP}/wif-adc" + mkdir -p "$ADC_DIR" + SUBJECT_TOKEN_FILE="${ADC_DIR}/subject.jwt" + ADC_JSON="${ADC_DIR}/adc.json" + : > "$SUBJECT_TOKEN_FILE" + AUD="//iam.googleapis.com/${WIF_PROVIDER}" + cat >"$ADC_JSON" <> "$GITHUB_ENV" + ENC_AUD="$(jq -rn --arg s "$AUD" '$s|@uri')" + TOKEN_JSON="$(curl -sSf -H "Authorization: Bearer ${ACTIONS_ID_TOKEN_REQUEST_TOKEN}" "${ACTIONS_ID_TOKEN_REQUEST_URL}&audience=${ENC_AUD}")" + OIDC="$(jq -r '.value // empty' <<<"$TOKEN_JSON")" + test -n "$OIDC" + printf '%s' "$OIDC" > "$SUBJECT_TOKEN_FILE" + gcloud auth application-default print-access-token >/dev/null + gcloud config set container/use_application_default_credentials true + + - name: Get GKE credentials (source) + shell: bash + run: | + set -euo pipefail + gcloud container clusters get-credentials "${GCP_K8S_SOURCE_CLUSTER_NAME}" \ + --region "${GCP_K8S_SOURCE_CLUSTER_REGION}" \ + --project "${GCP_SNAPSHOT_PROJECT}" + SRC_CURR="$(kubectl config current-context)" + kubectl config rename-context "${SRC_CURR}" "${K8S_SOURCE_CLUSTER_CONTEXT}" + + - name: Get GKE credentials (target) + shell: bash + run: | + set -euo pipefail + gcloud container clusters get-credentials "${GCP_K8S_TARGET_CLUSTER_NAME}" \ + --region "${GCP_K8S_TARGET_CLUSTER_REGION}" \ + --project "${GCP_TARGET_PROJECT}" + TGT_CURR="$(kubectl config current-context)" + kubectl config rename-context "${TGT_CURR}" "${K8S_TARGET_CLUSTER_CONTEXT}" + + - name: Cache Cloud SDK paths + shell: bash + run: | + set -euo pipefail + GCLOUD_BIN="$(command -v gcloud)" + KUBECTL_BIN="$(command -v kubectl)" + CLOUDSDK_BIN_DIR="$(dirname "$GCLOUD_BIN")" + echo "GCLOUD_BIN=$GCLOUD_BIN" >> "$GITHUB_ENV" + echo "KUBECTL_BIN=$KUBECTL_BIN" >> "$GITHUB_ENV" + echo "CLOUDSDK_BIN_DIR=$CLOUDSDK_BIN_DIR" >> "$GITHUB_ENV" + + - name: Setup Testkube CLI + uses: kubeshop/setup-testkube@970d643ec9ecbe5707049c1d65b851da72aab3d9 + with: + version: v2.3.0 + + - name: Setup Flux CLI + uses: fluxcd/flux2/action@ca29bb1a41d662495cbf3a8ee6dba7f088ae7310 + with: + version: v2.3.0 + + - name: Execute copy script + shell: bash + env: + AUTO_CONFIRM: "true" + DEFAULT_POOL_NAME: ${{ inputs.target_default_pool }} + WAIT_FOR_K6: "true" + run: | + set -Eeuo pipefail + export PATH="${CLOUDSDK_BIN_DIR}:${PATH}" + hash -r + ( + set -euo pipefail + while true; do + ENC_AUD="$(jq -rn --arg s "$AUD" '$s|@uri')" + if TOKEN_JSON="$(curl -sS -H "Authorization: Bearer ${ACTIONS_ID_TOKEN_REQUEST_TOKEN}" "${ACTIONS_ID_TOKEN_REQUEST_URL}&audience=${ENC_AUD}")"; then + OIDC="$(jq -r '.value // empty' <<<"$TOKEN_JSON")" + if [[ -n "$OIDC" ]]; then + printf '%s' "$OIDC" > "$SUBJECT_TOKEN_FILE" + gcloud auth application-default print-access-token >/dev/null 2>&1 || true + fi + fi + kubectl --context "${K8S_SOURCE_CLUSTER_CONTEXT}" --request-timeout=10s get --raw=/version >/dev/null 2>&1 || true + kubectl --context "${K8S_TARGET_CLUSTER_CONTEXT}" --request-timeout=10s get --raw=/version >/dev/null 2>&1 || true + sleep 120 + done + ) & + REFRESH_PID=$! + trap 'kill "$REFRESH_PID" 2>/dev/null || true' EXIT INT TERM + cd ./tools/cluster-management/ + ./copy-live-environment.sh diff --git a/.github/workflows/tirgger-staging-deploy.yml b/.github/workflows/tirgger-staging-deploy.yml new file mode 100644 index 00000000000..ef5c753779c --- /dev/null +++ b/.github/workflows/tirgger-staging-deploy.yml @@ -0,0 +1,31 @@ +# SPDX-License-Identifier: Apache-2.0 + +name: Trigger Staging on Deploy Change + +on: + push: + branches: [deploy] + paths: + - clusters/mainnet-staging-na/mainnet-citus/helmrelease.yaml + +permissions: + id-token: write + contents: read + +jobs: + run-reusable-copy: + uses: ./.github/workflows/copy-stackgres-cluster.yml + with: + source_cluster_location: "us-central1" + source_cluster_name: "mainnet-na" + source_project: "prod" + target_cluster_location: "us-central1" + target_cluster_name: "mainnet-staging-na" + target_default_pool: "mainnet-staging-na" + target_project: "nonprod" + teardown_target: true + secrets: + GH_ACTIONS_KUBECTL_MAIN_PROJECT_ID: ${{ secrets.GH_ACTIONS_KUBECTL_MAIN_PROJECT_ID }} + GH_ACTIONS_KUBECTL_GCP_SERVICE_ACCOUNT: ${{ secrets.GH_ACTIONS_KUBECTL_GCP_SERVICE_ACCOUNT }} + GH_ACTIONS_KUBECTL_STAGING_PROJECT_ID: ${{ secrets.GH_ACTIONS_KUBECTL_STAGING_PROJECT_ID }} + GH_ACTIONS_KUBECTL_WORKLOAD_ID_PROVIDER: ${{ secrets.GH_ACTIONS_KUBECTL_WORKLOAD_ID_PROVIDER }} diff --git a/docs/runbook/change-citus-node-pool-machine-type.md b/docs/runbook/change-citus-node-pool-machine-type.md index b62105e0143..745cb343f4c 100644 --- a/docs/runbook/change-citus-node-pool-machine-type.md +++ b/docs/runbook/change-citus-node-pool-machine-type.md @@ -8,7 +8,7 @@ Need to Change Machine Type for Citus Node Pool(s) - Have `jq` and `yq` installed - kubectl is pointing to the cluster you want to change the machine type for -- All bash commands assume your working directory is `docs/runbook/scripts` +- All bash commands assume your working directory is `tools/cluster-management` ## Solution diff --git a/docs/runbook/copy-live-environment.md b/docs/runbook/copy-live-environment.md index 829b91fb833..43020af1130 100644 --- a/docs/runbook/copy-live-environment.md +++ b/docs/runbook/copy-live-environment.md @@ -6,14 +6,15 @@ Need to copy live environment with zero downtime on source ## Prerequisites -- Have `jq`, `yq`, and `ksd`(kubernetes secret decrypter) installed +- Have `jq`, `yq`, and `base64` installed +- Have `testkube` kubectl plugin installed - The source and target have compatible versions of postgres - The `target cluster` has a running Citus cluster deployed with `hedera-mirror` chart - The `target cluster` you are restoring to doesn't have any pvcs with a size larger than the size of the pvc in the snapshot. You can't decrease the size of a pvc. If needed, you can delete the existing cluster in the `target cluster` and redeploy the `hedera-mirror` chart with the default disk sizes. - If you have multiple Citus clusters in the `target cluster`, you will need to restore all of them -- All bash commands assume your working directory is `docs/runbook/scripts` +- All bash commands assume your working directory is `tools/cluster-management` - Only a single citus cluster is installed per namespace ## Steps diff --git a/docs/runbook/create-disk-snapshot-for-citus-cluster.md b/docs/runbook/create-disk-snapshot-for-citus-cluster.md index a065cbd891c..444eb118704 100644 --- a/docs/runbook/create-disk-snapshot-for-citus-cluster.md +++ b/docs/runbook/create-disk-snapshot-for-citus-cluster.md @@ -8,7 +8,7 @@ Need to create disk snapshots for Citus cluster(s) - Have access to a running Citus cluster deployed by the `hedera-mirror` chart - Have `jq` and `yq` installed -- All bash commands assume your working directory is `docs/runbook/scripts` +- All bash commands assume your working directory is `tools/cluster-management` - The kubectl context is set to the cluster you want to create snapshots from ## Solution diff --git a/docs/runbook/restore-citus-from-disk-snapshot.md b/docs/runbook/restore-citus-from-disk-snapshot.md index 4075538391d..0816fb41206 100644 --- a/docs/runbook/restore-citus-from-disk-snapshot.md +++ b/docs/runbook/restore-citus-from-disk-snapshot.md @@ -7,14 +7,14 @@ Need to restore Citus cluster from disk snapshots ## Prerequisites - Snapshots of disks were created by following the [create snapshot](create-disk-snapshot-for-citus-cluster.md) runbook -- Have `jq`, `yq`, and `ksd`(kubernetes secret decrypter) installed +- Have `jq`, `yq`, and `base64` installed - The snapshots are from a compatible version of `postgres` - The `target cluster` has a running Citus cluster deployed with `hedera-mirror` chart - The `target cluster` you are restoring to doesn't have any pvcs with a size larger than the size of the pvc in the snapshot. You can't decrease the size of a pvc. If needed, you can delete the existing cluster in the `target cluster` and redeploy the `hedera-mirror` chart with the default disk sizes. - If you have multiple Citus clusters in the `target cluster`, you will need to restore all of them -- All bash commands assume your working directory is `docs/runbook/scripts` +- All bash commands assume your working directory is `tools/cluster-management` - Only a single citus cluster is installed per namespace - The kubectl context is set to the cluster you want to restore snapshots to diff --git a/docs/runbook/restore-citus-from-stackgres-backup.md b/docs/runbook/restore-citus-from-stackgres-backup.md index 78d46caf4ea..923dd451b27 100644 --- a/docs/runbook/restore-citus-from-stackgres-backup.md +++ b/docs/runbook/restore-citus-from-stackgres-backup.md @@ -6,10 +6,10 @@ Need to restore Citus cluster from a StackGres sharded backup ## Prerequisites -- Have `jq`, `yq`, and `ksd`(kubernetes secret decrypter) installed +- Have `jq`, `yq`, and `base64` installed - The cluster has a running Citus cluster deployed with `hedera-mirror` chart - StackGresShardedCluster backup is enabled -- All bash commands assume your working directory is `docs/runbook/scripts` +- All bash commands assume your working directory is `tools/cluster-management` - Only a single citus cluster is installed per namespace - The kubectl context is set to the cluster you want to restore backup to and the namespace is set to the one `hedera-mirror` chart is installed in diff --git a/docs/runbook/upgrade-k8s-version-citus-nodepool.md b/docs/runbook/upgrade-k8s-version-citus-nodepool.md index bc0c1e084a9..6c8e4fa0885 100644 --- a/docs/runbook/upgrade-k8s-version-citus-nodepool.md +++ b/docs/runbook/upgrade-k8s-version-citus-nodepool.md @@ -8,7 +8,7 @@ Need to update k8s version for clusters with citus installed - Have `jq` and `yq` installed - The kubectl context is set to the cluster you want to upgrade -- All bash commands assume your working directory is `docs/runbook/scripts` +- All bash commands assume your working directory is `tools/cluster-management` ## Solution diff --git a/docs/runbook/scripts/change-machine-type.sh b/tools/cluster-management/change-machine-type.sh similarity index 99% rename from docs/runbook/scripts/change-machine-type.sh rename to tools/cluster-management/change-machine-type.sh index a088de11f86..dfa348763e8 100755 --- a/docs/runbook/scripts/change-machine-type.sh +++ b/tools/cluster-management/change-machine-type.sh @@ -4,7 +4,7 @@ set -euo pipefail -source ./utils.sh +source ./utils/utils.sh GCP_TARGET_PROJECT="$(readUserInput "Enter GCP Project for target: ")" if [[ -z "${GCP_TARGET_PROJECT}" ]]; then diff --git a/docs/runbook/scripts/copy-live-environment.sh b/tools/cluster-management/copy-live-environment.sh similarity index 80% rename from docs/runbook/scripts/copy-live-environment.sh rename to tools/cluster-management/copy-live-environment.sh index 64c64d4e20a..b269d620c49 100755 --- a/docs/runbook/scripts/copy-live-environment.sh +++ b/tools/cluster-management/copy-live-environment.sh @@ -2,11 +2,11 @@ # SPDX-License-Identifier: Apache-2.0 -set -euo pipefail +set -Eeuo pipefail -source ./utils.sh -source ./input-utils.sh -source ./snapshot-utils.sh +source ./utils/utils.sh +source ./utils/input-utils.sh +source ./utils/snapshot-utils.sh CREATE_NEW_BACKUPS="${CREATE_NEW_BACKUPS:-true}" DEFAULT_POOL_MAX_PER_ZONE="${DEFAULT_POOL_MAX_PER_ZONE:-5}" @@ -277,6 +277,8 @@ function patchBackupPaths() { } function scaleupResources() { + waitForClusterOperations "${DEFAULT_POOL_NAME}" + gcloud container clusters resize "${GCP_K8S_TARGET_CLUSTER_NAME}" \ --location="${GCP_K8S_TARGET_CLUSTER_REGION}" \ --node-pool="${DEFAULT_POOL_NAME}" \ @@ -315,6 +317,7 @@ function restoreTarget() { changeContext "${K8S_TARGET_CLUSTER_CONTEXT}" configureAndValidateSnapshotRestore scaleupResources + resumeCommonChart patchBackupPaths replaceDisks } @@ -341,21 +344,73 @@ function deleteSnapshots() { function scaleDownNodePools() { resizeCitusNodePools 0 + waitForClusterOperations "${DEFAULT_POOL_NAME}" gcloud container node-pools update "${DEFAULT_POOL_NAME}" \ --cluster="${GCP_K8S_TARGET_CLUSTER_NAME}" \ --location="${GCP_K8S_TARGET_CLUSTER_REGION}" \ --project="${GCP_TARGET_PROJECT}" \ --no-enable-autoscaling \ --quiet - gcloud container clusters resize "${GCP_K8S_TARGET_CLUSTER_NAME}" \ + + sleep 10 + + local poolNodes + log "Cordoning nodes in pool ${DEFAULT_POOL_NAME}" + mapfile -t poolNodes < <( + kubectl --context "${K8S_TARGET_CLUSTER_CONTEXT}" \ + get nodes -l "cloud.google.com/gke-nodepool=${DEFAULT_POOL_NAME}" \ + -o name + ) + + for node in "${poolNodes[@]}"; do + [[ -z "$node" ]] && continue + if ! kubectl --context "${K8S_TARGET_CLUSTER_CONTEXT}" get "$node" >/dev/null 2>&1; then + log "Node ${node} no longer exists; skipping cordon" + continue + fi + kubectl --context "${K8S_TARGET_CLUSTER_CONTEXT}" cordon "$node" || true + done + + log "Draining nodes in pool ${DEFAULT_POOL_NAME}" + for node in "${poolNodes[@]}"; do + [[ -z "$node" ]] && continue + + while true; do + if ! kubectl --context "${K8S_TARGET_CLUSTER_CONTEXT}" get "$node" >/dev/null 2>&1; then + log "Node ${node} disappeared before/while draining; skipping" + break + fi + + if kubectl --context "${K8S_TARGET_CLUSTER_CONTEXT}" drain "$node" \ + --ignore-daemonsets \ + --delete-emptydir-data \ + --grace-period=60 \ + --timeout=15m \ + --force; then + log "Drained ${node}" + break + fi + + log "Failed to drain ${node}, retrying" + kubectl --context "${K8S_TARGET_CLUSTER_CONTEXT}" get pods -A -o wide --field-selector spec.nodeName="${node#node/}" || true + kubectl --context "${K8S_TARGET_CLUSTER_CONTEXT}" cordon "$node" || true + sleep 30 + done + done + + log "Scaling down default pool to 0 nodes" + until gcloud container clusters resize "${GCP_K8S_TARGET_CLUSTER_NAME}" \ --location="${GCP_K8S_TARGET_CLUSTER_REGION}" \ --node-pool="${DEFAULT_POOL_NAME}" \ --project="${GCP_TARGET_PROJECT}" \ --num-nodes=0 \ - --quiet \ - --async + --quiet; do + log "Failed to scale down default pool, retrying" + sleep 5 + done } + function removeDisks() { local disksToDelete diskJson diskName diskZone zoneLink disksToDelete="$(getCitusDiskNames "${GCP_TARGET_PROJECT}" "${DISK_PREFIX}")" @@ -381,19 +436,25 @@ function teardownResources() { log "Tearing down resources" for namespace in "${CITUS_NAMESPACES[@]}"; do unrouteTraffic "${namespace}" + pauseCitus "${namespace}" "true" + kubectl delete pdb -n "${namespace}" --all --ignore-not-found 1>&2 done + suspendCommonChart + kubectl delete pdb -n "${COMMON_NAMESPACE}" --all --ignore-not-found 1>&2 + scaleDownNodePools removeDisks } function waitForK6PodExecution() { local testName="$1" - local job + local job out - while true; do - job="$( - kubectl get jobs -n "${TEST_KUBE_NAMESPACE}" -l "executor=k6-custom-executor" -o json \ + job="" + until { + if out="$( + kubectl get jobs -n "${TEST_KUBE_NAMESPACE}" -l "executor=k6-custom-executor" -o json 2>/dev/null \ | jq -r --arg testName "$testName" ' .items[] | select(.metadata.labels["test-name"] != null @@ -401,24 +462,43 @@ function waitForK6PodExecution() { | .metadata.name ' \ | head -n1 - )" - if [[ -n "${job}" ]]; then - log "Found job ${job} for test ${testName}" - break + )"; then + [[ -n "$out" ]] + else + false fi + }; do log "waiting for test ${testName} to start" sleep 30 done - log "Waiting on job for test ${testName} to complete" - kubectl wait -n "${TEST_KUBE_NAMESPACE}" --for=condition=complete "job/${job}" --timeout=-1s + job="$out" + log "Found job ${job} for test ${testName}" + until kubectl wait -n "${TEST_KUBE_NAMESPACE}" --for=condition=complete "job/${job}" --timeout=10m > /dev/null 2>&1; do + log "Waiting for job ${job} to complete for test ${testName}" + sleep 1 + done + until kubectl get job -n "${TEST_KUBE_NAMESPACE}" "${job}-scraper" >/dev/null 2>&1; do log "Waiting for scraper" sleep 1 done - kubectl wait -n "${TEST_KUBE_NAMESPACE}" --for=condition=complete "job/${job}-scraper" --timeout=-1s - sleep 5 - kubectl testkube download artifacts "${job}" + + until kubectl wait -n "${TEST_KUBE_NAMESPACE}" --for=condition=complete "job/${job}-scraper" --timeout=10m > /dev/null 2>&1; do + log "Waiting for scraper job to complete" + sleep 1 + done + + log "downloading artifacts for job ${job}" + until { + rm -f artifacts/report.md 2>/dev/null || true + kubectl testkube download artifacts "${job}" >/dev/null 2>&1 + [[ -s artifacts/report.md ]] + }; do + log "Waiting for artifacts to be available" + sleep 5 + done + cat artifacts/report.md } @@ -437,7 +517,7 @@ snapshotSource restoreTarget deleteSnapshots -if [[ "${WAIT_FOR_K6}" ]]; then +if [[ "${WAIT_FOR_K6}" == "true" ]]; then log "Awaiting k6 results" waitForK6PodExecution "rest" waitForK6PodExecution "rest-java" diff --git a/docs/runbook/scripts/reduce-citus-disk-size.sh b/tools/cluster-management/reduce-citus-disk-size.sh similarity index 99% rename from docs/runbook/scripts/reduce-citus-disk-size.sh rename to tools/cluster-management/reduce-citus-disk-size.sh index 29773189c9e..fd315ab3a2b 100755 --- a/docs/runbook/scripts/reduce-citus-disk-size.sh +++ b/tools/cluster-management/reduce-citus-disk-size.sh @@ -4,7 +4,7 @@ set -euo pipefail -source ./utils.sh +source ./utils/utils.sh ONE_GI_BYTES=1073741824 EPOCH_SECONDS=$(date +%s) diff --git a/docs/runbook/scripts/restore-stackgres-backup.sh b/tools/cluster-management/restore-stackgres-backup.sh similarity index 99% rename from docs/runbook/scripts/restore-stackgres-backup.sh rename to tools/cluster-management/restore-stackgres-backup.sh index 4a843108266..184070d66c1 100755 --- a/docs/runbook/scripts/restore-stackgres-backup.sh +++ b/tools/cluster-management/restore-stackgres-backup.sh @@ -4,7 +4,7 @@ set -euox pipefail -source ./utils.sh +source ./utils/utils.sh BACKUP_TO_RESTORE= CLUSTER= diff --git a/docs/runbook/scripts/restore-volume-snapshot.sh b/tools/cluster-management/restore-volume-snapshot.sh similarity index 66% rename from docs/runbook/scripts/restore-volume-snapshot.sh rename to tools/cluster-management/restore-volume-snapshot.sh index 6b3e77f4660..5d247c07002 100755 --- a/docs/runbook/scripts/restore-volume-snapshot.sh +++ b/tools/cluster-management/restore-volume-snapshot.sh @@ -4,8 +4,9 @@ set -euo pipefail -source ./utils.sh -source ./snapshot-utils.sh +source ./utils/utils.sh +source ./utils/input-utils.sh +source ./utils/snapshot-utils.sh REPLACE_DISKS="${REPLACE_DISKS:-true}" diff --git a/docs/runbook/scripts/upgrade-k8s-version-citus.sh b/tools/cluster-management/upgrade-k8s-version-citus.sh similarity index 99% rename from docs/runbook/scripts/upgrade-k8s-version-citus.sh rename to tools/cluster-management/upgrade-k8s-version-citus.sh index 4d85dc98299..45c3e33d426 100755 --- a/docs/runbook/scripts/upgrade-k8s-version-citus.sh +++ b/tools/cluster-management/upgrade-k8s-version-citus.sh @@ -4,7 +4,7 @@ set -euo pipefail -source ./utils.sh +source ./utils/utils.sh versionGreater() { local raw1="$1" diff --git a/docs/runbook/scripts/input-utils.sh b/tools/cluster-management/utils/input-utils.sh similarity index 96% rename from docs/runbook/scripts/input-utils.sh rename to tools/cluster-management/utils/input-utils.sh index 092f7a1080d..c22b4b1d113 100755 --- a/docs/runbook/scripts/input-utils.sh +++ b/tools/cluster-management/utils/input-utils.sh @@ -70,8 +70,8 @@ function promptSnapshotId() { gcloud compute snapshots list \ --project "${GCP_SNAPSHOT_PROJECT}" \ --format="table(name, diskSizeGb, sourceDisk, description, creationTimestamp)" \ - --filter="name~.*[0-9]{10,}$" \ - --sort-by="~creationTimestamp" + --filter="name~.*[0-9]{10,}\$" \ + --sort-by="~creationTimestamp" >& 2 local snapshotId snapshotId="$(readUserInput "Enter snapshot id (the epoch suffix of the snapshot group): ")" diff --git a/docs/runbook/scripts/snapshot-utils.sh b/tools/cluster-management/utils/snapshot-utils.sh similarity index 97% rename from docs/runbook/scripts/snapshot-utils.sh rename to tools/cluster-management/utils/snapshot-utils.sh index a90bc925f89..dbe5ad2a7ef 100755 --- a/docs/runbook/scripts/snapshot-utils.sh +++ b/tools/cluster-management/utils/snapshot-utils.sh @@ -4,7 +4,18 @@ set -euo pipefail -source ./utils.sh +source ./utils/utils.sh + +normalizeGceSnapshotName() { + local s="$1" max=63 + + s="${s,,}" # lowercase + s="${s//[^a-z0-9-]/-}" # only [a-z0-9-] + (( ${#s} > max )) && s="${s: -max}" # last 63 + [[ $s =~ ^[a-z] ]] || s="a${s#?}" # start with letter + while [[ $s == -* ]]; do s="${s::-1}"; done # no trailing '-' + printf '%s' "$s" +} function setupZfsVolumeForRecovery() { local namespace="${1}" pvcName="${2}" backupLabel="${3}" @@ -216,6 +227,7 @@ function snapshotCitusDisks() { diskNodeId="${diskName#"$diskPrefix"-}" diskNodeId="${diskNodeId%"-zfs"}" snapshotName="${diskName}-${epochSeconds}" + snapshotName="$(normalizeGceSnapshotName "$snapshotName")" snapshotRegion=$(echo "${diskNodeId}" | cut -d '-' -f 2-3) diskZone=$(echo "${diskNodeId}" | cut -d '-' -f 2-4) nodeVolumes=$(echo "${zfsVolumes}" | jq -r --arg diskNodeId "${diskNodeId}" 'map(select(.nodeId == $diskNodeId))') @@ -578,7 +590,9 @@ function deleteZfsSnapshots() { log "No snapshots found for ${ZFS_POOL_NAME} on node ${nodeId}" else log "Deleting all snapshots for ${ZFS_POOL_NAME} on node ${nodeId}" - kubectl_common exec "${pod}" -c openebs-zfs-plugin -- zfs destroy -r ${snapshots} + while IFS= read -r snap; do + kubectl_common exec "${pod}" -c openebs-zfs-plugin -- zfs destroy -r "$snap" + done <<< "${snapshots}" fi } diff --git a/docs/runbook/scripts/utils.sh b/tools/cluster-management/utils/utils.sh similarity index 85% rename from docs/runbook/scripts/utils.sh rename to tools/cluster-management/utils/utils.sh index f1d80c15041..be451b91e84 100755 --- a/docs/runbook/scripts/utils.sh +++ b/tools/cluster-management/utils/utils.sh @@ -12,6 +12,31 @@ function backgroundErrorHandler() { exit 1 } +mask() { + set +x + if [[ "${GITHUB_ACTIONS:-}" == "true" && -n "${1:-}" ]]; then + printf '::add-mask::%s\n' "$1" + fi +} + +maskJsonValues() { + set +x + local json input + json="${1:-}" + + [[ -z "$json" ]] && return 0 + + while IFS= read -r value; do + mask "$value" + + if decoded="$(printf '%s' "$value" | base64 -d 2>/dev/null)"; then + if [[ "$decoded" != *$'\x00'* && "$decoded" == *[[:print:]]* ]]; then + mask "$decoded" + fi + fi + done < <(jq -r '.. | strings' <<< "$json") +} + trap backgroundErrorHandler INT function watchInBackground() { @@ -251,6 +276,46 @@ function scaleDeployment() { fi } +function suspendCommonChart() { + if kubectl get helmrelease -n "${COMMON_NAMESPACE}" "${HELM_RELEASE_NAME}" >/dev/null; then + log "Suspending helm release ${HELM_RELEASE_NAME} in namespace ${COMMON_NAMESPACE}" + flux suspend helmrelease -n "${COMMON_NAMESPACE}" "${HELM_RELEASE_NAME}" + fi +} + +function resumeCommonChart() { + if ! kubectl get helmrelease -n "${COMMON_NAMESPACE}" "${HELM_RELEASE_NAME}" >/dev/null 2>&1; then + log "HelmRelease ${HELM_RELEASE_NAME} not found in namespace ${COMMON_NAMESPACE}; nothing to resume" + return 0 + fi + + local suspended + suspended="$(kubectl -n "${COMMON_NAMESPACE}" get helmrelease "${HELM_RELEASE_NAME}" -o jsonpath='{.spec.suspend}' 2>/dev/null || echo '')" + if [[ "${suspended}" == "true" ]]; then + log "Resuming helm release ${HELM_RELEASE_NAME} in namespace ${COMMON_NAMESPACE}" + flux resume helmrelease "${HELM_RELEASE_NAME}" -n "${COMMON_NAMESPACE}" || true + else + log "HelmRelease ${HELM_RELEASE_NAME} is not suspended; proceeding to reconcile & wait" + fi + + local deadline=$((SECONDS+1800)) + until kubectl wait -n "${COMMON_NAMESPACE}" \ + --for=condition=Ready "helmrelease/${HELM_RELEASE_NAME}" \ + --timeout=10m >/dev/null 2>&1; do + if (( SECONDS >= deadline )); then + log "Timed out waiting for helmrelease/${HELM_RELEASE_NAME} to become Ready" + flux get helmreleases -n "${COMMON_NAMESPACE}" "${HELM_RELEASE_NAME}" || true + kubectl -n "${COMMON_NAMESPACE}" describe helmrelease "${HELM_RELEASE_NAME}" || true + return 1 + fi + log "Waiting for helmrelease/${HELM_RELEASE_NAME} to become Ready… retrying reconcile" + flux reconcile helmrelease "${HELM_RELEASE_NAME}" -n "${COMMON_NAMESPACE}" --with-source >/dev/null 2>&1 || true + done + + log "HelmRelease ${HELM_RELEASE_NAME} is Ready" +} + + function unrouteTraffic() { local namespace="${1}" if [[ "${AUTO_UNROUTE}" == "true" ]]; then @@ -316,7 +381,7 @@ function routeTraffic() { doContinue flux resume helmrelease -n "${namespace}" "${HELM_RELEASE_NAME}" --timeout 30m else - log "No helm release found in namespace ${namespace}. Skipping suspend" + log "No helm release found in namespace ${namespace}. Skipping resume" fi scaleDeployment "${namespace}" 1 "app.kubernetes.io/component=monitor" fi @@ -608,6 +673,34 @@ function getZFSVolumes() { )' } +function waitForClusterOperations() { + local pool="${1}" + local ops + while true; do + ops="$( + gcloud container operations list \ + --project "${GCP_TARGET_PROJECT}" \ + --location "${GCP_K8S_TARGET_CLUSTER_REGION}" \ + --filter="status=RUNNING AND (targetLink~clusters/${GCP_K8S_TARGET_CLUSTER_NAME} OR targetLink~nodePools/${pool})" \ + --format="value(name)" \ + --verbosity=none 2>/dev/null | awk 'NF' | sort -u + )" + + [[ -z "$ops" ]] && break + + while IFS= read -r op; do + [[ -z "$op" ]] && continue + log "Waiting for in-flight operation ${op} before resizing pool ${pool}…" + gcloud container operations wait "${op}" \ + --project "${GCP_TARGET_PROJECT}" \ + --location "${GCP_K8S_TARGET_CLUSTER_REGION}" \ + --verbosity=none || true + done <<< "$ops" + + sleep 5 + done +} + function resizeCitusNodePools() { local numNodes="${1}" @@ -628,6 +721,7 @@ function resizeCitusNodePools() { for pool in "${citusPools[@]}"; do log "Scaling pool ${pool} to ${numNodes} nodes" + waitForClusterOperations "${pool}" gcloud container clusters resize "${GCP_K8S_TARGET_CLUSTER_NAME}" \ --node-pool="${pool}" \ @@ -650,36 +744,39 @@ function updateStackgresCreds() { local cluster="${1}" local namespace="${2}" local sgPasswords=$(kubectl get secret -n "${namespace}" "${cluster}" -o json | - ksd | - jq -r '.stringData') - local superuserUsername=$(echo "${sgPasswords}" | jq -r '.["superuser-username"]') - local superuserPassword=$(echo "${sgPasswords}" | jq -r '.["superuser-password"]') - local replicationUsername=$(echo "${sgPasswords}" | jq -r '.["replication-username"]') - local replicationPassword=$(echo "${sgPasswords}" | jq -r '.["replication-password"]') - local authenticatorUsername=$(echo "${sgPasswords}" | jq -r '.["authenticator-username"]') - local authenticatorPassword=$(echo "${sgPasswords}" | jq -r '.["authenticator-password"]') + jq -r '.data') + maskJsonValues "${sgPasswords}" + + local superuserUsername=$(echo "${sgPasswords}" | jq -r '.["superuser-username"]' | base64 -d) + local superuserPassword=$(echo "${sgPasswords}" | jq -r '.["superuser-password"]'| base64 -d) + local replicationUsername=$(echo "${sgPasswords}" | jq -r '.["replication-username"]'| base64 -d) + local replicationPassword=$(echo "${sgPasswords}" | jq -r '.["replication-password"]'| base64 -d) + local authenticatorUsername=$(echo "${sgPasswords}" | jq -r '.["authenticator-username"]'| base64 -d) + local authenticatorPassword=$(echo "${sgPasswords}" | jq -r '.["authenticator-password"]'| base64 -d) # Mirror Node Passwords local mirrorNodePasswords=$(kubectl get secret -n "${namespace}" "${HELM_RELEASE_NAME}-passwords" -o json | - ksd | - jq -r '.stringData') - local graphqlUsername=$(echo "${mirrorNodePasswords}" | jq -r '.HIERO_MIRROR_GRAPHQL_DB_USERNAME') - local graphqlPassword=$(echo "${mirrorNodePasswords}" | jq -r '.HIERO_MIRROR_GRAPHQL_DB_PASSWORD') - local grpcUsername=$(echo "${mirrorNodePasswords}" | jq -r '.HIERO_MIRROR_GRPC_DB_USERNAME') - local grpcPassword=$(echo "${mirrorNodePasswords}" | jq -r '.HIERO_MIRROR_GRPC_DB_PASSWORD') - local importerUsername=$(echo "${mirrorNodePasswords}" | jq -r '.HIERO_MIRROR_IMPORTER_DB_USERNAME') - local importerPassword=$(echo "${mirrorNodePasswords}" | jq -r '.HIERO_MIRROR_IMPORTER_DB_PASSWORD') - local ownerUsername=$(echo "${mirrorNodePasswords}" | jq -r '.HIERO_MIRROR_IMPORTER_DB_OWNER') - local ownerPassword=$(echo "${mirrorNodePasswords}" | jq -r '.HIERO_MIRROR_IMPORTER_DB_OWNERPASSWORD') - local restUsername=$(echo "${mirrorNodePasswords}" | jq -r '.HIERO_MIRROR_REST_DB_USERNAME') - local restPassword=$(echo "${mirrorNodePasswords}" | jq -r '.HIERO_MIRROR_REST_DB_PASSWORD') - local restJavaUsername=$(echo "${mirrorNodePasswords}" | jq -r '.HIERO_MIRROR_RESTJAVA_DB_USERNAME') - local restJavaPassword=$(echo "${mirrorNodePasswords}" | jq -r '.HIERO_MIRROR_RESTJAVA_DB_PASSWORD') - local rosettaUsername=$(echo "${mirrorNodePasswords}" | jq -r '.HIERO_MIRROR_ROSETTA_DB_USERNAME') - local rosettaPassword=$(echo "${mirrorNodePasswords}" | jq -r '.HIERO_MIRROR_ROSETTA_DB_PASSWORD') - local web3Username=$(echo "${mirrorNodePasswords}" | jq -r '.HIERO_MIRROR_WEB3_DB_USERNAME') - local web3Password=$(echo "${mirrorNodePasswords}" | jq -r '.HIERO_MIRROR_WEB3_DB_PASSWORD') - local dbName=$(echo "${mirrorNodePasswords}" | jq -r '.HIERO_MIRROR_IMPORTER_DB_NAME') + jq -r '.data') + maskJsonValues "${mirrorNodePasswords}" + + local graphqlUsername=$(echo "${mirrorNodePasswords}" | jq -r '.HIERO_MIRROR_GRAPHQL_DB_USERNAME'| base64 -d) + local graphqlPassword=$(echo "${mirrorNodePasswords}" | jq -r '.HIERO_MIRROR_GRAPHQL_DB_PASSWORD'| base64 -d) + local grpcUsername=$(echo "${mirrorNodePasswords}" | jq -r '.HIERO_MIRROR_GRPC_DB_USERNAME'| base64 -d) + local grpcPassword=$(echo "${mirrorNodePasswords}" | jq -r '.HIERO_MIRROR_GRPC_DB_PASSWORD'| base64 -d) + local importerUsername=$(echo "${mirrorNodePasswords}" | jq -r '.HIERO_MIRROR_IMPORTER_DB_USERNAME'| base64 -d) + local importerPassword=$(echo "${mirrorNodePasswords}" | jq -r '.HIERO_MIRROR_IMPORTER_DB_PASSWORD'| base64 -d) + local ownerUsername=$(echo "${mirrorNodePasswords}" | jq -r '.HIERO_MIRROR_IMPORTER_DB_OWNER'| base64 -d) + local ownerPassword=$(echo "${mirrorNodePasswords}" | jq -r '.HIERO_MIRROR_IMPORTER_DB_OWNERPASSWORD'| base64 -d) + local restUsername=$(echo "${mirrorNodePasswords}" | jq -r '.HIERO_MIRROR_REST_DB_USERNAME'| base64 -d) + local restPassword=$(echo "${mirrorNodePasswords}" | jq -r '.HIERO_MIRROR_REST_DB_PASSWORD'| base64 -d) + local restJavaUsername=$(echo "${mirrorNodePasswords}" | jq -r '.HIERO_MIRROR_RESTJAVA_DB_USERNAME'| base64 -d) + local restJavaPassword=$(echo "${mirrorNodePasswords}" | jq -r '.HIERO_MIRROR_RESTJAVA_DB_PASSWORD'| base64 -d) + local rosettaUsername=$(echo "${mirrorNodePasswords}" | jq -r '.HIERO_MIRROR_ROSETTA_DB_USERNAME'| base64 -d) + local rosettaPassword=$(echo "${mirrorNodePasswords}" | jq -r '.HIERO_MIRROR_ROSETTA_DB_PASSWORD'| base64 -d) + local web3Username=$(echo "${mirrorNodePasswords}" | jq -r '.HIERO_MIRROR_WEB3_DB_USERNAME'| base64 -d) + local web3Password=$(echo "${mirrorNodePasswords}" | jq -r '.HIERO_MIRROR_WEB3_DB_PASSWORD'| base64 -d) + local dbName=$(echo "${mirrorNodePasswords}" | jq -r '.HIERO_MIRROR_IMPORTER_DB_NAME'| base64 -d) + local sql=$( cat <