Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 136 additions & 0 deletions .github/workflows/cicd-approve-test-queue.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

name: Approve Test Queue

on:
schedule:
- cron: '*/5 * * * *' # Runs every 5 minutes
workflow_dispatch: # Allows manual triggering

jobs:
approve-queue:
runs-on: ubuntu-latest
environment: main
steps:
- name: Checkout repository
uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.12"

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install requests

- name: Approve waiting deployments
env:
GITHUB_TOKEN: ${{ secrets.PAT }}
MAX_CONCURRENCY: ${{ vars.MAX_CONCURRENCY || 1 }}
run: |
python - <<EOF
import os
import requests


# GitHub API configuration
GITHUB_TOKEN = os.environ["GITHUB_TOKEN"]
REPO = os.environ["GITHUB_REPOSITORY"]
MAX_CONCURRENCY = int(os.environ["MAX_CONCURRENCY"])
API_BASE = f"https://api.github.com/repos/{REPO}"

# Headers for GitHub API
headers = {
"Authorization": f"token {GITHUB_TOKEN}",
"Accept": "application/vnd.github.v3+json",
"X-GitHub-Api-Version": "2022-11-28",
}

def make_request(endpoint, method="GET", data=None):
"""Make a request to the GitHub API with error handling."""
url = f"{API_BASE}/{endpoint}"
try:
if method == "GET":
response = requests.get(url, headers=headers)
else:
response = requests.post(url, headers=headers, json=data)
response.raise_for_status()
return response.json()
except requests.exceptions.RequestException as e:
print(f"Error making request to {endpoint}: {str(e)}")
if hasattr(e.response, 'text'):
print(f"Response: {e.response.text}")
return None

# Get current running and queued workflows
print("Fetching workflow runs...")
queued_workflow_runs = make_request("actions/runs?status=queued").get("workflow_runs", [])
in_progress_workflow_runs = make_request("actions/runs?status=in_progress").get("workflow_runs", [])

# Count running and queued workflows
queued_workflows = sum(1 for run in queued_workflow_runs if run["name"] == "CICD NeMo")
in_progress_workflows = sum(1 for run in in_progress_workflow_runs if run["name"] == "CICD NeMo")

total_workflows = queued_workflows + in_progress_workflows
print(f"Current queued workflows: {queued_workflows}")
print(f"Current running workflows: {in_progress_workflows}")
print(f"Total workflows: {total_workflows}")
print(f"Max concurrency: {MAX_CONCURRENCY}")

if total_workflows >= MAX_CONCURRENCY:
print("Maximum concurrency reached, no new approvals will be made")
exit(0)

# Get waiting CI workflows for test environment
print("Fetching deployments...")
pending_workflows = make_request("actions/runs?status=waiting").get("workflow_runs", [])
pending_workflows = [run for run in pending_workflows if run["name"] == "CICD NeMo"]

# Sort deployments by creation date (oldest first)
print("Sorting workflows...")
pending_workflows = sorted(pending_workflows, key=lambda x: x["created_at"])

# Process each deployment
print("Processing ...")
for workflow in pending_workflows:
if total_workflows >= MAX_CONCURRENCY:
print("Maximum concurrency reached, stopping approvals")
break

workflow_id = workflow["id"]
workflow_name = workflow["display_title"]
pull_request_number = workflow["pull_requests"][0]["number"]
print(f"Approving workflow {workflow_name} for PR #{pull_request_number}")

deployment_url = f"actions/runs/{workflow_id}/pending_deployments"
deployment = make_request(deployment_url)[0]
environment_id = deployment["environment"]["id"]

# Approve the deployment
status_data = {
"environment_ids": [environment_id],
"state": "approved",
"comment": "Automatically approved by queue manager"
}
result = make_request(deployment_url, method="POST", data=status_data)

if result:
total_workflows += 1
else:
print(f"Failed to approve deployment {deployment['id']}")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What are reasons for a failed deployment? What else could do we in that situation? Can we just retry? Or would it better to send a team alert? I guess in this situation we have a free slot but we’re not making use it

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

well, a retry is probably just something that happens in a 5 minutes. Probably we can send an alert to the channel. I'll see if I can add it.


EOF
19 changes: 14 additions & 5 deletions .github/workflows/cicd-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,18 +52,18 @@ jobs:
- name: Select tests to run
id: test_to_run
run: |
# For manual dispatch, we replace `all` with the actual job names
# For manual dispatch, we replace `all` with the actual job names
if [[ "$EVENT_NAME" == "workflow_dispatch" ]]; then
TESTS_TO_RUN=$TESTS_TO_RUN

# For correctly labeled PR, we replace `all` with the actual job names
# For correctly labeled PR, we replace `all` with the actual job names
elif [[ "$EVENT_NAME" == "pull_request" && "$HAS_LABEL" == "true" ]]; then
TESTS_TO_RUN=all

# For incorrectly labeled PR, run no tests
elif [[ "$EVENT_NAME" == "pull_request" && "$HAS_LABEL" != "true" ]]; then
TESTS_TO_RUN=""

# For push events, run all tests. This is so that we can generate coverage
# on branch `main`.
elif [[ "$EVENT_NAME" == "push" ]]; then
Expand All @@ -82,10 +82,19 @@ jobs:
needs: [pre-flight]
uses: ./.github/workflows/code-linting.yml

cicd-wait-in-queue:
needs: [pre-flight]
runs-on: ubuntu-latest
environment: test
steps:
- name: Running CI tests
run: |
echo "Running CI tests"

cicd-test-container-build:
uses: ./.github/workflows/_build_container.yml
if: ${{ needs.pre-flight.outputs.test_to_run != '[]' }}
needs: [pre-flight, code-linting]
needs: [pre-flight, code-linting, cicd-wait-in-queue]
with:
image-name: nemo_container
dockerfile: Dockerfile.ci
Expand Down Expand Up @@ -186,7 +195,7 @@ jobs:
NUM_FAILED=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion == "failure") | .name] | length')
NUM_CANCELLED=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion == "cancelled") | .name] | length')
NUM_SKIPPED=$(gh run view $RUN_ID --json jobs -q '[.jobs[] | select(.conclusion == "skipped") | .name] | length')

if [[ $NUM_FAILED -eq 0 && $NUM_CANCELLED -eq 0 && $NUM_SKIPPED -eq 0 ]]; then
RESULT="success"
elif [[ $NUM_FAILED -eq 0 && $NUM_CANCELLED -gt 0 && $NUM_SKIPPED -eq 0 ]]; then
Expand Down
Loading