Skip to content

You tagging instead of job name to find the training run #139

You tagging instead of job name to find the training run

You tagging instead of job name to find the training run #139

Workflow file for this run

name: Train and Deploy ML Model
on:
push:
branches: [main]
pull_request:
types: [closed]
branches: [main]
workflow_dispatch:
jobs:
train-deploy:
if: (github.event_name == 'push' && github.ref_name == 'main') ||
(github.event_name == 'pull_request' && github.event.pull_request.merged == true && github.base_ref == 'main') ||
(github.event_name == 'workflow_dispatch' && github.ref_name == 'main')
runs-on: ubuntu-latest
permissions:
id-token: write
contents: read
env:
ACR_NAME: transformerloadacr
IMAGE_NAME: transformer-load-image
IMAGE_TAG: latest
RESOURCE_GROUP: transformer-load-rg
WORKSPACE_NAME: transformer-load-ws2
ARM_USE_OIDC: true
ARM_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }}
ARM_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }}
ARM_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
steps:
- name: Checkout repo
uses: actions/checkout@v3
- name: Azure Login
uses: azure/login@v1
with:
client-id: ${{ secrets.AZURE_CLIENT_ID }}
tenant-id: ${{ secrets.AZURE_TENANT_ID }}
subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }}
enable-AzPSSession: false
- name: Log in to ACR
run: az acr login --name $ACR_NAME
- name: Set FULL_IMAGE
run: echo "FULL_IMAGE=${ACR_NAME}.azurecr.io/${IMAGE_NAME}:${IMAGE_TAG}" >> $GITHUB_ENV
- name: Build Docker image
run: docker build -t $FULL_IMAGE .
- name: Push Docker image to ACR
run: docker push $FULL_IMAGE
- name: Setup Terraform
uses: hashicorp/setup-terraform@v2
with:
terraform_version: 1.6.2
- name: Terraform Init
run: terraform init
working-directory: ./terraform-ml-deploy
- name: Terraform Format Check
run: terraform fmt -check -recursive
working-directory: ./terraform-ml-deploy
- name: Terraform Validate
run: terraform validate
working-directory: ./terraform-ml-deploy
- name: Terraform Plan
run: terraform plan
working-directory: ./terraform-ml-deploy
- name: Terraform Apply
run: terraform apply -auto-approve
working-directory: ./terraform-ml-deploy
#- name: Wait for role assignment propagation
# run: sleep 60
- name: Confirm Azure ML workspace
run: |
az ml workspace show \
--name ${{ env.WORKSPACE_NAME }} \
--resource-group ${{ env.RESOURCE_GROUP }}
- name: Validate pipeline YAML
shell: bash
run: |
VALIDATION_OUTPUT=$(az ml job validate --file src/pipeline.yml \
--resource-group ${{ env.RESOURCE_GROUP }} \
--workspace-name ${{ env.WORKSPACE_NAME }})
echo "$VALIDATION_OUTPUT"
if echo "$VALIDATION_OUTPUT" | jq -e '.result == "Failed"' > /dev/null; then
echo "Pipeline validation failed. Halting workflow."
exit 1
else
echo "Pipeline validation succeeded."
fi
- name: Submit Azure ML pipeline job
id: submit_job
run: |
JOB_NAME="transformer-load-job-$(date +%s)"
echo "JOB_NAME=$JOB_NAME" >> $GITHUB_ENV
az ml job create --file src/pipeline.yml \
--resource-group ${{ env.RESOURCE_GROUP }} \
--workspace-name ${{ env.WORKSPACE_NAME }} \
--set name=$JOB_NAME \
--set experiment_name=transformer-load-exp \
--set jobs.train_job.environment_variables.MANAGED_IDENTITY_CLIENT_ID=$ARM_CLIENT_ID \
--set jobs.predict_job.environment_variables.MANAGED_IDENTITY_CLIENT_ID=$ARM_CLIENT_ID
#- name: Validate job identity
# run: |
# EXPECTED_IDENTITY="/subscriptions/${{ secrets.AZURE_SUBSCRIPTION_ID }}/resourceGroups/${{ env.RESOURCE_GROUP }}/providers/Microsoft.ManagedIdentity/userAssignedIdentities/transformer-load-identity"
# ACTUAL_IDENTITY=$(az ml job show --name ${{ env.JOB_NAME }} \
# --resource-group ${{ env.RESOURCE_GROUP }} \
# --workspace-name ${{ env.WORKSPACE_NAME }} \
# --query "identity.userAssignedIdentities" -o tsv)
# echo "Expected identity: $EXPECTED_IDENTITY"
# echo "Actual identity: $ACTUAL_IDENTITY"
# if [[ "$ACTUAL_IDENTITY" != *"$EXPECTED_IDENTITY"* ]]; then
# echo "Managed identity not correctly applied to job."
# exit 1
# else
# echo "Managed identity verified."
# fi
- name: Monitor Azure ML job
run: |
while true; do
STATUS=$(az ml job show --name ${{ env.JOB_NAME }} \
--resource-group ${{ env.RESOURCE_GROUP }} \
--workspace-name ${{ env.WORKSPACE_NAME }} \
--query "status" -o tsv)
echo "Job status: $STATUS"
if [[ "$STATUS" == "Completed" ]]; then
break
elif [[ "$STATUS" == "Failed" || "$STATUS" == "Canceled" ]]; then
echo "Azure ML job failed or was canceled."
exit 1
fi
sleep 30
done