You tagging instead of job name to find the training run #139
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Train and Deploy ML Model | |
| on: | |
| push: | |
| branches: [main] | |
| pull_request: | |
| types: [closed] | |
| branches: [main] | |
| workflow_dispatch: | |
| jobs: | |
| train-deploy: | |
| if: (github.event_name == 'push' && github.ref_name == 'main') || | |
| (github.event_name == 'pull_request' && github.event.pull_request.merged == true && github.base_ref == 'main') || | |
| (github.event_name == 'workflow_dispatch' && github.ref_name == 'main') | |
| runs-on: ubuntu-latest | |
| permissions: | |
| id-token: write | |
| contents: read | |
| env: | |
| ACR_NAME: transformerloadacr | |
| IMAGE_NAME: transformer-load-image | |
| IMAGE_TAG: latest | |
| RESOURCE_GROUP: transformer-load-rg | |
| WORKSPACE_NAME: transformer-load-ws2 | |
| ARM_USE_OIDC: true | |
| ARM_CLIENT_ID: ${{ secrets.AZURE_CLIENT_ID }} | |
| ARM_TENANT_ID: ${{ secrets.AZURE_TENANT_ID }} | |
| ARM_SUBSCRIPTION_ID: ${{ secrets.AZURE_SUBSCRIPTION_ID }} | |
| steps: | |
| - name: Checkout repo | |
| uses: actions/checkout@v3 | |
| - name: Azure Login | |
| uses: azure/login@v1 | |
| with: | |
| client-id: ${{ secrets.AZURE_CLIENT_ID }} | |
| tenant-id: ${{ secrets.AZURE_TENANT_ID }} | |
| subscription-id: ${{ secrets.AZURE_SUBSCRIPTION_ID }} | |
| enable-AzPSSession: false | |
| - name: Log in to ACR | |
| run: az acr login --name $ACR_NAME | |
| - name: Set FULL_IMAGE | |
| run: echo "FULL_IMAGE=${ACR_NAME}.azurecr.io/${IMAGE_NAME}:${IMAGE_TAG}" >> $GITHUB_ENV | |
| - name: Build Docker image | |
| run: docker build -t $FULL_IMAGE . | |
| - name: Push Docker image to ACR | |
| run: docker push $FULL_IMAGE | |
| - name: Setup Terraform | |
| uses: hashicorp/setup-terraform@v2 | |
| with: | |
| terraform_version: 1.6.2 | |
| - name: Terraform Init | |
| run: terraform init | |
| working-directory: ./terraform-ml-deploy | |
| - name: Terraform Format Check | |
| run: terraform fmt -check -recursive | |
| working-directory: ./terraform-ml-deploy | |
| - name: Terraform Validate | |
| run: terraform validate | |
| working-directory: ./terraform-ml-deploy | |
| - name: Terraform Plan | |
| run: terraform plan | |
| working-directory: ./terraform-ml-deploy | |
| - name: Terraform Apply | |
| run: terraform apply -auto-approve | |
| working-directory: ./terraform-ml-deploy | |
| #- name: Wait for role assignment propagation | |
| # run: sleep 60 | |
| - name: Confirm Azure ML workspace | |
| run: | | |
| az ml workspace show \ | |
| --name ${{ env.WORKSPACE_NAME }} \ | |
| --resource-group ${{ env.RESOURCE_GROUP }} | |
| - name: Validate pipeline YAML | |
| shell: bash | |
| run: | | |
| VALIDATION_OUTPUT=$(az ml job validate --file src/pipeline.yml \ | |
| --resource-group ${{ env.RESOURCE_GROUP }} \ | |
| --workspace-name ${{ env.WORKSPACE_NAME }}) | |
| echo "$VALIDATION_OUTPUT" | |
| if echo "$VALIDATION_OUTPUT" | jq -e '.result == "Failed"' > /dev/null; then | |
| echo "Pipeline validation failed. Halting workflow." | |
| exit 1 | |
| else | |
| echo "Pipeline validation succeeded." | |
| fi | |
| - name: Submit Azure ML pipeline job | |
| id: submit_job | |
| run: | | |
| JOB_NAME="transformer-load-job-$(date +%s)" | |
| echo "JOB_NAME=$JOB_NAME" >> $GITHUB_ENV | |
| az ml job create --file src/pipeline.yml \ | |
| --resource-group ${{ env.RESOURCE_GROUP }} \ | |
| --workspace-name ${{ env.WORKSPACE_NAME }} \ | |
| --set name=$JOB_NAME \ | |
| --set experiment_name=transformer-load-exp \ | |
| --set jobs.train_job.environment_variables.MANAGED_IDENTITY_CLIENT_ID=$ARM_CLIENT_ID \ | |
| --set jobs.predict_job.environment_variables.MANAGED_IDENTITY_CLIENT_ID=$ARM_CLIENT_ID | |
| #- name: Validate job identity | |
| # run: | | |
| # EXPECTED_IDENTITY="/subscriptions/${{ secrets.AZURE_SUBSCRIPTION_ID }}/resourceGroups/${{ env.RESOURCE_GROUP }}/providers/Microsoft.ManagedIdentity/userAssignedIdentities/transformer-load-identity" | |
| # ACTUAL_IDENTITY=$(az ml job show --name ${{ env.JOB_NAME }} \ | |
| # --resource-group ${{ env.RESOURCE_GROUP }} \ | |
| # --workspace-name ${{ env.WORKSPACE_NAME }} \ | |
| # --query "identity.userAssignedIdentities" -o tsv) | |
| # echo "Expected identity: $EXPECTED_IDENTITY" | |
| # echo "Actual identity: $ACTUAL_IDENTITY" | |
| # if [[ "$ACTUAL_IDENTITY" != *"$EXPECTED_IDENTITY"* ]]; then | |
| # echo "Managed identity not correctly applied to job." | |
| # exit 1 | |
| # else | |
| # echo "Managed identity verified." | |
| # fi | |
| - name: Monitor Azure ML job | |
| run: | | |
| while true; do | |
| STATUS=$(az ml job show --name ${{ env.JOB_NAME }} \ | |
| --resource-group ${{ env.RESOURCE_GROUP }} \ | |
| --workspace-name ${{ env.WORKSPACE_NAME }} \ | |
| --query "status" -o tsv) | |
| echo "Job status: $STATUS" | |
| if [[ "$STATUS" == "Completed" ]]; then | |
| break | |
| elif [[ "$STATUS" == "Failed" || "$STATUS" == "Canceled" ]]; then | |
| echo "Azure ML job failed or was canceled." | |
| exit 1 | |
| fi | |
| sleep 30 | |
| done |