From 9801c0f57736290186b3a87d8e3cde11a2b9fff6 Mon Sep 17 00:00:00 2001
From: chunangli <chunang_li@163.com>
Date: Thu, 31 Oct 2024 14:53:07 +0800
Subject: [PATCH 1/7] enhance elastictest template, use bash script instead of
 azcli task, improve and fix azlogin and get token when requesting APIs

Signed-off-by: Chun'ang Li <chunangli@microsoft.com>
---
 .../run-test-elastictest-template.yml         | 336 ++++++++----------
 .azure-pipelines/test_plan.py                 | 319 ++++++++---------
 2 files changed, 302 insertions(+), 353 deletions(-)

diff --git a/.azure-pipelines/run-test-elastictest-template.yml b/.azure-pipelines/run-test-elastictest-template.yml
index 595a6cb3136..ebd09be86b2 100644
--- a/.azure-pipelines/run-test-elastictest-template.yml
+++ b/.azure-pipelines/run-test-elastictest-template.yml
@@ -184,206 +184,176 @@ steps:
       fi
     displayName: "Install azure-cli"
 
-  - task: AzureCLI@2
-    inputs:
-      azureSubscription: "SONiC-Automation"
-      scriptType: 'bash'
-      scriptLocation: 'inlineScript'
-      inlineScript: |
-        set -e
-
-        pip install PyYAML
-
-        rm -f new_test_plan_id.txt
-
-        python ./.azure-pipelines/test_plan.py create \
-        -t ${{ parameters.TOPOLOGY }} \
-        -o new_test_plan_id.txt \
-        --min-worker ${{ parameters.MIN_WORKER }} \
-        --max-worker ${{ parameters.MAX_WORKER }} \
-        --lock-wait-timeout-seconds ${{ parameters.LOCK_WAIT_TIMEOUT_SECONDS }} \
-        --test-set ${{ parameters.TEST_SET }} \
-        --kvm-build-id $(KVM_BUILD_ID) \
-        --kvm-image-branch "${{ parameters.KVM_IMAGE_BRANCH }}" \
-        --deploy-mg-extra-params="${{ parameters.DEPLOY_MG_EXTRA_PARAMS }}" \
-        --common-extra-params="${{ parameters.COMMON_EXTRA_PARAMS }}" \
-        --vm-type ${{ parameters.VM_TYPE }} --num-asic ${{ parameters.NUM_ASIC }} \
-        --ptf_image_tag ${{ parameters.PTF_IMAGE_TAG }} \
-        --image_url ${{ parameters.IMAGE_URL }} \
-        --upgrade-image-param="${{ parameters.UPGRADE_IMAGE_PARAM }}" \
-        --hwsku ${{ parameters.HWSKU }} \
-        --test-plan-type ${{ parameters.TEST_PLAN_TYPE }} \
-        --platform ${{ parameters.PLATFORM }} \
-        --testbed-name "${{ parameters.TESTBED_NAME }}" \
-        --scripts "${{ parameters.SCRIPTS }}" \
-        --features "${{ parameters.FEATURES }}" \
-        --scripts-exclude "${{ parameters.SCRIPTS_EXCLUDE }}" \
-        --features-exclude "${{ parameters.FEATURES_EXCLUDE }}" \
-        --specific-param='${{ parameters.SPECIFIC_PARAM }}' \
-        --affinity='${{ parameters.AFFINITY }}' \
-        --build-reason ${{ parameters.BUILD_REASON }} \
-        --repo-name ${{ parameters.REPO_NAME }} \
-        --mgmt-branch ${{ parameters.MGMT_BRANCH }} \
-        --stop-on-failure ${{ parameters.STOP_ON_FAILURE }} \
-        --retry-times ${{ parameters.RETRY_TIMES }} \
-        --dump-kvm-if-fail ${{ parameters.DUMP_KVM_IF_FAIL }} \
-        --requester "${{ parameters.REQUESTER }}" \
-        --max-execute-seconds $((${{ parameters.MAX_RUN_TEST_MINUTES }} * 60)) \
-        --test-plan-num ${{ parameters.TEST_PLAN_NUM }}
-
-        TEST_PLAN_ID_LIST=( $(cat new_test_plan_id.txt) )
-        echo -e "\033[33mSONiC PR system-level test is powered by SONiC Elastictest, for any issue, please send email to sonicelastictest@microsoft.com \033[0m"
-        for TEST_PLAN_ID in "${TEST_PLAN_ID_LIST[@]}"
-        do
-            echo "Created test plan $TEST_PLAN_ID"
-            echo -e -n "\033[33mPlease visit Elastictest page \033[0m"
-            echo -n "$(FRONTEND_URL)/scheduler/testplan/$TEST_PLAN_ID "
-            echo -e "\033[33mfor detailed test plan progress \033[0m"
-        done
-        TEST_PLAN_ID_LIST_STRING=$(printf "%s," "${TEST_PLAN_ID_LIST[@]}")
-        TEST_PLAN_ID_LIST_STRING=${TEST_PLAN_ID_LIST_STRING%,}
-        echo "##vso[task.setvariable variable=TEST_PLAN_ID_LIST_STRING]$TEST_PLAN_ID_LIST_STRING"
+  - script: |
+      set -e
+
+      pip install PyYAML
+
+      rm -f new_test_plan_id.txt
+
+      python ./.azure-pipelines/test_plan.py create \
+      -t ${{ parameters.TOPOLOGY }} \
+      -o new_test_plan_id.txt \
+      --min-worker ${{ parameters.MIN_WORKER }} \
+      --max-worker ${{ parameters.MAX_WORKER }} \
+      --lock-wait-timeout-seconds ${{ parameters.LOCK_WAIT_TIMEOUT_SECONDS }} \
+      --test-set ${{ parameters.TEST_SET }} \
+      --kvm-build-id $(KVM_BUILD_ID) \
+      --kvm-image-branch "${{ parameters.KVM_IMAGE_BRANCH }}" \
+      --deploy-mg-extra-params="${{ parameters.DEPLOY_MG_EXTRA_PARAMS }}" \
+      --common-extra-params="${{ parameters.COMMON_EXTRA_PARAMS }}" \
+      --vm-type ${{ parameters.VM_TYPE }} --num-asic ${{ parameters.NUM_ASIC }} \
+      --ptf_image_tag ${{ parameters.PTF_IMAGE_TAG }} \
+      --image_url ${{ parameters.IMAGE_URL }} \
+      --upgrade-image-param="${{ parameters.UPGRADE_IMAGE_PARAM }}" \
+      --hwsku ${{ parameters.HWSKU }} \
+      --test-plan-type ${{ parameters.TEST_PLAN_TYPE }} \
+      --platform ${{ parameters.PLATFORM }} \
+      --testbed-name "${{ parameters.TESTBED_NAME }}" \
+      --scripts "${{ parameters.SCRIPTS }}" \
+      --features "${{ parameters.FEATURES }}" \
+      --scripts-exclude "${{ parameters.SCRIPTS_EXCLUDE }}" \
+      --features-exclude "${{ parameters.FEATURES_EXCLUDE }}" \
+      --specific-param='${{ parameters.SPECIFIC_PARAM }}' \
+      --affinity='${{ parameters.AFFINITY }}' \
+      --build-reason ${{ parameters.BUILD_REASON }} \
+      --repo-name ${{ parameters.REPO_NAME }} \
+      --mgmt-branch ${{ parameters.MGMT_BRANCH }} \
+      --stop-on-failure ${{ parameters.STOP_ON_FAILURE }} \
+      --retry-times ${{ parameters.RETRY_TIMES }} \
+      --dump-kvm-if-fail ${{ parameters.DUMP_KVM_IF_FAIL }} \
+      --requester "${{ parameters.REQUESTER }}" \
+      --max-execute-seconds $((${{ parameters.MAX_RUN_TEST_MINUTES }} * 60)) \
+      --test-plan-num ${{ parameters.TEST_PLAN_NUM }}
+
+      TEST_PLAN_ID_LIST=( $(cat new_test_plan_id.txt) )
+      echo -e "\033[33mSONiC PR system-level test is powered by SONiC Elastictest, for any issue, please send email to sonicelastictest@microsoft.com \033[0m"
+      for TEST_PLAN_ID in "${TEST_PLAN_ID_LIST[@]}"
+      do
+          echo "Created test plan $TEST_PLAN_ID"
+          echo -e -n "\033[33mPlease visit Elastictest page \033[0m"
+          echo -n "$(ELASTICTEST_FRONTEND_URL)/scheduler/testplan/$TEST_PLAN_ID "
+          echo -e "\033[33mfor detailed test plan progress \033[0m"
+      done
+      TEST_PLAN_ID_LIST_STRING=$(printf "%s," "${TEST_PLAN_ID_LIST[@]}")
+      TEST_PLAN_ID_LIST_STRING=${TEST_PLAN_ID_LIST_STRING%,}
+      echo "##vso[task.setvariable variable=TEST_PLAN_ID_LIST_STRING]$TEST_PLAN_ID_LIST_STRING"
 
     displayName: "Trigger test"
 
-  - task: AzureCLI@2
-    inputs:
-      azureSubscription: "SONiC-Automation"
-      scriptType: 'bash'
-      scriptLocation: 'inlineScript'
-      inlineScript: |
-        set -o
-        echo "Lock testbed"
-
-        echo -e "\033[33mSONiC PR system-level test is powered by SONiC Elastictest, for any issue, please send email to sonicelastictest@microsoft.com \033[0m"
-        IFS=',' read -ra TEST_PLAN_ID_LIST <<< "$TEST_PLAN_ID_LIST_STRING"
-        failure_count=0
-        for TEST_PLAN_ID in "${TEST_PLAN_ID_LIST[@]}"
-        do
-            echo -e -n "\033[33mPlease visit Elastictest page \033[0m"
-            echo -n "$(FRONTEND_URL)/scheduler/testplan/$TEST_PLAN_ID "
-            echo -e "\033[33mfor detailed test plan progress \033[0m"
-            # When "LOCK_TESTBED" finish, it changes into "PREPARE_TESTBED"
-            echo "[test_plan.py] poll LOCK_TESTBED status"
-            python ./.azure-pipelines/test_plan.py poll -i $TEST_PLAN_ID --expected-state LOCK_TESTBED
-            RET=$?
-            if [ $RET -ne 0 ]; then
-                ((failure_count++))
-            fi
-        done
-
-        if [ $failure_count -eq ${#TEST_PLAN_ID_LIST[@]} ]; then
-            echo "All testplan failed, cancel following steps"
-            exit 3
-        fi
+  - script: |
+      set -o
+      echo "Lock testbed"
+
+      echo -e "\033[33mSONiC PR system-level test is powered by SONiC Elastictest, for any issue, please send email to sonicelastictest@microsoft.com \033[0m"
+      IFS=',' read -ra TEST_PLAN_ID_LIST <<< "$TEST_PLAN_ID_LIST_STRING"
+      failure_count=0
+      for TEST_PLAN_ID in "${TEST_PLAN_ID_LIST[@]}"
+      do
+          echo -e -n "\033[33mPlease visit Elastictest page \033[0m"
+          echo -n "$(ELASTICTEST_FRONTEND_URL)/scheduler/testplan/$TEST_PLAN_ID "
+          echo -e "\033[33mfor detailed test plan progress \033[0m"
+          # When "LOCK_TESTBED" finish, it changes into "PREPARE_TESTBED"
+          echo "[test_plan.py] poll LOCK_TESTBED status"
+          python ./.azure-pipelines/test_plan.py poll -i $TEST_PLAN_ID --expected-state LOCK_TESTBED
+          RET=$?
+          if [ $RET -ne 0 ]; then
+              ((failure_count++))
+          fi
+      done
+
+      if [ $failure_count -eq ${#TEST_PLAN_ID_LIST[@]} ]; then
+          echo "All testplan failed, cancel following steps"
+          exit 3
+      fi
 
     displayName: "Lock testbed"
 
-  - task: AzureCLI@2
-    inputs:
-      azureSubscription: "SONiC-Automation"
-      scriptType: 'bash'
-      scriptLocation: 'inlineScript'
-      inlineScript: |
-        set -o
-        echo "Prepare testbed"
-        echo "Preparing the testbed(add-topo, deploy-mg) may take 15-30 minutes. Before the testbed is ready, the progress of the test plan keeps displayed as 0, please be patient"
-
-        echo -e "\033[33mSONiC PR system-level test is powered by SONiC Elastictest, for any issue, please send email to sonicelastictest@microsoft.com \033[0m"
-        IFS=',' read -ra TEST_PLAN_ID_LIST <<< "$TEST_PLAN_ID_LIST_STRING"
-        failure_count=0
-        for TEST_PLAN_ID in "${TEST_PLAN_ID_LIST[@]}"
-        do
-            echo -e -n "\033[33mPlease visit Elastictest page \033[0m"
-            echo -n "$(FRONTEND_URL)/scheduler/testplan/$TEST_PLAN_ID "
-            echo -e "\033[33mfor detailed test plan progress \033[0m"
-            # When "PREPARE_TESTBED" finish, it changes into "EXECUTING"
-            echo "[test_plan.py] poll PREPARE_TESTBED status"
-            python ./.azure-pipelines/test_plan.py poll -i $TEST_PLAN_ID --expected-state PREPARE_TESTBED
-            RET=$?
-            if [ $RET -ne 0 ]; then
-                ((failure_count++))
-            fi
-        done
-
-        if [ "$failure_count" -eq ${#TEST_PLAN_ID_LIST[@]} ]; then
-            echo "All testplan failed, cancel following steps"
-            exit 3
-        fi
+  - script: |
+      set -o
+      echo "Prepare testbed"
+      echo "Preparing the testbed(add-topo, deploy-mg) may take 15-30 minutes. Before the testbed is ready, the progress of the test plan keeps displayed as 0, please be patient"
+
+      echo -e "\033[33mSONiC PR system-level test is powered by SONiC Elastictest, for any issue, please send email to sonicelastictest@microsoft.com \033[0m"
+      IFS=',' read -ra TEST_PLAN_ID_LIST <<< "$TEST_PLAN_ID_LIST_STRING"
+      failure_count=0
+      for TEST_PLAN_ID in "${TEST_PLAN_ID_LIST[@]}"
+      do
+          echo -e -n "\033[33mPlease visit Elastictest page \033[0m"
+          echo -n "$(ELASTICTEST_FRONTEND_URL)/scheduler/testplan/$TEST_PLAN_ID "
+          echo -e "\033[33mfor detailed test plan progress \033[0m"
+          # When "PREPARE_TESTBED" finish, it changes into "EXECUTING"
+          echo "[test_plan.py] poll PREPARE_TESTBED status"
+          python ./.azure-pipelines/test_plan.py poll -i $TEST_PLAN_ID --expected-state PREPARE_TESTBED
+          RET=$?
+          if [ $RET -ne 0 ]; then
+              ((failure_count++))
+          fi
+      done
+
+      if [ "$failure_count" -eq ${#TEST_PLAN_ID_LIST[@]} ]; then
+          echo "All testplan failed, cancel following steps"
+          exit 3
+      fi
 
     displayName: "Prepare testbed"
 
-  - task: AzureCLI@2
-    inputs:
-      azureSubscription: "SONiC-Automation"
-      scriptType: 'bash'
-      scriptLocation: 'inlineScript'
-      inlineScript: |
-        set -o
-        echo "Run test"
-
-        echo -e "\033[33mSONiC PR system-level test is powered by SONiC Elastictest, for any issue, please send email to sonicelastictest@microsoft.com \033[0m"
-        IFS=',' read -ra TEST_PLAN_ID_LIST <<< "$TEST_PLAN_ID_LIST_STRING"
-        failure_count=0
-        for TEST_PLAN_ID in "${TEST_PLAN_ID_LIST[@]}"
-        do
-            echo -e -n "\033[33mPlease visit Elastictest page \033[0m"
-            echo -n "$(FRONTEND_URL)/scheduler/testplan/$TEST_PLAN_ID "
-            echo -e "\033[33mfor detailed test plan progress \033[0m"
-            # When "EXECUTING" finish, it changes into "KVMDUMP", "FAILED", "CANCELLED" or "FINISHED"
-            echo "[test_plan.py] poll EXECUTING status"
-            python ./.azure-pipelines/test_plan.py poll -i $TEST_PLAN_ID --expected-state EXECUTING --expected-result ${{ parameters.EXPECTED_RESULT }}
-            RET=$?
-            if [ $RET -ne 0 ]; then
-                ((failure_count++))
-            fi
-        done
-
-        if [ $failure_count -eq ${#TEST_PLAN_ID_LIST[@]} ]; then
-            echo "All testplan failed, cancel following steps"
-            exit 3
-        fi
+  - script: |
+      set -o
+      echo "Run test"
+
+      echo -e "\033[33mSONiC PR system-level test is powered by SONiC Elastictest, for any issue, please send email to sonicelastictest@microsoft.com \033[0m"
+      IFS=',' read -ra TEST_PLAN_ID_LIST <<< "$TEST_PLAN_ID_LIST_STRING"
+      failure_count=0
+      for TEST_PLAN_ID in "${TEST_PLAN_ID_LIST[@]}"
+      do
+          echo -e -n "\033[33mPlease visit Elastictest page \033[0m"
+          echo -n "$(ELASTICTEST_FRONTEND_URL)/scheduler/testplan/$TEST_PLAN_ID "
+          echo -e "\033[33mfor detailed test plan progress \033[0m"
+          # When "EXECUTING" finish, it changes into "KVMDUMP", "FAILED", "CANCELLED" or "FINISHED"
+          echo "[test_plan.py] poll EXECUTING status"
+          python ./.azure-pipelines/test_plan.py poll -i $TEST_PLAN_ID --expected-state EXECUTING --expected-result ${{ parameters.EXPECTED_RESULT }}
+          RET=$?
+          if [ $RET -ne 0 ]; then
+              ((failure_count++))
+          fi
+      done
+
+      if [ $failure_count -eq ${#TEST_PLAN_ID_LIST[@]} ]; then
+          echo "All testplan failed, cancel following steps"
+          exit 3
+      fi
 
     displayName: "Run test"
     timeoutInMinutes: ${{ parameters.MAX_RUN_TEST_MINUTES }}
 
   - ${{ if eq(parameters.DUMP_KVM_IF_FAIL, 'True') }}:
-      - task: AzureCLI@2
-        inputs:
-          azureSubscription: "SONiC-Automation"
-          scriptType: 'bash'
-          scriptLocation: 'inlineScript'
-          inlineScript: |
-            set -e
-            echo "KVM dump"
-
-            echo -e "\033[33mSONiC PR system-level test is powered by SONiC Elastictest, for any issue, please send email to sonicelastictest@microsoft.com \033[0m"
-            IFS=',' read -ra TEST_PLAN_ID_LIST <<< "$TEST_PLAN_ID_LIST_STRING"
-            for TEST_PLAN_ID in "${TEST_PLAN_ID_LIST[@]}"
-            do
-                echo -e -n "\033[33mPlease visit Elastictest page \033[0m"
-                echo -n "$(FRONTEND_URL)/scheduler/testplan/$TEST_PLAN_ID "
-                echo -e "\033[33mfor detailed test plan progress \033[0m"
-                # When "KVMDUMP" finish, it changes into "FAILED", "CANCELLED" or "FINISHED"
-                echo "##[group][test_plan.py] poll KVMDUMP status"
-                python ./.azure-pipelines/test_plan.py poll -i $TEST_PLAN_ID --expected-state KVMDUMP
-            done
+      - script: |
+          set -e
+          echo "KVM dump"
+
+          echo -e "\033[33mSONiC PR system-level test is powered by SONiC Elastictest, for any issue, please send email to sonicelastictest@microsoft.com \033[0m"
+          IFS=',' read -ra TEST_PLAN_ID_LIST <<< "$TEST_PLAN_ID_LIST_STRING"
+          for TEST_PLAN_ID in "${TEST_PLAN_ID_LIST[@]}"
+          do
+              echo -e -n "\033[33mPlease visit Elastictest page \033[0m"
+              echo -n "$(ELASTICTEST_FRONTEND_URL)/scheduler/testplan/$TEST_PLAN_ID "
+              echo -e "\033[33mfor detailed test plan progress \033[0m"
+              # When "KVMDUMP" finish, it changes into "FAILED", "CANCELLED" or "FINISHED"
+              echo "##[group][test_plan.py] poll KVMDUMP status"
+              python ./.azure-pipelines/test_plan.py poll -i $TEST_PLAN_ID --expected-state KVMDUMP
+          done
 
         condition: succeededOrFailed()
         displayName: "KVM dump"
 
-  - task: AzureCLI@2
-    inputs:
-      azureSubscription: "SONiC-Automation"
-      scriptType: 'bash'
-      scriptLocation: 'inlineScript'
-      inlineScript: |
-        set -e
-        echo "Try to cancel test plan $TEST_PLAN_ID, cancelling finished test plan has no effect."
-        IFS=',' read -ra TEST_PLAN_ID_LIST <<< "$TEST_PLAN_ID_LIST_STRING"
-        for TEST_PLAN_ID in "${TEST_PLAN_ID_LIST[@]}"
-        do
-            python ./.azure-pipelines/test_plan.py cancel -i $TEST_PLAN_ID
-        done
+  - script: |
+      set -e
+      echo "Try to cancel test plan $TEST_PLAN_ID, cancelling finished test plan has no effect."
+      IFS=',' read -ra TEST_PLAN_ID_LIST <<< "$TEST_PLAN_ID_LIST_STRING"
+      for TEST_PLAN_ID in "${TEST_PLAN_ID_LIST[@]}"
+      do
+          python ./.azure-pipelines/test_plan.py cancel -i $TEST_PLAN_ID
+      done
     condition: always()
     displayName: "Finalize running test plan"
diff --git a/.azure-pipelines/test_plan.py b/.azure-pipelines/test_plan.py
index f4b07bb2d18..7ec72614d10 100644
--- a/.azure-pipelines/test_plan.py
+++ b/.azure-pipelines/test_plan.py
@@ -8,7 +8,7 @@
 import subprocess
 import copy
 import time
-from datetime import datetime, timedelta
+from datetime import datetime, timezone
 
 import requests
 import yaml
@@ -22,8 +22,7 @@
 INTERNAL_SONIC_MGMT_REPO = "https://dev.azure.com/mssonic/internal/_git/sonic-mgmt-int"
 PR_TEST_SCRIPTS_FILE = "pr_test_scripts.yaml"
 SPECIFIC_PARAM_KEYWORD = "specific_param"
-TOLERATE_HTTP_EXCEPTION_TIMES = 20
-TOKEN_EXPIRE_HOURS = 1
+MAX_POLL_RETRY_TIMES = 10
 MAX_GET_TOKEN_RETRY_TIMES = 3
 TEST_PLAN_STATUS_UNSUCCESSFUL_FINISHED = ["FAILED", "CANCELLED"]
 TEST_PLAN_STEP_STATUS_UNFINISHED = ["EXECUTING", None]
@@ -83,13 +82,15 @@ def __init__(self, status):
     def get_status(self):
         return self.status.value
 
-    def print_logs(self, test_plan_id, resp_data, start_time):
+    def print_logs(self, test_plan_id, resp_data, expected_status, start_time):
         status = resp_data.get("status", None)
         current_status = test_plan_status_factory(status).get_status()
 
         if current_status == self.get_status():
-            print("Test plan id: {}, status: {},  elapsed: {:.0f} seconds"
-                  .format(test_plan_id, resp_data.get("status", None), time.time() - start_time))
+            print(
+                f"Test plan id: {test_plan_id}, status: {resp_data.get('status', None)}, "
+                f"expected_status: {expected_status}, elapsed: {time.time() - start_time:.0f} seconds"
+            )
 
 
 class InitStatus(AbstractStatus):
@@ -111,10 +112,12 @@ class ExecutingStatus(AbstractStatus):
     def __init__(self):
         super(ExecutingStatus, self).__init__(TestPlanStatus.EXECUTING)
 
-    def print_logs(self, test_plan_id, resp_data, start_time):
-        print("Test plan id: {}, status: {}, progress: {:.2f}%, elapsed: {:.0f} seconds"
-              .format(test_plan_id, resp_data.get("status", None),
-                      resp_data.get("progress", 0) * 100, time.time() - start_time))
+    def print_logs(self, test_plan_id, resp_data, expected_status, start_time):
+        print(
+            f"Test plan id: {test_plan_id}, status: {resp_data.get('status', None)}, "
+            f"expected_status: {expected_status}, progress: {resp_data.get('progress', 0) * 100:.2f}%, "
+            f"elapsed: {time.time() - start_time:.0f} seconds"
+        )
 
 
 class KvmDumpStatus(AbstractStatus):
@@ -150,74 +153,81 @@ def parse_list_from_str(s):
             if single_str.strip()]
 
 
+def run_cmd(cmd):
+    process = subprocess.Popen(
+        cmd.split(),
+        shell=False,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.PIPE
+    )
+    stdout, stderr = process.communicate()
+    return_code = process.returncode
+
+    if return_code != 0:
+        raise Exception(f'Command {cmd} execution failed, rc={return_code}, error={stderr}')
+    return stdout, stderr, return_code
+
+
 class TestPlanManager(object):
 
-    def __init__(self, scheduler_url, community_url, frontend_url, client_id=None):
+    def __init__(self, scheduler_url, frontend_url, elastictest_msal_client_id, sonic_automation_umi):
         self.scheduler_url = scheduler_url
-        self.community_url = community_url
         self.frontend_url = frontend_url
-        self.client_id = client_id
-        self.with_auth = False
-        self._token = None
-        self._token_expires_on = None
-        if self.client_id:
-            self.with_auth = True
-            self.get_token()
-
-    def cmd(self, cmds):
-        process = subprocess.Popen(
-            cmds,
-            shell=False,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE
-        )
-        stdout, stderr = process.communicate()
-        return_code = process.returncode
-
-        return stdout, stderr, return_code
-
-    def az_run(self, cmd):
-        stdout, stderr, retcode = self.cmd(cmd.split())
-        if retcode != 0:
-            raise Exception(f'Command {cmd} execution failed, rc={retcode}, error={stderr}')
-        return stdout, stderr, retcode
+        self.elastictest_msal_client_id = elastictest_msal_client_id
+        self.sonic_automation_umi = sonic_automation_umi
 
     def get_token(self):
 
-        token_is_valid = \
-            self._token_expires_on is not None and \
-            (self._token_expires_on - datetime.now()) > timedelta(hours=TOKEN_EXPIRE_HOURS)
+        # 1. Run az login with re-try
+        az_login_cmd = f"az login --identity --username {self.sonic_automation_umi}"
+        az_login_attempts = 0
+        while az_login_attempts < MAX_GET_TOKEN_RETRY_TIMES:
+            try:
+                stdout, _, _ = run_cmd(az_login_cmd)
+                print(f"Az login successfully. Login time: {datetime.now(timezone.utc)}")
+                break
+            except Exception as exception:
+                az_login_attempts += 1
+                print(
+                    f"Failed to az login with exception: {repr(exception)}. "
+                    f"Retry {MAX_GET_TOKEN_RETRY_TIMES - az_login_attempts} times to login."
+                )
 
-        if self._token is not None and token_is_valid:
-            return self._token
+        # If az login failed, return with exception
+        if az_login_attempts >= MAX_GET_TOKEN_RETRY_TIMES:
+            raise Exception(f"Failed to az login after {MAX_GET_TOKEN_RETRY_TIMES} attempts.")
 
-        cmd = 'az account get-access-token --resource {}'.format(self.client_id)
-        attempt = 0
-        while attempt < MAX_GET_TOKEN_RETRY_TIMES:
+        # 2. Get access token with re-try
+        get_token_cmd = f"az account get-access-token --resource {self.elastictest_msal_client_id}"
+        get_token_attempts = 0
+        while get_token_attempts < MAX_GET_TOKEN_RETRY_TIMES:
             try:
-                stdout, _, _ = self.az_run(cmd)
+                stdout, _, _ = run_cmd(get_token_cmd)
 
                 token = json.loads(stdout.decode("utf-8"))
-                self._token = token.get("accessToken", None)
-                if not self._token:
-                    raise Exception("Parse token from stdout failed")
+                access_token = token.get("accessToken", None)
+                if not access_token:
+                    raise Exception("Parse token from stdout failed, accessToken is None.")
 
                 # Parse token expires time from string
                 token_expires_on = token.get("expiresOn", "")
-                self._token_expires_on = datetime.strptime(token_expires_on, "%Y-%m-%d %H:%M:%S.%f")
-                print("Get token successfully.")
-                return self._token
+                if token_expires_on:
+                    print(f"Get token successfully. Token will expire on {token_expires_on}.")
+
+                return access_token
 
             except Exception as exception:
-                attempt += 1
-                print("Failed to get token with exception: {}".format(repr(exception)))
+                get_token_attempts += 1
+                print(f"Failed to get token with exception: {repr(exception)}.")
 
-        raise Exception("Failed to get token after {} attempts".format(MAX_GET_TOKEN_RETRY_TIMES))
+        # If az get token failed, return with exception
+        if get_token_attempts >= MAX_GET_TOKEN_RETRY_TIMES:
+            raise Exception(f"Failed to get token after {MAX_GET_TOKEN_RETRY_TIMES} attempts")
 
     def create(self, topology, test_plan_name="my_test_plan", deploy_mg_extra_params="", kvm_build_id="",
                min_worker=None, max_worker=None, pr_id="unknown", output=None,
                common_extra_params="", **kwargs):
-        tp_url = "{}/test_plan".format(self.scheduler_url)
+        tp_url = f"{self.scheduler_url}/test_plan"
         testbed_name = parse_list_from_str(kwargs.get("testbed_name", None))
         image_url = kwargs.get("image_url", None)
         hwsku = kwargs.get("hwsku", None)
@@ -229,8 +239,10 @@ def create(self, topology, test_plan_name="my_test_plan", deploy_mg_extra_params
         features_exclude = parse_list_from_str(kwargs.get("features_exclude", None))
         ptf_image_tag = kwargs.get("ptf_image_tag", None)
 
-        print("Creating test plan, topology: {}, name: {}, build info:{} {} {}".format(topology, test_plan_name,
-                                                                                       repo_name, pr_id, build_id))
+        print(
+            f"Creating test plan, topology: {topology}, name: {test_plan_name}, "
+            f"build info:{repo_name} {pr_id} {build_id}"
+        )
         print("Test scripts to be covered in this test plan:")
         print(json.dumps(scripts, indent=4))
 
@@ -320,10 +332,9 @@ def create(self, topology, test_plan_name="my_test_plan", deploy_mg_extra_params
             "extra_params": {},
             "priority": 10
         }
-        print('Creating test plan with payload:\n{}'.format(json.dumps(payload, indent=4)))
+        print(f"Creating test plan with payload:\n{json.dumps(payload, indent=4)}")
         headers = {
-            "Authorization": "Bearer {}".format(self.get_token()),
-            "scheduler-site": "PRTest",
+            "Authorization": f"Bearer {self.get_token()}",
             "Content-Type": "application/json"
         }
         raw_resp = {}
@@ -331,17 +342,16 @@ def create(self, topology, test_plan_name="my_test_plan", deploy_mg_extra_params
             raw_resp = requests.post(tp_url, headers=headers, data=json.dumps(payload), timeout=10)
             resp = raw_resp.json()
         except Exception as exception:
-            raise Exception("HTTP execute failure, url: {}, raw_resp: {}, exception: {}"
-                            .format(tp_url, str(raw_resp), str(exception)))
+            raise Exception(f"HTTP execute failure, url: {tp_url}, raw_resp: {raw_resp}, exception: {str(exception)}")
         if not resp["data"]:
-            raise Exception("Pre deploy action failed with error: {}".format(resp["errmsg"]))
+            raise Exception(f"Create test plan failed with error: {resp['errmsg']}")
         if not resp["success"]:
-            raise Exception("Create test plan failed with error: {}".format(resp["errmsg"]))
+            raise Exception(f"Create test plan failed with error: {resp['errmsg']}")
 
-        print("Result of creating test plan: {}".format(str(resp["data"])))
+        print(f"Result of creating test plan: {str(resp['data'])}")
 
         if output:
-            print("Store new test plan id to file {}".format(output))
+            print(f"Store new test plan id to file {output}")
             with open(output, "a") as f:
                 f.write(str(resp["data"]) + "\n")
 
@@ -349,15 +359,14 @@ def create(self, topology, test_plan_name="my_test_plan", deploy_mg_extra_params
 
     def cancel(self, test_plan_id):
 
-        tp_url = "{}/test_plan/{}".format(self.scheduler_url, test_plan_id)
-        cancel_url = "{}/cancel".format(tp_url)
+        tp_url = f"{self.scheduler_url}/test_plan/{test_plan_id}"
+        cancel_url = f"{tp_url}/cancel"
 
-        print("Cancelling test plan at {}".format(cancel_url))
+        print(f"Cancelling test plan at {cancel_url}")
 
         payload = json.dumps({})
         headers = {
-            "Authorization": "Bearer {}".format(self.get_token()),
-            "scheduler-site": "PRTest",
+            "Authorization": f"Bearer {self.get_token()}",
             "Content-Type": "application/json"
         }
 
@@ -366,73 +375,57 @@ def cancel(self, test_plan_id):
             raw_resp = requests.post(cancel_url, headers=headers, data=payload, timeout=10)
             resp = raw_resp.json()
         except Exception as exception:
-            raise Exception("HTTP execute failure, url: {}, raw_resp: {}, exception: {}"
-                            .format(cancel_url, str(raw_resp), str(exception)))
+            raise Exception(f"HTTP execute failure, url: {cancel_url}, raw_resp: {str(raw_resp)}, "
+                            f"exception: {str(exception)}")
         if not resp["success"]:
-            raise Exception("Cancel test plan failed with error: {}".format(resp["errmsg"]))
+            raise Exception(f"Cancel test plan failed with error: {resp['errmsg']}")
 
-        print("Result of cancelling test plan at {}:".format(tp_url))
+        print(f"Result of cancelling test plan at {tp_url}:")
         print(str(resp["data"]))
 
     def poll(self, test_plan_id, interval=60, timeout=-1, expected_state="", expected_result=None):
-        print("Polling progress and status of test plan at {}/scheduler/testplan/{}"
-              .format(self.frontend_url, test_plan_id))
-        print("Polling interval: {} seconds".format(interval))
+        print(f"Polling progress and status of test plan at {self.frontend_url}/scheduler/testplan/{test_plan_id}")
+        print(f"Polling interval: {interval} seconds")
 
-        poll_url = "{}/test_plan/{}/get_test_plan_status".format(self.scheduler_url, test_plan_id)
-        poll_url_no_auth = "{}/get_test_plan_status/{}".format(self.community_url, test_plan_id)
+        poll_url = f"{self.scheduler_url}/test_plan/{test_plan_id}/get_test_plan_status"
+        # In current polling task, initialize headers one time to avoid frequent token accessing
+        # For some tasks running over 24h, then token may expire, need a fresh
         headers = {
+            "Authorization": f"Bearer {self.get_token()}",
             "Content-Type": "application/json"
         }
         start_time = time.time()
-        http_exception_times = 0
-        http_exception_times_no_auth = 0
-        failed_poll_auth_url = False
+        poll_retry_times = 0
         while timeout < 0 or (time.time() - start_time) < timeout:
             resp = None
-            # To make the transition smoother, first try to access the original API
-            if not failed_poll_auth_url:
-                try:
-                    if self.with_auth:
-                        headers["Authorization"] = "Bearer {}".format(self.get_token())
-                    resp = requests.get(poll_url, headers=headers, timeout=10).json()
-                except Exception as exception:
-                    print("HTTP execute failure, url: {}, raw_resp: {}, exception: {}".format(poll_url, resp,
-                                                                                              str(exception)))
-                    http_exception_times = http_exception_times + 1
-                    if http_exception_times >= TOLERATE_HTTP_EXCEPTION_TIMES:
-                        failed_poll_auth_url = True
-                    else:
-                        time.sleep(interval)
-                    continue
-
-            # If failed on poll auth url(most likely token has expired), try with no-auth url
-            else:
-                print("Polling test plan status failed with auth url, try with no-auth url.")
-                try:
-                    resp = requests.get(poll_url_no_auth, headers={"Content-Type": "application/json"},
-                                        timeout=10).json()
-                except Exception as e:
-                    print("HTTP execute failure, url: {}, raw_resp: {}, exception: {}".format(poll_url_no_auth, resp,
-                                                                                              repr(e)))
-                    http_exception_times_no_auth = http_exception_times_no_auth + 1
-                    if http_exception_times_no_auth >= TOLERATE_HTTP_EXCEPTION_TIMES:
-                        raise Exception(
-                            "HTTP execute failure, url: {}, raw_resp: {}, exception: {}".format(poll_url_no_auth, resp,
-                                                                                                repr(e)))
-                    else:
-                        time.sleep(interval)
-                        continue
+            try:
+                resp = requests.get(poll_url, headers=headers, timeout=10).json()
+
+                if not resp:
+                    raise Exception("Poll test plan status failed with request error, no response!")
+
+                if not resp["success"]:
+                    raise Exception(f"Get test plan status failed with error: {resp['errmsg']}")
+
+                resp_data = resp.get("data", None)
+                if not resp_data:
+                    raise Exception("No valid data in response.")
 
-            if not resp:
-                raise Exception("Poll test plan status failed with request error, no response!")
+            except Exception as exception:
+                print(f"Failed to get valid response, url: {poll_url}, raw_resp: {resp}, exception: {str(exception)}")
 
-            if not resp["success"]:
-                raise Exception("Query test plan at {} failed with error: {}".format(poll_url, resp["errmsg"]))
+                # Refresh headers token to address token expiration issue
+                headers = {
+                    "Authorization": f"Bearer {self.get_token()}",
+                    "Content-Type": "application/json"
+                }
 
-            resp_data = resp.get("data", None)
-            if not resp_data:
-                raise Exception("No valid data in response: {}".format(str(resp)))
+                poll_retry_times = poll_retry_times + 1
+                if poll_retry_times >= MAX_POLL_RETRY_TIMES:
+                    raise Exception("Poll test plan status failed, exceeded the maximum number of retries.")
+                else:
+                    time.sleep(interval)
+                continue
 
             current_tp_status = resp_data.get("status", None)
             current_tp_result = resp_data.get("result", None)
@@ -441,11 +434,10 @@ def poll(self, test_plan_id, interval=60, timeout=-1, expected_state="", expecte
                 current_status = test_plan_status_factory(current_tp_status)
                 expected_status = test_plan_status_factory(expected_state)
 
-                print("current test plan status: {}, expected status: {}".format(current_tp_status, expected_state))
+                current_status.print_logs(test_plan_id, resp_data, expected_state, start_time)
 
-                if expected_status.get_status() == current_status.get_status():
-                    current_status.print_logs(test_plan_id, resp_data, start_time)
-                elif expected_status.get_status() < current_status.get_status():
+                # If test plan has finished current step, its now status will behind the expected status
+                if expected_status.get_status() < current_status.get_status():
                     steps = None
                     step_status = None
                     runtime = resp_data.get("runtime", None)
@@ -460,7 +452,7 @@ def poll(self, test_plan_id, interval=60, timeout=-1, expected_state="", expecte
                     # Print test summary
                     test_summary = resp_data.get("runtime", {}).get("test_summary", None)
                     if test_summary:
-                        print("Test summary:\n{}".format(json.dumps(test_summary, indent=4)))
+                        print(f"Test summary:\n{json.dumps(test_summary, indent=4)}")
 
                     """
                     In below scenarios, need to return false to pipeline.
@@ -477,38 +469,34 @@ def poll(self, test_plan_id, interval=60, timeout=-1, expected_state="", expecte
                         # Print error type and message
                         err_code = resp_data.get("runtime", {}).get("err_code", None)
                         if err_code:
-                            print("Error type: {}".format(err_code))
+                            print(f"Error type: {err_code}")
 
                         err_msg = resp_data.get("runtime", {}).get("message", None)
                         if err_msg:
-                            print("Error message: {}".format(err_msg))
+                            print(f"Error message: {err_msg}")
 
-                        raise Exception("Test plan id: {}, status: {}, result: {}, Elapsed {:.0f} seconds. "
-                                        "Check {}/scheduler/testplan/{} for test plan status"
-                                        .format(test_plan_id, step_status, current_tp_result, time.time() - start_time,
-                                                self.frontend_url,
-                                                test_plan_id))
+                        raise Exception(
+                            f"Test plan id: {test_plan_id}, status: {step_status}, "
+                            f"result: {current_tp_result}, Elapsed {time.time() - start_time:.0f} seconds. "
+                            f"Check {self.frontend_url}/scheduler/testplan/{test_plan_id} for test plan status"
+                        )
                     if expected_result:
                         if current_tp_result != expected_result:
-                            raise Exception("Test plan id: {}, status: {}, result: {} not match expected result: {}, "
-                                            "Elapsed {:.0f} seconds. "
-                                            "Check {}/scheduler/testplan/{} for test plan status"
-                                            .format(test_plan_id, step_status, current_tp_result,
-                                                    expected_result, time.time() - start_time,
-                                                    self.frontend_url,
-                                                    test_plan_id))
-
-                    print("Current step status is {}".format(step_status))
+                            raise Exception(
+                                f"Test plan id: {test_plan_id}, status: {step_status}, "
+                                f"result: {current_tp_result} not match expected result: {expected_result}, "
+                                f"Elapsed {time.time() - start_time:.0f} seconds. "
+                                f"Check {self.frontend_url}/scheduler/testplan/{test_plan_id} for test plan status"
+                            )
+
+                    print(f"Current step status is {step_status}.")
                     return
-                else:
-                    print("Current test plan state is {}, waiting for the expected state {}".format(current_tp_status,
-                                                                                                    expected_state))
 
                 time.sleep(interval)
 
         else:
             raise PollTimeoutException(
-                "Max polling time reached, test plan at {} is not successfully finished or cancelled".format(poll_url)
+                f"Max polling time reached, test plan at {poll_url} is not successfully finished or cancelled"
             )
 
 
@@ -930,30 +918,28 @@ def poll(self, test_plan_id, interval=60, timeout=-1, expected_state="", expecte
         # https://github.com/microsoft/azure-pipelines-tasks/issues/10331
         args.test_plan_id = args.test_plan_id.replace("'", "")
 
-    print("Test plan utils parameters: {}".format(args))
-    auth_env = ["CLIENT_ID"]
-    required_env = ["ELASTICTEST_SCHEDULER_BACKEND_URL"]
+    print(f"Test plan utils parameters: {args}")
 
-    if args.action in ["create", "cancel"]:
-        required_env.extend(auth_env)
+    required_env = ["ELASTICTEST_SCHEDULER_BACKEND_URL", "ELASTICTEST_MSAL_CLIENT_ID", "SONIC_AUTOMATION_UMI"]
 
     env = {
         "elastictest_scheduler_backend_url": os.environ.get("ELASTICTEST_SCHEDULER_BACKEND_URL"),
-        "elastictest_community_url": os.environ.get("ELASTICTEST_COMMUNITY_URL"),
-        "client_id": os.environ.get("ELASTICTEST_MSAL_CLIENT_ID"),
+        "elastictest_msal_client_id": os.environ.get("ELASTICTEST_MSAL_CLIENT_ID"),
         "frontend_url": os.environ.get("ELASTICTEST_FRONTEND_URL", "https://elastictest.org"),
+        "sonic_automation_umi": os.environ.get("SONIC_AUTOMATION_UMI"),
     }
     env_missing = [k.upper() for k, v in env.items() if k.upper() in required_env and not v]
     if env_missing:
-        print("Missing required environment variables: {}".format(env_missing))
+        print(f"Missing required environment variables: {env_missing}")
         sys.exit(1)
 
     try:
         tp = TestPlanManager(
             env["elastictest_scheduler_backend_url"],
-            env["elastictest_community_url"],
             env["frontend_url"],
-            env["client_id"])
+            env["elastictest_msal_client_id"],
+            env["sonic_automation_umi"]
+        )
 
         if args.action == "create":
             pr_id = os.environ.get("SYSTEM_PULLREQUEST_PULLREQUESTNUMBER") or os.environ.get(
@@ -964,14 +950,7 @@ def poll(self, test_plan_id, interval=60, timeout=-1, expected_state="", expecte
             job_name = os.environ.get("SYSTEM_JOBDISPLAYNAME")
             repo_name = args.repo_name if args.repo_name else os.environ.get("BUILD_REPOSITORY_NAME")
 
-            test_plan_prefix = "{repo}_{reason}_PR_{pr_id}_BUILD_{build_id}_JOB_{job_name}" \
-                .format(
-                    repo=repo,
-                    reason=reason,
-                    pr_id=pr_id,
-                    build_id=build_id,
-                    job_name=job_name
-                ).replace(' ', '_')
+            test_plan_prefix = f"{repo}_{reason}_PR_{pr_id}_BUILD_{build_id}_JOB_{job_name}".replace(' ', '_')
 
             scripts = args.scripts
             specific_param = json.loads(args.specific_param)
@@ -989,7 +968,7 @@ def poll(self, test_plan_id, interval=60, timeout=-1, expected_state="", expecte
             for num in range(args.test_plan_num):
                 test_plan_name = copy.copy(test_plan_prefix)
                 if args.test_plan_num > 1:
-                    test_plan_name = "{}_{}".format(test_plan_name, num + 1)
+                    test_plan_name = f"{test_plan_name}_{num + 1}"
 
                 tp.create(
                     args.topology,
@@ -1033,8 +1012,8 @@ def poll(self, test_plan_id, interval=60, timeout=-1, expected_state="", expecte
             tp.cancel(args.test_plan_id)
         sys.exit(0)
     except PollTimeoutException as e:
-        print("Polling test plan failed with exception: {}".format(repr(e)))
+        print(f"Polling test plan failed with exception: {repr(e)}")
         sys.exit(2)
     except Exception as e:
-        print("Operation failed with exception: {}".format(repr(e)))
+        print(f"Operation failed with exception: {repr(e)}")
         sys.exit(3)

From 15d3eed205e438a4b6712e626513a2581283ae2f Mon Sep 17 00:00:00 2001
From: chunangli <chunang_li@163.com>
Date: Thu, 31 Oct 2024 15:55:25 +0800
Subject: [PATCH 2/7] refined

Signed-off-by: Chun'ang Li <chunangli@microsoft.com>
---
 .azure-pipelines/test_plan.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/.azure-pipelines/test_plan.py b/.azure-pipelines/test_plan.py
index 7ec72614d10..961a8730a28 100644
--- a/.azure-pipelines/test_plan.py
+++ b/.azure-pipelines/test_plan.py
@@ -170,16 +170,16 @@ def run_cmd(cmd):
 
 class TestPlanManager(object):
 
-    def __init__(self, scheduler_url, frontend_url, elastictest_msal_client_id, sonic_automation_umi):
+    def __init__(self, scheduler_url, frontend_url, client_id, managed_identity_id):
         self.scheduler_url = scheduler_url
         self.frontend_url = frontend_url
-        self.elastictest_msal_client_id = elastictest_msal_client_id
-        self.sonic_automation_umi = sonic_automation_umi
+        self.client_id = client_id
+        self.managed_identity_id = managed_identity_id
 
     def get_token(self):
 
         # 1. Run az login with re-try
-        az_login_cmd = f"az login --identity --username {self.sonic_automation_umi}"
+        az_login_cmd = f"az login --identity --username {self.managed_identity_id}"
         az_login_attempts = 0
         while az_login_attempts < MAX_GET_TOKEN_RETRY_TIMES:
             try:
@@ -198,7 +198,7 @@ def get_token(self):
             raise Exception(f"Failed to az login after {MAX_GET_TOKEN_RETRY_TIMES} attempts.")
 
         # 2. Get access token with re-try
-        get_token_cmd = f"az account get-access-token --resource {self.elastictest_msal_client_id}"
+        get_token_cmd = f"az account get-access-token --resource {self.client_id}"
         get_token_attempts = 0
         while get_token_attempts < MAX_GET_TOKEN_RETRY_TIMES:
             try:
@@ -920,13 +920,13 @@ def poll(self, test_plan_id, interval=60, timeout=-1, expected_state="", expecte
 
     print(f"Test plan utils parameters: {args}")
 
-    required_env = ["ELASTICTEST_SCHEDULER_BACKEND_URL", "ELASTICTEST_MSAL_CLIENT_ID", "SONIC_AUTOMATION_UMI"]
+    required_env = ["ELASTICTEST_SCHEDULER_BACKEND_URL", "CLIENT_ID", "MANAGED_IDENTITY_ID"]
 
     env = {
         "elastictest_scheduler_backend_url": os.environ.get("ELASTICTEST_SCHEDULER_BACKEND_URL"),
-        "elastictest_msal_client_id": os.environ.get("ELASTICTEST_MSAL_CLIENT_ID"),
+        "client_id": os.environ.get("ELASTICTEST_MSAL_CLIENT_ID"),
         "frontend_url": os.environ.get("ELASTICTEST_FRONTEND_URL", "https://elastictest.org"),
-        "sonic_automation_umi": os.environ.get("SONIC_AUTOMATION_UMI"),
+        "managed_identity_id": os.environ.get("SONIC_AUTOMATION_UMI"),
     }
     env_missing = [k.upper() for k, v in env.items() if k.upper() in required_env and not v]
     if env_missing:
@@ -937,8 +937,8 @@ def poll(self, test_plan_id, interval=60, timeout=-1, expected_state="", expecte
         tp = TestPlanManager(
             env["elastictest_scheduler_backend_url"],
             env["frontend_url"],
-            env["elastictest_msal_client_id"],
-            env["sonic_automation_umi"]
+            env["client_id"],
+            env["managed_identity_id"]
         )
 
         if args.action == "create":

From 2f9d177b11415acbf620691ed3e6c8e9e789f58a Mon Sep 17 00:00:00 2001
From: chunangli <chunang_li@163.com>
Date: Mon, 4 Nov 2024 14:37:25 +0800
Subject: [PATCH 3/7] fix and refined

Signed-off-by: Chun'ang Li <chunangli@microsoft.com>
---
 .azure-pipelines/test_plan.py | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)

diff --git a/.azure-pipelines/test_plan.py b/.azure-pipelines/test_plan.py
index 961a8730a28..77371ba6b11 100644
--- a/.azure-pipelines/test_plan.py
+++ b/.azure-pipelines/test_plan.py
@@ -1,3 +1,10 @@
+"""
+This script manages the creation, polling, and cancellation of test plans on multiple pipelines.
+
+Important!!!
+- Any updates to this script must be tested on all dependent pipelines to ensure compatibility and prevent disruptions.
+"""
+
 from __future__ import print_function, division
 
 import argparse
@@ -920,25 +927,25 @@ def poll(self, test_plan_id, interval=60, timeout=-1, expected_state="", expecte
 
     print(f"Test plan utils parameters: {args}")
 
-    required_env = ["ELASTICTEST_SCHEDULER_BACKEND_URL", "CLIENT_ID", "MANAGED_IDENTITY_ID"]
+    required_env = ["ELASTICTEST_SCHEDULER_BACKEND_URL", "CLIENT_ID", "SONIC_AUTOMATION_UMI"]
 
     env = {
-        "elastictest_scheduler_backend_url": os.environ.get("ELASTICTEST_SCHEDULER_BACKEND_URL"),
-        "client_id": os.environ.get("ELASTICTEST_MSAL_CLIENT_ID"),
-        "frontend_url": os.environ.get("ELASTICTEST_FRONTEND_URL", "https://elastictest.org"),
-        "managed_identity_id": os.environ.get("SONIC_AUTOMATION_UMI"),
+        "ELASTICTEST_SCHEDULER_BACKEND_URL": os.environ.get("ELASTICTEST_SCHEDULER_BACKEND_URL"),
+        "CLIENT_ID": os.environ.get("ELASTICTEST_MSAL_CLIENT_ID"),
+        "FRONTEND_URL": os.environ.get("ELASTICTEST_FRONTEND_URL", "https://elastictest.org"),
+        "SONIC_AUTOMATION_UMI": os.environ.get("SONIC_AUTOMATION_UMI"),
     }
     env_missing = [k.upper() for k, v in env.items() if k.upper() in required_env and not v]
     if env_missing:
-        print(f"Missing required environment variables: {env_missing}")
+        print(f"Missing required environment variables: {env_missing}.")
         sys.exit(1)
 
     try:
         tp = TestPlanManager(
-            env["elastictest_scheduler_backend_url"],
-            env["frontend_url"],
-            env["client_id"],
-            env["managed_identity_id"]
+            env["ELASTICTEST_SCHEDULER_BACKEND_URL"],
+            env["FRONTEND_URL"],
+            env["CLIENT_ID"],
+            env["SONIC_AUTOMATION_UMI"]
         )
 
         if args.action == "create":

From 220e420e95314aa9960fe800c9bf0f1c446cfef5 Mon Sep 17 00:00:00 2001
From: chunangli <chunang_li@163.com>
Date: Mon, 4 Nov 2024 14:58:20 +0800
Subject: [PATCH 4/7] refine description

Signed-off-by: Chun'ang Li <chunangli@microsoft.com>
---
 .azure-pipelines/run-test-elastictest-template.yml | 7 +++++++
 .azure-pipelines/test_plan.py                      | 8 +++++---
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/.azure-pipelines/run-test-elastictest-template.yml b/.azure-pipelines/run-test-elastictest-template.yml
index ebd09be86b2..7dff58bb846 100644
--- a/.azure-pipelines/run-test-elastictest-template.yml
+++ b/.azure-pipelines/run-test-elastictest-template.yml
@@ -1,3 +1,10 @@
+# Description:
+# - This template manages the entire life cycle of the Elastictest test plan, from creation to completion.
+#
+# Important!!!:
+# - This template is referenced in multiple pipelines.
+# - Any updates to this file must be tested on all dependent pipelines to ensure compatibility and prevent disruptions.
+
 parameters:
   - name: TOPOLOGY
     type: string
diff --git a/.azure-pipelines/test_plan.py b/.azure-pipelines/test_plan.py
index 77371ba6b11..1cc48fdbd31 100644
--- a/.azure-pipelines/test_plan.py
+++ b/.azure-pipelines/test_plan.py
@@ -1,8 +1,10 @@
 """
-This script manages the creation, polling, and cancellation of test plans on multiple pipelines.
+Description:
+- This script provides access to Elastictest test plan API, including creating, canceling, and polling status.
 
-Important!!!
-- Any updates to this script must be tested on all dependent pipelines to ensure compatibility and prevent disruptions.
+Important!!!:
+- This script is downloaded in multiple pipelines.
+- Any updates to this file must be tested on all dependent pipelines to ensure compatibility and prevent disruptions.
 """
 
 from __future__ import print_function, division

From 9e1b27c023d5dd2ac1ffe120757f0f5744ed9242 Mon Sep 17 00:00:00 2001
From: chunangli <chunang_li@163.com>
Date: Mon, 4 Nov 2024 15:01:02 +0800
Subject: [PATCH 5/7] refine description

Signed-off-by: Chun'ang Li <chunangli@microsoft.com>
---
 .azure-pipelines/run-test-elastictest-template.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.azure-pipelines/run-test-elastictest-template.yml b/.azure-pipelines/run-test-elastictest-template.yml
index 7dff58bb846..882ab9ce6b9 100644
--- a/.azure-pipelines/run-test-elastictest-template.yml
+++ b/.azure-pipelines/run-test-elastictest-template.yml
@@ -1,5 +1,5 @@
 # Description:
-# - This template manages the entire life cycle of the Elastictest test plan, from creation to completion.
+# - This template manages the entire life cycle of the Elastictest test plan in test pipelines.
 #
 # Important!!!:
 # - This template is referenced in multiple pipelines.

From 8fd21d52981fe7a890e9ff772e3bd9cb36217e6d Mon Sep 17 00:00:00 2001
From: chunangli <chunang_li@163.com>
Date: Mon, 4 Nov 2024 15:34:09 +0800
Subject: [PATCH 6/7] Directly specify the value of MGMT_BRANCH as master.
 Because dynamic assignment does not take effect immediately for the
 conditional statement of pipeline yaml, the expected value of MGMT_BRANCH
 cannot be obtained, and the locally updated testplan.py cannot be used.

Signed-off-by: Chun'ang Li <chunangli@microsoft.com>
---
 azure-pipelines.yml | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/azure-pipelines.yml b/azure-pipelines.yml
index 5ffbf11de09..7f00e940ce0 100644
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@@ -69,7 +69,7 @@ stages:
         MIN_WORKER: $(T0_INSTANCE_NUM)
         MAX_WORKER: $(T0_INSTANCE_NUM)
         KVM_IMAGE_BRANCH: $(BUILD_BRANCH)
-        MGMT_BRANCH: $(BUILD_BRANCH)
+        MGMT_BRANCH: "master"
 
   - job: t0_2vlans_elastictest
     displayName: "kvmtest-t0-2vlans by Elastictest"
@@ -85,7 +85,7 @@ stages:
         MAX_WORKER: $(T0_2VLANS_INSTANCE_NUM)
         DEPLOY_MG_EXTRA_PARAMS: "-e vlan_config=two_vlan_a"
         KVM_IMAGE_BRANCH: $(BUILD_BRANCH)
-        MGMT_BRANCH: $(BUILD_BRANCH)
+        MGMT_BRANCH: "master"
 
   - job: t1_lag_elastictest
     displayName: "kvmtest-t1-lag by Elastictest"
@@ -99,7 +99,7 @@ stages:
         MIN_WORKER: $(T1_LAG_INSTANCE_NUM)
         MAX_WORKER: $(T1_LAG_INSTANCE_NUM)
         KVM_IMAGE_BRANCH: $(BUILD_BRANCH)
-        MGMT_BRANCH: $(BUILD_BRANCH)
+        MGMT_BRANCH: "master"
 
   - job: dualtor_elastictest
     displayName: "kvmtest-dualtor-t0 by Elastictest"
@@ -114,7 +114,7 @@ stages:
           MAX_WORKER: $(T0_DUALTOR_INSTANCE_NUM)
           COMMON_EXTRA_PARAMS: "--disable_loganalyzer "
           KVM_IMAGE_BRANCH: $(BUILD_BRANCH)
-          MGMT_BRANCH: $(BUILD_BRANCH)
+          MGMT_BRANCH: "master"
 
   - job: multi_asic_elastictest
     displayName: "kvmtest-multi-asic-t1-lag by Elastictest"
@@ -130,7 +130,7 @@ stages:
           MAX_WORKER: $(MULTI_ASIC_INSTANCE_NUM)
           NUM_ASIC: 4
           KVM_IMAGE_BRANCH: $(BUILD_BRANCH)
-          MGMT_BRANCH: $(BUILD_BRANCH)
+          MGMT_BRANCH: "master"
 
   - job: sonic_t0_elastictest
     displayName: "kvmtest-t0-sonic by Elastictest"
@@ -147,7 +147,7 @@ stages:
           COMMON_EXTRA_PARAMS: "--neighbor_type=sonic "
           VM_TYPE: vsonic
           KVM_IMAGE_BRANCH: $(BUILD_BRANCH)
-          MGMT_BRANCH: $(BUILD_BRANCH)
+          MGMT_BRANCH: "master"
 
   - job: dpu_elastictest
     displayName: "kvmtest-dpu by Elastictest"
@@ -161,7 +161,7 @@ stages:
           MIN_WORKER: $(T0_SONIC_INSTANCE_NUM)
           MAX_WORKER: $(T0_SONIC_INSTANCE_NUM)
           KVM_IMAGE_BRANCH: $(BUILD_BRANCH)
-          MGMT_BRANCH: $(BUILD_BRANCH)
+          MGMT_BRANCH: "master"
 
   - job: onboarding_elastictest_t0
     displayName: "onboarding t0 testcases by Elastictest - optional"
@@ -177,7 +177,7 @@ stages:
           MIN_WORKER: $(T0_ONBOARDING_SONIC_INSTANCE_NUM)
           MAX_WORKER: $(T0_ONBOARDING_SONIC_INSTANCE_NUM)
           KVM_IMAGE_BRANCH: $(BUILD_BRANCH)
-          MGMT_BRANCH: $(BUILD_BRANCH)
+          MGMT_BRANCH: "master"
           TEST_SET: onboarding_t0
 
   - job: onboarding_elastictest_t1
@@ -194,7 +194,7 @@ stages:
           MIN_WORKER: $(T1_LAG_ONBOARDING_INSTANCE_NUM)
           MAX_WORKER: $(T1_LAG_ONBOARDING_INSTANCE_NUM)
           KVM_IMAGE_BRANCH: $(BUILD_BRANCH)
-          MGMT_BRANCH: $(BUILD_BRANCH)
+          MGMT_BRANCH: "master"
           TEST_SET: onboarding_t1
 
 #  - job: onboarding_elastictest_dualtor
@@ -211,7 +211,7 @@ stages:
 #          MIN_WORKER: $(T0_DUALTOR_INSTANCE_NUM)
 #          MAX_WORKER: $(T0_DUALTOR_INSTANCE_NUM)
 #          KVM_IMAGE_BRANCH: $(BUILD_BRANCH)
-#          MGMT_BRANCH: $(BUILD_BRANCH)
+#          MGMT_BRANCH: "master"
 #          TEST_SET: onboarding_dualtor
 
 #  - job: wan_elastictest

From 37889da6f27fff4e4794708eb663968be7cc1cee Mon Sep 17 00:00:00 2001
From: chunangli <chunang_li@163.com>
Date: Tue, 19 Nov 2024 11:04:28 +0800
Subject: [PATCH 7/7] remove unsued dump_kvm param

Signed-off-by: Chun'ang Li <chunangli@microsoft.com>
---
 .../run-test-elastictest-template.yml         | 28 -------------------
 .azure-pipelines/test_plan.py                 | 13 ---------
 2 files changed, 41 deletions(-)

diff --git a/.azure-pipelines/run-test-elastictest-template.yml b/.azure-pipelines/run-test-elastictest-template.yml
index 0d1e74835f9..64a1922edf2 100644
--- a/.azure-pipelines/run-test-elastictest-template.yml
+++ b/.azure-pipelines/run-test-elastictest-template.yml
@@ -126,13 +126,6 @@ parameters:
     type: string
     default: ""
 
-  - name: DUMP_KVM_IF_FAIL
-    type: string
-    default: "False"  # KVM dump has beed deleted
-    values:
-      - "True"
-      - "False"
-
   - name: REQUESTER
     type: string
     default: ""
@@ -250,7 +243,6 @@ steps:
       --mgmt-branch ${{ parameters.MGMT_BRANCH }} \
       --stop-on-failure ${{ parameters.STOP_ON_FAILURE }} \
       --retry-times ${{ parameters.RETRY_TIMES }} \
-      --dump-kvm-if-fail ${{ parameters.DUMP_KVM_IF_FAIL }} \
       --requester "${{ parameters.REQUESTER }}" \
       --max-execute-seconds $((${{ parameters.MAX_RUN_TEST_MINUTES }} * 60)) \
       --test-plan-num ${{ parameters.TEST_PLAN_NUM }}
@@ -356,26 +348,6 @@ steps:
     displayName: "Run test"
     timeoutInMinutes: ${{ parameters.MAX_RUN_TEST_MINUTES }}
 
-  - ${{ if eq(parameters.DUMP_KVM_IF_FAIL, 'True') }}:
-      - script: |
-          set -e
-          echo "KVM dump"
-
-          echo -e "\033[33mSONiC PR system-level test is powered by SONiC Elastictest, for any issue, please send email to sonicelastictest@microsoft.com \033[0m"
-          IFS=',' read -ra TEST_PLAN_ID_LIST <<< "$TEST_PLAN_ID_LIST_STRING"
-          for TEST_PLAN_ID in "${TEST_PLAN_ID_LIST[@]}"
-          do
-              echo -e -n "\033[33mPlease visit Elastictest page \033[0m"
-              echo -n "$(ELASTICTEST_FRONTEND_URL)/scheduler/testplan/$TEST_PLAN_ID "
-              echo -e "\033[33mfor detailed test plan progress \033[0m"
-              # When "KVMDUMP" finish, it changes into "FAILED", "CANCELLED" or "FINISHED"
-              echo "##[group][test_plan.py] poll KVMDUMP status"
-              python ./.azure-pipelines/test_plan.py poll -i $TEST_PLAN_ID --expected-state KVMDUMP
-          done
-
-        condition: succeededOrFailed()
-        displayName: "KVM dump"
-
   - script: |
       set -e
       echo "Try to cancel test plan $TEST_PLAN_ID, cancelling finished test plan has no effect."
diff --git a/.azure-pipelines/test_plan.py b/.azure-pipelines/test_plan.py
index 1cc48fdbd31..7753f6d6e64 100644
--- a/.azure-pipelines/test_plan.py
+++ b/.azure-pipelines/test_plan.py
@@ -329,7 +329,6 @@ def create(self, topology, test_plan_name="my_test_plan", deploy_mg_extra_params
                 "affinity": affinity,
                 "deploy_mg_param": deploy_mg_extra_params,
                 "max_execute_seconds": kwargs.get("max_execute_seconds", None),
-                "dump_kvm_if_fail": kwargs.get("dump_kvm_if_fail", False),
             },
             "type": test_plan_type,
             "trigger": {
@@ -826,17 +825,6 @@ def poll(self, test_plan_id, interval=60, timeout=-1, expected_state="", expecte
         required=False,
         help="Retry times after tests failed."
     )
-    parser_create.add_argument(
-        "--dump-kvm-if-fail",
-        type=ast.literal_eval,
-        dest="dump_kvm_if_fail",
-        nargs='?',
-        const='True',
-        default='True',
-        required=False,
-        choices=[True, False],
-        help="Dump KVM DUT if test plan failed, only supports KVM test plan."
-    )
     parser_create.add_argument(
         "--requester",
         type=str,
@@ -1010,7 +998,6 @@ def poll(self, test_plan_id, interval=60, timeout=-1, expected_state="", expecte
                     platform=args.platform,
                     stop_on_failure=args.stop_on_failure,
                     retry_times=args.retry_times,
-                    dump_kvm_if_fail=args.dump_kvm_if_fail,
                     requester=args.requester,
                     max_execute_seconds=args.max_execute_seconds,
                     lock_wait_timeout_seconds=args.lock_wait_timeout_seconds,