Add the check of GPU utilization before test.

Xreki · Xreki · commit ab99c781eb92 · 2025-10-31T16:50:09.000+08:00
diff --git a/graph_net/paddle/test_compiler.py b/graph_net/paddle/test_compiler.py
@@ -371,15 +371,15 @@ def test_multi_models(args):
         assert os.path.isfile(args.allow_list)
         graphnet_root = path_utils.get_graphnet_root()
         print(f"graphnet_root: {graphnet_root}", file=sys.stderr, flush=True)
-        verified_samples = []
-        with open(args.verified_samples_list_path, "r") as f:
+        test_samples = []
+        with open(args.allow_list, "r") as f:
             for line in f.readlines():
                 test_samples.append(os.path.join(graphnet_root, line.strip()))
 
     sample_idx = 0
     failed_samples = []
     for model_path in path_utils.get_recursively_model_path(args.model_path):
-        if verified_samples is None or os.path.abspath(model_path) in verified_samples:
+        if test_samples is None or os.path.abspath(model_path) in test_samples:
             print(
                 f"[{sample_idx}] test_compiler, model_path: {model_path}",
                 file=sys.stderr,
@@ -416,6 +416,19 @@ def main(args):
     assert os.path.isdir(args.model_path)
     assert args.compiler in {"cinn", "nope"}
 
+    if paddle.device.is_compiled_with_cuda():
+        device_id = int(paddle.device.get_device().split(":")[-1])
+        device_count = paddle.device.cuda.device_count()
+        gpu_util, mem_util = test_compiler_util.get_device_utilization(
+            device_id, device_count, get_synchronizer_func(args)
+        )
+        if gpu_util is not None and mem_util is not None:
+            print(
+                f"Device status: gpu_id {device_id}, gpu_util {gpu_util:.2f}%, mem_util {mem_util:.2f}%",
+                file=sys.stderr,
+                flush=True,
+            )
+
     initalize_seed = 123
     set_seed(random_seed=initalize_seed)
 
diff --git a/graph_net/test_compiler_util.py b/graph_net/test_compiler_util.py
@@ -3,6 +3,8 @@
 import sys
 import json
 import time
+import subprocess
+import shutil
 import numpy as np
 from dataclasses import dataclass
 from contextlib import contextmanager
@@ -23,6 +25,98 @@ def naive_timer(duration_box, synchronizer_func):
     duration_box.value = (end - start) * 1000  # Store in milliseconds
 
 
+def get_device_utilization(device_id, device_count, synchronizer_func):
+    current_pid = os.getpid()
+
+    if shutil.which("nvidia-smi"):
+        try:
+            cuda_devices_str = os.getenv("CUDA_VISIBLE_DEVICES", "")
+            if cuda_devices_str != "":
+                cuda_devices = list(map(int, cuda_devices_str.split(",")))
+            else:
+                cuda_devices = list(range(device_count))
+            selected_gpu_id = cuda_devices[device_id]
+
+            print(
+                f"Check the status of GPU {selected_gpu_id} for 5 times.",
+                file=sys.stderr,
+                flush=True,
+            )
+            selected_gpu_uuid, max_gpu_util, max_mem_util = None, 0.0, 0.0
+            for i in range(5):
+                synchronizer_func()
+                time.sleep(1)
+
+                output = (
+                    subprocess.check_output(
+                        [
+                            "nvidia-smi",
+                            f"--query-gpu=index,gpu_uuid,utilization.gpu,memory.used,memory.total",
+                            "--format=csv,noheader,nounits",
+                        ]
+                    )
+                    .decode()
+                    .strip()
+                )
+                for line in output.split("\n"):
+                    if line.strip():
+                        (
+                            gpu_id,
+                            selected_gpu_uuid,
+                            gpu_util,
+                            used_mem,
+                            mem_total,
+                        ) = line.split(", ")
+                        if int(gpu_id) == selected_gpu_id:
+                            break
+
+                gpu_util = float(gpu_util)
+                mem_util = float(used_mem) * 100 / float(mem_total)
+                print(
+                    f"- gpu_id: {selected_gpu_id}, gpu_uuid: {selected_gpu_uuid}, gpu_util: {gpu_util:.2f}%, used_mem: {used_mem}, mem_total: {mem_total}",
+                    file=sys.stderr,
+                    flush=True,
+                )
+
+                max_gpu_util = gpu_util if gpu_util > max_gpu_util else max_gpu_util
+                max_mem_util = mem_util if mem_util > max_mem_util else max_mem_util
+
+            other_tasks = []
+            output = (
+                subprocess.check_output(
+                    [
+                        "nvidia-smi",
+                        f"--query-compute-apps=gpu_uuid,pid,used_memory",
+                        "--format=csv,noheader,nounits",
+                    ]
+                )
+                .decode()
+                .strip()
+            )
+            for line in output.split("\n"):
+                if line.strip():
+                    gpu_uuid, pid, used_memory = line.split(", ")
+                    if gpu_uuid == selected_gpu_uuid and int(pid) != current_pid:
+                        other_tasks.append(line)
+            print(
+                f"Note: There are {len(other_tasks)} tasks running on GPU {selected_gpu_id}.",
+                file=sys.stderr,
+                flush=True,
+            )
+            for task in other_tasks:
+                gpu_uuid, pid, used_memory = task.split(", ")
+                print(
+                    f"- gpu_uuid:{gpu_uuid}, pid:{pid}, used_memory:{used_memory}",
+                    file=sys.stderr,
+                    flush=True,
+                )
+            return max_gpu_util, max_mem_util
+        except subprocess.CalledProcessError:
+            pass
+
+    return None, None
+
+
 def get_timing_stats(elapsed_times):
     stats = {
         "mean": float(f"{np.mean(elapsed_times):.6g}"),