Skip to content

Commit ab99c78

Browse files
committed
Add the check of GPU utilization before test.
1 parent 96785c6 commit ab99c78

File tree

2 files changed

+110
-3
lines changed

2 files changed

+110
-3
lines changed

graph_net/paddle/test_compiler.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -371,15 +371,15 @@ def test_multi_models(args):
371371
assert os.path.isfile(args.allow_list)
372372
graphnet_root = path_utils.get_graphnet_root()
373373
print(f"graphnet_root: {graphnet_root}", file=sys.stderr, flush=True)
374-
verified_samples = []
375-
with open(args.verified_samples_list_path, "r") as f:
374+
test_samples = []
375+
with open(args.allow_list, "r") as f:
376376
for line in f.readlines():
377377
test_samples.append(os.path.join(graphnet_root, line.strip()))
378378

379379
sample_idx = 0
380380
failed_samples = []
381381
for model_path in path_utils.get_recursively_model_path(args.model_path):
382-
if verified_samples is None or os.path.abspath(model_path) in verified_samples:
382+
if test_samples is None or os.path.abspath(model_path) in test_samples:
383383
print(
384384
f"[{sample_idx}] test_compiler, model_path: {model_path}",
385385
file=sys.stderr,
@@ -416,6 +416,19 @@ def main(args):
416416
assert os.path.isdir(args.model_path)
417417
assert args.compiler in {"cinn", "nope"}
418418

419+
if paddle.device.is_compiled_with_cuda():
420+
device_id = int(paddle.device.get_device().split(":")[-1])
421+
device_count = paddle.device.cuda.device_count()
422+
gpu_util, mem_util = test_compiler_util.get_device_utilization(
423+
device_id, device_count, get_synchronizer_func(args)
424+
)
425+
if gpu_util is not None and mem_util is not None:
426+
print(
427+
f"Device status: gpu_id {device_id}, gpu_util {gpu_util:.2f}%, mem_util {mem_util:.2f}%",
428+
file=sys.stderr,
429+
flush=True,
430+
)
431+
419432
initalize_seed = 123
420433
set_seed(random_seed=initalize_seed)
421434

graph_net/test_compiler_util.py

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
import sys
44
import json
55
import time
6+
import subprocess
7+
import shutil
68
import numpy as np
79
from dataclasses import dataclass
810
from contextlib import contextmanager
@@ -23,6 +25,98 @@ def naive_timer(duration_box, synchronizer_func):
2325
duration_box.value = (end - start) * 1000 # Store in milliseconds
2426

2527

28+
def get_device_utilization(device_id, device_count, synchronizer_func):
29+
current_pid = os.getpid()
30+
31+
if shutil.which("nvidia-smi"):
32+
try:
33+
cuda_devices_str = os.getenv("CUDA_VISIBLE_DEVICES", "")
34+
if cuda_devices_str != "":
35+
cuda_devices = list(map(int, cuda_devices_str.split(",")))
36+
else:
37+
cuda_devices = list(range(device_count))
38+
selected_gpu_id = cuda_devices[device_id]
39+
40+
print(
41+
f"Check the status of GPU {selected_gpu_id} for 5 times.",
42+
file=sys.stderr,
43+
flush=True,
44+
)
45+
selected_gpu_uuid, max_gpu_util, max_mem_util = None, 0.0, 0.0
46+
for i in range(5):
47+
synchronizer_func()
48+
time.sleep(1)
49+
50+
output = (
51+
subprocess.check_output(
52+
[
53+
"nvidia-smi",
54+
f"--query-gpu=index,gpu_uuid,utilization.gpu,memory.used,memory.total",
55+
"--format=csv,noheader,nounits",
56+
]
57+
)
58+
.decode()
59+
.strip()
60+
)
61+
for line in output.split("\n"):
62+
if line.strip():
63+
(
64+
gpu_id,
65+
selected_gpu_uuid,
66+
gpu_util,
67+
used_mem,
68+
mem_total,
69+
) = line.split(", ")
70+
if int(gpu_id) == selected_gpu_id:
71+
break
72+
73+
gpu_util = float(gpu_util)
74+
mem_util = float(used_mem) * 100 / float(mem_total)
75+
print(
76+
f"- gpu_id: {selected_gpu_id}, gpu_uuid: {selected_gpu_uuid}, gpu_util: {gpu_util:.2f}%, used_mem: {used_mem}, mem_total: {mem_total}",
77+
file=sys.stderr,
78+
flush=True,
79+
)
80+
81+
max_gpu_util = gpu_util if gpu_util > max_gpu_util else max_gpu_util
82+
max_mem_util = mem_util if mem_util > max_mem_util else max_mem_util
83+
84+
other_tasks = []
85+
output = (
86+
subprocess.check_output(
87+
[
88+
"nvidia-smi",
89+
f"--query-compute-apps=gpu_uuid,pid,used_memory",
90+
"--format=csv,noheader,nounits",
91+
]
92+
)
93+
.decode()
94+
.strip()
95+
)
96+
for line in output.split("\n"):
97+
if line.strip():
98+
gpu_uuid, pid, used_memory = line.split(", ")
99+
if gpu_uuid == selected_gpu_uuid and int(pid) != current_pid:
100+
other_tasks.append(line)
101+
print(
102+
f"Note: There are {len(other_tasks)} tasks running on GPU {selected_gpu_id}.",
103+
file=sys.stderr,
104+
flush=True,
105+
)
106+
for task in other_tasks:
107+
gpu_uuid, pid, used_memory = task.split(", ")
108+
print(
109+
f"- gpu_uuid:{gpu_uuid}, pid:{pid}, used_memory:{used_memory}",
110+
file=sys.stderr,
111+
flush=True,
112+
)
113+
return max_gpu_util, max_mem_util
114+
except subprocess.CalledProcessError:
115+
pass
116+
117+
return None, None
118+
119+
26120
def get_timing_stats(elapsed_times):
27121
stats = {
28122
"mean": float(f"{np.mean(elapsed_times):.6g}"),

0 commit comments

Comments
 (0)