33import sys
44import json
55import time
6+ import subprocess
7+ import shutil
68import numpy as np
79from dataclasses import dataclass
810from contextlib import contextmanager
@@ -23,6 +25,98 @@ def naive_timer(duration_box, synchronizer_func):
2325 duration_box .value = (end - start ) * 1000 # Store in milliseconds
2426
2527
28+ def get_device_utilization (device_id , device_count , synchronizer_func ):
29+ current_pid = os .getpid ()
30+
31+ if shutil .which ("nvidia-smi" ):
32+ try :
33+ cuda_devices_str = os .getenv ("CUDA_VISIBLE_DEVICES" , "" )
34+ if cuda_devices_str != "" :
35+ cuda_devices = list (map (int , cuda_devices_str .split ("," )))
36+ else :
37+ cuda_devices = list (range (device_count ))
38+ selected_gpu_id = cuda_devices [device_id ]
39+
40+ print (
41+ f"Check the status of GPU { selected_gpu_id } for 5 times." ,
42+ file = sys .stderr ,
43+ flush = True ,
44+ )
45+ selected_gpu_uuid , max_gpu_util , max_mem_util = None , 0.0 , 0.0
46+ for i in range (5 ):
47+ synchronizer_func ()
48+ time .sleep (1 )
49+
50+ output = (
51+ subprocess .check_output (
52+ [
53+ "nvidia-smi" ,
54+ f"--query-gpu=index,gpu_uuid,utilization.gpu,memory.used,memory.total" ,
55+ "--format=csv,noheader,nounits" ,
56+ ]
57+ )
58+ .decode ()
59+ .strip ()
60+ )
61+ for line in output .split ("\n " ):
62+ if line .strip ():
63+ (
64+ gpu_id ,
65+ selected_gpu_uuid ,
66+ gpu_util ,
67+ used_mem ,
68+ mem_total ,
69+ ) = line .split (", " )
70+ if int (gpu_id ) == selected_gpu_id :
71+ break
72+
73+ gpu_util = float (gpu_util )
74+ mem_util = float (used_mem ) * 100 / float (mem_total )
75+ print (
76+ f"- gpu_id: { selected_gpu_id } , gpu_uuid: { selected_gpu_uuid } , gpu_util: { gpu_util :.2f} %, used_mem: { used_mem } , mem_total: { mem_total } " ,
77+ file = sys .stderr ,
78+ flush = True ,
79+ )
80+
81+ max_gpu_util = gpu_util if gpu_util > max_gpu_util else max_gpu_util
82+ max_mem_util = mem_util if mem_util > max_mem_util else max_mem_util
83+
84+ other_tasks = []
85+ output = (
86+ subprocess .check_output (
87+ [
88+ "nvidia-smi" ,
89+ f"--query-compute-apps=gpu_uuid,pid,used_memory" ,
90+ "--format=csv,noheader,nounits" ,
91+ ]
92+ )
93+ .decode ()
94+ .strip ()
95+ )
96+ for line in output .split ("\n " ):
97+ if line .strip ():
98+ gpu_uuid , pid , used_memory = line .split (", " )
99+ if gpu_uuid == selected_gpu_uuid and int (pid ) != current_pid :
100+ other_tasks .append (line )
101+ print (
102+ f"Note: There are { len (other_tasks )} tasks running on GPU { selected_gpu_id } ." ,
103+ file = sys .stderr ,
104+ flush = True ,
105+ )
106+ for task in other_tasks :
107+ gpu_uuid , pid , used_memory = task .split (", " )
108+ print (
109+ f"- gpu_uuid:{ gpu_uuid } , pid:{ pid } , used_memory:{ used_memory } " ,
110+ file = sys .stderr ,
111+ flush = True ,
112+ )
113+ return max_gpu_util , max_mem_util
114+ except subprocess .CalledProcessError :
115+ pass
116+
117+ return None , None
118+
119+
26120def get_timing_stats (elapsed_times ):
27121 stats = {
28122 "mean" : float (f"{ np .mean (elapsed_times ):.6g} " ),
0 commit comments