Skip to content

Commit b44224e

Browse files
committed
Gpu memory -> Gpu info: memory + utilization
1 parent d2adc85 commit b44224e

5 files changed

Lines changed: 120 additions & 84 deletions

File tree

examples/contrib/mnist/mnist_with_tqdm_logger.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ def get_data_loaders(train_batch_size, val_batch_size):
4444
return train_loader, val_loader
4545

4646

47-
def run(train_batch_size, val_batch_size, epochs, lr, momentum):
47+
def run(train_batch_size, val_batch_size, epochs, lr, momentum, display_gpu_info):
4848
train_loader, val_loader = get_data_loaders(train_batch_size, val_batch_size)
4949
model = Net()
5050
device = 'cpu'
@@ -61,8 +61,12 @@ def run(train_batch_size, val_batch_size, epochs, lr, momentum):
6161

6262
RunningAverage(output_transform=lambda x: x).attach(trainer, 'loss')
6363

64+
if display_gpu_info:
65+
from ignite.contrib.metrics import GpuInfo
66+
GpuInfo().attach(trainer, name='gpu')
67+
6468
pbar = ProgressBar(persist=True)
65-
pbar.attach(trainer, ['loss'])
69+
pbar.attach(trainer, metric_names=['loss', 'gpu:0 memory', 'gpu:0 util'])
6670

6771
@trainer.on(Events.EPOCH_COMPLETED)
6872
def log_training_results(engine):
@@ -102,7 +106,9 @@ def log_validation_results(engine):
102106
help='learning rate (default: 0.01)')
103107
parser.add_argument('--momentum', type=float, default=0.5,
104108
help='SGD momentum (default: 0.5)')
109+
parser.add_argument('--display_gpu_info', action='store_true',
110+
help='Display gpu usage info. This needs python 3.X and pynvml package')
105111

106112
args = parser.parse_args()
107113

108-
run(args.batch_size, args.val_batch_size, args.epochs, args.lr, args.momentum)
114+
run(args.batch_size, args.val_batch_size, args.epochs, args.lr, args.momentum, args.display_gpu_info)

ignite/contrib/metrics/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
from ignite.contrib.metrics.average_precision import AveragePrecision
22
from ignite.contrib.metrics.roc_auc import ROC_AUC
33
import ignite.contrib.metrics.regression
4-
from ignite.contrib.metrics.gpu_memory import GpuMemory
4+
from ignite.contrib.metrics.gpu_info import GpuInfo

ignite/contrib/metrics/gpu_info.py

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
# -*- coding: utf-8 -*-
2+
import warnings
3+
4+
import torch
5+
6+
from ignite.metrics import Metric
7+
from ignite.engine import Events
8+
9+
10+
class GpuInfo(Metric):
11+
"""GPU information: a) used / max memory, b) gpu utilization values as Metric.
12+
13+
Examples:
14+
15+
.. code-block:: python
16+
17+
# Default GPU measurement
18+
GpuInfo().attach(trainer) # default metric names are 'gpu info:N memory', 'gpu info:N util'
19+
ProgressBar(persist=True).attach(trainer, metric_names=['gpu info:0 memory', 'gpu info:0 util'])
20+
21+
# Progress bar will looks like
22+
# Epoch [2/50]: [64/128] 50%|█████ , gpu memory=1120 / 11176 MiB [06:17<12:34]
23+
24+
"""
25+
26+
def __init__(self):
27+
try:
28+
import pynvml
29+
except ImportError:
30+
raise RuntimeError("This contrib module requires pynvml to be installed. "
31+
"Please install it with command: \n pip install pynvml")
32+
# Let's check available devices
33+
if not torch.cuda.is_available():
34+
raise RuntimeError("This contrib module requires available GPU")
35+
36+
from pynvml.smi import nvidia_smi
37+
# Let it fail if no libnvidia drivers or NMVL library found
38+
self.nvsmi = nvidia_smi.getInstance()
39+
super(GpuInfo, self).__init__()
40+
41+
def reset(self):
42+
pass
43+
44+
def update(self, output):
45+
pass
46+
47+
def compute(self):
48+
data = self.nvsmi.DeviceQuery('memory.used, memory.total, utilization.gpu')
49+
if len(data) == 0 or ('gpu' not in data):
50+
warnings.warn("No GPU information available")
51+
return []
52+
return data['gpu']
53+
54+
def completed(self, engine, name):
55+
data = self.compute()
56+
if len(data) < 1:
57+
warnings.warn("No GPU information available")
58+
return
59+
60+
for i, data_by_rank in enumerate(data):
61+
mem_name = "{}:{} memory".format(name, i)
62+
63+
if 'fb_memory_usage' not in data_by_rank:
64+
warnings.warn("No GPU memory usage information available in {}".format(data_by_rank))
65+
continue
66+
mem_report = data_by_rank['fb_memory_usage']
67+
if not ('used' in mem_report and 'total' in mem_report):
68+
warnings.warn("GPU memory usage information does not provide used/total "
69+
"memory consumption information in {}".format(mem_report))
70+
continue
71+
72+
engine.state.metrics[mem_name] = "{}/{} MiB".format(int(mem_report['used']), int(mem_report['total']))
73+
74+
util_name = "{}:{} util".format(name, i)
75+
if 'utilization' not in data_by_rank:
76+
warnings.warn("No GPU utilization information available in {}".format(data_by_rank))
77+
continue
78+
util_report = data_by_rank['utilization']
79+
if not ('gpu_util' in util_report):
80+
warnings.warn("GPU utilization information does not provide 'gpu_util' information in "
81+
"{}".format(util_report))
82+
continue
83+
84+
engine.state.metrics[util_name] = "{:02d}%".format(int(util_report['gpu_util']))
85+
86+
def attach(self, engine, name="gpu info", event_name=Events.ITERATION_COMPLETED):
87+
engine.add_event_handler(event_name, self.completed, name)

ignite/contrib/metrics/gpu_memory.py

Lines changed: 0 additions & 66 deletions
This file was deleted.

tests/ignite/contrib/metrics/test_gpu_memory.py renamed to tests/ignite/contrib/metrics/test_gpu_info.py

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import torch
44

55
from ignite.engine import Engine, State
6-
from ignite.contrib.metrics import GpuMemory
6+
from ignite.contrib.metrics import GpuInfo
77

88
import pytest
99

@@ -26,38 +26,47 @@ def no_site_packages():
2626
def test_no_pynvml_package(no_site_packages):
2727

2828
with pytest.raises(RuntimeError, match="This contrib module requires pynvml to be installed."):
29-
GpuMemory()
29+
GpuInfo()
3030

3131

3232
@pytest.mark.skipif(sys.version[0] == "2" or torch.cuda.is_available(), reason="No pynvml for python 2.7")
3333
def test_no_gpu():
3434

3535
with pytest.raises(RuntimeError, match="This contrib module requires available GPU"):
36-
GpuMemory()
36+
GpuInfo()
3737

3838

3939
@pytest.mark.skipif(sys.version[0] == "2" or not (torch.cuda.is_available()),
4040
reason="No pynvml for python 2.7 and no GPU")
4141
def test_gpu_mem_consumption():
4242

43-
gpu_mem = GpuMemory()
43+
gpu_info = GpuInfo()
4444

4545
t = torch.rand(4, 10, 100, 100)
46-
data = gpu_mem.compute()
46+
data = gpu_info.compute()
4747
assert len(data) > 0
4848
assert "fb_memory_usage" in data[0]
49-
report = data[0]['fb_memory_usage']
50-
assert 'used' in report and 'total' in report
51-
assert report['total'] > 0.0
52-
assert report['used'] > t.shape[0] * t.shape[1] * t.shape[2] * t.shape[3] / 1024.0 / 1024.0
49+
mem_report = data[0]['fb_memory_usage']
50+
assert 'used' in mem_report and 'total' in mem_report
51+
assert mem_report['total'] > 0.0
52+
assert mem_report['used'] > t.shape[0] * t.shape[1] * t.shape[2] * t.shape[3] / 1024.0 / 1024.0
53+
54+
assert "utilization" in data[0]
55+
util_report = data[0]['utilization']
56+
assert 'gpu_util' in util_report
5357

5458
# with Engine
5559
engine = Engine(lambda engine, batch: 0.0)
5660
engine.state = State(metrics={})
5761

58-
gpu_mem.completed(engine, name='gpu mem', local_rank=0)
62+
gpu_info.completed(engine, name='gpu info')
63+
64+
assert 'gpu info:0 memory' in engine.state.metrics
65+
assert 'gpu info:0 util' in engine.state.metrics
66+
67+
assert isinstance(engine.state.metrics['gpu info:0 memory'], str)
68+
assert "{}".format(int(mem_report['used'])) in engine.state.metrics['gpu info:0 memory']
69+
assert "{}".format(int(mem_report['total'])) in engine.state.metrics['gpu info:0 memory']
5970

60-
assert 'gpu mem' in engine.state.metrics
61-
assert isinstance(engine.state.metrics['gpu mem'], str)
62-
assert "{}".format(int(report['used'])) in engine.state.metrics['gpu mem']
63-
assert "{}".format(int(report['total'])) in engine.state.metrics['gpu mem']
71+
assert isinstance(engine.state.metrics['gpu info:0 util'], str)
72+
assert "{}".format(int(util_report['gpu_util'])) in engine.state.metrics['gpu info:0 util']

0 commit comments

Comments
 (0)