-
Notifications
You must be signed in to change notification settings - Fork 219
add a CUDA device code sanity check #4692
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 110 commits
e329d46
c8cece2
ee63b8e
de6d49d
0e97868
6b6d2c8
6568909
3d07ef6
f13fca2
bbe189d
caff559
a6408ff
ba960aa
f569ba4
025604b
17dc755
354f071
4634cd4
dd5feda
25fccd1
563ba3a
ed49c46
f2f252d
1e97753
39e5652
8e70838
dbf7a7e
7a919f3
dab2042
be0a990
384c17a
5b0dd43
039542f
ec0683d
84a1905
7508104
213acef
e56cace
1c230d6
24f6b8a
92073b1
aecd62d
74d7349
31dc541
9266344
2a03e2e
931cd8c
a466f36
1bbff1b
22c3c23
4166d34
45bfcda
3b9b386
0688117
050226f
c8a448a
dd2be94
b0d5d5f
5c0adce
9c2167f
4ba7942
8d94d87
0b615e1
f9e99a2
aca934d
7fde91b
2258d91
f782df8
3ba1d7b
79d7084
316e71f
494bd95
8a9ea6d
a2960c2
a62cdaa
11cf157
8d9720e
b426226
e53143e
5f533ea
2ee867b
9c32b67
4fb884b
a02d198
45f659c
f9f3050
1a5cd5d
19cfe04
7e3a2dd
2442d44
97cef2b
c358e27
588c342
09a182f
1787c47
0f85f13
4b78bd7
581767c
f027c51
8b6c40d
d6620d4
39e5561
2bbfff9
f009b7d
a901ba5
e304b17
4273f16
c2cdb87
e9ec501
a17a42c
190156b
b14cceb
abc108b
b6eb063
22858ec
2655a07
ceacffa
e73900c
7e92cd5
5cef2e0
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -29,6 +29,7 @@ | |
|
|
||
| * Jens Timmerman (Ghent University) | ||
| * Ward Poelmans (Ghent University) | ||
| * Jasper Grimm (UoY) | ||
| * Jan Andre Reuter (Forschungszentrum Juelich GmbH) | ||
| """ | ||
| import csv | ||
|
|
@@ -41,6 +42,7 @@ | |
| import platform | ||
| import pwd | ||
| import re | ||
| import shutil | ||
| import struct | ||
| import sys | ||
| import termios | ||
|
|
@@ -64,6 +66,7 @@ | |
| pass | ||
|
|
||
| from easybuild.base import fancylogger | ||
| from easybuild.tools import LooseVersion | ||
| from easybuild.tools.build_log import EasyBuildError, EasyBuildExit, print_warning | ||
| from easybuild.tools.config import IGNORE | ||
| from easybuild.tools.filetools import is_readable, read_file, which | ||
|
|
@@ -998,6 +1001,106 @@ def get_glibc_version(): | |
| return glibc_ver | ||
|
|
||
|
|
||
| def get_cuda_object_dump_raw(path): | ||
| """ | ||
| Get raw ouput from command which extracts information from CUDA binary files in a human-readable format, | ||
| or None for files containing no CUDA device code. | ||
| See https://docs.nvidia.com/cuda/cuda-binary-utilities/index.html#cuobjdump | ||
| """ | ||
|
|
||
| res = run_shell_cmd("file %s" % path, fail_on_error=False, hidden=True, output_file=False, stream_output=False) | ||
| if res.exit_code != EasyBuildExit.SUCCESS: | ||
| fail_msg = "Failed to run 'file %s': %s" % (path, res.output) | ||
| _log.warning(fail_msg) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't this exit here? |
||
|
|
||
| # check that the file is an executable or object (shared library) or archive (static library) | ||
| result = None | ||
| if any(x in res.output for x in ['executable', 'object', 'archive']): | ||
| # Make sure we have a cuobjdump command | ||
| if not shutil.which('cuobjdump'): | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is already checked at https://github.com/easybuilders/easybuild-framework/pull/4692/files#diff-00260ae7a519d5825760f53b067b29fb84a3e0d2649e6a27ace99abaca96d7d1R4361 is this required in both places? |
||
| raise EasyBuildError("Failed to get object dump from CUDA file: cuobjdump command not found") | ||
| cuda_cmd = f"cuobjdump {path}" | ||
casparvl marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| res = run_shell_cmd(cuda_cmd, fail_on_error=False, hidden=True, output_file=False, stream_output=False) | ||
| if res.exit_code == EasyBuildExit.SUCCESS: | ||
| result = res.output | ||
| else: | ||
| # Check and report for the common case that this is simply not a CUDA binary, i.e. does not | ||
| # contain CUDA device code | ||
| no_device_code_match = re.search(r'does not contain device code', res.output) | ||
| if no_device_code_match is not None: | ||
| # File is a regular executable, object or library, but not a CUDA file | ||
| msg = "'%s' does not appear to be a CUDA binary: cuobjdump failed to find device code in this file" | ||
| _log.debug(msg, path) | ||
| else: | ||
| # This should not happen: there was no string saying this was NOT a CUDA file, yet no device code | ||
| # was found at all | ||
| msg = "Dumping CUDA binary file information for '%s' via '%s' failed! Output: '%s'" | ||
casparvl marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| raise EasyBuildError(msg, path, cuda_cmd, res.output) | ||
|
|
||
| return result | ||
|
|
||
|
|
||
| def get_cuda_architectures(path, section_type): | ||
| """ | ||
| Get a sorted list of CUDA architectures supported in the file in 'path'. | ||
| path: full path to a CUDA file | ||
| section_type: the type of section in the cuobjdump output to check for architectures ('elf' or 'ptx') | ||
| Returns None if no CUDA device code is present in the file | ||
| """ | ||
|
|
||
| # Note that typical output for a cuobjdump call will look like this for device code: | ||
| # | ||
| # Fatbin elf code: | ||
| # ================ | ||
| # arch = sm_90 | ||
| # code version = [1,7] | ||
| # host = linux | ||
| # compile_size = 64bit | ||
| # | ||
| # And for ptx code, it will look like this: | ||
| # | ||
| # Fatbin ptx code: | ||
| # ================ | ||
| # arch = sm_90 | ||
| # code version = [8,1] | ||
| # host = linux | ||
| # compile_size = 64bit | ||
|
|
||
| # Pattern to extract elf code architectures and ptx code architectures respectively | ||
| code_regex = re.compile(f'Fatbin {section_type} code:\n=+\narch = sm_([0-9]+)([0-9]a?)') | ||
|
|
||
| # resolve symlinks | ||
| if os.path.islink(path) and os.path.exists(path): | ||
| path = os.path.realpath(path) | ||
|
|
||
| cc_archs = None | ||
| cuda_raw = get_cuda_object_dump_raw(path) | ||
| if cuda_raw is not None: | ||
| # extract unique device code architectures from raw dump | ||
| code_matches = re.findall(code_regex, cuda_raw) | ||
| if code_matches: | ||
| # convert match tuples into unique list of cuda compute capabilities | ||
| # e.g. [('8', '6'), ('8', '6'), ('9', '0')] -> ['8.6', '9.0'] | ||
| cc_archs = sorted(['.'.join(m) for m in set(code_matches)], key=LooseVersion) | ||
| else: | ||
| # Try to be clear in the warning... did we not find elf/ptx code sections at all? or was the arch missing? | ||
| section_regex = re.compile(f'Fatbin {section_type} code') | ||
| section_matches = re.findall(section_regex, cuda_raw) | ||
| if section_matches: | ||
| fail_msg = f"Found Fatbin {section_type} code section(s) in cuobjdump output for {path}, " | ||
| fail_msg += "but failed to extract CUDA architecture" | ||
| else: | ||
| # In this case, the "Fatbin {section_type} code" section is simply missing from the binary | ||
| # It is entirely possible for a CUDA binary to have only device code or only ptx code (and thus the | ||
| # other section could be missing). However, considering --cuda-compute-capabilities is supposed to | ||
| # generate both PTX and device code (at least for the highest CC in that list), it is unexpected | ||
| # in an EasyBuild context and thus we print a warning | ||
| fail_msg = f"Failed to find Fatbin {section_type} code section(s) in cuobjdump output for {path}." | ||
| _log.warning(fail_msg) | ||
|
|
||
| return cc_archs | ||
|
|
||
|
|
||
| def get_linked_libs_raw(path): | ||
| """ | ||
| Get raw output from command that reports linked libraries for dynamically linked executables/libraries, | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.