From e329d461957f0d6c05bbfd894634cb9ac19920c1 Mon Sep 17 00:00:00 2001 From: jfgrimm Date: Thu, 24 Oct 2024 16:09:36 +0100 Subject: [PATCH 001/114] sanity check binaries/libraries for device code matching cuda_compute_capabilities when CUDA is used --- easybuild/framework/easyblock.py | 67 +++++++++++++++++++++++++++++++- easybuild/tools/systemtools.py | 59 ++++++++++++++++++++++++++++ 2 files changed, 124 insertions(+), 2 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index cfe0220202..9f0e0a98d7 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -40,6 +40,7 @@ * Maxime Boissonneault (Compute Canada) * Davide Vanzo (Vanderbilt University) * Caspar van Leeuwen (SURF) +* Jasper Grimm (UoY) """ import concurrent import copy @@ -101,8 +102,9 @@ from easybuild.tools.output import show_progress_bars, start_progress_bar, stop_progress_bar, update_progress_bar from easybuild.tools.package.utilities import package from easybuild.tools.repository.repository import init_repository -from easybuild.tools.systemtools import check_linked_shared_libs, det_parallelism, get_linked_libs_raw -from easybuild.tools.systemtools import get_shared_lib_ext, pick_system_specific_value, use_group +from easybuild.tools.systemtools import check_linked_shared_libs, det_parallelism, get_cuda_device_code_architectures +from easybuild.tools.systemtools import get_linked_libs_raw, get_shared_lib_ext, pick_system_specific_value, use_group +from easybuild.tools.toolchain.toolchain import TOOLCHAIN_CAPABILITY_CUDA from easybuild.tools.utilities import INDENT_4SPACES, get_class_for, nub, quote_str from easybuild.tools.utilities import remove_unwanted_chars, time2str, trace_msg from easybuild.tools.version import this_is_easybuild, VERBOSE_VERSION, VERSION @@ -3193,6 +3195,59 @@ def _sanity_check_step_multi_deps(self, *args, **kwargs): self.cfg['builddependencies'] = builddeps self.cfg.iterating = False + def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): + """Sanity check that binaries/libraries contain device code for the correct architecture targets.""" + + self.log.info("Checking binaries/libraries for CUDA device code...") + + fails = [] + cfg_ccs = build_option('cuda_compute_capabilities') or self.cfg.get('cuda_compute_capabilities', None) + + if cuda_dirs is None: + cuda_dirs = self.cfg['bin_lib_subdirs'] or self.bin_lib_subdirs() + + if not cuda_dirs: + cuda_dirs = DEFAULT_BIN_LIB_SUBDIRS + self.log.info("Using default subdirectories for binaries/libraries to verify CUDA device code: %s", + cuda_dirs) + else: + self.log.info("Using default subdirectories for binaries/libraries to verify CUDA device code: %s", + cuda_dirs) + + for dirpath in [os.path.join(self.installdir, d) for d in cuda_dirs]: + if os.path.exists(dirpath): + self.log.debug(f"Sanity checking files for CUDA device code in {dirpath}") + + for path in [os.path.join(dirpath, x) for x in os.listdir(dirpath)]: + self.log.debug("Sanity checking for CUDA device code in %s", path) + + derived_ccs = get_cuda_device_code_architectures(path) + + if derived_ccs is None: + msg = f"No CUDA device code found in {path}, so skipping it in CUDA device code sanity check" + self.log.debug(msg) + else: + # check whether device code architectures match cuda_compute_capabilities + additional_ccs = list(set(derived_ccs) - set(cfg_ccs)) + missing_ccs = list(set(cfg_ccs) - set(derived_ccs)) + + if additional_ccs or missing_ccs: + fail_msg = f"Mismatch between cuda_compute_capabilities and device code in {path}. " + if additional_ccs: + fail_msg += "Surplus compute capabilities: %s. " % ', '.join(sorted(additional_ccs)) + if missing_ccs: + fail_msg += "Missing compute capabilities: %s." % ', '.join(sorted(missing_ccs)) + self.log.warning(fail_msg) + fails.append(fail_msg) + else: + msg = (f"Output of 'cuobjdump' checked for {path}; device code architecures match " + "those in cuda_compute_capabilities") + self.log.debug(msg) + else: + self.log.debug(f"Not sanity checking files in non-existing directory {dirpath}") + + return fails + def sanity_check_rpath(self, rpath_dirs=None, check_readelf_rpath=True): """Sanity check binaries/libraries w.r.t. RPATH linking.""" @@ -3782,6 +3837,14 @@ def xs2str(xs): else: self.log.debug("Skipping RPATH sanity check") + if get_software_root('CUDA'): + cuda_fails = self.sanity_check_cuda() + if cuda_fails: + self.log.warning("CUDA device code sanity check failed!") + self.sanity_check_fail_msgs.extend(cuda_fails) + else: + self.log.debug("Skipping CUDA device code sanity check") + # pass or fail if self.sanity_check_fail_msgs: raise EasyBuildError( diff --git a/easybuild/tools/systemtools.py b/easybuild/tools/systemtools.py index def6fbf3f1..9a1a337082 100644 --- a/easybuild/tools/systemtools.py +++ b/easybuild/tools/systemtools.py @@ -29,6 +29,7 @@ * Jens Timmerman (Ghent University) * Ward Poelmans (Ghent University) +* Jasper Grimm (UoY) """ import ctypes import errno @@ -963,6 +964,64 @@ def get_glibc_version(): return glibc_ver +def get_cuda_object_dump_raw(path): + """ + Get raw ouput from command which extracts information from CUDA binary files in a human-readable format, + or None for files containing no CUDA device code. + See https://docs.nvidia.com/cuda/cuda-binary-utilities/index.html#cuobjdump + """ + + res = run_shell_cmd("file %s" % path, fail_on_error=False, hidden=True, output_file=False, stream_output=False) + if res.exit_code != EasyBuildExit.SUCCESS: + fail_msg = "Failed to run 'file %s': %s" % (path, res.output) + _log.warning(fail_msg) + + # check that the file is an executable or library/object + if any(x in res.output for x in ['executable', 'object', 'library']): + cuda_cmd = f"cuobjdump {path}" + else: + return None + + res = run_shell_cmd(cuda_cmd, fail_on_error=False, hidden=True, output_file=False, stream_output=False) + if res.exit_code == EasyBuildExit.SUCCESS: + return res.output + else: + msg = "Dumping CUDA binary file information for '%s' via '%s' failed! Output: '%s'" + _log.debug(msg % (path, cuda_cmd, res.output)) + return None + + +def get_cuda_device_code_architectures(path): + """ + Get list of supported CUDA architectures, by inspecting the device code of an executable/library. The format is the + same as cuda_compute_capabilities (e.g. ['8.6', '9.0'] for sm_86 sm_90). + Returns None if no CUDA device code is present in the file. + """ + + # cudaobjdump uses the sm_XY format + device_code_regex = re.compile('(?<=arch = sm_)([0-9])([0-9]+a{0,1})') + + # resolve symlinks + if os.path.islink(path) and os.path.exists(path): + path = os.path.realpath(path) + + cuda_raw = get_cuda_object_dump_raw(path) + if cuda_raw is None: + return None + + # extract unique architectures from raw dump + matches = re.findall(device_code_regex, cuda_raw) + if matches is not None: + # convert match tuples into unique list of cuda compute capabilities + # e.g. [('8', '6'), ('8', '6'), ('9', '0')] -> ['8.6', '9.0'] + matches = sorted(['.'.join(m) for m in set(matches)]) + else: + fail_msg = f"Failed to determine supported CUDA architectures from {path}" + _log.warning(fail_msg) + + return matches + + def get_linked_libs_raw(path): """ Get raw output from command that reports linked libraries for dynamically linked executables/libraries, From ee63b8e64f834d20c1d695e17346e5315217b64e Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 20 Feb 2025 02:40:05 +0100 Subject: [PATCH 002/114] Add check for PTX, more explicit debug logging --- easybuild/framework/easyblock.py | 31 +++++++++-- easybuild/tools/systemtools.py | 88 ++++++++++++++++++++++++++++---- 2 files changed, 104 insertions(+), 15 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index b9a3f341a4..80ecd25bbe 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3322,6 +3322,11 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): fails = [] cfg_ccs = build_option('cuda_compute_capabilities') or self.cfg.get('cuda_compute_capabilities', None) + # If there are no CUDA compute capabilities defined, return + if cfg_ccs is None or len(cfg_ccs) == 0: + self.log.info("Skipping CUDA sanity check, as no CUDA compute capabilities where configured") + return fails + if cuda_dirs is None: cuda_dirs = self.cfg['bin_lib_subdirs'] or self.bin_lib_subdirs() @@ -3340,12 +3345,17 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): for path in [os.path.join(dirpath, x) for x in os.listdir(dirpath)]: self.log.debug("Sanity checking for CUDA device code in %s", path) - derived_ccs = get_cuda_device_code_architectures(path) - - if derived_ccs is None: - msg = f"No CUDA device code found in {path}, so skipping it in CUDA device code sanity check" + res = get_cuda_device_code_architectures(path) + if res is None: + msg = f"{path} does not appear to be a CUDA executable (no CUDA device code found), " + msg += "so skipping CUDA sanity check." self.log.debug(msg) + return fails else: + # unpack results + derived_ccs = res.device_code_archs + derived_ptx_ccs = res.ptx_archs + # check whether device code architectures match cuda_compute_capabilities additional_ccs = list(set(derived_ccs) - set(cfg_ccs)) missing_ccs = list(set(cfg_ccs) - set(derived_ccs)) @@ -3362,6 +3372,19 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): msg = (f"Output of 'cuobjdump' checked for {path}; device code architecures match " "those in cuda_compute_capabilities") self.log.debug(msg) + + # Check whether there is ptx code for the highest CC in cfg_ccs + highest_cc = sorted(cfg_ccs)[-1] + missing_ptx_ccs = list(set(highest_cc) - set(derived_ptx_ccs)) + + if missing_ptx_ccs: + fail_msg = "Configured highest compute capability was '%s', " + fail_msg += "but no PTX code for this compute capability was found in '%s'" + self.log.warning(fail_msg, highest_cc, missing_ptx_ccs) + else: + msg = (f"Output of 'cuobjdump' checked for {path}; ptx code was present for (at least) the" + " highest CUDA compute capability in cuda_compute_capabilities") + self.log.debug(msg) else: self.log.debug(f"Not sanity checking files in non-existing directory {dirpath}") diff --git a/easybuild/tools/systemtools.py b/easybuild/tools/systemtools.py index cfcdb4bfe4..5afe1717e8 100644 --- a/easybuild/tools/systemtools.py +++ b/easybuild/tools/systemtools.py @@ -43,7 +43,7 @@ import sys import termios import warnings -from collections import OrderedDict +from collections import OrderedDict, namedtuple from ctypes.util import find_library from socket import gethostname @@ -215,6 +215,14 @@ } +# A named tuple, to be returned by e.g. `get_cuda_device_code_architectures` +cuda_dev_ptx_archs = namedtuple('cuda_dev_ptx_archs', ('device_code_archs', 'ptx_archs')) +cuda_dev_ptx_archs.__doc__ = """A namedtuple that represents the result of a call to get_cuda_device_code_architectures, +with the following fields: +- device_code_archs: a list of CUDA device compute capabilities for which device code was found +- ptx_archs: a list of CUDA (virtual) device compute capabilities for which ptx code was found +""" + class SystemToolsException(Exception): """raised when systemtools fails""" @@ -986,8 +994,15 @@ def get_cuda_object_dump_raw(path): if res.exit_code == EasyBuildExit.SUCCESS: return res.output else: - msg = "Dumping CUDA binary file information for '%s' via '%s' failed! Output: '%s'" - _log.debug(msg % (path, cuda_cmd, res.output)) + # Check and report for the common case that this is simply not a CUDA binary, i.e. does not + # contain CUDA device code + no_device_code_match = re.search(r'does not contain device code', res.output) + if no_device_code_match is not None: + msg = "'%s' does not appear to be a CUDA binary: cuobjdump failed to find device code in this file" + _log.debug(msg, path) + else: + msg = "Dumping CUDA binary file information for '%s' via '%s' failed! Output: '%s'" + _log.debug(msg, path, cuda_cmd, res.output) return None @@ -998,8 +1013,27 @@ def get_cuda_device_code_architectures(path): Returns None if no CUDA device code is present in the file. """ - # cudaobjdump uses the sm_XY format - device_code_regex = re.compile('(?<=arch = sm_)([0-9])([0-9]+a{0,1})') + # Note that typical output for a cuobjdump call will look like this for device code: + # + # Fatbin elf code: + # ================ + # arch = sm_90 + # code version = [1,7] + # host = linux + # compile_size = 64bit + # + # And for ptx code, it will look like this: + # + # Fatbin ptx code: + # ================ + # arch = sm_90 + # code version = [8,1] + # host = linux + # compile_size = 64bit + + # Pattern to extract elf code architectures and ptx code architectures respectively + device_code_regex = re.compile('Fatbin elf code:\n=+\narch = sm_([0-9])([0-9]+a{0,1})') + ptx_code_regex = re.compile('Fatbin ptx code:\n=+\narch = sm_([0-9])([0-9]+a{0,1})') # resolve symlinks if os.path.islink(path) and os.path.exists(path): @@ -1009,17 +1043,49 @@ def get_cuda_device_code_architectures(path): if cuda_raw is None: return None - # extract unique architectures from raw dump - matches = re.findall(device_code_regex, cuda_raw) - if matches is not None: + # extract unique device code architectures from raw dump + device_code_matches = re.findall(device_code_regex, cuda_raw) + if device_code_matches is not None: + # convert match tuples into unique list of cuda compute capabilities + # e.g. [('8', '6'), ('8', '6'), ('9', '0')] -> ['8.6', '9.0'] + device_code_matches = sorted(['.'.join(m) for m in set(device_code_matches)]) + else: + # Try to be clear in the warning... did we not find elf code sections at all? or was the arch missing? + device_section_regex = re.compile('Fatbin elf code') + device_section_matches = re.findall(device_section_regex, cuda_raw) + if device_section_matches is not None: + fail_msg = f"Found Fatbin elf code section(s) in cuobjdump output for {path}, " + fail_msg += "but failed to extract CUDA architecture" + else: + # In this case, the cuobjdump command _likely_ already returned a non-zero exit + # This error message would only be displayed if cuobjdump somehow completely successfully + # but still no Fatbin elf code section was found + fail_msg = f"Failed to find Fatbin elf code section(s) in cuobjdump output for {path}, " + fail_msg += "are you sure this is a CUDA binary?" + _log.warning(fail_msg) + + # extract unique ptx code architectures from raw dump + ptx_code_matches = re.findall(ptx_code_regex, cuda_raw) + if ptx_code_matches is not None: # convert match tuples into unique list of cuda compute capabilities # e.g. [('8', '6'), ('8', '6'), ('9', '0')] -> ['8.6', '9.0'] - matches = sorted(['.'.join(m) for m in set(matches)]) + ptx_code_matches = sorted(['.'.join(m) for m in set(ptx_code_matches)]) else: - fail_msg = f"Failed to determine supported CUDA architectures from {path}" + # Try to be clear in the warning... did we not find ptx code sections at all? or was the arch missing? + ptx_section_regex = re.compile('Fatbin ptx code') + ptx_section_matches = re.findall(ptx_section_regex, cuda_raw) + if ptx_section_matches is not None: + fail_msg = f"Found Fatbin ptx code section(s) in cuobjdump output for {path}, " + fail_msg += "but failed to extract CUDA architecture" + else: + # In this case, the cuobjdump command _likely_ already returned a non-zero exit + # This error message would only be displayed if cuobjdump somehow completely successfully + # but still no Fatbin ptx code section was found + fail_msg = f"Failed to find Fatbin ptx code section(s) in cuobjdump output for {path}, " + fail_msg += "are you sure this is a CUDA binary?" _log.warning(fail_msg) - return matches + return cuda_dev_ptx_archs(ptx_archs=ptx_code_matches, device_code_archs=device_code_matches) def get_linked_libs_raw(path): From de6d49d8186708b63b3dbae8e0f847e736797fb1 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 20 Feb 2025 02:42:01 +0100 Subject: [PATCH 003/114] That return should not be there, as it will stop the sanity check after the first non-cuda file. that's wrong --- easybuild/framework/easyblock.py | 1 - 1 file changed, 1 deletion(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 80ecd25bbe..0a718fd31f 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3350,7 +3350,6 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): msg = f"{path} does not appear to be a CUDA executable (no CUDA device code found), " msg += "so skipping CUDA sanity check." self.log.debug(msg) - return fails else: # unpack results derived_ccs = res.device_code_archs From 0e97868b72a3868b824c96ff0f28f26029aaac8a Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 20 Feb 2025 02:56:12 +0100 Subject: [PATCH 004/114] Fix some logic in the PTX warning printed --- easybuild/framework/easyblock.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 0a718fd31f..747838f959 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3373,13 +3373,14 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): self.log.debug(msg) # Check whether there is ptx code for the highest CC in cfg_ccs - highest_cc = sorted(cfg_ccs)[-1] + highest_cc = [sorted(cfg_ccs)[-1]] missing_ptx_ccs = list(set(highest_cc) - set(derived_ptx_ccs)) if missing_ptx_ccs: fail_msg = "Configured highest compute capability was '%s', " - fail_msg += "but no PTX code for this compute capability was found in '%s'" - self.log.warning(fail_msg, highest_cc, missing_ptx_ccs) + fail_msg += "but no PTX code for this compute capability was found in '%s' " + fail_msg += "PTX architectures supported in that file: %s" + self.log.warning(fail_msg, highest_cc[0], path, derived_ptx_ccs) else: msg = (f"Output of 'cuobjdump' checked for {path}; ptx code was present for (at least) the" " highest CUDA compute capability in cuda_compute_capabilities") From 6b6d2c8b77fd7f4a854f56b1d8f69fb4493167c1 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 20 Feb 2025 23:42:04 +0100 Subject: [PATCH 005/114] Add option for ignoring individual files in the CUDA sanity check --- easybuild/framework/easyblock.py | 16 +++++++++++++--- easybuild/framework/easyconfig/default.py | 5 +++++ 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 747838f959..9631da2617 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3322,6 +3322,10 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): fails = [] cfg_ccs = build_option('cuda_compute_capabilities') or self.cfg.get('cuda_compute_capabilities', None) + # Construct the list of files to ignore as full paths (cuda_sanity_ignore_files contains the paths + # to ignore, relative to the installation prefix) + ignore_file_list = [os.path.join(self.installdir, d) for d in self.cfg['cuda_sanity_ignore_files']] + # If there are no CUDA compute capabilities defined, return if cfg_ccs is None or len(cfg_ccs) == 0: self.log.info("Skipping CUDA sanity check, as no CUDA compute capabilities where configured") @@ -3364,9 +3368,15 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): if additional_ccs: fail_msg += "Surplus compute capabilities: %s. " % ', '.join(sorted(additional_ccs)) if missing_ccs: - fail_msg += "Missing compute capabilities: %s." % ', '.join(sorted(missing_ccs)) - self.log.warning(fail_msg) - fails.append(fail_msg) + fail_msg += "Missing compute capabilities: %s. " % ', '.join(sorted(missing_ccs)) + # We still log the result, but don't fail: + if path in ignore_file_list: + fail_msg += f"This failure will be ignored as {path} is listed in " + fail_msg += "'ignore_cuda_sanity_failures'." + self.log.warning(fail_msg) + else: + self.log.warning(fail_msg) + fails.append(fail_msg) else: msg = (f"Output of 'cuobjdump' checked for {path}; device code architecures match " "those in cuda_compute_capabilities") diff --git a/easybuild/framework/easyconfig/default.py b/easybuild/framework/easyconfig/default.py index bca46c3856..80319c6ec9 100644 --- a/easybuild/framework/easyconfig/default.py +++ b/easybuild/framework/easyconfig/default.py @@ -126,6 +126,11 @@ 'after make (for e.g.,"test" for make test)'), BUILD], 'bin_lib_subdirs': [[], "List of subdirectories for binaries and libraries, which is used during sanity check " "to check RPATH linking and banned/required libraries", BUILD], + 'cuda_sanity_ignore_files': [[], "List of files (relative to the installation prefix) for which failurs in " + "the CUDA sanity check step are ignored. Typically used for files where you " + "know the CUDA architectures in those files don't match the " + "--cuda-compute-capabitilities configured for EasyBuild AND where you know " + "that this is ok / reasonable (e.g. binary installations)", BUILD], 'sanity_check_commands': [[], ("format: [(name, options)] e.g. [('gzip','-h')]. " "Using a non-tuple is equivalent to (name, '-h')"), BUILD], 'sanity_check_paths': [{}, ("List of files and directories to check " From 6568909bb43d0108eea310a0e7427be2c7e12295 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Fri, 21 Feb 2025 21:44:52 +0100 Subject: [PATCH 006/114] Add strict-cuda-sanity-check option and make sure we only fail the sanity check on surpluss CUDA archs if this option is set. Otherwise, print warning --- easybuild/framework/easyblock.py | 15 ++++++++++++--- easybuild/tools/config.py | 1 + easybuild/tools/options.py | 9 ++++++++- 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 9631da2617..6291c0fba7 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3321,6 +3321,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): fails = [] cfg_ccs = build_option('cuda_compute_capabilities') or self.cfg.get('cuda_compute_capabilities', None) + strict_cc_check = build_option('strict_cuda_sanity_check') # Construct the list of files to ignore as full paths (cuda_sanity_ignore_files contains the paths # to ignore, relative to the installation prefix) @@ -3364,21 +3365,29 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): missing_ccs = list(set(cfg_ccs) - set(derived_ccs)) if additional_ccs or missing_ccs: + # Do we log this as warning or produce a sanity failure? + is_failure = False fail_msg = f"Mismatch between cuda_compute_capabilities and device code in {path}. " if additional_ccs: fail_msg += "Surplus compute capabilities: %s. " % ', '.join(sorted(additional_ccs)) + if strict_cc_check: + is_failure = True if missing_ccs: fail_msg += "Missing compute capabilities: %s. " % ', '.join(sorted(missing_ccs)) + is_failure = True # We still log the result, but don't fail: if path in ignore_file_list: fail_msg += f"This failure will be ignored as {path} is listed in " fail_msg += "'ignore_cuda_sanity_failures'." - self.log.warning(fail_msg) + is_failure = False + + # Log warning or sanity error + if is_failure: + fails.append(fail_msg) else: self.log.warning(fail_msg) - fails.append(fail_msg) else: - msg = (f"Output of 'cuobjdump' checked for {path}; device code architecures match " + msg = (f"Output of 'cuobjdump' checked for {path}; device code architectures match " "those in cuda_compute_capabilities") self.log.debug(msg) diff --git a/easybuild/tools/config.py b/easybuild/tools/config.py index 3503d5c2f5..04e86e8562 100644 --- a/easybuild/tools/config.py +++ b/easybuild/tools/config.py @@ -358,6 +358,7 @@ def mk_full_default_path(name, prefix=DEFAULT_PREFIX): 'pre_create_installdir', 'show_progress_bar', 'strict_rpath_sanity_check', + 'strict_cuda_sanity_check', 'trace', ], EMPTY_LIST: [ diff --git a/easybuild/tools/options.py b/easybuild/tools/options.py index 70671ce20f..0fb439baf4 100644 --- a/easybuild/tools/options.py +++ b/easybuild/tools/options.py @@ -540,9 +540,16 @@ def override_options(self): "Git commit to use for the target software build (robot capabilities are automatically disabled)", None, 'store', None), 'sticky-bit': ("Set sticky bit on newly created directories", None, 'store_true', False), - 'strict-rpath-sanity-check': ("Perform strict RPATH sanity check, which involces unsetting " + 'strict-rpath-sanity-check': ("Perform strict RPATH sanity check, which involves unsetting " "$LD_LIBRARY_PATH before checking whether all required libraries are found", None, 'store_true', False), + 'strict-cuda-sanity-check': ("Perform strict CUDA sanity check. Without this option, the CUDA sanity " + "check will fail if the CUDA binaries don't contain code for (at least) " + "all compute capabilities defined in --cude-compute-capabilities, but will " + "accept if code for additional compute capabilities is present. " + "With this setting, the sanity check will also fail if code is present for " + "more compute capabilities than defined in --cuda-compute-capabilities.", + None, 'store_true', False), 'sysroot': ("Location root directory of system, prefix for standard paths like /usr/lib and /usr/include", None, 'store', None), 'trace': ("Provide more information in output to stdout on progress", None, 'store_true', True, 'T'), From 3d07ef6ad3eba20bb6e0c87565ee90eaa9dc552f Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Fri, 21 Feb 2025 23:55:34 +0100 Subject: [PATCH 007/114] This is a work in progress for creating a set of tests... --- test/framework/toy_build.py | 93 ++++++++++++++++++++++++++++++++++++- 1 file changed, 92 insertions(+), 1 deletion(-) diff --git a/test/framework/toy_build.py b/test/framework/toy_build.py index cd9aefb26f..13e121bf71 100644 --- a/test/framework/toy_build.py +++ b/test/framework/toy_build.py @@ -53,7 +53,7 @@ from easybuild.main import main_with_hooks from easybuild.tools.build_log import EasyBuildError from easybuild.tools.config import get_module_syntax, get_repositorypath -from easybuild.tools.environment import modify_env +from easybuild.tools.environment import modify_env, setvar from easybuild.tools.filetools import adjust_permissions, change_dir, copy_file, mkdir, move_file from easybuild.tools.filetools import read_file, remove_dir, remove_file, which, write_file from easybuild.tools.module_generator import ModuleGeneratorTcl @@ -3008,6 +3008,97 @@ def test_toy_filter_rpath_sanity_libs(self): self.assertErrorRegex(EasyBuildError, error_pattern, self._test_toy_build, ec_file=toy_ec, extra_args=args, name='toy-app', raise_error=True, verbose=False) + def test_toy_cuda_sanity_check(self): + """Test the CUDA sanity check""" + # We need to mock a cuobjdump executable and prepend in on the PATH + # First, make sure we can restore environment at the end of this test + start_env = copy.deepcopy(os.environ) + + # Create mock cuobjdump + # First, lets define sections of echo's for cuobjdump for various scenarios + + # Shebang for cuobjdump + cuobjdump_txt_shebang = "#!/bin/bash\n" + + # Section for cuobjdump printing output for sm_80 architecture + cuobjdump_txt_sm80 = '\n'.join([ + "echo 'Fatbin elf code:'" + "echo '================'" + "echo 'arch = sm_80'" + "echo 'code version = [1,7]'" + "echo 'host = linux'" + "echo 'compile_size = 64bit'" + "echo ''" + ]) + + # Section for cuobjdump printing output for sm_90 architecture + cuobjdump_txt_sm90 = '\n'.join([ + "echo 'Fatbin elf code:'" + "echo '================'" + "echo 'arch = sm_90'" + "echo 'code version = [1,7]'" + "echo 'host = linux'" + "echo 'compile_size = 64bit'" + "echo ''" + ]) + + # Section for cuobjdump printing output for sm_80 PTX code + cuobjdump_txt_sm80_ptx = '\n'.join([ + "echo 'Fatbin ptx code:'" + "echo '================'" + "echo 'arch = sm_80'" + "echo 'code version = [8,1]'" + "echo 'host = linux'" + "echo 'compile_size = 64bit'" + "echo 'compressed'" + ]) + + # Section for cuobjdump printing output for sm_90 PTX code + cuobjdump_txt_sm90_ptx = '\n'.join([ + "echo 'Fatbin ptx code:'" + "echo '================'" + "echo 'arch = sm_90'" + "echo 'code version = [8,1]'" + "echo 'host = linux'" + "echo 'compile_size = 64bit'" + "echo 'compressed'" + ]) + + # Create temporary subdir for cuobjdump, so that we don't have to add self.test_prefix itself to the PATH + cuobjdump_dir = os.path.join(self.test_prefix, 'cuobjdump_dir') + mkdir(cuobjdump_dir, parents=True) + + # Add cuobjdump_dir to the path + setvar('PATH', '%s:%s' % (cuobjdump_dir, os.getenv('PATH'))) + + # Filepath to cuobjdump + cuobjdump_file = os.path.join(cuobjdump_dir, 'cuobjdump') + + # Test case 1: --cuda-compute-capabilities=8.0 and mocking a binary that contains 8.0 EFL code + write_file(cuobjdump_file, cuobjdump_txt_shebang), + write_file(cuobjdump_file, cuobjdump_txt_sm80, append=True) + adjust_permission(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable + args = ['--cuda-compute-capabilities=8.0', '--debug'] # Need debug so we can check output + test_report_fp = os.path.join(self.test_buildpath, 'full_test_report.md') + # We expect this to pass, so no need to check errors + regex = r"DEBUG Output of 'cuobjdump' checked for .*toy; " + regex += "device code architectures match those in cuda_compute_capabilities" + self.test_toy_build(extra_args=args, test_report=test_report_fp, raise_error=True + test_report_regexs=[regex]) + + + + + + # Test single CUDA compute capability with --cuda-compute-capabilities=8.0 + + # Test multiple CUDA compute capabilities with --cuda-compute-capabilities=8.0,9.0 + + # Test stric CUDA check with --cuda-compute-capabilities=8.0 and a binary that also contains also 9.0 code + + # Restore original environment + modify_env(os.environ, start_env, verbose=False) + def test_toy_modaltsoftname(self): """Build two dependent toys as in test_toy_toy but using modaltsoftname""" topdir = os.path.dirname(os.path.abspath(__file__)) From f13fca23e26a1c1851fe1fc6524785d331523f6f Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Sat, 22 Feb 2025 02:50:24 +0100 Subject: [PATCH 008/114] First test working.. --- test/framework/toy_build.py | 74 ++++++++++++++++++++++--------------- 1 file changed, 44 insertions(+), 30 deletions(-) diff --git a/test/framework/toy_build.py b/test/framework/toy_build.py index 13e121bf71..0eb1c28e78 100644 --- a/test/framework/toy_build.py +++ b/test/framework/toy_build.py @@ -3014,6 +3014,10 @@ def test_toy_cuda_sanity_check(self): # First, make sure we can restore environment at the end of this test start_env = copy.deepcopy(os.environ) + # Define the toy_ec file we want to use + topdir = os.path.dirname(os.path.abspath(__file__)) + toy_ec = os.path.join(topdir, 'easyconfigs', 'test_ecs', 't', 'toy', 'toy-0.0.eb') + # Create mock cuobjdump # First, lets define sections of echo's for cuobjdump for various scenarios @@ -3022,48 +3026,54 @@ def test_toy_cuda_sanity_check(self): # Section for cuobjdump printing output for sm_80 architecture cuobjdump_txt_sm80 = '\n'.join([ - "echo 'Fatbin elf code:'" - "echo '================'" - "echo 'arch = sm_80'" - "echo 'code version = [1,7]'" - "echo 'host = linux'" - "echo 'compile_size = 64bit'" + "echo 'Fatbin elf code:'", + "echo '================'", + "echo 'arch = sm_80'", + "echo 'code version = [1,7]'", + "echo 'host = linux'", + "echo 'compile_size = 64bit'", "echo ''" ]) # Section for cuobjdump printing output for sm_90 architecture cuobjdump_txt_sm90 = '\n'.join([ - "echo 'Fatbin elf code:'" - "echo '================'" - "echo 'arch = sm_90'" - "echo 'code version = [1,7]'" - "echo 'host = linux'" - "echo 'compile_size = 64bit'" + "echo 'Fatbin elf code:'", + "echo '================'", + "echo 'arch = sm_90'", + "echo 'code version = [1,7]'", + "echo 'host = linux'", + "echo 'compile_size = 64bit'", "echo ''" ]) # Section for cuobjdump printing output for sm_80 PTX code cuobjdump_txt_sm80_ptx = '\n'.join([ - "echo 'Fatbin ptx code:'" - "echo '================'" - "echo 'arch = sm_80'" - "echo 'code version = [8,1]'" - "echo 'host = linux'" - "echo 'compile_size = 64bit'" + "echo 'Fatbin ptx code:'", + "echo '================'", + "echo 'arch = sm_80'", + "echo 'code version = [8,1]'", + "echo 'host = linux'", + "echo 'compile_size = 64bit'", "echo 'compressed'" ]) # Section for cuobjdump printing output for sm_90 PTX code cuobjdump_txt_sm90_ptx = '\n'.join([ - "echo 'Fatbin ptx code:'" - "echo '================'" - "echo 'arch = sm_90'" - "echo 'code version = [8,1]'" - "echo 'host = linux'" - "echo 'compile_size = 64bit'" + "echo 'Fatbin ptx code:'", + "echo '================'", + "echo 'arch = sm_90'", + "echo 'code version = [8,1]'", + "echo 'host = linux'", + "echo 'compile_size = 64bit'", "echo 'compressed'" ]) + # Created regex for success and failures + device_code_regex_pattern = r"DEBUG Output of 'cuobjdump' checked for .*/bin/toy; device code " + device_code_regex_pattern += "architectures match those in cuda_compute_capabilities" + device_code_regex = re.compile(device_code_regex_pattern, re.M) + # TODO: create regex for failures + # Create temporary subdir for cuobjdump, so that we don't have to add self.test_prefix itself to the PATH cuobjdump_dir = os.path.join(self.test_prefix, 'cuobjdump_dir') mkdir(cuobjdump_dir, parents=True) @@ -3071,20 +3081,24 @@ def test_toy_cuda_sanity_check(self): # Add cuobjdump_dir to the path setvar('PATH', '%s:%s' % (cuobjdump_dir, os.getenv('PATH'))) + # Pretend we have CUDA loaded, or the sanity check won't run + setvar('EBROOTCUDA', '/foo/bar') + # Filepath to cuobjdump cuobjdump_file = os.path.join(cuobjdump_dir, 'cuobjdump') # Test case 1: --cuda-compute-capabilities=8.0 and mocking a binary that contains 8.0 EFL code write_file(cuobjdump_file, cuobjdump_txt_shebang), write_file(cuobjdump_file, cuobjdump_txt_sm80, append=True) - adjust_permission(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable - args = ['--cuda-compute-capabilities=8.0', '--debug'] # Need debug so we can check output + adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable + args = ['--cuda-compute-capabilities=8.0'] test_report_fp = os.path.join(self.test_buildpath, 'full_test_report.md') # We expect this to pass, so no need to check errors - regex = r"DEBUG Output of 'cuobjdump' checked for .*toy; " - regex += "device code architectures match those in cuda_compute_capabilities" - self.test_toy_build(extra_args=args, test_report=test_report_fp, raise_error=True - test_report_regexs=[regex]) + with self.mocked_stdout_stderr(): + outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=False) + msg = "Patter %s found in full build log:\n%s" % (device_code_regex.pattern, outtxt) + self.assertTrue(device_code_regex.search(outtxt), msg) + From bbe189d130116d3741958681d2b7f2ab3027215c Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 5 Mar 2025 21:45:42 +0100 Subject: [PATCH 009/114] Restructure logging messages a bit --- easybuild/framework/easyblock.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 6291c0fba7..0194c8acb7 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3377,7 +3377,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): is_failure = True # We still log the result, but don't fail: if path in ignore_file_list: - fail_msg += f"This failure will be ignored as {path} is listed in " + fail_msg += f"This failure will be ignored as '{path}' is listed in " fail_msg += "'ignore_cuda_sanity_failures'." is_failure = False @@ -3387,7 +3387,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): else: self.log.warning(fail_msg) else: - msg = (f"Output of 'cuobjdump' checked for {path}; device code architectures match " + msg = (f"Output of 'cuobjdump' checked for '{path}'; device code architectures match " "those in cuda_compute_capabilities") self.log.debug(msg) @@ -3398,7 +3398,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): if missing_ptx_ccs: fail_msg = "Configured highest compute capability was '%s', " fail_msg += "but no PTX code for this compute capability was found in '%s' " - fail_msg += "PTX architectures supported in that file: %s" + fail_msg += "(PTX architectures supported in that file: %s)" self.log.warning(fail_msg, highest_cc[0], path, derived_ptx_ccs) else: msg = (f"Output of 'cuobjdump' checked for {path}; ptx code was present for (at least) the" From caff55965011faa6fb0951e432d491631b04fe40 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 5 Mar 2025 22:31:43 +0100 Subject: [PATCH 010/114] quote path in debug log string --- easybuild/framework/easyblock.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 0194c8acb7..b374c1455f 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3401,7 +3401,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): fail_msg += "(PTX architectures supported in that file: %s)" self.log.warning(fail_msg, highest_cc[0], path, derived_ptx_ccs) else: - msg = (f"Output of 'cuobjdump' checked for {path}; ptx code was present for (at least) the" + msg = (f"Output of 'cuobjdump' checked for '{path}'; ptx code was present for (at least) the" " highest CUDA compute capability in cuda_compute_capabilities") self.log.debug(msg) else: From a6408ffe20721bb4981e0ba3a1bb9cee919aa1a2 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 5 Mar 2025 23:25:05 +0100 Subject: [PATCH 011/114] Added unit tests --- test/framework/toy_build.py | 188 +++++++++++++++++++++++++++++++++--- 1 file changed, 174 insertions(+), 14 deletions(-) diff --git a/test/framework/toy_build.py b/test/framework/toy_build.py index 0eb1c28e78..141147cbf3 100644 --- a/test/framework/toy_build.py +++ b/test/framework/toy_build.py @@ -3069,10 +3069,38 @@ def test_toy_cuda_sanity_check(self): ]) # Created regex for success and failures - device_code_regex_pattern = r"DEBUG Output of 'cuobjdump' checked for .*/bin/toy; device code " - device_code_regex_pattern += "architectures match those in cuda_compute_capabilities" - device_code_regex = re.compile(device_code_regex_pattern, re.M) - # TODO: create regex for failures + device_code_regex_success_pattern = r"DEBUG Output of 'cuobjdump' checked for '.*/bin/toy'; device code " + device_code_regex_success_pattern += "architectures match those in cuda_compute_capabilities" + device_code_regex_success = re.compile(device_code_regex_success_pattern, re.M) + + device_missing_80_code_regex_pattern = r"Missing compute capabilities: 8.0." + device_missing_80_code_regex = re.compile(device_missing_80_code_regex_pattern, re.M) + + device_missing_80_code_ignored_regex_pattern = r"Missing compute capabilities: 8.0. This failure will be " + device_missing_80_code_ignored_regex_pattern += "ignored as '.*/bin/toy' is listed in " + device_missing_80_code_ignored_regex_pattern += "'ignore_cuda_sanity_failures'." + device_missing_80_code_ignored_regex = re.compile(device_missing_80_code_ignored_regex_pattern, re.M) + + device_missing_90_code_regex_pattern = r"Missing compute capabilities: 9.0." + device_missing_90_code_regex = re.compile(device_missing_90_code_regex_pattern, re.M) + + device_surplus_90_code_regex_pattern = r"Surplus compute capabilities: 9.0." + device_surplus_90_code_regex = re.compile(device_surplus_90_code_regex_pattern, re.M) + + ptx_code_regex_success_pattern = r"DEBUG Output of 'cuobjdump' checked for '.*/bin/toy'; ptx code was " + ptx_code_regex_success_pattern += "present for \(at least\) the highest CUDA compute capability in " + ptx_code_regex_success_pattern += "cuda_compute_capabilities" + ptx_code_regex_success = re.compile(ptx_code_regex_success_pattern, re.M) + + ptx_code_missing_80_regex_pattern = r"Configured highest compute capability was '8.0', but no PTX code " + ptx_code_missing_80_regex_pattern += "for this compute capability was found in '.*/bin/toy' " + ptx_code_missing_80_regex_pattern += "\(PTX architectures supported in that file: \[\]\)" + ptx_code_missing_80_regex = re.compile(ptx_code_missing_80_regex_pattern, re.M) + + ptx_code_missing_90_regex_pattern = r"Configured highest compute capability was '9.0', but no PTX code " + ptx_code_missing_90_regex_pattern += "for this compute capability was found in '.*/bin/toy' " + ptx_code_missing_90_regex_pattern += "\(PTX architectures supported in that file: \[\]\)" + ptx_code_missing_90_regex = re.compile(ptx_code_missing_90_regex_pattern, re.M) # Create temporary subdir for cuobjdump, so that we don't have to add self.test_prefix itself to the PATH cuobjdump_dir = os.path.join(self.test_prefix, 'cuobjdump_dir') @@ -3087,28 +3115,160 @@ def test_toy_cuda_sanity_check(self): # Filepath to cuobjdump cuobjdump_file = os.path.join(cuobjdump_dir, 'cuobjdump') - # Test case 1: --cuda-compute-capabilities=8.0 and mocking a binary that contains 8.0 EFL code + # Test case 1: --cuda-compute-capabilities=8.0 and mocking a binary that contains 8.0 ELF code + # This means the build should succeed, so we can run with raise_error=True and check the output + # for the expected debugging output + # We also check here for the warning that no PTX code for the highest compute capability (8.0) was found write_file(cuobjdump_file, cuobjdump_txt_shebang), write_file(cuobjdump_file, cuobjdump_txt_sm80, append=True) adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable args = ['--cuda-compute-capabilities=8.0'] - test_report_fp = os.path.join(self.test_buildpath, 'full_test_report.md') # We expect this to pass, so no need to check errors with self.mocked_stdout_stderr(): - outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=False) - msg = "Patter %s found in full build log:\n%s" % (device_code_regex.pattern, outtxt) - self.assertTrue(device_code_regex.search(outtxt), msg) + outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) + msg = "Pattern %s not found in full build log: %s" % (device_code_regex_success.pattern, outtxt) + self.assertTrue(device_code_regex_success.search(outtxt), msg) + msg = "Pattern %s not found in full build log: %s" % (ptx_code_missing_80_regex.pattern, outtxt) + self.assertTrue(ptx_code_missing_80_regex.search(outtxt), msg) + + # Test case 2: --cuda-compute-capabilities=8.0 and mocking a binary that contains 8.0 ELF code and 8.0 PTX code + # This means the build should succeed, so we can run with raise_error=True and check the output + # for the expected debugging output + # It also means we expect output confirming that PTX code was found for the highest compute capability + write_file(cuobjdump_file, cuobjdump_txt_shebang), + write_file(cuobjdump_file, cuobjdump_txt_sm80, append=True) + write_file(cuobjdump_file, cuobjdump_txt_sm80_ptx, append=True) + adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable + args = ['--cuda-compute-capabilities=8.0'] + # We expect this to pass, so no need to check errors + with self.mocked_stdout_stderr(): + outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) + msg = "Pattern %s not found in full build log: %s" % (device_code_regex_success.pattern, outtxt) + self.assertTrue(device_code_regex_success.search(outtxt), msg) + msg = "Pattern %s not found in full build log: %s" % (ptx_code_regex_success.pattern, outtxt) + self.assertTrue(ptx_code_regex_success.search(outtxt), msg) + + + # Test case 3: --cuda-compute-capabilities=8.0 and mocking a binary that contains only 9.0 ELF code + # This means we expect the build to fail, so we'll do an assertErrorRegex to check that + # Subsequently, we rerun with raise_error=False so we can check the debugging output + # There, we expect EB to tell us that 8.0 code was expected, but only 9.0 code was found + write_file(cuobjdump_file, cuobjdump_txt_shebang), + write_file(cuobjdump_file, cuobjdump_txt_sm90, append=True) + adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable + args = ['--cuda-compute-capabilities=8.0'] + # We expect this to fail + error_pattern = "Sanity check failed: Mismatch between cuda_compute_capabilities and device code in " + error_pattern += ".*/bin/toy. Surplus compute capabilities: 9.0. Missing compute capabilities: 8.0." + with self.mocked_stdout_stderr(): + self.assertErrorRegex(EasyBuildError, error_pattern, self._test_toy_build, ec_file=toy_ec, + extra_args=args, raise_error=True) + outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=False, verify=False) + msg = "Pattern %s not found in full build log: %s" % (device_missing_80_code_regex.pattern, outtxt) + self.assertTrue(device_missing_80_code_regex.search(outtxt), msg) + msg = "Pattern %s not found in full build log: %s" % (ptx_code_missing_80_regex.pattern, outtxt) + self.assertTrue(ptx_code_missing_80_regex.search(outtxt), msg) + + + # Test case 4: --cuda-compute-capabilities=8.0,9.0 and mocking a binary that contains both 8.0 and 9.0 ELF code + # This means the build should succeed, so we can run with raise_error=True and check the output + # for the expected debugging output. + # We also check here for the warning that no PTX code for the highest compute capability (9.0) was found + write_file(cuobjdump_file, cuobjdump_txt_shebang), + write_file(cuobjdump_file, cuobjdump_txt_sm80, append=True) + write_file(cuobjdump_file, cuobjdump_txt_sm90, append=True) + adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable + args = ['--cuda-compute-capabilities=8.0,9.0'] + # We expect this to succeed + with self.mocked_stdout_stderr(): + outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) + msg = "Pattern %s not found in full build log: %s" % (device_code_regex_success.pattern, outtxt) + self.assertTrue(device_code_regex_success.search(outtxt), msg) + msg = "Pattern %s not found in full build log: %s" % (ptx_code_missing_90_regex.pattern, outtxt) + self.assertTrue(ptx_code_missing_90_regex.search(outtxt), msg) - + # Test case 5: --cuda-compute-capabilities=8.0,9.0 and mocking a binary that only contains 8.0 ELF code + # This means we expect the build to fail, so we'll do an assertErrorRegex to check that + # Subsequently, we rerun with raise_error=False so we can check the debugging output for the debugging + # output which tells us it expected 8.0 and 9.0, but only found 9.0 ELF code + write_file(cuobjdump_file, cuobjdump_txt_shebang), + write_file(cuobjdump_file, cuobjdump_txt_sm80, append=True) + adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable + args = ['--cuda-compute-capabilities=8.0,9.0'] + # We expect this to fail + error_pattern = "Sanity check failed: Mismatch between cuda_compute_capabilities and device code in " + error_pattern += ".*/bin/toy. Missing compute capabilities: 9.0." + with self.mocked_stdout_stderr(): + self.assertErrorRegex(EasyBuildError, error_pattern, self._test_toy_build, ec_file=toy_ec, + extra_args=args, raise_error=True) + outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=False, verify=False) + msg = "Pattern %s not found in full build log: %s" % (device_missing_90_code_regex.pattern, outtxt) + self.assertTrue(device_missing_90_code_regex.search(outtxt), msg) + msg = "Pattern %s not found in full build log: %s" % (ptx_code_missing_90_regex.pattern, outtxt) + self.assertTrue(ptx_code_missing_90_regex.search(outtxt), msg) + # Test case 6: --cuda-compute-capabilities=8.0,9.0 and mocking a binary that contains 8.0 and 9.0 ELF code + # as well as 9.0 PTX code + # This means the build should succeed, so we can run with raise_error=True and check the output + # for the expected debugging output + # It also means we expect output confirming that PTX code was found for the highest compute capability + write_file(cuobjdump_file, cuobjdump_txt_shebang), + write_file(cuobjdump_file, cuobjdump_txt_sm80, append=True) + write_file(cuobjdump_file, cuobjdump_txt_sm90, append=True) + write_file(cuobjdump_file, cuobjdump_txt_sm90_ptx, append=True) + adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable + args = ['--cuda-compute-capabilities=8.0,9.0'] + # We expect this to succeed + with self.mocked_stdout_stderr(): + outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) + msg = "Pattern %s not found in full build log: %s" % (device_code_regex_success.pattern, outtxt) + self.assertTrue(device_code_regex_success.search(outtxt), msg) + msg = "Pattern %s not found in full build log: %s" % (ptx_code_regex_success.pattern, outtxt) + self.assertTrue(ptx_code_regex_success.search(outtxt), msg) + + # Test case 7: --cude-compute-capabilities=8.0 --strict-cuda-sanity-check and mocking a binary that contains + # 8.0 and 9.0 ELF code + # This means we expect the build to fail, so we'll do an assertErrorRegex to check that + # Subsequently, we rerun with raise_error=False so we can check the debugging output + # There, we expect EB to tell us that only 8.0 code was expected, but both 8.0 and 9.0 code was found + write_file(cuobjdump_file, cuobjdump_txt_shebang), + write_file(cuobjdump_file, cuobjdump_txt_sm80, append=True) + write_file(cuobjdump_file, cuobjdump_txt_sm90, append=True) + adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable + args = ['--cuda-compute-capabilities=8.0', '--strict-cuda-sanity-check'] + # We expect this to fail + error_pattern = "Sanity check failed: Mismatch between cuda_compute_capabilities and device code in " + error_pattern += ".*/bin/toy. Surplus compute capabilities: 9.0." + with self.mocked_stdout_stderr(): + self.assertErrorRegex(EasyBuildError, error_pattern, self._test_toy_build, ec_file=toy_ec, + extra_args=args, raise_error=True) + outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=False, verify=False) + msg = "Pattern %s not found in full build log: %s" % (device_surplus_90_code_regex.pattern, outtxt) + self.assertTrue(device_surplus_90_code_regex.search(outtxt), msg) + msg = "Pattern %s not found in full build log: %s" % (ptx_code_missing_80_regex.pattern, outtxt) + self.assertTrue(ptx_code_missing_80_regex.search(outtxt), msg) - # Test single CUDA compute capability with --cuda-compute-capabilities=8.0 - - # Test multiple CUDA compute capabilities with --cuda-compute-capabilities=8.0,9.0 - # Test stric CUDA check with --cuda-compute-capabilities=8.0 and a binary that also contains also 9.0 code + # Test case 8: --cuda-compute-capabilities=8.0 and mocking a binary that contains 9.0 ELF code + # but passing that binary on the ignore_cuda_sanity_failures list + # This means we expect the build to succeed and we'll check the output for the expected debugging output + test_ec = os.path.join(self.test_prefix, 'test.eb') + test_ec_txt = read_file(toy_ec) + test_ec_txt += "\ncuda_sanity_ignore_files = ['bin/toy']" + write_file(test_ec, test_ec_txt) + write_file(cuobjdump_file, cuobjdump_txt_shebang), + write_file(cuobjdump_file, cuobjdump_txt_sm90, append=True) + adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable + args = ['--cuda-compute-capabilities=8.0'] + # We expect this to succeed + with self.mocked_stdout_stderr(): + outtxt = self._test_toy_build(ec_file=test_ec, extra_args=args, raise_error=True) + msg = "Pattern %s not found in full build log: %s" % (device_missing_80_code_ignored_regex.pattern, outtxt) + self.assertTrue(device_missing_80_code_ignored_regex.search(outtxt), msg) + msg = "Pattern %s not found in full build log: %s" % (ptx_code_missing_80_regex.pattern, outtxt) + self.assertTrue(ptx_code_missing_80_regex.search(outtxt), msg) # Restore original environment modify_env(os.environ, start_env, verbose=False) From ba960aa966c2e943be13416de35814093fb4b86e Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 5 Mar 2025 23:34:36 +0100 Subject: [PATCH 012/114] Fix hound issues --- easybuild/framework/easyblock.py | 7 +++---- test/framework/toy_build.py | 13 ++++--------- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index b374c1455f..05bc15d6c3 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -110,7 +110,6 @@ from easybuild.tools.repository.repository import init_repository from easybuild.tools.systemtools import check_linked_shared_libs, det_parallelism, get_cuda_device_code_architectures from easybuild.tools.systemtools import get_linked_libs_raw, get_shared_lib_ext, pick_system_specific_value, use_group -from easybuild.tools.toolchain.toolchain import TOOLCHAIN_CAPABILITY_CUDA from easybuild.tools.utilities import INDENT_4SPACES, get_class_for, nub, quote_str from easybuild.tools.utilities import remove_unwanted_chars, time2str, trace_msg from easybuild.tools.version import this_is_easybuild, VERBOSE_VERSION, VERSION @@ -3340,7 +3339,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): self.log.info("Using default subdirectories for binaries/libraries to verify CUDA device code: %s", cuda_dirs) else: - self.log.info("Using default subdirectories for binaries/libraries to verify CUDA device code: %s", + self.log.info("Using configured subdirectories for binaries/libraries to verify CUDA device code: %s", cuda_dirs) for dirpath in [os.path.join(self.installdir, d) for d in cuda_dirs]: @@ -3401,8 +3400,8 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): fail_msg += "(PTX architectures supported in that file: %s)" self.log.warning(fail_msg, highest_cc[0], path, derived_ptx_ccs) else: - msg = (f"Output of 'cuobjdump' checked for '{path}'; ptx code was present for (at least) the" - " highest CUDA compute capability in cuda_compute_capabilities") + msg = (f"Output of 'cuobjdump' checked for '{path}'; ptx code was present for (at least)" + " the highest CUDA compute capability in cuda_compute_capabilities") self.log.debug(msg) else: self.log.debug(f"Not sanity checking files in non-existing directory {dirpath}") diff --git a/test/framework/toy_build.py b/test/framework/toy_build.py index 141147cbf3..7b5469d8fb 100644 --- a/test/framework/toy_build.py +++ b/test/framework/toy_build.py @@ -3088,18 +3088,18 @@ def test_toy_cuda_sanity_check(self): device_surplus_90_code_regex = re.compile(device_surplus_90_code_regex_pattern, re.M) ptx_code_regex_success_pattern = r"DEBUG Output of 'cuobjdump' checked for '.*/bin/toy'; ptx code was " - ptx_code_regex_success_pattern += "present for \(at least\) the highest CUDA compute capability in " + ptx_code_regex_success_pattern += r"present for \(at least\) the highest CUDA compute capability in " ptx_code_regex_success_pattern += "cuda_compute_capabilities" ptx_code_regex_success = re.compile(ptx_code_regex_success_pattern, re.M) ptx_code_missing_80_regex_pattern = r"Configured highest compute capability was '8.0', but no PTX code " ptx_code_missing_80_regex_pattern += "for this compute capability was found in '.*/bin/toy' " - ptx_code_missing_80_regex_pattern += "\(PTX architectures supported in that file: \[\]\)" + ptx_code_missing_80_regex_pattern += r"\(PTX architectures supported in that file: \[\]\)" ptx_code_missing_80_regex = re.compile(ptx_code_missing_80_regex_pattern, re.M) ptx_code_missing_90_regex_pattern = r"Configured highest compute capability was '9.0', but no PTX code " ptx_code_missing_90_regex_pattern += "for this compute capability was found in '.*/bin/toy' " - ptx_code_missing_90_regex_pattern += "\(PTX architectures supported in that file: \[\]\)" + ptx_code_missing_90_regex_pattern += r"\(PTX architectures supported in that file: \[\]\)" ptx_code_missing_90_regex = re.compile(ptx_code_missing_90_regex_pattern, re.M) # Create temporary subdir for cuobjdump, so that we don't have to add self.test_prefix itself to the PATH @@ -3147,7 +3147,6 @@ def test_toy_cuda_sanity_check(self): self.assertTrue(device_code_regex_success.search(outtxt), msg) msg = "Pattern %s not found in full build log: %s" % (ptx_code_regex_success.pattern, outtxt) self.assertTrue(ptx_code_regex_success.search(outtxt), msg) - # Test case 3: --cuda-compute-capabilities=8.0 and mocking a binary that contains only 9.0 ELF code # This means we expect the build to fail, so we'll do an assertErrorRegex to check that @@ -3169,7 +3168,6 @@ def test_toy_cuda_sanity_check(self): msg = "Pattern %s not found in full build log: %s" % (ptx_code_missing_80_regex.pattern, outtxt) self.assertTrue(ptx_code_missing_80_regex.search(outtxt), msg) - # Test case 4: --cuda-compute-capabilities=8.0,9.0 and mocking a binary that contains both 8.0 and 9.0 ELF code # This means the build should succeed, so we can run with raise_error=True and check the output # for the expected debugging output. @@ -3187,7 +3185,6 @@ def test_toy_cuda_sanity_check(self): msg = "Pattern %s not found in full build log: %s" % (ptx_code_missing_90_regex.pattern, outtxt) self.assertTrue(ptx_code_missing_90_regex.search(outtxt), msg) - # Test case 5: --cuda-compute-capabilities=8.0,9.0 and mocking a binary that only contains 8.0 ELF code # This means we expect the build to fail, so we'll do an assertErrorRegex to check that # Subsequently, we rerun with raise_error=False so we can check the debugging output for the debugging @@ -3207,7 +3204,6 @@ def test_toy_cuda_sanity_check(self): self.assertTrue(device_missing_90_code_regex.search(outtxt), msg) msg = "Pattern %s not found in full build log: %s" % (ptx_code_missing_90_regex.pattern, outtxt) self.assertTrue(ptx_code_missing_90_regex.search(outtxt), msg) - # Test case 6: --cuda-compute-capabilities=8.0,9.0 and mocking a binary that contains 8.0 and 9.0 ELF code # as well as 9.0 PTX code @@ -3228,7 +3224,7 @@ def test_toy_cuda_sanity_check(self): msg = "Pattern %s not found in full build log: %s" % (ptx_code_regex_success.pattern, outtxt) self.assertTrue(ptx_code_regex_success.search(outtxt), msg) - # Test case 7: --cude-compute-capabilities=8.0 --strict-cuda-sanity-check and mocking a binary that contains + # Test case 7: --cude-compute-capabilities=8.0 --strict-cuda-sanity-check and mocking a binary that contains # 8.0 and 9.0 ELF code # This means we expect the build to fail, so we'll do an assertErrorRegex to check that # Subsequently, we rerun with raise_error=False so we can check the debugging output @@ -3250,7 +3246,6 @@ def test_toy_cuda_sanity_check(self): msg = "Pattern %s not found in full build log: %s" % (ptx_code_missing_80_regex.pattern, outtxt) self.assertTrue(ptx_code_missing_80_regex.search(outtxt), msg) - # Test case 8: --cuda-compute-capabilities=8.0 and mocking a binary that contains 9.0 ELF code # but passing that binary on the ignore_cuda_sanity_failures list # This means we expect the build to succeed and we'll check the output for the expected debugging output From f569ba4f9127017effe4ae2b159b47241dc382de Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 5 Mar 2025 23:38:05 +0100 Subject: [PATCH 013/114] flake8 compliance: add extra blank line --- easybuild/tools/systemtools.py | 1 + 1 file changed, 1 insertion(+) diff --git a/easybuild/tools/systemtools.py b/easybuild/tools/systemtools.py index 5afe1717e8..2680b67916 100644 --- a/easybuild/tools/systemtools.py +++ b/easybuild/tools/systemtools.py @@ -223,6 +223,7 @@ - ptx_archs: a list of CUDA (virtual) device compute capabilities for which ptx code was found """ + class SystemToolsException(Exception): """raised when systemtools fails""" From 17dc7556d0939753ddb2c7948c781f5fa2cde1ae Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 3 Apr 2025 15:30:02 +0200 Subject: [PATCH 014/114] Make sure strict_cuda_sanity_check is in the list for the correct default (false). Add accept-ptx-as-cc-support and accept-missing-cuda-ptx options --- easybuild/tools/config.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/easybuild/tools/config.py b/easybuild/tools/config.py index 631106b160..bf728049cf 100644 --- a/easybuild/tools/config.py +++ b/easybuild/tools/config.py @@ -342,6 +342,9 @@ def mk_full_default_path(name, prefix=DEFAULT_PREFIX): 'use_existing_modules', 'use_f90cache', 'wait_on_lock_limit', + 'strict_cuda_sanity_check', + 'accept_ptx_as_cc_support', + 'accept_missing_cuda_ptx', ], True: [ 'cleanup_builddir', @@ -359,7 +362,6 @@ def mk_full_default_path(name, prefix=DEFAULT_PREFIX): 'pre_create_installdir', 'show_progress_bar', 'strict_rpath_sanity_check', - 'strict_cuda_sanity_check', 'trace', ], EMPTY_LIST: [ From 354f071cc24e05c697f6c89c39df178b9f45a74e Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 3 Apr 2025 15:30:27 +0200 Subject: [PATCH 015/114] Add accept-ptx-as-cc-support and accept-missing-cuda-ptx options --- easybuild/tools/options.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/easybuild/tools/options.py b/easybuild/tools/options.py index c3025118f4..33e42c15b8 100644 --- a/easybuild/tools/options.py +++ b/easybuild/tools/options.py @@ -398,7 +398,13 @@ def override_options(self): int, 'store_or_None', None), 'cuda-compute-capabilities': ("List of CUDA compute capabilities to use when building GPU software; " "values should be specified as digits separated by a dot, " - "for example: 3.5,5.0,7.2", 'strlist', 'extend', None), + "for example: 3.5,5.0,7.2. EasyBuild will compile a fat binaries with " + "support for (at least) all requested CUDA compute capabilities, and " + "PTX code for the highest CUDA compute capability (for forwards " + "compatibility). The check on this behavior may be relaxed using " + "--accept-ptx-for-cc-support or --accept-missing-ptx, or made more " + "stringent using --strict-cuda-sanity-check.", + 'strlist', 'extend', None), 'debug-lmod': ("Run Lmod modules tool commands in debug module", None, 'store_true', False), 'default-opt-level': ("Specify default optimisation level", 'choice', 'store', DEFAULT_OPT_LEVEL, Compiler.COMPILER_OPT_OPTIONS), @@ -554,6 +560,18 @@ def override_options(self): "With this setting, the sanity check will also fail if code is present for " "more compute capabilities than defined in --cuda-compute-capabilities.", None, 'store_true', False), + 'accept-ptx-as-cc-support': ("CUDA sanity check also passes if requested device code is not present, as " + "long as a PTX code is present that can be JIT-compiled into the requestd " + "device code. E.g. if --cuda-compute-capabilities=8.0 and a binary is found " + "in the installation that does not have device code for 8.0, but does have " + "PTX code for 7.0, the sanity check will pass if, and only if, this option " + "is True. Note that JIT-compiling means the binary will work on the " + "requested architecture, but is it not necessarily as well optimized as when " + "actual device code is present for the requested architecture ", + None, 'store_true', False), + 'accept-missing-cuda-ptx': ("CUDA sanity check also passes if PTX code for the highest requested CUDA " + "compute capability is not present (but will print a warning)", + None, 'store_true', False), 'sysroot': ("Location root directory of system, prefix for standard paths like /usr/lib and /usr/include", None, 'store', None), 'trace': ("Provide more information in output to stdout on progress", None, 'store_true', True, 'T'), From 4634cd4405f73d810a3e804df26bd25893389bd8 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 3 Apr 2025 15:54:32 +0200 Subject: [PATCH 016/114] Fix indentation mistake --- easybuild/tools/options.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/easybuild/tools/options.py b/easybuild/tools/options.py index 33e42c15b8..d60fa7a42f 100644 --- a/easybuild/tools/options.py +++ b/easybuild/tools/options.py @@ -570,8 +570,8 @@ def override_options(self): "actual device code is present for the requested architecture ", None, 'store_true', False), 'accept-missing-cuda-ptx': ("CUDA sanity check also passes if PTX code for the highest requested CUDA " - "compute capability is not present (but will print a warning)", - None, 'store_true', False), + "compute capability is not present (but will print a warning)", + None, 'store_true', False), 'sysroot': ("Location root directory of system, prefix for standard paths like /usr/lib and /usr/include", None, 'store', None), 'trace': ("Provide more information in output to stdout on progress", None, 'store_true', True, 'T'), From dd5feda9e3d59ce6714c05dad5e6e5a99d6062ce Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 3 Apr 2025 15:58:38 +0200 Subject: [PATCH 017/114] Get rid of early return, which apparently BDFL doesn't like :) --- easybuild/framework/easyblock.py | 135 +++++++++++++++---------------- 1 file changed, 67 insertions(+), 68 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index e58a103c5a..61eb7cc9d6 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3367,82 +3367,81 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): # If there are no CUDA compute capabilities defined, return if cfg_ccs is None or len(cfg_ccs) == 0: self.log.info("Skipping CUDA sanity check, as no CUDA compute capabilities where configured") - return fails - - if cuda_dirs is None: - cuda_dirs = self.cfg['bin_lib_subdirs'] or self.bin_lib_subdirs() - - if not cuda_dirs: - cuda_dirs = DEFAULT_BIN_LIB_SUBDIRS - self.log.info("Using default subdirectories for binaries/libraries to verify CUDA device code: %s", - cuda_dirs) else: - self.log.info("Using configured subdirectories for binaries/libraries to verify CUDA device code: %s", - cuda_dirs) + if cuda_dirs is None: + cuda_dirs = self.cfg['bin_lib_subdirs'] or self.bin_lib_subdirs() - for dirpath in [os.path.join(self.installdir, d) for d in cuda_dirs]: - if os.path.exists(dirpath): - self.log.debug(f"Sanity checking files for CUDA device code in {dirpath}") + if not cuda_dirs: + cuda_dirs = DEFAULT_BIN_LIB_SUBDIRS + self.log.info("Using default subdirectories for binaries/libraries to verify CUDA device code: %s", + cuda_dirs) + else: + self.log.info("Using configured subdirectories for binaries/libraries to verify CUDA device code: %s", + cuda_dirs) - for path in [os.path.join(dirpath, x) for x in os.listdir(dirpath)]: - self.log.debug("Sanity checking for CUDA device code in %s", path) + for dirpath in [os.path.join(self.installdir, d) for d in cuda_dirs]: + if os.path.exists(dirpath): + self.log.debug(f"Sanity checking files for CUDA device code in {dirpath}") - res = get_cuda_device_code_architectures(path) - if res is None: - msg = f"{path} does not appear to be a CUDA executable (no CUDA device code found), " - msg += "so skipping CUDA sanity check." - self.log.debug(msg) - else: - # unpack results - derived_ccs = res.device_code_archs - derived_ptx_ccs = res.ptx_archs - - # check whether device code architectures match cuda_compute_capabilities - additional_ccs = list(set(derived_ccs) - set(cfg_ccs)) - missing_ccs = list(set(cfg_ccs) - set(derived_ccs)) - - if additional_ccs or missing_ccs: - # Do we log this as warning or produce a sanity failure? - is_failure = False - fail_msg = f"Mismatch between cuda_compute_capabilities and device code in {path}. " - if additional_ccs: - fail_msg += "Surplus compute capabilities: %s. " % ', '.join(sorted(additional_ccs)) - if strict_cc_check: - is_failure = True - if missing_ccs: - fail_msg += "Missing compute capabilities: %s. " % ', '.join(sorted(missing_ccs)) - is_failure = True - # We still log the result, but don't fail: - if path in ignore_file_list: - fail_msg += f"This failure will be ignored as '{path}' is listed in " - fail_msg += "'ignore_cuda_sanity_failures'." - is_failure = False + for path in [os.path.join(dirpath, x) for x in os.listdir(dirpath)]: + self.log.debug("Sanity checking for CUDA device code in %s", path) - # Log warning or sanity error - if is_failure: - fails.append(fail_msg) - else: - self.log.warning(fail_msg) - else: - msg = (f"Output of 'cuobjdump' checked for '{path}'; device code architectures match " - "those in cuda_compute_capabilities") + res = get_cuda_device_code_architectures(path) + if res is None: + msg = f"{path} does not appear to be a CUDA executable (no CUDA device code found), " + msg += "so skipping CUDA sanity check." self.log.debug(msg) + else: + # unpack results + derived_ccs = res.device_code_archs + derived_ptx_ccs = res.ptx_archs - # Check whether there is ptx code for the highest CC in cfg_ccs - highest_cc = [sorted(cfg_ccs)[-1]] - missing_ptx_ccs = list(set(highest_cc) - set(derived_ptx_ccs)) + # check whether device code architectures match cuda_compute_capabilities + additional_ccs = list(set(derived_ccs) - set(cfg_ccs)) + missing_ccs = list(set(cfg_ccs) - set(derived_ccs)) - if missing_ptx_ccs: - fail_msg = "Configured highest compute capability was '%s', " - fail_msg += "but no PTX code for this compute capability was found in '%s' " - fail_msg += "(PTX architectures supported in that file: %s)" - self.log.warning(fail_msg, highest_cc[0], path, derived_ptx_ccs) - else: - msg = (f"Output of 'cuobjdump' checked for '{path}'; ptx code was present for (at least)" - " the highest CUDA compute capability in cuda_compute_capabilities") - self.log.debug(msg) - else: - self.log.debug(f"Not sanity checking files in non-existing directory {dirpath}") + if additional_ccs or missing_ccs: + # Do we log this as warning or produce a sanity failure? + is_failure = False + fail_msg = f"Mismatch between cuda_compute_capabilities and device code in {path}. " + if additional_ccs: + fail_msg += "Surplus compute capabilities: %s. " % ', '.join(sorted(additional_ccs)) + if strict_cc_check: + is_failure = True + if missing_ccs: + fail_msg += "Missing compute capabilities: %s. " % ', '.join(sorted(missing_ccs)) + is_failure = True + # We still log the result, but don't fail: + if path in ignore_file_list: + fail_msg += f"This failure will be ignored as '{path}' is listed in " + fail_msg += "'ignore_cuda_sanity_failures'." + is_failure = False + + # Log warning or sanity error + if is_failure: + fails.append(fail_msg) + else: + self.log.warning(fail_msg) + else: + msg = (f"Output of 'cuobjdump' checked for '{path}'; device code architectures match " + "those in cuda_compute_capabilities") + self.log.debug(msg) + + # Check whether there is ptx code for the highest CC in cfg_ccs + highest_cc = [sorted(cfg_ccs)[-1]] + missing_ptx_ccs = list(set(highest_cc) - set(derived_ptx_ccs)) + + if missing_ptx_ccs: + fail_msg = "Configured highest compute capability was '%s', " + fail_msg += "but no PTX code for this compute capability was found in '%s' " + fail_msg += "(PTX architectures supported in that file: %s)" + self.log.warning(fail_msg, highest_cc[0], path, derived_ptx_ccs) + else: + msg = (f"Output of 'cuobjdump' checked for '{path}'; ptx code was present for (at least)" + " the highest CUDA compute capability in cuda_compute_capabilities") + self.log.debug(msg) + else: + self.log.debug(f"Not sanity checking files in non-existing directory {dirpath}") return fails From 25fccd13198362bb99186c30f8f90415d79343f7 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 3 Apr 2025 16:00:06 +0200 Subject: [PATCH 018/114] Fix too long line --- easybuild/framework/easyblock.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 61eb7cc9d6..4fe6e59946 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3437,8 +3437,8 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): fail_msg += "(PTX architectures supported in that file: %s)" self.log.warning(fail_msg, highest_cc[0], path, derived_ptx_ccs) else: - msg = (f"Output of 'cuobjdump' checked for '{path}'; ptx code was present for (at least)" - " the highest CUDA compute capability in cuda_compute_capabilities") + msg = (f"Output of 'cuobjdump' checked for '{path}'; ptx code was present for (at " + "least) the highest CUDA compute capability in cuda_compute_capabilities") self.log.debug(msg) else: self.log.debug(f"Not sanity checking files in non-existing directory {dirpath}") From 563ba3a82e7633a82685147406a52dfe0486b30d Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 3 Apr 2025 16:07:14 +0200 Subject: [PATCH 019/114] Implement small review comments --- easybuild/framework/easyblock.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 4fe6e59946..94f5111836 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3381,7 +3381,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): for dirpath in [os.path.join(self.installdir, d) for d in cuda_dirs]: if os.path.exists(dirpath): - self.log.debug(f"Sanity checking files for CUDA device code in {dirpath}") + self.log.debug(f"Sanity checking files for CUDA device code under folder {dirpath}") for path in [os.path.join(dirpath, x) for x in os.listdir(dirpath)]: self.log.debug("Sanity checking for CUDA device code in %s", path) @@ -3412,7 +3412,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): fail_msg += "Missing compute capabilities: %s. " % ', '.join(sorted(missing_ccs)) is_failure = True # We still log the result, but don't fail: - if path in ignore_file_list: + if is_failure and path in ignore_file_list: fail_msg += f"This failure will be ignored as '{path}' is listed in " fail_msg += "'ignore_cuda_sanity_failures'." is_failure = False From ed49c46a54be484cd9af7dc2a3ab7b00f192196a Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 3 Apr 2025 16:07:43 +0200 Subject: [PATCH 020/114] Fixed typo --- easybuild/framework/easyconfig/default.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/easybuild/framework/easyconfig/default.py b/easybuild/framework/easyconfig/default.py index f6ce63c9dc..b291974fe6 100644 --- a/easybuild/framework/easyconfig/default.py +++ b/easybuild/framework/easyconfig/default.py @@ -124,7 +124,7 @@ 'after make (for e.g.,"test" for make test)'), BUILD], 'bin_lib_subdirs': [[], "List of subdirectories for binaries and libraries, which is used during sanity check " "to check RPATH linking and banned/required libraries", BUILD], - 'cuda_sanity_ignore_files': [[], "List of files (relative to the installation prefix) for which failurs in " + 'cuda_sanity_ignore_files': [[], "List of files (relative to the installation prefix) for which failures in " "the CUDA sanity check step are ignored. Typically used for files where you " "know the CUDA architectures in those files don't match the " "--cuda-compute-capabitilities configured for EasyBuild AND where you know " From f2f252d8614461532e04f23fb15100469523ebbe Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 3 Apr 2025 16:23:52 +0200 Subject: [PATCH 021/114] Check for presence of cuobjdump. If sanity check isn't run, raise that to a warning message --- easybuild/framework/easyblock.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 94f5111836..a959327802 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -51,6 +51,7 @@ import os import random import re +import shutil import stat import sys import tempfile @@ -4034,12 +4035,15 @@ def xs2str(xs): self.log.debug("Skipping RPATH sanity check") if get_software_root('CUDA'): - cuda_fails = self.sanity_check_cuda() - if cuda_fails: - self.log.warning("CUDA device code sanity check failed!") - self.sanity_check_fail_msgs.extend(cuda_fails) + if shutil.which('cuobjdump'): + cuda_fails = self.sanity_check_cuda() + if cuda_fails: + self.log.warning("CUDA device code sanity check failed!") + self.sanity_check_fail_msgs.extend(cuda_fails) + else: + self.log.warning("Skipping CUDA sanity check: cuobjdump not found") else: - self.log.debug("Skipping CUDA device code sanity check") + self.log.warning("Skipping CUDA sanity check: CUDA module was not loaded") # pass or fail if self.sanity_check_fail_msgs: From 1e97753a47dd796e3c56a239f2e6985cdf5f3ca9 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 3 Apr 2025 16:26:25 +0200 Subject: [PATCH 022/114] Raise missing cuobjdump to error: if CUDA root was defined, then this IS CUDA software. Not being able to run the CUDA sanity check should then be considered a failure --- easybuild/framework/easyblock.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index a959327802..a6f09cb795 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -4041,7 +4041,7 @@ def xs2str(xs): self.log.warning("CUDA device code sanity check failed!") self.sanity_check_fail_msgs.extend(cuda_fails) else: - self.log.warning("Skipping CUDA sanity check: cuobjdump not found") + raise EasyBuildError("Failed to execute CUDA sanity check: cuobjdump not found") else: self.log.warning("Skipping CUDA sanity check: CUDA module was not loaded") From 39e5652980828dfe9fd06c7b3706924a1f402a3f Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 3 Apr 2025 16:27:16 +0200 Subject: [PATCH 023/114] downgrade to debug message, as the CUDA root not being defined is pretty 'common': this is the case for all builds of NON-cuda software :) --- easybuild/framework/easyblock.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index a6f09cb795..a928ef6b1c 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -4043,7 +4043,7 @@ def xs2str(xs): else: raise EasyBuildError("Failed to execute CUDA sanity check: cuobjdump not found") else: - self.log.warning("Skipping CUDA sanity check: CUDA module was not loaded") + self.log.debug("Skipping CUDA sanity check: CUDA module was not loaded") # pass or fail if self.sanity_check_fail_msgs: From 8e70838916817eb125276ce3cacfc977ed46ccab Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 3 Apr 2025 16:37:14 +0200 Subject: [PATCH 024/114] Limit to a single return statement --- easybuild/tools/systemtools.py | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/easybuild/tools/systemtools.py b/easybuild/tools/systemtools.py index 9ab8681fe2..ba81b30644 100644 --- a/easybuild/tools/systemtools.py +++ b/easybuild/tools/systemtools.py @@ -1021,25 +1021,25 @@ def get_cuda_object_dump_raw(path): _log.warning(fail_msg) # check that the file is an executable or library/object + result = None if any(x in res.output for x in ['executable', 'object', 'library']): cuda_cmd = f"cuobjdump {path}" - else: - return None - res = run_shell_cmd(cuda_cmd, fail_on_error=False, hidden=True, output_file=False, stream_output=False) - if res.exit_code == EasyBuildExit.SUCCESS: - return res.output - else: - # Check and report for the common case that this is simply not a CUDA binary, i.e. does not - # contain CUDA device code - no_device_code_match = re.search(r'does not contain device code', res.output) - if no_device_code_match is not None: - msg = "'%s' does not appear to be a CUDA binary: cuobjdump failed to find device code in this file" - _log.debug(msg, path) + res = run_shell_cmd(cuda_cmd, fail_on_error=False, hidden=True, output_file=False, stream_output=False) + if res.exit_code == EasyBuildExit.SUCCESS: + result = res.output else: - msg = "Dumping CUDA binary file information for '%s' via '%s' failed! Output: '%s'" - _log.debug(msg, path, cuda_cmd, res.output) - return None + # Check and report for the common case that this is simply not a CUDA binary, i.e. does not + # contain CUDA device code + no_device_code_match = re.search(r'does not contain device code', res.output) + if no_device_code_match is not None: + msg = "'%s' does not appear to be a CUDA binary: cuobjdump failed to find device code in this file" + _log.debug(msg, path) + else: + msg = "Dumping CUDA binary file information for '%s' via '%s' failed! Output: '%s'" + _log.debug(msg, path, cuda_cmd, res.output) + + return result def get_cuda_device_code_architectures(path): From dbf7a7e81f774f28d07fdd74e05e2228276aec48 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 3 Apr 2025 16:40:35 +0200 Subject: [PATCH 025/114] No more early returns --- easybuild/tools/systemtools.py | 87 +++++++++++++++++----------------- 1 file changed, 44 insertions(+), 43 deletions(-) diff --git a/easybuild/tools/systemtools.py b/easybuild/tools/systemtools.py index ba81b30644..5b4880516a 100644 --- a/easybuild/tools/systemtools.py +++ b/easybuild/tools/systemtools.py @@ -1075,53 +1075,54 @@ def get_cuda_device_code_architectures(path): if os.path.islink(path) and os.path.exists(path): path = os.path.realpath(path) + dev_ptx_archs = None cuda_raw = get_cuda_object_dump_raw(path) - if cuda_raw is None: - return None - - # extract unique device code architectures from raw dump - device_code_matches = re.findall(device_code_regex, cuda_raw) - if device_code_matches is not None: - # convert match tuples into unique list of cuda compute capabilities - # e.g. [('8', '6'), ('8', '6'), ('9', '0')] -> ['8.6', '9.0'] - device_code_matches = sorted(['.'.join(m) for m in set(device_code_matches)]) - else: - # Try to be clear in the warning... did we not find elf code sections at all? or was the arch missing? - device_section_regex = re.compile('Fatbin elf code') - device_section_matches = re.findall(device_section_regex, cuda_raw) - if device_section_matches is not None: - fail_msg = f"Found Fatbin elf code section(s) in cuobjdump output for {path}, " - fail_msg += "but failed to extract CUDA architecture" + if cuda_raw is not None: + # extract unique device code architectures from raw dump + device_code_matches = re.findall(device_code_regex, cuda_raw) + if device_code_matches is not None: + # convert match tuples into unique list of cuda compute capabilities + # e.g. [('8', '6'), ('8', '6'), ('9', '0')] -> ['8.6', '9.0'] + device_code_matches = sorted(['.'.join(m) for m in set(device_code_matches)]) else: - # In this case, the cuobjdump command _likely_ already returned a non-zero exit - # This error message would only be displayed if cuobjdump somehow completely successfully - # but still no Fatbin elf code section was found - fail_msg = f"Failed to find Fatbin elf code section(s) in cuobjdump output for {path}, " - fail_msg += "are you sure this is a CUDA binary?" - _log.warning(fail_msg) - - # extract unique ptx code architectures from raw dump - ptx_code_matches = re.findall(ptx_code_regex, cuda_raw) - if ptx_code_matches is not None: - # convert match tuples into unique list of cuda compute capabilities - # e.g. [('8', '6'), ('8', '6'), ('9', '0')] -> ['8.6', '9.0'] - ptx_code_matches = sorted(['.'.join(m) for m in set(ptx_code_matches)]) - else: - # Try to be clear in the warning... did we not find ptx code sections at all? or was the arch missing? - ptx_section_regex = re.compile('Fatbin ptx code') - ptx_section_matches = re.findall(ptx_section_regex, cuda_raw) - if ptx_section_matches is not None: - fail_msg = f"Found Fatbin ptx code section(s) in cuobjdump output for {path}, " - fail_msg += "but failed to extract CUDA architecture" + # Try to be clear in the warning... did we not find elf code sections at all? or was the arch missing? + device_section_regex = re.compile('Fatbin elf code') + device_section_matches = re.findall(device_section_regex, cuda_raw) + if device_section_matches is not None: + fail_msg = f"Found Fatbin elf code section(s) in cuobjdump output for {path}, " + fail_msg += "but failed to extract CUDA architecture" + else: + # In this case, the cuobjdump command _likely_ already returned a non-zero exit + # This error message would only be displayed if cuobjdump somehow completely successfully + # but still no Fatbin elf code section was found + fail_msg = f"Failed to find Fatbin elf code section(s) in cuobjdump output for {path}, " + fail_msg += "are you sure this is a CUDA binary?" + _log.warning(fail_msg) + + # extract unique ptx code architectures from raw dump + ptx_code_matches = re.findall(ptx_code_regex, cuda_raw) + if ptx_code_matches is not None: + # convert match tuples into unique list of cuda compute capabilities + # e.g. [('8', '6'), ('8', '6'), ('9', '0')] -> ['8.6', '9.0'] + ptx_code_matches = sorted(['.'.join(m) for m in set(ptx_code_matches)]) else: - # In this case, the cuobjdump command _likely_ already returned a non-zero exit - # This error message would only be displayed if cuobjdump somehow completely successfully - # but still no Fatbin ptx code section was found - fail_msg = f"Failed to find Fatbin ptx code section(s) in cuobjdump output for {path}, " - fail_msg += "are you sure this is a CUDA binary?" - _log.warning(fail_msg) + # Try to be clear in the warning... did we not find ptx code sections at all? or was the arch missing? + ptx_section_regex = re.compile('Fatbin ptx code') + ptx_section_matches = re.findall(ptx_section_regex, cuda_raw) + if ptx_section_matches is not None: + fail_msg = f"Found Fatbin ptx code section(s) in cuobjdump output for {path}, " + fail_msg += "but failed to extract CUDA architecture" + else: + # In this case, the cuobjdump command _likely_ already returned a non-zero exit + # This error message would only be displayed if cuobjdump somehow completely successfully + # but still no Fatbin ptx code section was found + fail_msg = f"Failed to find Fatbin ptx code section(s) in cuobjdump output for {path}, " + fail_msg += "are you sure this is a CUDA binary?" + _log.warning(fail_msg) + + dev_ptx_archs = cuda_dev_ptx_archs(ptx_archs=ptx_code_matches, device_code_archs=device_code_matches) - return cuda_dev_ptx_archs(ptx_archs=ptx_code_matches, device_code_archs=device_code_matches) + return dev_ptx_archs def get_linked_libs_raw(path): From 7a919f3735d729238e8aa766f9002feb65e21802 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 3 Apr 2025 17:01:04 +0200 Subject: [PATCH 026/114] Start adding summary data --- easybuild/framework/easyblock.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index a928ef6b1c..9204957455 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3380,6 +3380,20 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): self.log.info("Using configured subdirectories for binaries/libraries to verify CUDA device code: %s", cuda_dirs) + # Tracking some numbers for a summary report: + num_cuda_files = 0 + num_files_missing_cc = 0 + num_files_surplus_cc = 0 + num_files_missing_ptx = 0 + num_files_missing_cc_but_has_ptx = 0 + + # Creating lists of files for summary report: + files_missing_cc = [] + files_surplus_cc = [] + files_missing_ptx = [] + files_missing_cc_but_has_ptx = [] + + # Looping through all files to check CUDA device and PTX code for dirpath in [os.path.join(self.installdir, d) for d in cuda_dirs]: if os.path.exists(dirpath): self.log.debug(f"Sanity checking files for CUDA device code under folder {dirpath}") @@ -3393,6 +3407,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): msg += "so skipping CUDA sanity check." self.log.debug(msg) else: + num_cuda_files += 1 # unpack results derived_ccs = res.device_code_archs derived_ptx_ccs = res.ptx_archs @@ -3406,10 +3421,16 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): is_failure = False fail_msg = f"Mismatch between cuda_compute_capabilities and device code in {path}. " if additional_ccs: + # Count and log for summary report + files_surplus_cc.append(path) + num_files_surplus_cc += 1 fail_msg += "Surplus compute capabilities: %s. " % ', '.join(sorted(additional_ccs)) if strict_cc_check: is_failure = True if missing_ccs: + # Count and log for summary report + files_missing_cc.append(path) + num_files_missing_cc += 1 fail_msg += "Missing compute capabilities: %s. " % ', '.join(sorted(missing_ccs)) is_failure = True # We still log the result, but don't fail: @@ -3418,7 +3439,8 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): fail_msg += "'ignore_cuda_sanity_failures'." is_failure = False - # Log warning or sanity error + # If considered a failure, append to fails so that a sanity error will be thrown + # Otherwise, log a warning if is_failure: fails.append(fail_msg) else: From dab2042a8eb4911bb9a6cc540a63b1160bd89f56 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 8 Apr 2025 14:51:35 +0200 Subject: [PATCH 027/114] Add logic to deal with accept-ptx-for-cc-support and --accept-missing-cuda-ptx --- easybuild/framework/easyblock.py | 57 +++++++++++++++++++++++++++----- 1 file changed, 48 insertions(+), 9 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 9204957455..c5e19e27ee 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3360,6 +3360,8 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): fails = [] cfg_ccs = build_option('cuda_compute_capabilities') or self.cfg.get('cuda_compute_capabilities', None) strict_cc_check = build_option('strict_cuda_sanity_check') + accept_ptx_as_cc = build_option('accept_ptx_as_cc_support') + accept_missing_ptx = build_option('accept_missing_cuda_ptx') # Construct the list of files to ignore as full paths (cuda_sanity_ignore_files contains the paths # to ignore, relative to the installation prefix) @@ -3407,6 +3409,17 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): msg += "so skipping CUDA sanity check." self.log.debug(msg) else: + # Here, we check if CUDA device code is present for all compute capabilities in + # --cuda-compute-capabilities for the file pointed to by 'path' + # We also check for the presence of ptx code for the highest CUDA compute capability + # The following is considered fail/warning/success: + # - Missing device code is considered a failure (unless there is PTX code for + # a lower CC AND --accept-ptx-for-cc-support is True, in which case it is a warning) + # - Device code for additional compute capabilities is considered a failure if + # --strict-cuda-sanity-check is True (otherwise, it's a warning) + # - Missing PTX code for the highest CUDA compute capability in --cuda-compute-capabilities + # is considered a failure, unless --accept-missing-cuda-ptx is True (in which case it is + # a warning) num_cuda_files += 1 # unpack results derived_ccs = res.device_code_archs @@ -3417,23 +3430,41 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): missing_ccs = list(set(cfg_ccs) - set(derived_ccs)) if additional_ccs or missing_ccs: - # Do we log this as warning or produce a sanity failure? - is_failure = False fail_msg = f"Mismatch between cuda_compute_capabilities and device code in {path}. " if additional_ccs: # Count and log for summary report files_surplus_cc.append(path) num_files_surplus_cc += 1 - fail_msg += "Surplus compute capabilities: %s. " % ', '.join(sorted(additional_ccs)) - if strict_cc_check: + surplus_cc_str = ', '.join(sorted(additional_ccs, key=LooseVersion)) + fail_msg += "Surplus compute capabilities: %s. " % surplus_ccs + if strict_cc_check: # Surplus compute capabilities not allowed is_failure = True + else: + is_failure = False if missing_ccs: # Count and log for summary report files_missing_cc.append(path) num_files_missing_cc += 1 - fail_msg += "Missing compute capabilities: %s. " % ', '.join(sorted(missing_ccs)) - is_failure = True - # We still log the result, but don't fail: + missing_cc_str = ', '.join(sorted(missing_ccs, key=LooseVersion)) + fail_msg += "Missing compute capabilities: %s. " % missing_cc_str + # If accept_ptx_as_cc, this might not be a failure _if_ there is suitable PTX + # code to JIT compile from that supports the CCs in missing_ccs + if accept_ptx_as_cc: + # Check that for each item in missing_ccs there is PTX code for lower or equal + # CUDA compute capability + comparisons = [] + for cc in missing_ccs: + has_smaller_equal = any(LooseVersion(derived_ptx_ccs) <= LooseVersion(cc)) + comparisons.append(has_smaller_equal) + # Only if that's the case for ALL cc's in missing_ccs, this is a warning, not a + # failure + if all(comparisons): + is_failure = False + else: + is_failure = True + else: + is_failure = True + # If we have a failure, demote to a warning if path is on the ignore_file_list if is_failure and path in ignore_file_list: fail_msg += f"This failure will be ignored as '{path}' is listed in " fail_msg += "'ignore_cuda_sanity_failures'." @@ -3451,14 +3482,22 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): self.log.debug(msg) # Check whether there is ptx code for the highest CC in cfg_ccs - highest_cc = [sorted(cfg_ccs)[-1]] + # Make sure to use LooseVersion so that e.g. 9.0 < 9.0a < 9.2 < 9.10 + highest_cc = [sorted(cfg_ccs, key=LooseVersion)[-1]] missing_ptx_ccs = list(set(highest_cc) - set(derived_ptx_ccs)) if missing_ptx_ccs: + files_missing_ptx.append(path) + num_files_missing_ptx += 1 fail_msg = "Configured highest compute capability was '%s', " fail_msg += "but no PTX code for this compute capability was found in '%s' " fail_msg += "(PTX architectures supported in that file: %s)" - self.log.warning(fail_msg, highest_cc[0], path, derived_ptx_ccs) + if path in ignore_file_list: + fail_msg = f"This failure will be ignored as '{path}' is listed in" + fail_msg += "'ignore_cuda_sanity_failures'." + self.log.warning(fail_msg, highest_cc[0], path, derived_ptx_ccs) + else: + fails.append(fail_msg) else: msg = (f"Output of 'cuobjdump' checked for '{path}'; ptx code was present for (at " "least) the highest CUDA compute capability in cuda_compute_capabilities") From be0a99040e9dd131e34eaffdd834520f6f5f9975 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 8 Apr 2025 15:03:59 +0200 Subject: [PATCH 028/114] Further counting and summary collection --- easybuild/framework/easyblock.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index c5e19e27ee..4ae77953bc 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3388,12 +3388,14 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): num_files_surplus_cc = 0 num_files_missing_ptx = 0 num_files_missing_cc_but_has_ptx = 0 + num_files_ignored = 0 # Creating lists of files for summary report: files_missing_cc = [] files_surplus_cc = [] files_missing_ptx = [] files_missing_cc_but_has_ptx = [] + files_ignored = [] # Looping through all files to check CUDA device and PTX code for dirpath in [os.path.join(self.installdir, d) for d in cuda_dirs]: @@ -3443,8 +3445,6 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): is_failure = False if missing_ccs: # Count and log for summary report - files_missing_cc.append(path) - num_files_missing_cc += 1 missing_cc_str = ', '.join(sorted(missing_ccs, key=LooseVersion)) fail_msg += "Missing compute capabilities: %s. " % missing_cc_str # If accept_ptx_as_cc, this might not be a failure _if_ there is suitable PTX @@ -3459,13 +3459,22 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): # Only if that's the case for ALL cc's in missing_ccs, this is a warning, not a # failure if all(comparisons): + files_missing_cc_but_has_ptx.append(path) + num_files_missing_cc_but_has_ptx += 1 is_failure = False else: + files_missing_cc.append(path) + num_files_missing_cc += 1 is_failure = True else: + files_missing_cc.append(path) + num_files_missing_cc += 1 is_failure = True + # If we have a failure, demote to a warning if path is on the ignore_file_list if is_failure and path in ignore_file_list: + files_ignored.append(path) + num_files_ignored += 1 fail_msg += f"This failure will be ignored as '{path}' is listed in " fail_msg += "'ignore_cuda_sanity_failures'." is_failure = False From 384c17a50474ca08574b2ded6b6a71b5afed90a9 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 8 Apr 2025 15:12:52 +0200 Subject: [PATCH 029/114] Fix some hound issues --- easybuild/framework/easyblock.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 4ae77953bc..7ff53970dd 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3417,7 +3417,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): # The following is considered fail/warning/success: # - Missing device code is considered a failure (unless there is PTX code for # a lower CC AND --accept-ptx-for-cc-support is True, in which case it is a warning) - # - Device code for additional compute capabilities is considered a failure if + # - Device code for additional compute capabilities is considered a failure if # --strict-cuda-sanity-check is True (otherwise, it's a warning) # - Missing PTX code for the highest CUDA compute capability in --cuda-compute-capabilities # is considered a failure, unless --accept-missing-cuda-ptx is True (in which case it is @@ -3438,7 +3438,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): files_surplus_cc.append(path) num_files_surplus_cc += 1 surplus_cc_str = ', '.join(sorted(additional_ccs, key=LooseVersion)) - fail_msg += "Surplus compute capabilities: %s. " % surplus_ccs + fail_msg += "Surplus compute capabilities: %s. " % surplus_cc_str if strict_cc_check: # Surplus compute capabilities not allowed is_failure = True else: @@ -3505,8 +3505,10 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): fail_msg = f"This failure will be ignored as '{path}' is listed in" fail_msg += "'ignore_cuda_sanity_failures'." self.log.warning(fail_msg, highest_cc[0], path, derived_ptx_ccs) + elif accept_missing_ptx: + self.log.warning(fail_msg, highest_cc[0], path, derived_ptx_ccs) else: - fails.append(fail_msg) + fails.append(fail_msg % (highest_cc[0], path, derived_ptx_ccs)) else: msg = (f"Output of 'cuobjdump' checked for '{path}'; ptx code was present for (at " "least) the highest CUDA compute capability in cuda_compute_capabilities") From 5b0dd4311e7e0082e57537e6af9a6295d28a7ac5 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 8 Apr 2025 15:40:25 +0200 Subject: [PATCH 030/114] First attempt at summary report --- easybuild/framework/easyblock.py | 36 +++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 7ff53970dd..ac05a3cb92 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3500,11 +3500,11 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): num_files_missing_ptx += 1 fail_msg = "Configured highest compute capability was '%s', " fail_msg += "but no PTX code for this compute capability was found in '%s' " - fail_msg += "(PTX architectures supported in that file: %s)" + fail_msg += "(PTX architectures supported in that file: %s). " if path in ignore_file_list: - fail_msg = f"This failure will be ignored as '{path}' is listed in" + fail_msg += "This failure will be ignored as '%s' is listed in" fail_msg += "'ignore_cuda_sanity_failures'." - self.log.warning(fail_msg, highest_cc[0], path, derived_ptx_ccs) + self.log.warning(fail_msg, highest_cc[0], path, derived_ptx_ccs, path) elif accept_missing_ptx: self.log.warning(fail_msg, highest_cc[0], path, derived_ptx_ccs) else: @@ -3516,6 +3516,36 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): else: self.log.debug(f"Not sanity checking files in non-existing directory {dirpath}") + # Summary + summary_msg = "CUDA sanity check summary report:\n" + summary_msg += f"Number of CUDA files checked: {num_cuda_files}\n" + summary_msg += f"Number of files missing one or more CUDA Compute Capabilities: {num_files_missing_cc}\n" + if accept_ptx_as_cc: + summary_msg += f"Number of files missing one or more CUDA Compute Capabilities, but has suitable " + summary_msg += f"PTX code that can be JIT compiled for the requested CUDA Compute Capabilities: " + summary_msg += f"{num_files_missing_cc_but_has_ptx}\n" + summary_msg += f"Number of files with device code for more CUDA Compute Capabilities than requested: " + summary_msg += f"{num_files_surplus_cc}\n" + summary_msg += f"Number of files missing PTX code for the highest configured CUDA Compute Capability: " + summary_msg += f"{num_files_missing_ptx}\n" + summary_msg += f"Number of files ignored in the CUDA Sanity Check: {num_files_ignored}\n" + if num_files_ignored > 0: + summary_msg += "Note: ignored files still count toward the aforementioned summary statistics" + self.log.info(summary_msg) + + summary_msg_debug = "Detailed CUDA sanity check summary report:\n" + summary_msg_debug += f"Files missing one or more CUDA compute capabilities: {files_missing_cc}\n" + if accept_ptx_as_cc: + summary_msg_debug += f"Files missing one or more CUDA Compute Capabilities, but has suitable PTX " + summary_msg_debug += f"code that can be JIT compiled for the requested CUDA Compute Capabilities: " + summary_msg_debug += f"{files_missing_cc_but_has_ptx}\n" + summary_msg_debug += f"Files with device code for more CUDA Compute Capabilities than requested: " + summary_msg_debug += f"{files_surplus_cc}\n" + summary_msg_debug += f"Files missing PTX code for the highest configured CUDA Compute Capability: " + summary_msg_debug += f"{files_missing_ptx}\n" + summary_msg_debug += f"Files ignored in the CUDA Saniyt Check: {files_ignored}\n" + self.log.debug(summary_msg_debug) + return fails def sanity_check_rpath(self, rpath_dirs=None, check_readelf_rpath=True): From 039542f26364aad4490fac10d7cbec73921fa37d Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 8 Apr 2025 16:13:42 +0200 Subject: [PATCH 031/114] Make formatting more readable --- easybuild/framework/easyblock.py | 61 ++++++++++++++++++++------------ 1 file changed, 39 insertions(+), 22 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index ac05a3cb92..464965ca4d 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3385,17 +3385,22 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): # Tracking some numbers for a summary report: num_cuda_files = 0 num_files_missing_cc = 0 + num_files_missing_cc_ignored =0 num_files_surplus_cc = 0 + num_files_surplus_cc_ignored = 0 num_files_missing_ptx = 0 + num_files_missing_ptx_ignored = 0 num_files_missing_cc_but_has_ptx = 0 - num_files_ignored = 0 # Creating lists of files for summary report: files_missing_cc = [] + files_missing_cc_ignored = [] files_surplus_cc = [] + files_surplus_cc_ignored = [] files_missing_ptx = [] + files_missing_ptx_ignored = [] files_missing_cc_but_has_ptx = [] - files_ignored = [] + # Looping through all files to check CUDA device and PTX code for dirpath in [os.path.join(self.installdir, d) for d in cuda_dirs]: @@ -3431,6 +3436,10 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): additional_ccs = list(set(derived_ccs) - set(cfg_ccs)) missing_ccs = list(set(cfg_ccs) - set(derived_ccs)) + # Message for when file is on the ignore list: + ignore_msg = f"This failure will be ignored as '{path}' is listed in " + ignore_msg += "'ignore_cuda_sanity_failures'." + if additional_ccs or missing_ccs: fail_msg = f"Mismatch between cuda_compute_capabilities and device code in {path}. " if additional_ccs: @@ -3443,6 +3452,14 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): is_failure = True else: is_failure = False + + # Turn failure into warning if on ignore list + if is_failure and path in ignore_file_list: + files_surplus_cc_ignored.append(path) + num_files_suprlus_cc_ignored += 1 + fail_msg += ignore_msg + is_failure = False + if missing_ccs: # Count and log for summary report missing_cc_str = ', '.join(sorted(missing_ccs, key=LooseVersion)) @@ -3471,13 +3488,12 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): num_files_missing_cc += 1 is_failure = True - # If we have a failure, demote to a warning if path is on the ignore_file_list - if is_failure and path in ignore_file_list: - files_ignored.append(path) - num_files_ignored += 1 - fail_msg += f"This failure will be ignored as '{path}' is listed in " - fail_msg += "'ignore_cuda_sanity_failures'." - is_failure = False + # Turn failure into warning if on ignore list + if is_failure and path in ignore_file_list: + files_missing_cc_ignored.append(path) + num_files_missing_cc_ignored += 1 + fail_msg += ignore_msg + is_failure = False # If considered a failure, append to fails so that a sanity error will be thrown # Otherwise, log a warning @@ -3502,9 +3518,10 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): fail_msg += "but no PTX code for this compute capability was found in '%s' " fail_msg += "(PTX architectures supported in that file: %s). " if path in ignore_file_list: - fail_msg += "This failure will be ignored as '%s' is listed in" - fail_msg += "'ignore_cuda_sanity_failures'." - self.log.warning(fail_msg, highest_cc[0], path, derived_ptx_ccs, path) + files_missing_ptx_ignored.append(path) + num_files_missing_ptx_ignored += 1 + fail_msg += ignore_msg + self.log.warning(fail_msg, highest_cc[0], path, derived_ptx_ccs) elif accept_missing_ptx: self.log.warning(fail_msg, highest_cc[0], path, derived_ptx_ccs) else: @@ -3519,31 +3536,31 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): # Summary summary_msg = "CUDA sanity check summary report:\n" summary_msg += f"Number of CUDA files checked: {num_cuda_files}\n" - summary_msg += f"Number of files missing one or more CUDA Compute Capabilities: {num_files_missing_cc}\n" + summary_msg += f"Number of files missing one or more CUDA Compute Capabilities: {num_files_missing_cc} " + summary_msg += f"(ignored: {num_files_missing_cc_ignored})\n" if accept_ptx_as_cc: summary_msg += f"Number of files missing one or more CUDA Compute Capabilities, but has suitable " summary_msg += f"PTX code that can be JIT compiled for the requested CUDA Compute Capabilities: " summary_msg += f"{num_files_missing_cc_but_has_ptx}\n" summary_msg += f"Number of files with device code for more CUDA Compute Capabilities than requested: " - summary_msg += f"{num_files_surplus_cc}\n" + summary_msg += f"{num_files_surplus_cc} (ignored: {num_files_surplus_cc_ignored})\n" summary_msg += f"Number of files missing PTX code for the highest configured CUDA Compute Capability: " - summary_msg += f"{num_files_missing_ptx}\n" - summary_msg += f"Number of files ignored in the CUDA Sanity Check: {num_files_ignored}\n" - if num_files_ignored > 0: - summary_msg += "Note: ignored files still count toward the aforementioned summary statistics" + summary_msg += f"{num_files_missing_ptx} (ignored: {num_files_missing_ptx_ignored})" self.log.info(summary_msg) summary_msg_debug = "Detailed CUDA sanity check summary report:\n" summary_msg_debug += f"Files missing one or more CUDA compute capabilities: {files_missing_cc}\n" + summary_msg_debug += f"These failures are ignored for: {files_missing_cc_ignored})\n" if accept_ptx_as_cc: - summary_msg_debug += f"Files missing one or more CUDA Compute Capabilities, but has suitable PTX " + summary_msg_debug += f"Files missing one or more CUDA Compute Capabilities, but has suitable PTX " summary_msg_debug += f"code that can be JIT compiled for the requested CUDA Compute Capabilities: " summary_msg_debug += f"{files_missing_cc_but_has_ptx}\n" summary_msg_debug += f"Files with device code for more CUDA Compute Capabilities than requested: " - summary_msg_debug += f"{files_surplus_cc}\n" + summary_msg_debug += f"{files_surplus_cc}" + summary_msg_debug += f"These failures are ignored for: {files_surplus_cc_ignored})\n" summary_msg_debug += f"Files missing PTX code for the highest configured CUDA Compute Capability: " - summary_msg_debug += f"{files_missing_ptx}\n" - summary_msg_debug += f"Files ignored in the CUDA Saniyt Check: {files_ignored}\n" + summary_msg_debug += f"{files_missing_ptx}" + summary_msg_debug += f"These failures are ignored for: {files_missing_ptx_ignored})" self.log.debug(summary_msg_debug) return fails From ec0683da91621061182af7b5b1712c76cc802c9c Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 8 Apr 2025 16:19:17 +0200 Subject: [PATCH 032/114] Fix hound issues --- easybuild/framework/easyblock.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 464965ca4d..4db00b79a5 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3385,7 +3385,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): # Tracking some numbers for a summary report: num_cuda_files = 0 num_files_missing_cc = 0 - num_files_missing_cc_ignored =0 + num_files_missing_cc_ignored 0 num_files_surplus_cc = 0 num_files_surplus_cc_ignored = 0 num_files_missing_ptx = 0 @@ -3401,7 +3401,6 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): files_missing_ptx_ignored = [] files_missing_cc_but_has_ptx = [] - # Looping through all files to check CUDA device and PTX code for dirpath in [os.path.join(self.installdir, d) for d in cuda_dirs]: if os.path.exists(dirpath): @@ -3456,7 +3455,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): # Turn failure into warning if on ignore list if is_failure and path in ignore_file_list: files_surplus_cc_ignored.append(path) - num_files_suprlus_cc_ignored += 1 + num_files_surplus_cc_ignored += 1 fail_msg += ignore_msg is_failure = False From 84a1905ce4767e81c9155e3f70e99e65eb1ff348 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 8 Apr 2025 16:20:29 +0200 Subject: [PATCH 033/114] Fix one more hound issue --- easybuild/framework/easyblock.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 4db00b79a5..2a5cf92bb7 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3385,7 +3385,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): # Tracking some numbers for a summary report: num_cuda_files = 0 num_files_missing_cc = 0 - num_files_missing_cc_ignored 0 + num_files_missing_cc_ignored = 0 num_files_surplus_cc = 0 num_files_surplus_cc_ignored = 0 num_files_missing_ptx = 0 From 7508104b9468f28154674f97ffdc2cc625ff11a8 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 8 Apr 2025 17:58:24 +0200 Subject: [PATCH 034/114] Add clear distinction between ignores & failures. Also, clearly indicated when some summary statistic is _not_ considered a failure (because of selected options). Also, add suggestions on how to resolve / accept certain sanity failures --- easybuild/framework/easyblock.py | 94 ++++++++++++++++++++++++-------- 1 file changed, 71 insertions(+), 23 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 2a5cf92bb7..b0c4e30eaf 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3385,19 +3385,25 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): # Tracking some numbers for a summary report: num_cuda_files = 0 num_files_missing_cc = 0 + num_files_missing_cc_fails = 0 num_files_missing_cc_ignored = 0 num_files_surplus_cc = 0 + num_files_surplus_cc_fails = 0 num_files_surplus_cc_ignored = 0 num_files_missing_ptx = 0 + num_files_missing_ptx_fails = 0 num_files_missing_ptx_ignored = 0 num_files_missing_cc_but_has_ptx = 0 # Creating lists of files for summary report: files_missing_cc = [] + files_missing_cc_fails = [] files_missing_cc_ignored = [] files_surplus_cc = [] + files_surplus_cc_fails = [] files_surplus_cc_ignored = [] files_missing_ptx = [] + files_missing_ptx_fails = [] files_missing_ptx_ignored = [] files_missing_cc_but_has_ptx = [] @@ -3448,17 +3454,18 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): surplus_cc_str = ', '.join(sorted(additional_ccs, key=LooseVersion)) fail_msg += "Surplus compute capabilities: %s. " % surplus_cc_str if strict_cc_check: # Surplus compute capabilities not allowed - is_failure = True + if path in ignore_file_list: + files_surplus_cc_ignored.append(path) + num_files_surplus_cc_ignored += 1 + fail_msg += ignore_msg + is_failure = False + else: + files_surplus_cc_fails.append(path) + num_files_surplus_cc_fails += 1 + is_failure = True else: is_failure = False - # Turn failure into warning if on ignore list - if is_failure and path in ignore_file_list: - files_surplus_cc_ignored.append(path) - num_files_surplus_cc_ignored += 1 - fail_msg += ignore_msg - is_failure = False - if missing_ccs: # Count and log for summary report missing_cc_str = ', '.join(sorted(missing_ccs, key=LooseVersion)) @@ -3470,7 +3477,9 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): # CUDA compute capability comparisons = [] for cc in missing_ccs: - has_smaller_equal = any(LooseVersion(derived_ptx_ccs) <= LooseVersion(cc)) + has_smaller_equal = any( + LooseVersion(ptx_cc) <= LooseVersion(cc) for ptx_cc in derived_ptx_ccs + ) comparisons.append(has_smaller_equal) # Only if that's the case for ALL cc's in missing_ccs, this is a warning, not a # failure @@ -3481,18 +3490,27 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): else: files_missing_cc.append(path) num_files_missing_cc += 1 - is_failure = True + if path in ignore_file_list: + files_missing_cc_ignored.append(path) + num_files_missing_cc_ignored += 1 + fail_msg += ignore_msg + is_failure = False + else: + files_missing_cc_fails.append(path) + num_files_missing_cc_fails += 1 + is_failure = True else: files_missing_cc.append(path) num_files_missing_cc += 1 - is_failure = True - - # Turn failure into warning if on ignore list - if is_failure and path in ignore_file_list: - files_missing_cc_ignored.append(path) - num_files_missing_cc_ignored += 1 - fail_msg += ignore_msg - is_failure = False + if path in ignore_file_list: + files_missing_cc_ignored.append(path) + num_files_missing_cc_ignored += 1 + fail_msg += ignore_msg + is_failure = False + else: + files_missing_cc_fails.append(path) + num_files_missing_cc_fails += 1 + is_failure = True # If considered a failure, append to fails so that a sanity error will be thrown # Otherwise, log a warning @@ -3524,6 +3542,8 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): elif accept_missing_ptx: self.log.warning(fail_msg, highest_cc[0], path, derived_ptx_ccs) else: + files_missing_ptx_fails.append(path) + num_files_missing_ptx_fails += 1 fails.append(fail_msg % (highest_cc[0], path, derived_ptx_ccs)) else: msg = (f"Output of 'cuobjdump' checked for '{path}'; ptx code was present for (at " @@ -3536,15 +3556,43 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): summary_msg = "CUDA sanity check summary report:\n" summary_msg += f"Number of CUDA files checked: {num_cuda_files}\n" summary_msg += f"Number of files missing one or more CUDA Compute Capabilities: {num_files_missing_cc} " - summary_msg += f"(ignored: {num_files_missing_cc_ignored})\n" + summary_msg += f"(ignored: {num_files_missing_cc_ignored}, fails: {num_files_missing_cc_fails})\n" if accept_ptx_as_cc: summary_msg += f"Number of files missing one or more CUDA Compute Capabilities, but has suitable " summary_msg += f"PTX code that can be JIT compiled for the requested CUDA Compute Capabilities: " summary_msg += f"{num_files_missing_cc_but_has_ptx}\n" summary_msg += f"Number of files with device code for more CUDA Compute Capabilities than requested: " - summary_msg += f"{num_files_surplus_cc} (ignored: {num_files_surplus_cc_ignored})\n" + if strict_cc_check: + summary_msg += f"{num_files_surplus_cc} (ignored: {num_files_surplus_cc_ignored}, fails: {num_files_surplus_cc_fails})\n" + else: + summary_msg += f"{num_files_surplus_cc} (not running with --strict-cuda-sanity-check, so not considered failures)\n" summary_msg += f"Number of files missing PTX code for the highest configured CUDA Compute Capability: " - summary_msg += f"{num_files_missing_ptx} (ignored: {num_files_missing_ptx_ignored})" + if accept_missing_ptx: + summary_msg += f"{num_files_missing_ptx} (running with --accept-missing-cuda-ptx so not considered failures)\n" + else: + summary_msg += f"{num_files_missing_ptx} (ignored: {num_files_missing_ptx_ignored}, fails: {num_files_missing_ptx_fails})\n" + if not build_option('debug'): + summary_msg += f"Rerun with --debug to see a detailed list of files.\n" + # Give some advice + if num_files_missing_cc > 0 and not accept_ptx_as_cc: + summary_msg += "\nYou may consider rerunning with --accept-ptx-as-cc-support to accept binaries that " + summary_msg += "don't contain the device code for your requested CUDA Compute Capabilities, but that " + summary_msg += "do have PTX code that can be compiled for your requested CUDA Compute " + summary_msg += "Capabilities. Note that this may increase startup delay due to JIT compilation " + summary_msg += "and may also lead to suboptimal runtime performance, as the PTX code may not exploit " + summary_msg += "all features specific to your hardware architecture.\n" + if num_files_surplus_cc > 0 and strict_cc_check: + summary_msg += "\nYou may consider running with --disable-strict-cuda-sanity-check. This means you'll " + summary_msg += "accept that some binaries may have CUDA Device Code for more architectures than you " + summary_msg += "requested, i.e. the binary is 'fatter' than you need. Bigger binaries may generally " + summary_msg += "cause some startup delay, and code path selection could introduce a small overhead, " + summary_msg += "though this is generally negligible.\n" + if num_files_missing_ptx > 0 and not accept_missing_ptx: + summary_msg += "\nYou may consider running with --accept-missing-cuda-ptx to accept binaries that don't " + summary_msg += "contain PTX code for the highest CUDA Compute Capability you requested. This breaks " + summary_msg += "forwards compatibility for newer CUDA Compute Capabilities (i.e. your compiled " + summary_msg += "binaries will not run on cards with higher CUDA Compute Capabilities than what " + summary_msg += "you requested in --cuda-compute-capabilities), but that may be acceptable to you.\n" self.log.info(summary_msg) summary_msg_debug = "Detailed CUDA sanity check summary report:\n" @@ -3555,10 +3603,10 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): summary_msg_debug += f"code that can be JIT compiled for the requested CUDA Compute Capabilities: " summary_msg_debug += f"{files_missing_cc_but_has_ptx}\n" summary_msg_debug += f"Files with device code for more CUDA Compute Capabilities than requested: " - summary_msg_debug += f"{files_surplus_cc}" + summary_msg_debug += f"{files_surplus_cc}\n" summary_msg_debug += f"These failures are ignored for: {files_surplus_cc_ignored})\n" summary_msg_debug += f"Files missing PTX code for the highest configured CUDA Compute Capability: " - summary_msg_debug += f"{files_missing_ptx}" + summary_msg_debug += f"{files_missing_ptx}\n" summary_msg_debug += f"These failures are ignored for: {files_missing_ptx_ignored})" self.log.debug(summary_msg_debug) From 213aceff8245cca08b1e74e26a057631298cd1dd Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 8 Apr 2025 18:06:36 +0200 Subject: [PATCH 035/114] Truncate too long lines --- easybuild/framework/easyblock.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index b0c4e30eaf..07d619ade0 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3563,14 +3563,18 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): summary_msg += f"{num_files_missing_cc_but_has_ptx}\n" summary_msg += f"Number of files with device code for more CUDA Compute Capabilities than requested: " if strict_cc_check: - summary_msg += f"{num_files_surplus_cc} (ignored: {num_files_surplus_cc_ignored}, fails: {num_files_surplus_cc_fails})\n" + summary_msg += f"{num_files_surplus_cc} (ignored: {num_files_surplus_cc_ignored}, fails: " + summary_msg += "{num_files_surplus_cc_fails})\n" else: - summary_msg += f"{num_files_surplus_cc} (not running with --strict-cuda-sanity-check, so not considered failures)\n" + summary_msg += f"{num_files_surplus_cc} (not running with --strict-cuda-sanity-check, so not " + summary_msg += "considered failures)\n" summary_msg += f"Number of files missing PTX code for the highest configured CUDA Compute Capability: " if accept_missing_ptx: - summary_msg += f"{num_files_missing_ptx} (running with --accept-missing-cuda-ptx so not considered failures)\n" + summary_msg += f"{num_files_missing_ptx} (running with --accept-missing-cuda-ptx so not considered " + summary_msg += "failures)\n" else: - summary_msg += f"{num_files_missing_ptx} (ignored: {num_files_missing_ptx_ignored}, fails: {num_files_missing_ptx_fails})\n" + summary_msg += f"{num_files_missing_ptx} (ignored: {num_files_missing_ptx_ignored}, fails: " + summary_msg += "{num_files_missing_ptx_fails})\n" if not build_option('debug'): summary_msg += f"Rerun with --debug to see a detailed list of files.\n" # Give some advice @@ -3588,9 +3592,9 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): summary_msg += "cause some startup delay, and code path selection could introduce a small overhead, " summary_msg += "though this is generally negligible.\n" if num_files_missing_ptx > 0 and not accept_missing_ptx: - summary_msg += "\nYou may consider running with --accept-missing-cuda-ptx to accept binaries that don't " - summary_msg += "contain PTX code for the highest CUDA Compute Capability you requested. This breaks " - summary_msg += "forwards compatibility for newer CUDA Compute Capabilities (i.e. your compiled " + summary_msg += "\nYou may consider running with --accept-missing-cuda-ptx to accept binaries that " + summary_msg += "don't contain PTX code for the highest CUDA Compute Capability you requested. This " + summary_msg += "breaks forwards compatibility for newer CUDA Compute Capabilities (i.e. your compiled " summary_msg += "binaries will not run on cards with higher CUDA Compute Capabilities than what " summary_msg += "you requested in --cuda-compute-capabilities), but that may be acceptable to you.\n" self.log.info(summary_msg) From e56cace94c9a4067e1db17ab249296dff99513a8 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 8 Apr 2025 18:12:27 +0200 Subject: [PATCH 036/114] Fix linting errors --- easybuild/framework/easyblock.py | 16 ++++++++-------- easybuild/tools/systemtools.py | 4 ++-- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 07d619ade0..33c1bf06f1 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3558,8 +3558,8 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): summary_msg += f"Number of files missing one or more CUDA Compute Capabilities: {num_files_missing_cc} " summary_msg += f"(ignored: {num_files_missing_cc_ignored}, fails: {num_files_missing_cc_fails})\n" if accept_ptx_as_cc: - summary_msg += f"Number of files missing one or more CUDA Compute Capabilities, but has suitable " - summary_msg += f"PTX code that can be JIT compiled for the requested CUDA Compute Capabilities: " + summary_msg += "Number of files missing one or more CUDA Compute Capabilities, but has suitable " + summary_msg += "PTX code that can be JIT compiled for the requested CUDA Compute Capabilities: " summary_msg += f"{num_files_missing_cc_but_has_ptx}\n" summary_msg += f"Number of files with device code for more CUDA Compute Capabilities than requested: " if strict_cc_check: @@ -3568,7 +3568,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): else: summary_msg += f"{num_files_surplus_cc} (not running with --strict-cuda-sanity-check, so not " summary_msg += "considered failures)\n" - summary_msg += f"Number of files missing PTX code for the highest configured CUDA Compute Capability: " + summary_msg += "Number of files missing PTX code for the highest configured CUDA Compute Capability: " if accept_missing_ptx: summary_msg += f"{num_files_missing_ptx} (running with --accept-missing-cuda-ptx so not considered " summary_msg += "failures)\n" @@ -3576,7 +3576,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): summary_msg += f"{num_files_missing_ptx} (ignored: {num_files_missing_ptx_ignored}, fails: " summary_msg += "{num_files_missing_ptx_fails})\n" if not build_option('debug'): - summary_msg += f"Rerun with --debug to see a detailed list of files.\n" + summary_msg += "Rerun with --debug to see a detailed list of files.\n" # Give some advice if num_files_missing_cc > 0 and not accept_ptx_as_cc: summary_msg += "\nYou may consider rerunning with --accept-ptx-as-cc-support to accept binaries that " @@ -3603,13 +3603,13 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): summary_msg_debug += f"Files missing one or more CUDA compute capabilities: {files_missing_cc}\n" summary_msg_debug += f"These failures are ignored for: {files_missing_cc_ignored})\n" if accept_ptx_as_cc: - summary_msg_debug += f"Files missing one or more CUDA Compute Capabilities, but has suitable PTX " - summary_msg_debug += f"code that can be JIT compiled for the requested CUDA Compute Capabilities: " + summary_msg_debug += "Files missing one or more CUDA Compute Capabilities, but has suitable PTX " + summary_msg_debug += "code that can be JIT compiled for the requested CUDA Compute Capabilities: " summary_msg_debug += f"{files_missing_cc_but_has_ptx}\n" - summary_msg_debug += f"Files with device code for more CUDA Compute Capabilities than requested: " + summary_msg_debug += "Files with device code for more CUDA Compute Capabilities than requested: " summary_msg_debug += f"{files_surplus_cc}\n" summary_msg_debug += f"These failures are ignored for: {files_surplus_cc_ignored})\n" - summary_msg_debug += f"Files missing PTX code for the highest configured CUDA Compute Capability: " + summary_msg_debug += "Files missing PTX code for the highest configured CUDA Compute Capability: " summary_msg_debug += f"{files_missing_ptx}\n" summary_msg_debug += f"These failures are ignored for: {files_missing_ptx_ignored})" self.log.debug(summary_msg_debug) diff --git a/easybuild/tools/systemtools.py b/easybuild/tools/systemtools.py index 5b4880516a..c896a57a43 100644 --- a/easybuild/tools/systemtools.py +++ b/easybuild/tools/systemtools.py @@ -1038,7 +1038,7 @@ def get_cuda_object_dump_raw(path): else: msg = "Dumping CUDA binary file information for '%s' via '%s' failed! Output: '%s'" _log.debug(msg, path, cuda_cmd, res.output) - + return result @@ -1098,7 +1098,7 @@ def get_cuda_device_code_architectures(path): fail_msg = f"Failed to find Fatbin elf code section(s) in cuobjdump output for {path}, " fail_msg += "are you sure this is a CUDA binary?" _log.warning(fail_msg) - + # extract unique ptx code architectures from raw dump ptx_code_matches = re.findall(ptx_code_regex, cuda_raw) if ptx_code_matches is not None: From 1c230d68c761f273d089ea394dca93ff34508f0f Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 8 Apr 2025 18:20:08 +0200 Subject: [PATCH 037/114] Fix missing f for f-strings: --- easybuild/framework/easyblock.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 33c1bf06f1..fb7b8dfbdd 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3564,7 +3564,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): summary_msg += f"Number of files with device code for more CUDA Compute Capabilities than requested: " if strict_cc_check: summary_msg += f"{num_files_surplus_cc} (ignored: {num_files_surplus_cc_ignored}, fails: " - summary_msg += "{num_files_surplus_cc_fails})\n" + summary_msg += f"{num_files_surplus_cc_fails})\n" else: summary_msg += f"{num_files_surplus_cc} (not running with --strict-cuda-sanity-check, so not " summary_msg += "considered failures)\n" @@ -3574,7 +3574,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): summary_msg += "failures)\n" else: summary_msg += f"{num_files_missing_ptx} (ignored: {num_files_missing_ptx_ignored}, fails: " - summary_msg += "{num_files_missing_ptx_fails})\n" + summary_msg += f"{num_files_missing_ptx_fails})\n" if not build_option('debug'): summary_msg += "Rerun with --debug to see a detailed list of files.\n" # Give some advice From 24f6b8a90b1e2dcbcdd7fb5a2e54f0a4b3056c87 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 9 Apr 2025 16:57:00 +0200 Subject: [PATCH 038/114] Add option to ignore all CUDA sanity failures to not break current EasyBuild behaviour --- easybuild/tools/config.py | 1 + easybuild/tools/options.py | 3 +++ 2 files changed, 4 insertions(+) diff --git a/easybuild/tools/config.py b/easybuild/tools/config.py index bf728049cf..0ddee0fb8a 100644 --- a/easybuild/tools/config.py +++ b/easybuild/tools/config.py @@ -352,6 +352,7 @@ def mk_full_default_path(name, prefix=DEFAULT_PREFIX): 'cleanup_tmpdir', 'extended_dry_run_ignore_errors', 'fixed_installdir_naming_scheme', + 'ignore_cuda_sanity_failures', 'lib_lib64_symlink', 'lib64_fallback_sanity_check', 'lib64_lib_symlink', diff --git a/easybuild/tools/options.py b/easybuild/tools/options.py index d60fa7a42f..d4cdf503fc 100644 --- a/easybuild/tools/options.py +++ b/easybuild/tools/options.py @@ -553,6 +553,9 @@ def override_options(self): 'strict-rpath-sanity-check': ("Perform strict RPATH sanity check, which involves unsetting " "$LD_LIBRARY_PATH before checking whether all required libraries are found", None, 'store_true', False), + 'ignore-cuda-sanity-failures': ("The CUDA sanity check will be performed, and a report will be printed, " + "but any failures in the CUDA sanity check will be ignored", + None, 'stroe_true', True) 'strict-cuda-sanity-check': ("Perform strict CUDA sanity check. Without this option, the CUDA sanity " "check will fail if the CUDA binaries don't contain code for (at least) " "all compute capabilities defined in --cude-compute-capabilities, but will " From 92073b1c0ee261ee59efa4f38ea1ceb92b836bae Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 9 Apr 2025 17:00:53 +0200 Subject: [PATCH 039/114] Implement an option te report, but ignore _all_ failures --- easybuild/framework/easyblock.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index fb7b8dfbdd..efc7f631b9 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3359,6 +3359,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): fails = [] cfg_ccs = build_option('cuda_compute_capabilities') or self.cfg.get('cuda_compute_capabilities', None) + ignore_failures = build_option('ignore_cuda_sanity_failures') strict_cc_check = build_option('strict_cuda_sanity_check') accept_ptx_as_cc = build_option('accept_ptx_as_cc_support') accept_missing_ptx = build_option('accept_missing_cuda_ptx') @@ -3454,7 +3455,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): surplus_cc_str = ', '.join(sorted(additional_ccs, key=LooseVersion)) fail_msg += "Surplus compute capabilities: %s. " % surplus_cc_str if strict_cc_check: # Surplus compute capabilities not allowed - if path in ignore_file_list: + if path in ignore_file_list or ignore_failures: files_surplus_cc_ignored.append(path) num_files_surplus_cc_ignored += 1 fail_msg += ignore_msg @@ -3490,7 +3491,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): else: files_missing_cc.append(path) num_files_missing_cc += 1 - if path in ignore_file_list: + if path in ignore_file_list or ignore_failures: files_missing_cc_ignored.append(path) num_files_missing_cc_ignored += 1 fail_msg += ignore_msg @@ -3502,7 +3503,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): else: files_missing_cc.append(path) num_files_missing_cc += 1 - if path in ignore_file_list: + if path in ignore_file_list or ignore_failures: files_missing_cc_ignored.append(path) num_files_missing_cc_ignored += 1 fail_msg += ignore_msg @@ -3534,7 +3535,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): fail_msg = "Configured highest compute capability was '%s', " fail_msg += "but no PTX code for this compute capability was found in '%s' " fail_msg += "(PTX architectures supported in that file: %s). " - if path in ignore_file_list: + if path in ignore_file_list or ignore_failures: files_missing_ptx_ignored.append(path) num_files_missing_ptx_ignored += 1 fail_msg += ignore_msg From aecd62d9531538d1dc3426d32de0604d7e34416e Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 9 Apr 2025 17:02:54 +0200 Subject: [PATCH 040/114] Fix typo and missing comma --- easybuild/tools/options.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/easybuild/tools/options.py b/easybuild/tools/options.py index d4cdf503fc..19326b41e0 100644 --- a/easybuild/tools/options.py +++ b/easybuild/tools/options.py @@ -555,7 +555,7 @@ def override_options(self): None, 'store_true', False), 'ignore-cuda-sanity-failures': ("The CUDA sanity check will be performed, and a report will be printed, " "but any failures in the CUDA sanity check will be ignored", - None, 'stroe_true', True) + None, 'store_true', True), 'strict-cuda-sanity-check': ("Perform strict CUDA sanity check. Without this option, the CUDA sanity " "check will fail if the CUDA binaries don't contain code for (at least) " "all compute capabilities defined in --cude-compute-capabilities, but will " From 74d73492805b481f14d6d08de4b7fae0438cfba3 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 10 Apr 2025 14:30:31 +0200 Subject: [PATCH 041/114] Replaced all num_X with len(files_x), we don't need separate counters --- easybuild/framework/easyblock.py | 436 +++++++++++++++---------------- 1 file changed, 208 insertions(+), 228 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index efc7f631b9..5e5698d89b 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3357,7 +3357,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): self.log.info("Checking binaries/libraries for CUDA device code...") - fails = [] + fail_msgs = [] cfg_ccs = build_option('cuda_compute_capabilities') or self.cfg.get('cuda_compute_capabilities', None) ignore_failures = build_option('ignore_cuda_sanity_failures') strict_cc_check = build_option('strict_cuda_sanity_check') @@ -3371,251 +3371,231 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): # If there are no CUDA compute capabilities defined, return if cfg_ccs is None or len(cfg_ccs) == 0: self.log.info("Skipping CUDA sanity check, as no CUDA compute capabilities where configured") + return fail_msgs + + if cuda_dirs is None: + cuda_dirs = self.cfg['bin_lib_subdirs'] or self.bin_lib_subdirs() + + if not cuda_dirs: + cuda_dirs = DEFAULT_BIN_LIB_SUBDIRS + self.log.info("Using default subdirectories for binaries/libraries to verify CUDA device code: %s", + cuda_dirs) else: - if cuda_dirs is None: - cuda_dirs = self.cfg['bin_lib_subdirs'] or self.bin_lib_subdirs() + self.log.info("Using configured subdirectories for binaries/libraries to verify CUDA device code: %s", + cuda_dirs) + + # Tracking number of CUDA files for a summary report: + num_cuda_files = 0 + + # Creating lists of files for summary report: + files_missing_cc = [] + files_missing_cc_fails = [] + files_missing_cc_ignored = [] + files_surplus_cc = [] + files_surplus_cc_fails = [] + files_surplus_cc_ignored = [] + files_missing_ptx = [] + files_missing_ptx_fails = [] + files_missing_ptx_ignored = [] + files_missing_cc_but_has_ptx = [] + + # Looping through all files to check CUDA device and PTX code + for dirpath in [os.path.join(self.installdir, d) for d in cuda_dirs]: + if os.path.exists(dirpath): + self.log.debug(f"Sanity checking files for CUDA device code under folder {dirpath}") - if not cuda_dirs: - cuda_dirs = DEFAULT_BIN_LIB_SUBDIRS - self.log.info("Using default subdirectories for binaries/libraries to verify CUDA device code: %s", - cuda_dirs) - else: - self.log.info("Using configured subdirectories for binaries/libraries to verify CUDA device code: %s", - cuda_dirs) - - # Tracking some numbers for a summary report: - num_cuda_files = 0 - num_files_missing_cc = 0 - num_files_missing_cc_fails = 0 - num_files_missing_cc_ignored = 0 - num_files_surplus_cc = 0 - num_files_surplus_cc_fails = 0 - num_files_surplus_cc_ignored = 0 - num_files_missing_ptx = 0 - num_files_missing_ptx_fails = 0 - num_files_missing_ptx_ignored = 0 - num_files_missing_cc_but_has_ptx = 0 - - # Creating lists of files for summary report: - files_missing_cc = [] - files_missing_cc_fails = [] - files_missing_cc_ignored = [] - files_surplus_cc = [] - files_surplus_cc_fails = [] - files_surplus_cc_ignored = [] - files_missing_ptx = [] - files_missing_ptx_fails = [] - files_missing_ptx_ignored = [] - files_missing_cc_but_has_ptx = [] - - # Looping through all files to check CUDA device and PTX code - for dirpath in [os.path.join(self.installdir, d) for d in cuda_dirs]: - if os.path.exists(dirpath): - self.log.debug(f"Sanity checking files for CUDA device code under folder {dirpath}") - - for path in [os.path.join(dirpath, x) for x in os.listdir(dirpath)]: - self.log.debug("Sanity checking for CUDA device code in %s", path) - - res = get_cuda_device_code_architectures(path) - if res is None: - msg = f"{path} does not appear to be a CUDA executable (no CUDA device code found), " - msg += "so skipping CUDA sanity check." - self.log.debug(msg) - else: - # Here, we check if CUDA device code is present for all compute capabilities in - # --cuda-compute-capabilities for the file pointed to by 'path' - # We also check for the presence of ptx code for the highest CUDA compute capability - # The following is considered fail/warning/success: - # - Missing device code is considered a failure (unless there is PTX code for - # a lower CC AND --accept-ptx-for-cc-support is True, in which case it is a warning) - # - Device code for additional compute capabilities is considered a failure if - # --strict-cuda-sanity-check is True (otherwise, it's a warning) - # - Missing PTX code for the highest CUDA compute capability in --cuda-compute-capabilities - # is considered a failure, unless --accept-missing-cuda-ptx is True (in which case it is - # a warning) - num_cuda_files += 1 - # unpack results - derived_ccs = res.device_code_archs - derived_ptx_ccs = res.ptx_archs - - # check whether device code architectures match cuda_compute_capabilities - additional_ccs = list(set(derived_ccs) - set(cfg_ccs)) - missing_ccs = list(set(cfg_ccs) - set(derived_ccs)) - - # Message for when file is on the ignore list: - ignore_msg = f"This failure will be ignored as '{path}' is listed in " - ignore_msg += "'ignore_cuda_sanity_failures'." - - if additional_ccs or missing_ccs: - fail_msg = f"Mismatch between cuda_compute_capabilities and device code in {path}. " - if additional_ccs: - # Count and log for summary report - files_surplus_cc.append(path) - num_files_surplus_cc += 1 - surplus_cc_str = ', '.join(sorted(additional_ccs, key=LooseVersion)) - fail_msg += "Surplus compute capabilities: %s. " % surplus_cc_str - if strict_cc_check: # Surplus compute capabilities not allowed - if path in ignore_file_list or ignore_failures: - files_surplus_cc_ignored.append(path) - num_files_surplus_cc_ignored += 1 - fail_msg += ignore_msg - is_failure = False - else: - files_surplus_cc_fails.append(path) - num_files_surplus_cc_fails += 1 - is_failure = True + for path in [os.path.join(dirpath, x) for x in os.listdir(dirpath)]: + self.log.debug("Sanity checking for CUDA device code in %s", path) + + res = get_cuda_device_code_architectures(path) + if res is None: + msg = f"{path} does not appear to be a CUDA executable (no CUDA device code found), " + msg += "so skipping CUDA sanity check." + self.log.debug(msg) + else: + # Here, we check if CUDA device code is present for all compute capabilities in + # --cuda-compute-capabilities for the file pointed to by 'path' + # We also check for the presence of ptx code for the highest CUDA compute capability + # The following is considered fail/warning/success: + # - Missing device code is considered a failure (unless there is PTX code for + # a lower CC AND --accept-ptx-for-cc-support is True, in which case it is a warning) + # - Device code for additional compute capabilities is considered a failure if + # --strict-cuda-sanity-check is True (otherwise, it's a warning) + # - Missing PTX code for the highest CUDA compute capability in --cuda-compute-capabilities + # is considered a failure, unless --accept-missing-cuda-ptx is True (in which case it is + # a warning) + num_cuda_files += 1 + # unpack results + derived_ccs = res.device_code_archs + derived_ptx_ccs = res.ptx_archs + + # check whether device code architectures match cuda_compute_capabilities + additional_ccs = list(set(derived_ccs) - set(cfg_ccs)) + missing_ccs = list(set(cfg_ccs) - set(derived_ccs)) + + # Message for when file is on the ignore list: + ignore_msg = f"This failure will be ignored as '{path}' is listed in " + ignore_msg += "'ignore_cuda_sanity_failures'." + + if additional_ccs or missing_ccs: + fail_msg = f"Mismatch between cuda_compute_capabilities and device code in {path}. " + if additional_ccs: + # Count and log for summary report + files_surplus_cc.append(path) + surplus_cc_str = ', '.join(sorted(additional_ccs, key=LooseVersion)) + fail_msg += "Surplus compute capabilities: %s. " % surplus_cc_str + if strict_cc_check: # Surplus compute capabilities not allowed + if path in ignore_file_list or ignore_failures: + files_surplus_cc_ignored.append(path) + fail_msg += ignore_msg + is_failure = False else: + files_surplus_cc_fails.append(path) + is_failure = True + else: + is_failure = False + + if missing_ccs: + # Count and log for summary report + missing_cc_str = ', '.join(sorted(missing_ccs, key=LooseVersion)) + fail_msg += "Missing compute capabilities: %s. " % missing_cc_str + # If accept_ptx_as_cc, this might not be a failure _if_ there is suitable PTX + # code to JIT compile from that supports the CCs in missing_ccs + if accept_ptx_as_cc: + # Check that for each item in missing_ccs there is PTX code for lower or equal + # CUDA compute capability + comparisons = [] + for cc in missing_ccs: + has_smaller_equal = any( + LooseVersion(ptx_cc) <= LooseVersion(cc) for ptx_cc in derived_ptx_ccs + ) + comparisons.append(has_smaller_equal) + # Only if that's the case for ALL cc's in missing_ccs, this is a warning, not a + # failure + if all(comparisons): + files_missing_cc_but_has_ptx.append(path) is_failure = False - - if missing_ccs: - # Count and log for summary report - missing_cc_str = ', '.join(sorted(missing_ccs, key=LooseVersion)) - fail_msg += "Missing compute capabilities: %s. " % missing_cc_str - # If accept_ptx_as_cc, this might not be a failure _if_ there is suitable PTX - # code to JIT compile from that supports the CCs in missing_ccs - if accept_ptx_as_cc: - # Check that for each item in missing_ccs there is PTX code for lower or equal - # CUDA compute capability - comparisons = [] - for cc in missing_ccs: - has_smaller_equal = any( - LooseVersion(ptx_cc) <= LooseVersion(cc) for ptx_cc in derived_ptx_ccs - ) - comparisons.append(has_smaller_equal) - # Only if that's the case for ALL cc's in missing_ccs, this is a warning, not a - # failure - if all(comparisons): - files_missing_cc_but_has_ptx.append(path) - num_files_missing_cc_but_has_ptx += 1 - is_failure = False - else: - files_missing_cc.append(path) - num_files_missing_cc += 1 - if path in ignore_file_list or ignore_failures: - files_missing_cc_ignored.append(path) - num_files_missing_cc_ignored += 1 - fail_msg += ignore_msg - is_failure = False - else: - files_missing_cc_fails.append(path) - num_files_missing_cc_fails += 1 - is_failure = True else: files_missing_cc.append(path) - num_files_missing_cc += 1 if path in ignore_file_list or ignore_failures: files_missing_cc_ignored.append(path) - num_files_missing_cc_ignored += 1 fail_msg += ignore_msg is_failure = False else: files_missing_cc_fails.append(path) - num_files_missing_cc_fails += 1 is_failure = True - - # If considered a failure, append to fails so that a sanity error will be thrown - # Otherwise, log a warning - if is_failure: - fails.append(fail_msg) else: - self.log.warning(fail_msg) + files_missing_cc.append(path) + num_files_missing_cc += 1 + if path in ignore_file_list or ignore_failures: + files_missing_cc_ignored.append(path) + fail_msg += ignore_msg + is_failure = False + else: + files_missing_cc_fails.append(path) + is_failure = True + + # If considered a failure, append to fails so that a sanity error will be thrown + # Otherwise, log a warning + if is_failure: + fail_msgs.append(fail_msg) else: - msg = (f"Output of 'cuobjdump' checked for '{path}'; device code architectures match " - "those in cuda_compute_capabilities") - self.log.debug(msg) - - # Check whether there is ptx code for the highest CC in cfg_ccs - # Make sure to use LooseVersion so that e.g. 9.0 < 9.0a < 9.2 < 9.10 - highest_cc = [sorted(cfg_ccs, key=LooseVersion)[-1]] - missing_ptx_ccs = list(set(highest_cc) - set(derived_ptx_ccs)) - - if missing_ptx_ccs: - files_missing_ptx.append(path) - num_files_missing_ptx += 1 - fail_msg = "Configured highest compute capability was '%s', " - fail_msg += "but no PTX code for this compute capability was found in '%s' " - fail_msg += "(PTX architectures supported in that file: %s). " - if path in ignore_file_list or ignore_failures: - files_missing_ptx_ignored.append(path) - num_files_missing_ptx_ignored += 1 - fail_msg += ignore_msg - self.log.warning(fail_msg, highest_cc[0], path, derived_ptx_ccs) - elif accept_missing_ptx: - self.log.warning(fail_msg, highest_cc[0], path, derived_ptx_ccs) - else: - files_missing_ptx_fails.append(path) - num_files_missing_ptx_fails += 1 - fails.append(fail_msg % (highest_cc[0], path, derived_ptx_ccs)) + self.log.warning(fail_msg) + else: + msg = (f"Output of 'cuobjdump' checked for '{path}'; device code architectures match " + "those in cuda_compute_capabilities") + self.log.debug(msg) + + # Check whether there is ptx code for the highest CC in cfg_ccs + # Make sure to use LooseVersion so that e.g. 9.0 < 9.0a < 9.2 < 9.10 + highest_cc = [sorted(cfg_ccs, key=LooseVersion)[-1]] + missing_ptx_ccs = list(set(highest_cc) - set(derived_ptx_ccs)) + + if missing_ptx_ccs: + files_missing_ptx.append(path) + num_files_missing_ptx += 1 + fail_msg = "Configured highest compute capability was '%s', " + fail_msg += "but no PTX code for this compute capability was found in '%s' " + fail_msg += "(PTX architectures supported in that file: %s). " + if path in ignore_file_list or ignore_failures: + files_missing_ptx_ignored.append(path) + fail_msg += ignore_msg + self.log.warning(fail_msg, highest_cc[0], path, derived_ptx_ccs) + elif accept_missing_ptx: + self.log.warning(fail_msg, highest_cc[0], path, derived_ptx_ccs) else: - msg = (f"Output of 'cuobjdump' checked for '{path}'; ptx code was present for (at " - "least) the highest CUDA compute capability in cuda_compute_capabilities") - self.log.debug(msg) - else: - self.log.debug(f"Not sanity checking files in non-existing directory {dirpath}") - - # Summary - summary_msg = "CUDA sanity check summary report:\n" - summary_msg += f"Number of CUDA files checked: {num_cuda_files}\n" - summary_msg += f"Number of files missing one or more CUDA Compute Capabilities: {num_files_missing_cc} " - summary_msg += f"(ignored: {num_files_missing_cc_ignored}, fails: {num_files_missing_cc_fails})\n" - if accept_ptx_as_cc: - summary_msg += "Number of files missing one or more CUDA Compute Capabilities, but has suitable " - summary_msg += "PTX code that can be JIT compiled for the requested CUDA Compute Capabilities: " - summary_msg += f"{num_files_missing_cc_but_has_ptx}\n" - summary_msg += f"Number of files with device code for more CUDA Compute Capabilities than requested: " - if strict_cc_check: - summary_msg += f"{num_files_surplus_cc} (ignored: {num_files_surplus_cc_ignored}, fails: " - summary_msg += f"{num_files_surplus_cc_fails})\n" - else: - summary_msg += f"{num_files_surplus_cc} (not running with --strict-cuda-sanity-check, so not " - summary_msg += "considered failures)\n" - summary_msg += "Number of files missing PTX code for the highest configured CUDA Compute Capability: " - if accept_missing_ptx: - summary_msg += f"{num_files_missing_ptx} (running with --accept-missing-cuda-ptx so not considered " - summary_msg += "failures)\n" + files_missing_ptx_fails.append(path) + fail_msgs.append(fail_msg % (highest_cc[0], path, derived_ptx_ccs)) + else: + msg = (f"Output of 'cuobjdump' checked for '{path}'; ptx code was present for (at " + "least) the highest CUDA compute capability in cuda_compute_capabilities") + self.log.debug(msg) else: - summary_msg += f"{num_files_missing_ptx} (ignored: {num_files_missing_ptx_ignored}, fails: " - summary_msg += f"{num_files_missing_ptx_fails})\n" - if not build_option('debug'): - summary_msg += "Rerun with --debug to see a detailed list of files.\n" - # Give some advice - if num_files_missing_cc > 0 and not accept_ptx_as_cc: - summary_msg += "\nYou may consider rerunning with --accept-ptx-as-cc-support to accept binaries that " - summary_msg += "don't contain the device code for your requested CUDA Compute Capabilities, but that " - summary_msg += "do have PTX code that can be compiled for your requested CUDA Compute " - summary_msg += "Capabilities. Note that this may increase startup delay due to JIT compilation " - summary_msg += "and may also lead to suboptimal runtime performance, as the PTX code may not exploit " - summary_msg += "all features specific to your hardware architecture.\n" - if num_files_surplus_cc > 0 and strict_cc_check: - summary_msg += "\nYou may consider running with --disable-strict-cuda-sanity-check. This means you'll " - summary_msg += "accept that some binaries may have CUDA Device Code for more architectures than you " - summary_msg += "requested, i.e. the binary is 'fatter' than you need. Bigger binaries may generally " - summary_msg += "cause some startup delay, and code path selection could introduce a small overhead, " - summary_msg += "though this is generally negligible.\n" - if num_files_missing_ptx > 0 and not accept_missing_ptx: - summary_msg += "\nYou may consider running with --accept-missing-cuda-ptx to accept binaries that " - summary_msg += "don't contain PTX code for the highest CUDA Compute Capability you requested. This " - summary_msg += "breaks forwards compatibility for newer CUDA Compute Capabilities (i.e. your compiled " - summary_msg += "binaries will not run on cards with higher CUDA Compute Capabilities than what " - summary_msg += "you requested in --cuda-compute-capabilities), but that may be acceptable to you.\n" - self.log.info(summary_msg) - - summary_msg_debug = "Detailed CUDA sanity check summary report:\n" - summary_msg_debug += f"Files missing one or more CUDA compute capabilities: {files_missing_cc}\n" - summary_msg_debug += f"These failures are ignored for: {files_missing_cc_ignored})\n" - if accept_ptx_as_cc: - summary_msg_debug += "Files missing one or more CUDA Compute Capabilities, but has suitable PTX " - summary_msg_debug += "code that can be JIT compiled for the requested CUDA Compute Capabilities: " - summary_msg_debug += f"{files_missing_cc_but_has_ptx}\n" - summary_msg_debug += "Files with device code for more CUDA Compute Capabilities than requested: " - summary_msg_debug += f"{files_surplus_cc}\n" - summary_msg_debug += f"These failures are ignored for: {files_surplus_cc_ignored})\n" - summary_msg_debug += "Files missing PTX code for the highest configured CUDA Compute Capability: " - summary_msg_debug += f"{files_missing_ptx}\n" - summary_msg_debug += f"These failures are ignored for: {files_missing_ptx_ignored})" - self.log.debug(summary_msg_debug) + self.log.debug(f"Not sanity checking files in non-existing directory {dirpath}") - return fails + # Summary + summary_msg = "CUDA sanity check summary report:\n" + summary_msg += f"Number of CUDA files checked: {num_cuda_files}\n" + summary_msg += f"Number of files missing one or more CUDA Compute Capabilities: {len(files_missing_cc)} " + summary_msg += f"(ignored: {len(files_missing_cc_ignored)}, fails: {len(files_missing_cc_fails)})\n" + if accept_ptx_as_cc: + summary_msg += "Number of files missing one or more CUDA Compute Capabilities, but has suitable " + summary_msg += "PTX code that can be JIT compiled for the requested CUDA Compute Capabilities: " + summary_msg += f"{len(files_missing_cc_but_has_ptx)}\n" + summary_msg += f"Number of files with device code for more CUDA Compute Capabilities than requested: " + if strict_cc_check: + summary_msg += f"{len(files_surplus_cc)} (ignored: {len(num_files_surplus_cc_ignored)}, fails: " + summary_msg += f"{len(files_surplus_cc_fails)})\n" + else: + summary_msg += f"{len(files_surplus_cc)} (not running with --strict-cuda-sanity-check, so not " + summary_msg += "considered failures)\n" + summary_msg += "Number of files missing PTX code for the highest configured CUDA Compute Capability: " + if accept_missing_ptx: + summary_msg += f"{len(files_missing_ptx)} (running with --accept-missing-cuda-ptx so not considered " + summary_msg += "failures)\n" + else: + summary_msg += f"{len(files_missing_ptx)} (ignored: {len(files_missing_ptx_ignored)}, fails: " + summary_msg += f"{len(files_missing_ptx_fails)})\n" + if not build_option('debug'): + summary_msg += "Rerun with --debug to see a detailed list of files.\n" + # Give some advice + if num_files_missing_cc > 0 and not accept_ptx_as_cc: + summary_msg += "\nYou may consider rerunning with --accept-ptx-as-cc-support to accept binaries that " + summary_msg += "don't contain the device code for your requested CUDA Compute Capabilities, but that " + summary_msg += "do have PTX code that can be compiled for your requested CUDA Compute " + summary_msg += "Capabilities. Note that this may increase startup delay due to JIT compilation " + summary_msg += "and may also lead to suboptimal runtime performance, as the PTX code may not exploit " + summary_msg += "all features specific to your hardware architecture.\n" + if num_files_surplus_cc > 0 and strict_cc_check: + summary_msg += "\nYou may consider running with --disable-strict-cuda-sanity-check. This means you'll " + summary_msg += "accept that some binaries may have CUDA Device Code for more architectures than you " + summary_msg += "requested, i.e. the binary is 'fatter' than you need. Bigger binaries may generally " + summary_msg += "cause some startup delay, and code path selection could introduce a small overhead, " + summary_msg += "though this is generally negligible.\n" + if num_files_missing_ptx > 0 and not accept_missing_ptx: + summary_msg += "\nYou may consider running with --accept-missing-cuda-ptx to accept binaries that " + summary_msg += "don't contain PTX code for the highest CUDA Compute Capability you requested. This " + summary_msg += "breaks forwards compatibility for newer CUDA Compute Capabilities (i.e. your compiled " + summary_msg += "binaries will not run on cards with higher CUDA Compute Capabilities than what " + summary_msg += "you requested in --cuda-compute-capabilities), but that may be acceptable to you.\n" + self.log.info(summary_msg) + + summary_msg_debug = "Detailed CUDA sanity check summary report:\n" + summary_msg_debug += f"Files missing one or more CUDA compute capabilities: {files_missing_cc}\n" + summary_msg_debug += f"These failures are ignored for: {files_missing_cc_ignored})\n" + if accept_ptx_as_cc: + summary_msg_debug += "Files missing one or more CUDA Compute Capabilities, but has suitable PTX " + summary_msg_debug += "code that can be JIT compiled for the requested CUDA Compute Capabilities: " + summary_msg_debug += f"{files_missing_cc_but_has_ptx}\n" + summary_msg_debug += "Files with device code for more CUDA Compute Capabilities than requested: " + summary_msg_debug += f"{files_surplus_cc}\n" + summary_msg_debug += f"These failures are ignored for: {files_surplus_cc_ignored})\n" + summary_msg_debug += "Files missing PTX code for the highest configured CUDA Compute Capability: " + summary_msg_debug += f"{files_missing_ptx}\n" + summary_msg_debug += f"These failures are ignored for: {files_missing_ptx_ignored})" + self.log.debug(summary_msg_debug) + + return fail_msgs def sanity_check_rpath(self, rpath_dirs=None, check_readelf_rpath=True): """Sanity check binaries/libraries w.r.t. RPATH linking.""" From 31dc541d508597d8f229252913b9f48cc4a8a043 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen <33718780+casparvl@users.noreply.github.com> Date: Thu, 10 Apr 2025 14:32:06 +0200 Subject: [PATCH 042/114] Update easybuild/framework/easyblock.py Fix grammar Co-authored-by: Jasper Grimm <65227842+jfgrimm@users.noreply.github.com> --- easybuild/framework/easyblock.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 5e5698d89b..2424485c6c 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3370,7 +3370,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): # If there are no CUDA compute capabilities defined, return if cfg_ccs is None or len(cfg_ccs) == 0: - self.log.info("Skipping CUDA sanity check, as no CUDA compute capabilities where configured") + self.log.info("Skipping CUDA sanity check, as no CUDA compute capabilities were configured") return fail_msgs if cuda_dirs is None: From 9266344376c96ca3e0fd429febe7cb8aa08b2e30 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 10 Apr 2025 14:34:00 +0200 Subject: [PATCH 043/114] Removed some forgotten num_files_X and replaced with len(files_X) --- easybuild/framework/easyblock.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 2424485c6c..9f8bb40a5a 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3372,7 +3372,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): if cfg_ccs is None or len(cfg_ccs) == 0: self.log.info("Skipping CUDA sanity check, as no CUDA compute capabilities were configured") return fail_msgs - + if cuda_dirs is None: cuda_dirs = self.cfg['bin_lib_subdirs'] or self.bin_lib_subdirs() @@ -3486,7 +3486,6 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): is_failure = True else: files_missing_cc.append(path) - num_files_missing_cc += 1 if path in ignore_file_list or ignore_failures: files_missing_cc_ignored.append(path) fail_msg += ignore_msg @@ -3513,7 +3512,6 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): if missing_ptx_ccs: files_missing_ptx.append(path) - num_files_missing_ptx += 1 fail_msg = "Configured highest compute capability was '%s', " fail_msg += "but no PTX code for this compute capability was found in '%s' " fail_msg += "(PTX architectures supported in that file: %s). " @@ -3544,7 +3542,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): summary_msg += f"{len(files_missing_cc_but_has_ptx)}\n" summary_msg += f"Number of files with device code for more CUDA Compute Capabilities than requested: " if strict_cc_check: - summary_msg += f"{len(files_surplus_cc)} (ignored: {len(num_files_surplus_cc_ignored)}, fails: " + summary_msg += f"{len(files_surplus_cc)} (ignored: {len(files_surplus_cc_ignored)}, fails: " summary_msg += f"{len(files_surplus_cc_fails)})\n" else: summary_msg += f"{len(files_surplus_cc)} (not running with --strict-cuda-sanity-check, so not " @@ -3559,20 +3557,20 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): if not build_option('debug'): summary_msg += "Rerun with --debug to see a detailed list of files.\n" # Give some advice - if num_files_missing_cc > 0 and not accept_ptx_as_cc: + if len(files_missing_cc) > 0 and not accept_ptx_as_cc: summary_msg += "\nYou may consider rerunning with --accept-ptx-as-cc-support to accept binaries that " summary_msg += "don't contain the device code for your requested CUDA Compute Capabilities, but that " summary_msg += "do have PTX code that can be compiled for your requested CUDA Compute " summary_msg += "Capabilities. Note that this may increase startup delay due to JIT compilation " summary_msg += "and may also lead to suboptimal runtime performance, as the PTX code may not exploit " summary_msg += "all features specific to your hardware architecture.\n" - if num_files_surplus_cc > 0 and strict_cc_check: + if len(files_surplus_cc) > 0 and strict_cc_check: summary_msg += "\nYou may consider running with --disable-strict-cuda-sanity-check. This means you'll " summary_msg += "accept that some binaries may have CUDA Device Code for more architectures than you " summary_msg += "requested, i.e. the binary is 'fatter' than you need. Bigger binaries may generally " summary_msg += "cause some startup delay, and code path selection could introduce a small overhead, " summary_msg += "though this is generally negligible.\n" - if num_files_missing_ptx > 0 and not accept_missing_ptx: + if len(files_missing_ptx) > 0 and not accept_missing_ptx: summary_msg += "\nYou may consider running with --accept-missing-cuda-ptx to accept binaries that " summary_msg += "don't contain PTX code for the highest CUDA Compute Capability you requested. This " summary_msg += "breaks forwards compatibility for newer CUDA Compute Capabilities (i.e. your compiled " From 2a03e2eb01152a5301f9896c99a66da73a564f1d Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 10 Apr 2025 15:03:17 +0200 Subject: [PATCH 044/114] Change option names --- easybuild/framework/easyblock.py | 36 +++++++++++------------ easybuild/tools/options.py | 49 ++++++++++++++++++-------------- 2 files changed, 45 insertions(+), 40 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 9f8bb40a5a..6bb54cf3aa 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3359,10 +3359,10 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): fail_msgs = [] cfg_ccs = build_option('cuda_compute_capabilities') or self.cfg.get('cuda_compute_capabilities', None) - ignore_failures = build_option('ignore_cuda_sanity_failures') - strict_cc_check = build_option('strict_cuda_sanity_check') - accept_ptx_as_cc = build_option('accept_ptx_as_cc_support') - accept_missing_ptx = build_option('accept_missing_cuda_ptx') + ignore_failures = not build_option('cuda_sanity_check_error_on_fail') + strict_cc_check = build_option('cuda_sanity_check_strict') + accept_ptx_as_devcode = build_option('cuda_sanity_check_accept_ptx_as_devcode') + accept_missing_ptx = build_option('cuda_sanity_check_accept_missing_ptx') # Construct the list of files to ignore as full paths (cuda_sanity_ignore_files contains the paths # to ignore, relative to the installation prefix) @@ -3420,9 +3420,9 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): # - Missing device code is considered a failure (unless there is PTX code for # a lower CC AND --accept-ptx-for-cc-support is True, in which case it is a warning) # - Device code for additional compute capabilities is considered a failure if - # --strict-cuda-sanity-check is True (otherwise, it's a warning) + # --cuda-sanity-check-strict is True (otherwise, it's a warning) # - Missing PTX code for the highest CUDA compute capability in --cuda-compute-capabilities - # is considered a failure, unless --accept-missing-cuda-ptx is True (in which case it is + # is considered a failure, unless --cuda-sanity-check-accept-missing-ptx is True (in which case it is # a warning) num_cuda_files += 1 # unpack results @@ -3435,7 +3435,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): # Message for when file is on the ignore list: ignore_msg = f"This failure will be ignored as '{path}' is listed in " - ignore_msg += "'ignore_cuda_sanity_failures'." + ignore_msg += "'cuda_sanity_ignore_files'." if additional_ccs or missing_ccs: fail_msg = f"Mismatch between cuda_compute_capabilities and device code in {path}. " @@ -3445,7 +3445,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): surplus_cc_str = ', '.join(sorted(additional_ccs, key=LooseVersion)) fail_msg += "Surplus compute capabilities: %s. " % surplus_cc_str if strict_cc_check: # Surplus compute capabilities not allowed - if path in ignore_file_list or ignore_failures: + if path in ignore_file_list or ignore_failrues: files_surplus_cc_ignored.append(path) fail_msg += ignore_msg is_failure = False @@ -3459,9 +3459,9 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): # Count and log for summary report missing_cc_str = ', '.join(sorted(missing_ccs, key=LooseVersion)) fail_msg += "Missing compute capabilities: %s. " % missing_cc_str - # If accept_ptx_as_cc, this might not be a failure _if_ there is suitable PTX + # If accept_ptx_as_devcode, this might not be a failure _if_ there is suitable PTX # code to JIT compile from that supports the CCs in missing_ccs - if accept_ptx_as_cc: + if accept_ptx_as_devcode: # Check that for each item in missing_ccs there is PTX code for lower or equal # CUDA compute capability comparisons = [] @@ -3536,7 +3536,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): summary_msg += f"Number of CUDA files checked: {num_cuda_files}\n" summary_msg += f"Number of files missing one or more CUDA Compute Capabilities: {len(files_missing_cc)} " summary_msg += f"(ignored: {len(files_missing_cc_ignored)}, fails: {len(files_missing_cc_fails)})\n" - if accept_ptx_as_cc: + if accept_ptx_as_devcode: summary_msg += "Number of files missing one or more CUDA Compute Capabilities, but has suitable " summary_msg += "PTX code that can be JIT compiled for the requested CUDA Compute Capabilities: " summary_msg += f"{len(files_missing_cc_but_has_ptx)}\n" @@ -3545,11 +3545,11 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): summary_msg += f"{len(files_surplus_cc)} (ignored: {len(files_surplus_cc_ignored)}, fails: " summary_msg += f"{len(files_surplus_cc_fails)})\n" else: - summary_msg += f"{len(files_surplus_cc)} (not running with --strict-cuda-sanity-check, so not " + summary_msg += f"{len(files_surplus_cc)} (not running with --cuda-sanity-check-strict, so not " summary_msg += "considered failures)\n" summary_msg += "Number of files missing PTX code for the highest configured CUDA Compute Capability: " if accept_missing_ptx: - summary_msg += f"{len(files_missing_ptx)} (running with --accept-missing-cuda-ptx so not considered " + summary_msg += f"{len(files_missing_ptx)} (running with --cuda-sanity-check-accept-missing-ptx so not considered " summary_msg += "failures)\n" else: summary_msg += f"{len(files_missing_ptx)} (ignored: {len(files_missing_ptx_ignored)}, fails: " @@ -3557,21 +3557,21 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): if not build_option('debug'): summary_msg += "Rerun with --debug to see a detailed list of files.\n" # Give some advice - if len(files_missing_cc) > 0 and not accept_ptx_as_cc: - summary_msg += "\nYou may consider rerunning with --accept-ptx-as-cc-support to accept binaries that " + if len(files_missing_cc) > 0 and not accept_ptx_as_devcode: + summary_msg += "\nYou may consider rerunning with --cuda-sanity-check-accept-ptx-as-devcode to accept binaries that " summary_msg += "don't contain the device code for your requested CUDA Compute Capabilities, but that " summary_msg += "do have PTX code that can be compiled for your requested CUDA Compute " summary_msg += "Capabilities. Note that this may increase startup delay due to JIT compilation " summary_msg += "and may also lead to suboptimal runtime performance, as the PTX code may not exploit " summary_msg += "all features specific to your hardware architecture.\n" if len(files_surplus_cc) > 0 and strict_cc_check: - summary_msg += "\nYou may consider running with --disable-strict-cuda-sanity-check. This means you'll " + summary_msg += "\nYou may consider running with --disable-cuda-sanity-check-strict. This means you'll " summary_msg += "accept that some binaries may have CUDA Device Code for more architectures than you " summary_msg += "requested, i.e. the binary is 'fatter' than you need. Bigger binaries may generally " summary_msg += "cause some startup delay, and code path selection could introduce a small overhead, " summary_msg += "though this is generally negligible.\n" if len(files_missing_ptx) > 0 and not accept_missing_ptx: - summary_msg += "\nYou may consider running with --accept-missing-cuda-ptx to accept binaries that " + summary_msg += "\nYou may consider running with --cuda-sanity-check-accept-missing-ptx to accept binaries that " summary_msg += "don't contain PTX code for the highest CUDA Compute Capability you requested. This " summary_msg += "breaks forwards compatibility for newer CUDA Compute Capabilities (i.e. your compiled " summary_msg += "binaries will not run on cards with higher CUDA Compute Capabilities than what " @@ -3581,7 +3581,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): summary_msg_debug = "Detailed CUDA sanity check summary report:\n" summary_msg_debug += f"Files missing one or more CUDA compute capabilities: {files_missing_cc}\n" summary_msg_debug += f"These failures are ignored for: {files_missing_cc_ignored})\n" - if accept_ptx_as_cc: + if accept_ptx_as_devcode: summary_msg_debug += "Files missing one or more CUDA Compute Capabilities, but has suitable PTX " summary_msg_debug += "code that can be JIT compiled for the requested CUDA Compute Capabilities: " summary_msg_debug += f"{files_missing_cc_but_has_ptx}\n" diff --git a/easybuild/tools/options.py b/easybuild/tools/options.py index 19326b41e0..0c98216585 100644 --- a/easybuild/tools/options.py +++ b/easybuild/tools/options.py @@ -405,6 +405,33 @@ def override_options(self): "--accept-ptx-for-cc-support or --accept-missing-ptx, or made more " "stringent using --strict-cuda-sanity-check.", 'strlist', 'extend', None), + 'cuda-sanity-check-accept-missing-ptx': ("CUDA sanity check also passes if PTX code for the highest " + "requested CUDA compute capability is not present (but will " + "print a warning)", + None, 'store_true', False), + 'cuda-sanity-check-accept-ptx-as-devcode': ("CUDA sanity check also passes if requested device code is " + "not present, as long as PTX code is present that can be " + "JIT-compiled for each target in --cuda-compute-capabilities " + "E.g. if --cuda-compute-capabilities=8.0 and a binary is " + "found in the installation that does not have device code for " + "8.0, but it does have PTX code for 7.0, the sanity check " + "will pass if, and only if, this option is True. " + "Note that JIT-compiling means the binary will work on the " + "requested architecture, but is it not necessarily as well " + "optimized as when actual device code is present for the " + "requested architecture ", + None, 'store_true', False), + 'cuda-sanity-check-error-on-fail': ("If True, failures in the CUDA sanity check will produce an error. " + "If False, the CUDA sanity check will be performed, and failures will " + "be reported, but they will not result in an error", + None, 'store_true', False), + 'cuda-sanity-check-strict': ("Perform strict CUDA sanity check. Without this option, the CUDA sanity " + "check will fail if the CUDA binaries don't contain code for (at least) " + "all compute capabilities defined in --cude-compute-capabilities, but will " + "accept if code for additional compute capabilities is present. " + "With this setting, the sanity check will also fail if code is present for " + "more compute capabilities than defined in --cuda-compute-capabilities.", + None, 'store_true', False), 'debug-lmod': ("Run Lmod modules tool commands in debug module", None, 'store_true', False), 'default-opt-level': ("Specify default optimisation level", 'choice', 'store', DEFAULT_OPT_LEVEL, Compiler.COMPILER_OPT_OPTIONS), @@ -553,28 +580,6 @@ def override_options(self): 'strict-rpath-sanity-check': ("Perform strict RPATH sanity check, which involves unsetting " "$LD_LIBRARY_PATH before checking whether all required libraries are found", None, 'store_true', False), - 'ignore-cuda-sanity-failures': ("The CUDA sanity check will be performed, and a report will be printed, " - "but any failures in the CUDA sanity check will be ignored", - None, 'store_true', True), - 'strict-cuda-sanity-check': ("Perform strict CUDA sanity check. Without this option, the CUDA sanity " - "check will fail if the CUDA binaries don't contain code for (at least) " - "all compute capabilities defined in --cude-compute-capabilities, but will " - "accept if code for additional compute capabilities is present. " - "With this setting, the sanity check will also fail if code is present for " - "more compute capabilities than defined in --cuda-compute-capabilities.", - None, 'store_true', False), - 'accept-ptx-as-cc-support': ("CUDA sanity check also passes if requested device code is not present, as " - "long as a PTX code is present that can be JIT-compiled into the requestd " - "device code. E.g. if --cuda-compute-capabilities=8.0 and a binary is found " - "in the installation that does not have device code for 8.0, but does have " - "PTX code for 7.0, the sanity check will pass if, and only if, this option " - "is True. Note that JIT-compiling means the binary will work on the " - "requested architecture, but is it not necessarily as well optimized as when " - "actual device code is present for the requested architecture ", - None, 'store_true', False), - 'accept-missing-cuda-ptx': ("CUDA sanity check also passes if PTX code for the highest requested CUDA " - "compute capability is not present (but will print a warning)", - None, 'store_true', False), 'sysroot': ("Location root directory of system, prefix for standard paths like /usr/lib and /usr/include", None, 'store', None), 'trace': ("Provide more information in output to stdout on progress", None, 'store_true', True, 'T'), From 931cd8ca908306a5e12d496060f69bd3ec572449 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 10 Apr 2025 15:07:02 +0200 Subject: [PATCH 045/114] Fix cuda-compute-capabilities description to be more specific that fat binaries aren't always possible --- easybuild/tools/options.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/easybuild/tools/options.py b/easybuild/tools/options.py index 0c98216585..dc5c39a950 100644 --- a/easybuild/tools/options.py +++ b/easybuild/tools/options.py @@ -398,12 +398,13 @@ def override_options(self): int, 'store_or_None', None), 'cuda-compute-capabilities': ("List of CUDA compute capabilities to use when building GPU software; " "values should be specified as digits separated by a dot, " - "for example: 3.5,5.0,7.2. EasyBuild will compile a fat binaries with " - "support for (at least) all requested CUDA compute capabilities, and " - "PTX code for the highest CUDA compute capability (for forwards " - "compatibility). The check on this behavior may be relaxed using " - "--accept-ptx-for-cc-support or --accept-missing-ptx, or made more " - "stringent using --strict-cuda-sanity-check.", + "for example: 3.5,5.0,7.2. EasyBuild will (where possible) compile fat " + "binaries with support for (at least) all requested CUDA compute " + "capabilities, and PTX code for the highest CUDA compute capability (for " + "forwards compatibility). The check on this behavior may be relaxed using " + "--cuda-sanity-check-accept-missing-ptx, " + "--cuda-sanity-check-accept-ptx-as-devcode, " + "or made more stringent using --cuda-sanity-check-strict.", 'strlist', 'extend', None), 'cuda-sanity-check-accept-missing-ptx': ("CUDA sanity check also passes if PTX code for the highest " "requested CUDA compute capability is not present (but will " @@ -424,7 +425,7 @@ def override_options(self): 'cuda-sanity-check-error-on-fail': ("If True, failures in the CUDA sanity check will produce an error. " "If False, the CUDA sanity check will be performed, and failures will " "be reported, but they will not result in an error", - None, 'store_true', False), + None, 'store_true', False), 'cuda-sanity-check-strict': ("Perform strict CUDA sanity check. Without this option, the CUDA sanity " "check will fail if the CUDA binaries don't contain code for (at least) " "all compute capabilities defined in --cude-compute-capabilities, but will " From a466f369c4c520850b7a64ee7de38402f8a119b7 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 10 Apr 2025 15:32:56 +0200 Subject: [PATCH 046/114] Various changes from code review --- easybuild/framework/easyblock.py | 168 ++++++++++++++++--------------- 1 file changed, 85 insertions(+), 83 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 6bb54cf3aa..18424d4f2c 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3388,16 +3388,16 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): num_cuda_files = 0 # Creating lists of files for summary report: - files_missing_cc = [] - files_missing_cc_fails = [] - files_missing_cc_ignored = [] - files_surplus_cc = [] - files_surplus_cc_fails = [] - files_surplus_cc_ignored = [] + files_missing_devcode = [] + files_missing_devcode_fails = [] + files_missing_devcode_ignored = [] + files_additional_cc = [] + files_additional_cc_fails = [] + files_additional_cc_ignored = [] files_missing_ptx = [] files_missing_ptx_fails = [] files_missing_ptx_ignored = [] - files_missing_cc_but_has_ptx = [] + files_missing_devcode_but_has_ptx = [] # Looping through all files to check CUDA device and PTX code for dirpath in [os.path.join(self.installdir, d) for d in cuda_dirs]: @@ -3437,74 +3437,73 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): ignore_msg = f"This failure will be ignored as '{path}' is listed in " ignore_msg += "'cuda_sanity_ignore_files'." - if additional_ccs or missing_ccs: + if additional_ccs: fail_msg = f"Mismatch between cuda_compute_capabilities and device code in {path}. " - if additional_ccs: - # Count and log for summary report - files_surplus_cc.append(path) - surplus_cc_str = ', '.join(sorted(additional_ccs, key=LooseVersion)) - fail_msg += "Surplus compute capabilities: %s. " % surplus_cc_str - if strict_cc_check: # Surplus compute capabilities not allowed - if path in ignore_file_list or ignore_failrues: - files_surplus_cc_ignored.append(path) - fail_msg += ignore_msg - is_failure = False - else: - files_surplus_cc_fails.append(path) - is_failure = True + # Count and log for summary report + files_additional_cc.append(path) + additional_cc_str = ', '.join(sorted(additional_ccs, key=LooseVersion)) + fail_msg += "Surplus compute capabilities: %s. " % additional_cc_str + if strict_cc_check: # Surplus compute capabilities not allowed + if path in ignore_file_list or ignore_failrues: + files_additional_cc_ignored.append(path) + fail_msg += ignore_msg + is_failure = False else: + files_additional_cc_fails.append(path) + is_failure = True + else: + is_failure = False + elif missing_ccs: + fail_msg = f"Mismatch between cuda_compute_capabilities and device code in {path}. " + # Count and log for summary report + missing_cc_str = ', '.join(sorted(missing_ccs, key=LooseVersion)) + fail_msg += "Missing compute capabilities: %s. " % missing_cc_str + # If accept_ptx_as_devcode, this might not be a failure _if_ there is suitable PTX + # code to JIT compile from that supports the CCs in missing_ccs + if accept_ptx_as_devcode: + # Check that for each item in missing_ccs there is PTX code for lower or equal + # CUDA compute capability + comparisons = [] + for cc in missing_ccs: + has_smaller_equal_ptx = any( + LooseVersion(ptx_cc) <= LooseVersion(cc) for ptx_cc in derived_ptx_ccs + ) + comparisons.append(has_smaller_equal) + # Only if that's the case for ALL cc's in missing_ccs, this is a warning, not a + # failure + if all(comparisons): + files_missing_devcode_but_has_ptx.append(path) is_failure = False - - if missing_ccs: - # Count and log for summary report - missing_cc_str = ', '.join(sorted(missing_ccs, key=LooseVersion)) - fail_msg += "Missing compute capabilities: %s. " % missing_cc_str - # If accept_ptx_as_devcode, this might not be a failure _if_ there is suitable PTX - # code to JIT compile from that supports the CCs in missing_ccs - if accept_ptx_as_devcode: - # Check that for each item in missing_ccs there is PTX code for lower or equal - # CUDA compute capability - comparisons = [] - for cc in missing_ccs: - has_smaller_equal = any( - LooseVersion(ptx_cc) <= LooseVersion(cc) for ptx_cc in derived_ptx_ccs - ) - comparisons.append(has_smaller_equal) - # Only if that's the case for ALL cc's in missing_ccs, this is a warning, not a - # failure - if all(comparisons): - files_missing_cc_but_has_ptx.append(path) - is_failure = False - else: - files_missing_cc.append(path) - if path in ignore_file_list or ignore_failures: - files_missing_cc_ignored.append(path) - fail_msg += ignore_msg - is_failure = False - else: - files_missing_cc_fails.append(path) - is_failure = True else: - files_missing_cc.append(path) + files_missing_devcode.append(path) if path in ignore_file_list or ignore_failures: - files_missing_cc_ignored.append(path) + files_missing_devcode_ignored.append(path) fail_msg += ignore_msg is_failure = False else: - files_missing_cc_fails.append(path) + files_missing_devcode_fails.append(path) is_failure = True - - # If considered a failure, append to fails so that a sanity error will be thrown - # Otherwise, log a warning - if is_failure: - fail_msgs.append(fail_msg) else: - self.log.warning(fail_msg) + files_missing_devcode.append(path) + if path in ignore_file_list or ignore_failures: + files_missing_devcode_ignored.append(path) + fail_msg += ignore_msg + is_failure = False + else: + files_missing_devcode_fails.append(path) + is_failure = True else: msg = (f"Output of 'cuobjdump' checked for '{path}'; device code architectures match " "those in cuda_compute_capabilities") self.log.debug(msg) + # If considered a failure, append to fails so that a sanity error will be thrown + # Otherwise, log a warning + if is_failure: + fail_msgs.append(fail_msg) + else: + self.log.warning(fail_msg) + # Check whether there is ptx code for the highest CC in cfg_ccs # Make sure to use LooseVersion so that e.g. 9.0 < 9.0a < 9.2 < 9.10 highest_cc = [sorted(cfg_ccs, key=LooseVersion)[-1]] @@ -3531,21 +3530,39 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): else: self.log.debug(f"Not sanity checking files in non-existing directory {dirpath}") + summary_msg_files = f"{len(files_missing_devcode}) files missing one or more CUDA compute capabilities: " + summary_msg_files += f"{files_missing_devcode}\n" + summary_msg_files += f"These failures are ignored for {len(files_missing_devcode_ignored)} files: " + summary_msg_files += "{files_missing_devcode_ignored})\n" + if accept_ptx_as_devcode: + summary_msg_files += f"{len(files_missing_devcode_but_has_ptx)} files missing one or more CUDA Compute " + summary_msg_files += "Capabilities, but has suitable PTX code that can be JIT compiled for the requested " + summary_msg_files += f"CUDA Compute Capabilities: {files_missing_devcode_but_has_ptx}\n" + summary_msg_files += "{len(files_additional_cc)} files with device code for more CUDA Compute Capabilities " + summary_msg_files += f"than requested: {files_additional_cc}\n" + summary_msg_files += f"These failures are ignored for {len(files_additional_cc_ignored)} files: " + summary_msg_files += f"{files_additional_cc_ignored})\n" + summary_msg_files += f"{len(files_missing_ptx} files missing PTX code for the highest configured CUDA Compute " + summary_msg_files += f"Capability: {files_missing_ptx}\n" + summary_msg_files += f"These failures are ignored for {len(files_missing_ptx_ignored)} files: " + summary_msg_files += f"{files_missing_ptx_ignored})" + self.log.info(summary_msg_files) + # Summary summary_msg = "CUDA sanity check summary report:\n" summary_msg += f"Number of CUDA files checked: {num_cuda_files}\n" - summary_msg += f"Number of files missing one or more CUDA Compute Capabilities: {len(files_missing_cc)} " - summary_msg += f"(ignored: {len(files_missing_cc_ignored)}, fails: {len(files_missing_cc_fails)})\n" + summary_msg += f"Number of files missing one or more CUDA Compute Capabilities: {len(files_missing_devcode)} " + summary_msg += f"(ignored: {len(files_missing_devcode_ignored)}, fails: {len(files_missing_devcode_fails)})\n" if accept_ptx_as_devcode: summary_msg += "Number of files missing one or more CUDA Compute Capabilities, but has suitable " summary_msg += "PTX code that can be JIT compiled for the requested CUDA Compute Capabilities: " - summary_msg += f"{len(files_missing_cc_but_has_ptx)}\n" + summary_msg += f"{len(files_missing_devcode_but_has_ptx)}\n" summary_msg += f"Number of files with device code for more CUDA Compute Capabilities than requested: " if strict_cc_check: - summary_msg += f"{len(files_surplus_cc)} (ignored: {len(files_surplus_cc_ignored)}, fails: " - summary_msg += f"{len(files_surplus_cc_fails)})\n" + summary_msg += f"{len(files_additional_cc)} (ignored: {len(files_surplus_cc_ignored)}, fails: " + summary_msg += f"{len(files_additional_cc_fails)})\n" else: - summary_msg += f"{len(files_surplus_cc)} (not running with --cuda-sanity-check-strict, so not " + summary_msg += f"{len(files_additional_cc)} (not running with --cuda-sanity-check-strict, so not " summary_msg += "considered failures)\n" summary_msg += "Number of files missing PTX code for the highest configured CUDA Compute Capability: " if accept_missing_ptx: @@ -3557,14 +3574,14 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): if not build_option('debug'): summary_msg += "Rerun with --debug to see a detailed list of files.\n" # Give some advice - if len(files_missing_cc) > 0 and not accept_ptx_as_devcode: + if len(files_missing_devcode) > 0 and not accept_ptx_as_devcode: summary_msg += "\nYou may consider rerunning with --cuda-sanity-check-accept-ptx-as-devcode to accept binaries that " summary_msg += "don't contain the device code for your requested CUDA Compute Capabilities, but that " summary_msg += "do have PTX code that can be compiled for your requested CUDA Compute " summary_msg += "Capabilities. Note that this may increase startup delay due to JIT compilation " summary_msg += "and may also lead to suboptimal runtime performance, as the PTX code may not exploit " summary_msg += "all features specific to your hardware architecture.\n" - if len(files_surplus_cc) > 0 and strict_cc_check: + if len(files_additional_cc) > 0 and strict_cc_check: summary_msg += "\nYou may consider running with --disable-cuda-sanity-check-strict. This means you'll " summary_msg += "accept that some binaries may have CUDA Device Code for more architectures than you " summary_msg += "requested, i.e. the binary is 'fatter' than you need. Bigger binaries may generally " @@ -3578,21 +3595,6 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): summary_msg += "you requested in --cuda-compute-capabilities), but that may be acceptable to you.\n" self.log.info(summary_msg) - summary_msg_debug = "Detailed CUDA sanity check summary report:\n" - summary_msg_debug += f"Files missing one or more CUDA compute capabilities: {files_missing_cc}\n" - summary_msg_debug += f"These failures are ignored for: {files_missing_cc_ignored})\n" - if accept_ptx_as_devcode: - summary_msg_debug += "Files missing one or more CUDA Compute Capabilities, but has suitable PTX " - summary_msg_debug += "code that can be JIT compiled for the requested CUDA Compute Capabilities: " - summary_msg_debug += f"{files_missing_cc_but_has_ptx}\n" - summary_msg_debug += "Files with device code for more CUDA Compute Capabilities than requested: " - summary_msg_debug += f"{files_surplus_cc}\n" - summary_msg_debug += f"These failures are ignored for: {files_surplus_cc_ignored})\n" - summary_msg_debug += "Files missing PTX code for the highest configured CUDA Compute Capability: " - summary_msg_debug += f"{files_missing_ptx}\n" - summary_msg_debug += f"These failures are ignored for: {files_missing_ptx_ignored})" - self.log.debug(summary_msg_debug) - return fail_msgs def sanity_check_rpath(self, rpath_dirs=None, check_readelf_rpath=True): From 1bbff1bd7c63220d8bb5b975389b8e14710fa8ec Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 10 Apr 2025 15:33:51 +0200 Subject: [PATCH 047/114] Replaced more occurences of cc by devcode --- easybuild/framework/easyblock.py | 36 ++++++++++++++++---------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 18424d4f2c..dd6fdf32a3 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3391,9 +3391,9 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): files_missing_devcode = [] files_missing_devcode_fails = [] files_missing_devcode_ignored = [] - files_additional_cc = [] - files_additional_cc_fails = [] - files_additional_cc_ignored = [] + files_additional_devcode = [] + files_additional_devcode_fails = [] + files_additional_devcode_ignored = [] files_missing_ptx = [] files_missing_ptx_fails = [] files_missing_ptx_ignored = [] @@ -3430,26 +3430,26 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): derived_ptx_ccs = res.ptx_archs # check whether device code architectures match cuda_compute_capabilities - additional_ccs = list(set(derived_ccs) - set(cfg_ccs)) + additional_devcodes = list(set(derived_ccs) - set(cfg_ccs)) missing_ccs = list(set(cfg_ccs) - set(derived_ccs)) # Message for when file is on the ignore list: ignore_msg = f"This failure will be ignored as '{path}' is listed in " ignore_msg += "'cuda_sanity_ignore_files'." - if additional_ccs: + if additional_devcodes: fail_msg = f"Mismatch between cuda_compute_capabilities and device code in {path}. " # Count and log for summary report - files_additional_cc.append(path) - additional_cc_str = ', '.join(sorted(additional_ccs, key=LooseVersion)) - fail_msg += "Surplus compute capabilities: %s. " % additional_cc_str + files_additional_devcode.append(path) + additional_devcode_str = ', '.join(sorted(additional_devcodes, key=LooseVersion)) + fail_msg += "Surplus compute capabilities: %s. " % additional_devcode_str if strict_cc_check: # Surplus compute capabilities not allowed if path in ignore_file_list or ignore_failrues: - files_additional_cc_ignored.append(path) + files_additional_devcode_ignored.append(path) fail_msg += ignore_msg is_failure = False else: - files_additional_cc_fails.append(path) + files_additional_devcode_fails.append(path) is_failure = True else: is_failure = False @@ -3538,10 +3538,10 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): summary_msg_files += f"{len(files_missing_devcode_but_has_ptx)} files missing one or more CUDA Compute " summary_msg_files += "Capabilities, but has suitable PTX code that can be JIT compiled for the requested " summary_msg_files += f"CUDA Compute Capabilities: {files_missing_devcode_but_has_ptx}\n" - summary_msg_files += "{len(files_additional_cc)} files with device code for more CUDA Compute Capabilities " - summary_msg_files += f"than requested: {files_additional_cc}\n" - summary_msg_files += f"These failures are ignored for {len(files_additional_cc_ignored)} files: " - summary_msg_files += f"{files_additional_cc_ignored})\n" + summary_msg_files += "{len(files_additional_devcode)} files with device code for more CUDA Compute Capabilities " + summary_msg_files += f"than requested: {files_additional_devcode}\n" + summary_msg_files += f"These failures are ignored for {len(files_additional_devcode_ignored)} files: " + summary_msg_files += f"{files_additional_devcode_ignored})\n" summary_msg_files += f"{len(files_missing_ptx} files missing PTX code for the highest configured CUDA Compute " summary_msg_files += f"Capability: {files_missing_ptx}\n" summary_msg_files += f"These failures are ignored for {len(files_missing_ptx_ignored)} files: " @@ -3559,10 +3559,10 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): summary_msg += f"{len(files_missing_devcode_but_has_ptx)}\n" summary_msg += f"Number of files with device code for more CUDA Compute Capabilities than requested: " if strict_cc_check: - summary_msg += f"{len(files_additional_cc)} (ignored: {len(files_surplus_cc_ignored)}, fails: " - summary_msg += f"{len(files_additional_cc_fails)})\n" + summary_msg += f"{len(files_additional_devcode)} (ignored: {len(files_surplus_cc_ignored)}, fails: " + summary_msg += f"{len(files_additional_devcode_fails)})\n" else: - summary_msg += f"{len(files_additional_cc)} (not running with --cuda-sanity-check-strict, so not " + summary_msg += f"{len(files_additional_devcode)} (not running with --cuda-sanity-check-strict, so not " summary_msg += "considered failures)\n" summary_msg += "Number of files missing PTX code for the highest configured CUDA Compute Capability: " if accept_missing_ptx: @@ -3581,7 +3581,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): summary_msg += "Capabilities. Note that this may increase startup delay due to JIT compilation " summary_msg += "and may also lead to suboptimal runtime performance, as the PTX code may not exploit " summary_msg += "all features specific to your hardware architecture.\n" - if len(files_additional_cc) > 0 and strict_cc_check: + if len(files_additional_devcode) > 0 and strict_cc_check: summary_msg += "\nYou may consider running with --disable-cuda-sanity-check-strict. This means you'll " summary_msg += "accept that some binaries may have CUDA Device Code for more architectures than you " summary_msg += "requested, i.e. the binary is 'fatter' than you need. Bigger binaries may generally " From 22c3c23919db4c2f4d5aa87d52bdea540a026eeb Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 10 Apr 2025 16:11:22 +0200 Subject: [PATCH 048/114] only store relative paths in the files_X variables --- easybuild/framework/easyblock.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index dd6fdf32a3..1efdcfd300 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3440,16 +3440,16 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): if additional_devcodes: fail_msg = f"Mismatch between cuda_compute_capabilities and device code in {path}. " # Count and log for summary report - files_additional_devcode.append(path) + files_additional_devcode.append(os.path.relpath(path, self.installdir)) additional_devcode_str = ', '.join(sorted(additional_devcodes, key=LooseVersion)) fail_msg += "Surplus compute capabilities: %s. " % additional_devcode_str if strict_cc_check: # Surplus compute capabilities not allowed if path in ignore_file_list or ignore_failrues: - files_additional_devcode_ignored.append(path) + files_additional_devcode_ignored.append(os.path.relpath(path, self.installdir)) fail_msg += ignore_msg is_failure = False else: - files_additional_devcode_fails.append(path) + files_additional_devcode_fails.append(os.path.relpath(path, self.installdir)) is_failure = True else: is_failure = False @@ -3472,25 +3472,25 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): # Only if that's the case for ALL cc's in missing_ccs, this is a warning, not a # failure if all(comparisons): - files_missing_devcode_but_has_ptx.append(path) + files_missing_devcode_but_has_ptx.append(os.path.relpath(path, self.installdir)) is_failure = False else: - files_missing_devcode.append(path) + files_missing_devcode.append(os.path.relpath(path, self.installdir)) if path in ignore_file_list or ignore_failures: - files_missing_devcode_ignored.append(path) + files_missing_devcode_ignored.append(os.path.relpath(path, self.installdir)) fail_msg += ignore_msg is_failure = False else: - files_missing_devcode_fails.append(path) + files_missing_devcode_fails.append(os.path.relpath(path, self.installdir)) is_failure = True else: - files_missing_devcode.append(path) + files_missing_devcode.append(os.path.relpath(path, self.installdir)) if path in ignore_file_list or ignore_failures: - files_missing_devcode_ignored.append(path) + files_missing_devcode_ignored.append(os.path.relpath(path, self.installdir)) fail_msg += ignore_msg is_failure = False else: - files_missing_devcode_fails.append(path) + files_missing_devcode_fails.append(os.path.relpath(path, self.installdir)) is_failure = True else: msg = (f"Output of 'cuobjdump' checked for '{path}'; device code architectures match " @@ -3510,18 +3510,18 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): missing_ptx_ccs = list(set(highest_cc) - set(derived_ptx_ccs)) if missing_ptx_ccs: - files_missing_ptx.append(path) + files_missing_ptx.append(os.path.relpath(path, self.installdir)) fail_msg = "Configured highest compute capability was '%s', " fail_msg += "but no PTX code for this compute capability was found in '%s' " fail_msg += "(PTX architectures supported in that file: %s). " if path in ignore_file_list or ignore_failures: - files_missing_ptx_ignored.append(path) + files_missing_ptx_ignored.append(os.path.relpath(path, self.installdir)) fail_msg += ignore_msg self.log.warning(fail_msg, highest_cc[0], path, derived_ptx_ccs) elif accept_missing_ptx: self.log.warning(fail_msg, highest_cc[0], path, derived_ptx_ccs) else: - files_missing_ptx_fails.append(path) + files_missing_ptx_fails.append(os.path.relpath(path, self.installdir)) fail_msgs.append(fail_msg % (highest_cc[0], path, derived_ptx_ccs)) else: msg = (f"Output of 'cuobjdump' checked for '{path}'; ptx code was present for (at " From 4166d3413d065658baea7d9098fa5ebfeb3b5269 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 10 Apr 2025 16:52:26 +0200 Subject: [PATCH 049/114] Processed various review comments... --- easybuild/framework/easyblock.py | 32 ++++++++++++++++++++++++++++---- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 1efdcfd300..6d8d788e44 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3438,27 +3438,34 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): ignore_msg += "'cuda_sanity_ignore_files'." if additional_devcodes: + # Device code found for more architectures than requested in cuda-compute-capabilities fail_msg = f"Mismatch between cuda_compute_capabilities and device code in {path}. " # Count and log for summary report files_additional_devcode.append(os.path.relpath(path, self.installdir)) additional_devcode_str = ', '.join(sorted(additional_devcodes, key=LooseVersion)) fail_msg += "Surplus compute capabilities: %s. " % additional_devcode_str - if strict_cc_check: # Surplus compute capabilities not allowed - if path in ignore_file_list or ignore_failrues: + if strict_cc_check: + # cuda-sanity-check-strict, so no additional compute capabilities allowed + if path in ignore_file_list or ignore_failures: + # No error, because either path is on the cuda_sanity_ignore_files list in the + # easyconfig, or we are running with --disable-cuda-sanity-check-error-on-fail files_additional_devcode_ignored.append(os.path.relpath(path, self.installdir)) fail_msg += ignore_msg is_failure = False else: + # Sanity error files_additional_devcode_fails.append(os.path.relpath(path, self.installdir)) is_failure = True else: is_failure = False elif missing_ccs: + # One or more device code architectures requested in cuda-compute-capabilities was + # not found in the binary fail_msg = f"Mismatch between cuda_compute_capabilities and device code in {path}. " # Count and log for summary report missing_cc_str = ', '.join(sorted(missing_ccs, key=LooseVersion)) fail_msg += "Missing compute capabilities: %s. " % missing_cc_str - # If accept_ptx_as_devcode, this might not be a failure _if_ there is suitable PTX + # If accept_ptx_as_devcode, this might not be a failure IF there is suitable PTX # code to JIT compile from that supports the CCs in missing_ccs if accept_ptx_as_devcode: # Check that for each item in missing_ccs there is PTX code for lower or equal @@ -3475,24 +3482,35 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): files_missing_devcode_but_has_ptx.append(os.path.relpath(path, self.installdir)) is_failure = False else: + # If there are CCs for which there is no suiteable PTX that can be JIT-compiled + # from, this is considerd a failure files_missing_devcode.append(os.path.relpath(path, self.installdir)) if path in ignore_file_list or ignore_failures: + # No error, because either path is on the cuda_sanity_ignore_files list in the + # easyconfig, or we are running with --disable-cuda-sanity-check-error-on-fail files_missing_devcode_ignored.append(os.path.relpath(path, self.installdir)) fail_msg += ignore_msg is_failure = False else: + # Sanity error files_missing_devcode_fails.append(os.path.relpath(path, self.installdir)) is_failure = True else: + # Device code was missing, and we're not accepting PTX code as alternative + # This is considered a failure files_missing_devcode.append(os.path.relpath(path, self.installdir)) if path in ignore_file_list or ignore_failures: + # No error, because either path is on the cuda_sanity_ignore_files list in the + # easyconfig, or we are running with --disable-cuda-sanity-check-error-on-fail files_missing_devcode_ignored.append(os.path.relpath(path, self.installdir)) fail_msg += ignore_msg is_failure = False else: + # Sanity error files_missing_devcode_fails.append(os.path.relpath(path, self.installdir)) is_failure = True else: + # Device code for all architectures requested in --cuda-compute-capabilities was found msg = (f"Output of 'cuobjdump' checked for '{path}'; device code architectures match " "those in cuda_compute_capabilities") self.log.debug(msg) @@ -3510,17 +3528,22 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): missing_ptx_ccs = list(set(highest_cc) - set(derived_ptx_ccs)) if missing_ptx_ccs: + # There is no PTX code for the highest compute capability in --cuda-compute-capabilities files_missing_ptx.append(os.path.relpath(path, self.installdir)) fail_msg = "Configured highest compute capability was '%s', " fail_msg += "but no PTX code for this compute capability was found in '%s' " fail_msg += "(PTX architectures supported in that file: %s). " if path in ignore_file_list or ignore_failures: + # No error, because either path is on the cuda_sanity_ignore_files list in the + # easyconfig, or we are running with --disable-cuda-sanity-check-error-on-fail files_missing_ptx_ignored.append(os.path.relpath(path, self.installdir)) fail_msg += ignore_msg self.log.warning(fail_msg, highest_cc[0], path, derived_ptx_ccs) elif accept_missing_ptx: + # No error, because we are running with --cuda-sanity-check-accept-missing-ptx self.log.warning(fail_msg, highest_cc[0], path, derived_ptx_ccs) else: + # Sanity error files_missing_ptx_fails.append(os.path.relpath(path, self.installdir)) fail_msgs.append(fail_msg % (highest_cc[0], path, derived_ptx_ccs)) else: @@ -3530,6 +3553,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): else: self.log.debug(f"Not sanity checking files in non-existing directory {dirpath}") + # Long report, which prints the files that have potential issues summary_msg_files = f"{len(files_missing_devcode}) files missing one or more CUDA compute capabilities: " summary_msg_files += f"{files_missing_devcode}\n" summary_msg_files += f"These failures are ignored for {len(files_missing_devcode_ignored)} files: " @@ -3548,7 +3572,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): summary_msg_files += f"{files_missing_ptx_ignored})" self.log.info(summary_msg_files) - # Summary + # Short summary summary_msg = "CUDA sanity check summary report:\n" summary_msg += f"Number of CUDA files checked: {num_cuda_files}\n" summary_msg += f"Number of files missing one or more CUDA Compute Capabilities: {len(files_missing_devcode)} " From 45bfcda4af3b6b91cb165efcaab55c5a534e7b09 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 10 Apr 2025 16:56:12 +0200 Subject: [PATCH 050/114] Fix hound issues --- easybuild/framework/easyblock.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 6d8d788e44..b693dc4c34 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3422,8 +3422,8 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): # - Device code for additional compute capabilities is considered a failure if # --cuda-sanity-check-strict is True (otherwise, it's a warning) # - Missing PTX code for the highest CUDA compute capability in --cuda-compute-capabilities - # is considered a failure, unless --cuda-sanity-check-accept-missing-ptx is True (in which case it is - # a warning) + # is considered a failure, unless --cuda-sanity-check-accept-missing-ptx is True (in which + # case it is a warning) num_cuda_files += 1 # unpack results derived_ccs = res.device_code_archs @@ -3445,7 +3445,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): additional_devcode_str = ', '.join(sorted(additional_devcodes, key=LooseVersion)) fail_msg += "Surplus compute capabilities: %s. " % additional_devcode_str if strict_cc_check: - # cuda-sanity-check-strict, so no additional compute capabilities allowed + # cuda-sanity-check-strict, so no additional compute capabilities allowed if path in ignore_file_list or ignore_failures: # No error, because either path is on the cuda_sanity_ignore_files list in the # easyconfig, or we are running with --disable-cuda-sanity-check-error-on-fail @@ -3554,7 +3554,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): self.log.debug(f"Not sanity checking files in non-existing directory {dirpath}") # Long report, which prints the files that have potential issues - summary_msg_files = f"{len(files_missing_devcode}) files missing one or more CUDA compute capabilities: " + summary_msg_files = f"{len(files_missing_devcode)} files missing one or more CUDA compute capabilities: " summary_msg_files += f"{files_missing_devcode}\n" summary_msg_files += f"These failures are ignored for {len(files_missing_devcode_ignored)} files: " summary_msg_files += "{files_missing_devcode_ignored})\n" @@ -3562,8 +3562,8 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): summary_msg_files += f"{len(files_missing_devcode_but_has_ptx)} files missing one or more CUDA Compute " summary_msg_files += "Capabilities, but has suitable PTX code that can be JIT compiled for the requested " summary_msg_files += f"CUDA Compute Capabilities: {files_missing_devcode_but_has_ptx}\n" - summary_msg_files += "{len(files_additional_devcode)} files with device code for more CUDA Compute Capabilities " - summary_msg_files += f"than requested: {files_additional_devcode}\n" + summary_msg_files += "{len(files_additional_devcode)} files with device code for more CUDA Compute " + summary_msg_files += f"Capabilities than requested: {files_additional_devcode}\n" summary_msg_files += f"These failures are ignored for {len(files_additional_devcode_ignored)} files: " summary_msg_files += f"{files_additional_devcode_ignored})\n" summary_msg_files += f"{len(files_missing_ptx} files missing PTX code for the highest configured CUDA Compute " @@ -3590,8 +3590,8 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): summary_msg += "considered failures)\n" summary_msg += "Number of files missing PTX code for the highest configured CUDA Compute Capability: " if accept_missing_ptx: - summary_msg += f"{len(files_missing_ptx)} (running with --cuda-sanity-check-accept-missing-ptx so not considered " - summary_msg += "failures)\n" + summary_msg += f"{len(files_missing_ptx)} (running with --cuda-sanity-check-accept-missing-ptx so not " + summary_msg += "considered failures)\n" else: summary_msg += f"{len(files_missing_ptx)} (ignored: {len(files_missing_ptx_ignored)}, fails: " summary_msg += f"{len(files_missing_ptx_fails)})\n" @@ -3599,9 +3599,9 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): summary_msg += "Rerun with --debug to see a detailed list of files.\n" # Give some advice if len(files_missing_devcode) > 0 and not accept_ptx_as_devcode: - summary_msg += "\nYou may consider rerunning with --cuda-sanity-check-accept-ptx-as-devcode to accept binaries that " - summary_msg += "don't contain the device code for your requested CUDA Compute Capabilities, but that " - summary_msg += "do have PTX code that can be compiled for your requested CUDA Compute " + summary_msg += "\nYou may consider rerunning with --cuda-sanity-check-accept-ptx-as-devcode to accept " + summary_msg += "binaries that don't contain the device code for your requested CUDA Compute Capabilities, " + summary_msg += "but that do have PTX code that can be compiled for your requested CUDA Compute " summary_msg += "Capabilities. Note that this may increase startup delay due to JIT compilation " summary_msg += "and may also lead to suboptimal runtime performance, as the PTX code may not exploit " summary_msg += "all features specific to your hardware architecture.\n" @@ -3612,8 +3612,8 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): summary_msg += "cause some startup delay, and code path selection could introduce a small overhead, " summary_msg += "though this is generally negligible.\n" if len(files_missing_ptx) > 0 and not accept_missing_ptx: - summary_msg += "\nYou may consider running with --cuda-sanity-check-accept-missing-ptx to accept binaries that " - summary_msg += "don't contain PTX code for the highest CUDA Compute Capability you requested. This " + summary_msg += "\nYou may consider running with --cuda-sanity-check-accept-missing-ptx to accept binaries " + summary_msg += "that don't contain PTX code for the highest CUDA Compute Capability you requested. This " summary_msg += "breaks forwards compatibility for newer CUDA Compute Capabilities (i.e. your compiled " summary_msg += "binaries will not run on cards with higher CUDA Compute Capabilities than what " summary_msg += "you requested in --cuda-compute-capabilities), but that may be acceptable to you.\n" From 3b9b3869f24941b47d97f5691f6496554ee0f1b8 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 10 Apr 2025 16:58:05 +0200 Subject: [PATCH 051/114] Renamed function --- easybuild/framework/easyblock.py | 4 ++-- easybuild/tools/systemtools.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index b693dc4c34..bbac8d70a5 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -112,7 +112,7 @@ from easybuild.tools.output import show_progress_bars, start_progress_bar, stop_progress_bar, update_progress_bar from easybuild.tools.package.utilities import package from easybuild.tools.repository.repository import init_repository -from easybuild.tools.systemtools import check_linked_shared_libs, det_parallelism, get_cuda_device_code_architectures +from easybuild.tools.systemtools import check_linked_shared_libs, det_parallelism, get_cuda_device_code_and_ptx_architectures from easybuild.tools.systemtools import get_linked_libs_raw, get_shared_lib_ext, pick_system_specific_value, use_group from easybuild.tools.utilities import INDENT_4SPACES, get_class_for, nub, quote_str from easybuild.tools.utilities import remove_unwanted_chars, time2str, trace_msg @@ -3407,7 +3407,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): for path in [os.path.join(dirpath, x) for x in os.listdir(dirpath)]: self.log.debug("Sanity checking for CUDA device code in %s", path) - res = get_cuda_device_code_architectures(path) + res = get_cuda_device_code_and_ptx_architectures(path) if res is None: msg = f"{path} does not appear to be a CUDA executable (no CUDA device code found), " msg += "so skipping CUDA sanity check." diff --git a/easybuild/tools/systemtools.py b/easybuild/tools/systemtools.py index c896a57a43..903c33efbe 100644 --- a/easybuild/tools/systemtools.py +++ b/easybuild/tools/systemtools.py @@ -218,10 +218,10 @@ } -# A named tuple, to be returned by e.g. `get_cuda_device_code_architectures` +# A named tuple, to be returned by e.g. `get_cuda_device_code_and_ptx_architectures` cuda_dev_ptx_archs = namedtuple('cuda_dev_ptx_archs', ('device_code_archs', 'ptx_archs')) -cuda_dev_ptx_archs.__doc__ = """A namedtuple that represents the result of a call to get_cuda_device_code_architectures, -with the following fields: +cuda_dev_ptx_archs.__doc__ = """A namedtuple that represents the result of a call to +get_cuda_device_code_and_ptx_architectures, with the following fields: - device_code_archs: a list of CUDA device compute capabilities for which device code was found - ptx_archs: a list of CUDA (virtual) device compute capabilities for which ptx code was found """ @@ -1042,7 +1042,7 @@ def get_cuda_object_dump_raw(path): return result -def get_cuda_device_code_architectures(path): +def get_cuda_device_code_and_ptx_architectures(path): """ Get list of supported CUDA architectures, by inspecting the device code of an executable/library. The format is the same as cuda_compute_capabilities (e.g. ['8.6', '9.0'] for sm_86 sm_90). From 068811795542c5ecda827a1f12c168e00f7f7c80 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 10 Apr 2025 17:06:34 +0200 Subject: [PATCH 052/114] Various review comments processed --- easybuild/tools/systemtools.py | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/easybuild/tools/systemtools.py b/easybuild/tools/systemtools.py index 903c33efbe..cb15a4515a 100644 --- a/easybuild/tools/systemtools.py +++ b/easybuild/tools/systemtools.py @@ -65,6 +65,7 @@ pass from easybuild.base import fancylogger +from easybuild.tools import LooseVersion from easybuild.tools.build_log import EasyBuildError, EasyBuildExit, print_warning from easybuild.tools.config import IGNORE from easybuild.tools.filetools import is_readable, read_file, which @@ -1023,8 +1024,10 @@ def get_cuda_object_dump_raw(path): # check that the file is an executable or library/object result = None if any(x in res.output for x in ['executable', 'object', 'library']): + # Make sure we have a cuobjdump command + if not shutil.which('cuobjdump'): + raise EasyBuildError("Failed to get object dump from CUDA file: cuobjdump command not found") cuda_cmd = f"cuobjdump {path}" - res = run_shell_cmd(cuda_cmd, fail_on_error=False, hidden=True, output_file=False, stream_output=False) if res.exit_code == EasyBuildExit.SUCCESS: result = res.output @@ -1068,8 +1071,8 @@ def get_cuda_device_code_and_ptx_architectures(path): # compile_size = 64bit # Pattern to extract elf code architectures and ptx code architectures respectively - device_code_regex = re.compile('Fatbin elf code:\n=+\narch = sm_([0-9])([0-9]+a{0,1})') - ptx_code_regex = re.compile('Fatbin ptx code:\n=+\narch = sm_([0-9])([0-9]+a{0,1})') + device_code_regex = re.compile('Fatbin elf code:\n=+\narch = sm_([0-9]+)([0-9]a?)') + ptx_code_regex = re.compile('Fatbin ptx code:\n=+\narch = sm_([0-9]+)([0-9]a?)') # resolve symlinks if os.path.islink(path) and os.path.exists(path): @@ -1080,10 +1083,10 @@ def get_cuda_device_code_and_ptx_architectures(path): if cuda_raw is not None: # extract unique device code architectures from raw dump device_code_matches = re.findall(device_code_regex, cuda_raw) - if device_code_matches is not None: + if device_code_matches: # convert match tuples into unique list of cuda compute capabilities # e.g. [('8', '6'), ('8', '6'), ('9', '0')] -> ['8.6', '9.0'] - device_code_matches = sorted(['.'.join(m) for m in set(device_code_matches)]) + device_code_matches = sorted(['.'.join(m) for m in set(device_code_matches)], key=LooseVersion) else: # Try to be clear in the warning... did we not find elf code sections at all? or was the arch missing? device_section_regex = re.compile('Fatbin elf code') @@ -1104,12 +1107,12 @@ def get_cuda_device_code_and_ptx_architectures(path): if ptx_code_matches is not None: # convert match tuples into unique list of cuda compute capabilities # e.g. [('8', '6'), ('8', '6'), ('9', '0')] -> ['8.6', '9.0'] - ptx_code_matches = sorted(['.'.join(m) for m in set(ptx_code_matches)]) + ptx_code_matches = sorted(['.'.join(m) for m in set(ptx_code_matches)], key=LooseVersion) else: # Try to be clear in the warning... did we not find ptx code sections at all? or was the arch missing? ptx_section_regex = re.compile('Fatbin ptx code') ptx_section_matches = re.findall(ptx_section_regex, cuda_raw) - if ptx_section_matches is not None: + if ptx_section_matches: fail_msg = f"Found Fatbin ptx code section(s) in cuobjdump output for {path}, " fail_msg += "but failed to extract CUDA architecture" else: From 050226fe77810c94a5aaf75fff12920588a52008 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 10 Apr 2025 17:08:44 +0200 Subject: [PATCH 053/114] Fixed hound issues: --- easybuild/framework/easyblock.py | 5 +++-- easybuild/tools/systemtools.py | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index bbac8d70a5..4ea378a803 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -112,8 +112,9 @@ from easybuild.tools.output import show_progress_bars, start_progress_bar, stop_progress_bar, update_progress_bar from easybuild.tools.package.utilities import package from easybuild.tools.repository.repository import init_repository -from easybuild.tools.systemtools import check_linked_shared_libs, det_parallelism, get_cuda_device_code_and_ptx_architectures +from easybuild.tools.systemtools import check_linked_shared_libs, det_parallelism from easybuild.tools.systemtools import get_linked_libs_raw, get_shared_lib_ext, pick_system_specific_value, use_group +from easybuild.tools.systemtools import get_cuda_device_code_and_ptx_architectures from easybuild.tools.utilities import INDENT_4SPACES, get_class_for, nub, quote_str from easybuild.tools.utilities import remove_unwanted_chars, time2str, trace_msg from easybuild.tools.version import this_is_easybuild, VERBOSE_VERSION, VERSION @@ -3566,7 +3567,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): summary_msg_files += f"Capabilities than requested: {files_additional_devcode}\n" summary_msg_files += f"These failures are ignored for {len(files_additional_devcode_ignored)} files: " summary_msg_files += f"{files_additional_devcode_ignored})\n" - summary_msg_files += f"{len(files_missing_ptx} files missing PTX code for the highest configured CUDA Compute " + summary_msg_files += f"{len(files_missing_ptx)} files missing PTX code for the highest configured CUDA Compute " summary_msg_files += f"Capability: {files_missing_ptx}\n" summary_msg_files += f"These failures are ignored for {len(files_missing_ptx_ignored)} files: " summary_msg_files += f"{files_missing_ptx_ignored})" diff --git a/easybuild/tools/systemtools.py b/easybuild/tools/systemtools.py index cb15a4515a..7bbc470829 100644 --- a/easybuild/tools/systemtools.py +++ b/easybuild/tools/systemtools.py @@ -42,6 +42,7 @@ import platform import pwd import re +import shutil import struct import sys import termios @@ -221,7 +222,7 @@ # A named tuple, to be returned by e.g. `get_cuda_device_code_and_ptx_architectures` cuda_dev_ptx_archs = namedtuple('cuda_dev_ptx_archs', ('device_code_archs', 'ptx_archs')) -cuda_dev_ptx_archs.__doc__ = """A namedtuple that represents the result of a call to +cuda_dev_ptx_archs.__doc__ = """A namedtuple that represents the result of a call to get_cuda_device_code_and_ptx_architectures, with the following fields: - device_code_archs: a list of CUDA device compute capabilities for which device code was found - ptx_archs: a list of CUDA (virtual) device compute capabilities for which ptx code was found @@ -1041,7 +1042,7 @@ def get_cuda_object_dump_raw(path): else: msg = "Dumping CUDA binary file information for '%s' via '%s' failed! Output: '%s'" _log.debug(msg, path, cuda_cmd, res.output) - + return result From c8a448a211e4185296fcd894f1aa0ee0a215894d Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 10 Apr 2025 17:13:01 +0200 Subject: [PATCH 054/114] Make sure to raise an error if cuobjdump doesnt exist, or if it returns something very unexpected --- easybuild/tools/systemtools.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/easybuild/tools/systemtools.py b/easybuild/tools/systemtools.py index 7bbc470829..9aa679d8bf 100644 --- a/easybuild/tools/systemtools.py +++ b/easybuild/tools/systemtools.py @@ -1037,11 +1037,15 @@ def get_cuda_object_dump_raw(path): # contain CUDA device code no_device_code_match = re.search(r'does not contain device code', res.output) if no_device_code_match is not None: + # File is a regular executable, object or library, but not a CUDA file msg = "'%s' does not appear to be a CUDA binary: cuobjdump failed to find device code in this file" _log.debug(msg, path) else: + # This should not happen: there was no string saying this was NOT a CUDA file, yet no device code + # was found at all + msg = "Dumping CUDA binary file information for '%s' via '%s' failed! Output: '%s'" - _log.debug(msg, path, cuda_cmd, res.output) + raise EasyBuildError(msg, path, cuda_cmd, res.output) return result From dd2be947a48ee81a13d76b2b6099e8ba7bc20681 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 10 Apr 2025 17:17:05 +0200 Subject: [PATCH 055/114] Raise info to warning when we're not erroring on failure --- easybuild/framework/easyblock.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 4ea378a803..bff28fa112 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3618,7 +3618,11 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): summary_msg += "breaks forwards compatibility for newer CUDA Compute Capabilities (i.e. your compiled " summary_msg += "binaries will not run on cards with higher CUDA Compute Capabilities than what " summary_msg += "you requested in --cuda-compute-capabilities), but that may be acceptable to you.\n" - self.log.info(summary_msg) + # Give this some extra visibility if we're NOT erroring out on failures + if ignore_failures: + self.log.warning(summary_msg) + else: + self.log.info(summary_msg) return fail_msgs From b0d5d5f5b502652b43a86b54da5e8bca56662011 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 14 Apr 2025 14:31:41 +0200 Subject: [PATCH 056/114] Fix linting issues --- easybuild/framework/easyblock.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index bff28fa112..4e99e7fce6 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3476,7 +3476,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): has_smaller_equal_ptx = any( LooseVersion(ptx_cc) <= LooseVersion(cc) for ptx_cc in derived_ptx_ccs ) - comparisons.append(has_smaller_equal) + comparisons.append(has_smaller_equal_ptx) # Only if that's the case for ALL cc's in missing_ccs, this is a warning, not a # failure if all(comparisons): @@ -3582,9 +3582,9 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): summary_msg += "Number of files missing one or more CUDA Compute Capabilities, but has suitable " summary_msg += "PTX code that can be JIT compiled for the requested CUDA Compute Capabilities: " summary_msg += f"{len(files_missing_devcode_but_has_ptx)}\n" - summary_msg += f"Number of files with device code for more CUDA Compute Capabilities than requested: " + summary_msg += "Number of files with device code for more CUDA Compute Capabilities than requested: " if strict_cc_check: - summary_msg += f"{len(files_additional_devcode)} (ignored: {len(files_surplus_cc_ignored)}, fails: " + summary_msg += f"{len(files_additional_devcode)} (ignored: {len(files_additional_cc_ignored)}, fails: " summary_msg += f"{len(files_additional_devcode_fails)})\n" else: summary_msg += f"{len(files_additional_devcode)} (not running with --cuda-sanity-check-strict, so not " From 5c0adced83c43a9d8e777977e9f6e1b131899d19 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 14 Apr 2025 17:13:13 +0200 Subject: [PATCH 057/114] Deduplicate code by replacing get_cuda_device_code_and_ptx_architectures by get_cuda_architectures. This can then be called once to retrieve contents from the elf code sections (device code archs) and once from the ptx code sections (ptx code archs) --- easybuild/framework/easyblock.py | 66 +++++++++++++++++--------- easybuild/tools/systemtools.py | 80 ++++++++++---------------------- 2 files changed, 68 insertions(+), 78 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 4e99e7fce6..728a3ad328 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -114,7 +114,7 @@ from easybuild.tools.repository.repository import init_repository from easybuild.tools.systemtools import check_linked_shared_libs, det_parallelism from easybuild.tools.systemtools import get_linked_libs_raw, get_shared_lib_ext, pick_system_specific_value, use_group -from easybuild.tools.systemtools import get_cuda_device_code_and_ptx_architectures +from easybuild.tools.systemtools import get_cuda_architectures from easybuild.tools.utilities import INDENT_4SPACES, get_class_for, nub, quote_str from easybuild.tools.utilities import remove_unwanted_chars, time2str, trace_msg from easybuild.tools.version import this_is_easybuild, VERBOSE_VERSION, VERSION @@ -3408,8 +3408,9 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): for path in [os.path.join(dirpath, x) for x in os.listdir(dirpath)]: self.log.debug("Sanity checking for CUDA device code in %s", path) - res = get_cuda_device_code_and_ptx_architectures(path) - if res is None: + found_dev_code_ccs = get_cuda_architectures(path, 'elf') + found_ptx_ccs = get_cuda_architectures(path, 'ptx') + if found_dev_code_ccs is None and found_ptx_ccs is None: msg = f"{path} does not appear to be a CUDA executable (no CUDA device code found), " msg += "so skipping CUDA sanity check." self.log.debug(msg) @@ -3425,19 +3426,36 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): # - Missing PTX code for the highest CUDA compute capability in --cuda-compute-capabilities # is considered a failure, unless --cuda-sanity-check-accept-missing-ptx is True (in which # case it is a warning) + + # If found_dev_code_ccs is None, but found_ptx_ccs isn't, or vice versa, it IS a CUDA file + # but there was simply no device/ptx code, respectively. So, make that an empty list + # then continue + if found_dev_code_ccs is None: + found_dev_code_ccs = [] + elif found_ptx_ccs is None: + found_ptx_ccs = [] + num_cuda_files += 1 - # unpack results - derived_ccs = res.device_code_archs - derived_ptx_ccs = res.ptx_archs # check whether device code architectures match cuda_compute_capabilities - additional_devcodes = list(set(derived_ccs) - set(cfg_ccs)) - missing_ccs = list(set(cfg_ccs) - set(derived_ccs)) - - # Message for when file is on the ignore list: - ignore_msg = f"This failure will be ignored as '{path}' is listed in " - ignore_msg += "'cuda_sanity_ignore_files'." + additional_devcodes = list(set(found_dev_code_ccs) - set(cfg_ccs)) + missing_ccs = list(set(cfg_ccs) - set(found_dev_code_ccs)) + + # There are two reasons for ignoring failures: + # - We are running with --disable-cuda-sanity-check-error-on-fail + # - The specific {path} is on the cuda_sanity_ignore_files in the easyconfig + # In case we run with both, we'll just report that we're running with + # --disable-cuda-sanity-check-error-on-fail + if ignore_failures: + ignore_msg = "Failure for {path} will be ignored since we are running with " + ignore_msg += "--disable-cuda-sanity-check-error-on-fail" + else: + ignore_msg = f"This failure will be ignored as '{path}' is listed in " + ignore_msg += "'cuda_sanity_ignore_files'." + # Set default failure status and empty message + is_failure = False + fail_msg = "" if additional_devcodes: # Device code found for more architectures than requested in cuda-compute-capabilities fail_msg = f"Mismatch between cuda_compute_capabilities and device code in {path}. " @@ -3474,7 +3492,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): comparisons = [] for cc in missing_ccs: has_smaller_equal_ptx = any( - LooseVersion(ptx_cc) <= LooseVersion(cc) for ptx_cc in derived_ptx_ccs + LooseVersion(ptx_cc) <= LooseVersion(cc) for ptx_cc in found_ptx_ccs ) comparisons.append(has_smaller_equal_ptx) # Only if that's the case for ALL cc's in missing_ccs, this is a warning, not a @@ -3516,17 +3534,19 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): "those in cuda_compute_capabilities") self.log.debug(msg) - # If considered a failure, append to fails so that a sanity error will be thrown - # Otherwise, log a warning - if is_failure: - fail_msgs.append(fail_msg) - else: - self.log.warning(fail_msg) + # If there's no failure message, device code architectures match, so don't warn or fail + if fail_msg: + # If considered a failure, append to fails so that a sanity error will be thrown + # Otherwise, log a warning + if is_failure: + fail_msgs.append(fail_msg) + else: + self.log.warning(fail_msg) # Check whether there is ptx code for the highest CC in cfg_ccs # Make sure to use LooseVersion so that e.g. 9.0 < 9.0a < 9.2 < 9.10 highest_cc = [sorted(cfg_ccs, key=LooseVersion)[-1]] - missing_ptx_ccs = list(set(highest_cc) - set(derived_ptx_ccs)) + missing_ptx_ccs = list(set(highest_cc) - set(found_ptx_ccs)) if missing_ptx_ccs: # There is no PTX code for the highest compute capability in --cuda-compute-capabilities @@ -3539,14 +3559,14 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): # easyconfig, or we are running with --disable-cuda-sanity-check-error-on-fail files_missing_ptx_ignored.append(os.path.relpath(path, self.installdir)) fail_msg += ignore_msg - self.log.warning(fail_msg, highest_cc[0], path, derived_ptx_ccs) + self.log.warning(fail_msg, highest_cc[0], path, found_ptx_ccs) elif accept_missing_ptx: # No error, because we are running with --cuda-sanity-check-accept-missing-ptx - self.log.warning(fail_msg, highest_cc[0], path, derived_ptx_ccs) + self.log.warning(fail_msg, highest_cc[0], path, found_ptx_ccs) else: # Sanity error files_missing_ptx_fails.append(os.path.relpath(path, self.installdir)) - fail_msgs.append(fail_msg % (highest_cc[0], path, derived_ptx_ccs)) + fail_msgs.append(fail_msg % (highest_cc[0], path, found_ptx_ccs)) else: msg = (f"Output of 'cuobjdump' checked for '{path}'; ptx code was present for (at " "least) the highest CUDA compute capability in cuda_compute_capabilities") diff --git a/easybuild/tools/systemtools.py b/easybuild/tools/systemtools.py index 9aa679d8bf..cef3449c63 100644 --- a/easybuild/tools/systemtools.py +++ b/easybuild/tools/systemtools.py @@ -47,7 +47,7 @@ import sys import termios import warnings -from collections import OrderedDict, namedtuple +from collections import OrderedDict from ctypes.util import find_library from socket import gethostname @@ -220,15 +220,6 @@ } -# A named tuple, to be returned by e.g. `get_cuda_device_code_and_ptx_architectures` -cuda_dev_ptx_archs = namedtuple('cuda_dev_ptx_archs', ('device_code_archs', 'ptx_archs')) -cuda_dev_ptx_archs.__doc__ = """A namedtuple that represents the result of a call to -get_cuda_device_code_and_ptx_architectures, with the following fields: -- device_code_archs: a list of CUDA device compute capabilities for which device code was found -- ptx_archs: a list of CUDA (virtual) device compute capabilities for which ptx code was found -""" - - class SystemToolsException(Exception): """raised when systemtools fails""" @@ -1050,11 +1041,12 @@ def get_cuda_object_dump_raw(path): return result -def get_cuda_device_code_and_ptx_architectures(path): +def get_cuda_architectures(path, section_type): """ - Get list of supported CUDA architectures, by inspecting the device code of an executable/library. The format is the - same as cuda_compute_capabilities (e.g. ['8.6', '9.0'] for sm_86 sm_90). - Returns None if no CUDA device code is present in the file. + Get a sorted list of CUDA architectures supported in the file in 'path'. + path: full path to a CUDA file + section_type: the type of section in the cuobjdump output to check for architectures ('elf' or 'ptx') + Returns None if no CUDA device code is present in the file """ # Note that typical output for a cuobjdump call will look like this for device code: @@ -1076,61 +1068,39 @@ def get_cuda_device_code_and_ptx_architectures(path): # compile_size = 64bit # Pattern to extract elf code architectures and ptx code architectures respectively - device_code_regex = re.compile('Fatbin elf code:\n=+\narch = sm_([0-9]+)([0-9]a?)') - ptx_code_regex = re.compile('Fatbin ptx code:\n=+\narch = sm_([0-9]+)([0-9]a?)') - + code_regex = re.compile(f'Fatbin {section_type} code:\n=+\narch = sm_([0-9]+)([0-9]a?)') + # resolve symlinks if os.path.islink(path) and os.path.exists(path): path = os.path.realpath(path) - dev_ptx_archs = None + cc_archs = None cuda_raw = get_cuda_object_dump_raw(path) if cuda_raw is not None: # extract unique device code architectures from raw dump - device_code_matches = re.findall(device_code_regex, cuda_raw) - if device_code_matches: - # convert match tuples into unique list of cuda compute capabilities - # e.g. [('8', '6'), ('8', '6'), ('9', '0')] -> ['8.6', '9.0'] - device_code_matches = sorted(['.'.join(m) for m in set(device_code_matches)], key=LooseVersion) - else: - # Try to be clear in the warning... did we not find elf code sections at all? or was the arch missing? - device_section_regex = re.compile('Fatbin elf code') - device_section_matches = re.findall(device_section_regex, cuda_raw) - if device_section_matches is not None: - fail_msg = f"Found Fatbin elf code section(s) in cuobjdump output for {path}, " - fail_msg += "but failed to extract CUDA architecture" - else: - # In this case, the cuobjdump command _likely_ already returned a non-zero exit - # This error message would only be displayed if cuobjdump somehow completely successfully - # but still no Fatbin elf code section was found - fail_msg = f"Failed to find Fatbin elf code section(s) in cuobjdump output for {path}, " - fail_msg += "are you sure this is a CUDA binary?" - _log.warning(fail_msg) - - # extract unique ptx code architectures from raw dump - ptx_code_matches = re.findall(ptx_code_regex, cuda_raw) - if ptx_code_matches is not None: + code_matches = re.findall(code_regex, cuda_raw) + if code_matches: # convert match tuples into unique list of cuda compute capabilities # e.g. [('8', '6'), ('8', '6'), ('9', '0')] -> ['8.6', '9.0'] - ptx_code_matches = sorted(['.'.join(m) for m in set(ptx_code_matches)], key=LooseVersion) + cc_archs = sorted(['.'.join(m) for m in set(code_matches)], key=LooseVersion) else: - # Try to be clear in the warning... did we not find ptx code sections at all? or was the arch missing? - ptx_section_regex = re.compile('Fatbin ptx code') - ptx_section_matches = re.findall(ptx_section_regex, cuda_raw) - if ptx_section_matches: - fail_msg = f"Found Fatbin ptx code section(s) in cuobjdump output for {path}, " + # Try to be clear in the warning... did we not find elf/ptx code sections at all? or was the arch missing? + section_regex = re.compile(f'Fatbin {section_type} code') + print(f"Section_regex: {section_regex.pattern}") + section_matches = re.findall(section_regex, cuda_raw) + if section_matches: + fail_msg = f"Found Fatbin {section_type} code section(s) in cuobjdump output for {path}, " fail_msg += "but failed to extract CUDA architecture" else: - # In this case, the cuobjdump command _likely_ already returned a non-zero exit - # This error message would only be displayed if cuobjdump somehow completely successfully - # but still no Fatbin ptx code section was found - fail_msg = f"Failed to find Fatbin ptx code section(s) in cuobjdump output for {path}, " - fail_msg += "are you sure this is a CUDA binary?" + # In this case, the "Fatbin {section_type} code" section is simply missing from the binary + # It is entirely possible for a CUDA binary to have only device code or only ptx code (and thus the + # other section could be missing). However, considering --cuda-compute-capabilities is supposed to + # generate both PTX and device code (at least for the highest CC in that list), it is unexpected + # in an EasyBuild context and thus we print a warning + fail_msg = f"Failed to find Fatbin {section_type} code section(s) in cuobjdump output for {path}." _log.warning(fail_msg) - dev_ptx_archs = cuda_dev_ptx_archs(ptx_archs=ptx_code_matches, device_code_archs=device_code_matches) - - return dev_ptx_archs + return cc_archs def get_linked_libs_raw(path): From 9c2167fbf5c559d59290e2d47957678e7617b75f Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 14 Apr 2025 17:14:59 +0200 Subject: [PATCH 058/114] Grammar fix --- easybuild/framework/easyblock.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 728a3ad328..2d721e4492 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3599,7 +3599,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): summary_msg += f"Number of files missing one or more CUDA Compute Capabilities: {len(files_missing_devcode)} " summary_msg += f"(ignored: {len(files_missing_devcode_ignored)}, fails: {len(files_missing_devcode_fails)})\n" if accept_ptx_as_devcode: - summary_msg += "Number of files missing one or more CUDA Compute Capabilities, but has suitable " + summary_msg += "Number of files missing one or more CUDA Compute Capabilities, but having suitable " summary_msg += "PTX code that can be JIT compiled for the requested CUDA Compute Capabilities: " summary_msg += f"{len(files_missing_devcode_but_has_ptx)}\n" summary_msg += "Number of files with device code for more CUDA Compute Capabilities than requested: " From 4ba7942318336413767b5a1acfbb65026dcc2fc3 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 14 Apr 2025 17:27:05 +0200 Subject: [PATCH 059/114] Fix whitespace --- easybuild/tools/systemtools.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/easybuild/tools/systemtools.py b/easybuild/tools/systemtools.py index cef3449c63..27880980c1 100644 --- a/easybuild/tools/systemtools.py +++ b/easybuild/tools/systemtools.py @@ -1069,7 +1069,7 @@ def get_cuda_architectures(path, section_type): # Pattern to extract elf code architectures and ptx code architectures respectively code_regex = re.compile(f'Fatbin {section_type} code:\n=+\narch = sm_([0-9]+)([0-9]a?)') - + # resolve symlinks if os.path.islink(path) and os.path.exists(path): path = os.path.realpath(path) @@ -1093,9 +1093,9 @@ def get_cuda_architectures(path, section_type): fail_msg += "but failed to extract CUDA architecture" else: # In this case, the "Fatbin {section_type} code" section is simply missing from the binary - # It is entirely possible for a CUDA binary to have only device code or only ptx code (and thus the - # other section could be missing). However, considering --cuda-compute-capabilities is supposed to - # generate both PTX and device code (at least for the highest CC in that list), it is unexpected + # It is entirely possible for a CUDA binary to have only device code or only ptx code (and thus the + # other section could be missing). However, considering --cuda-compute-capabilities is supposed to + # generate both PTX and device code (at least for the highest CC in that list), it is unexpected # in an EasyBuild context and thus we print a warning fail_msg = f"Failed to find Fatbin {section_type} code section(s) in cuobjdump output for {path}." _log.warning(fail_msg) From 8d94d87733edc483d18737b589e88bea468f02fd Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 14 Apr 2025 17:28:52 +0200 Subject: [PATCH 060/114] Fix undefined name --- easybuild/framework/easyblock.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 2d721e4492..9ee441318b 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3603,14 +3603,20 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): summary_msg += "PTX code that can be JIT compiled for the requested CUDA Compute Capabilities: " summary_msg += f"{len(files_missing_devcode_but_has_ptx)}\n" summary_msg += "Number of files with device code for more CUDA Compute Capabilities than requested: " - if strict_cc_check: - summary_msg += f"{len(files_additional_devcode)} (ignored: {len(files_additional_cc_ignored)}, fails: " - summary_msg += f"{len(files_additional_devcode_fails)})\n" + if ignore_failures: + summary_msg += f"{len(files_additional_devcode)} (not running with --cuda-sanity-check-fail-on-error, " + summary_msg += "so not considered failures)\n" + elif strict_cc_check: + summary_msg += f"{len(files_additional_devcode)} (ignored: {len(files_additional_devcode_ignored)}, " + summary_msg += f"fails: {len(files_additional_devcode_fails)})\n" else: summary_msg += f"{len(files_additional_devcode)} (not running with --cuda-sanity-check-strict, so not " summary_msg += "considered failures)\n" summary_msg += "Number of files missing PTX code for the highest configured CUDA Compute Capability: " - if accept_missing_ptx: + if ignore_failures: + summary_msg += f"{len(files_missing_ptx)} (not running with --cuda-sanity-check-fail-on-error so not " + summary_msg += "considered failures)\n" + elif accept_missing_ptx: summary_msg += f"{len(files_missing_ptx)} (running with --cuda-sanity-check-accept-missing-ptx so not " summary_msg += "considered failures)\n" else: From 0b615e198fbd184b99f9778d1a32ce1b4539ccde Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 15 Apr 2025 10:34:56 +0200 Subject: [PATCH 061/114] Create mock setup for get_cuda_object_dump_raw and get_cuda_architecture --- test/framework/systemtools.py | 174 ++++++++++++++++++++++++++++++++++ 1 file changed, 174 insertions(+) diff --git a/test/framework/systemtools.py b/test/framework/systemtools.py index 7529166d72..e44e3b50ed 100644 --- a/test/framework/systemtools.py +++ b/test/framework/systemtools.py @@ -299,6 +299,164 @@ DirectMap1G: 65011712 kB """ +FILE_BIN=""" +ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, +for GNU/Linux 3.2.0, not stripped, too many notes (256) +""" + +FILE_SHAREDLIB=""" +ELF 64-bit LSB shared object, x86-64, version 1 (SYSV), dynamically linked, +BuildID[sha1]=5535086d3380568f8eaecfa2e73f456f1edd94ec, stripped +""" + +CUOBJDUMP_FAT=""" + +Fatbin elf code: +================ +arch = sm_50 +code version = [1,7] +host = linux +compile_size = 64bit +compressed + +Fatbin elf code: +================ +arch = sm_60 +code version = [1,7] +host = linux +compile_size = 64bit +compressed + +Fatbin elf code: +================ +arch = sm_61 +code version = [1,7] +host = linux +compile_size = 64bit +compressed + +Fatbin elf code: +================ +arch = sm_70 +code version = [1,7] +host = linux +compile_size = 64bit +compressed + +Fatbin elf code: +================ +arch = sm_75 +code version = [1,7] +host = linux +compile_size = 64bit +compressed + +Fatbin elf code: +================ +arch = sm_80 +code version = [1,7] +host = linux +compile_size = 64bit +compressed + +Fatbin elf code: +================ +arch = sm_86 +code version = [1,7] +host = linux +compile_size = 64bit +compressed + +Fatbin elf code: +================ +arch = sm_89 +code version = [1,7] +host = linux +compile_size = 64bit +compressed + +Fatbin ptx code: +================ +arch = sm_90 +code version = [8,1] +host = linux +compile_size = 64bit +compressed + +Fatbin elf code: +================ +arch = sm_90 +code version = [1,7] +host = linux +compile_size = 64bit +compressed + +Fatbin elf code: +================ +arch = sm_90a +code version = [1,7] +host = linux +compile_size = 64bit + +Fatbin ptx code: +================ +arch = sm_90a +code version = [8,4] +host = linux +compile_size = 64bit +compressed +ptxasOptions = +""" + +CUOBJDUMP_PTX_ONLY=""" + +Fatbin ptx code: +================ +arch = sm_90 +code version = [8,4] +host = linux +compile_size = 64bit +compressed +ptxasOptions = + +Fatbin ptx code: +================ +arch = sm_90a +code version = [8,4] +host = linux +compile_size = 64bit +compressed +ptxasOptions = +""" + +CUOBJDUMP_DEVICE_CODE_ONLY=""" + +Fatbin elf code: +================ +arch = sm_90 +code version = [1,7] +host = linux +compile_size = 64bit +compressed + +Fatbin elf code: +================ +arch = sm_90a +code version = [1,7] +host = linux +compile_size = 64bit +""" + + +CUOBJDUMP_NON_CUDA_SHAREDLIB=""" +cuobjdump info : File '/path/to/my/mock.so' does not contain device code +""" + +CUOBJDUMP_NON_CUDA_UNEXPECTED===""" +cuobjdump info : Some unexpected output +""" + + MACHINE_NAME = None @@ -338,6 +496,17 @@ def mocked_run_shell_cmd(cmd, **kwargs): "sysctl -n machdep.cpu.leaf7_features": "SMEP ERMS RDWRFSGS TSC_THREAD_OFFSET BMI1 AVX2 BMI2 INVPCID FPU_CSDS", "sysctl -n machdep.cpu.vendor": 'GenuineIntel', "ulimit -u": '40', + "file mock_cuda_bin": FILE_BIN, + "file mock_cuda_sharedlib": FILE_SHAREDLIB, + "file mock_non_cuda_sharedlib": FILE_SHAREDLIB, + "file mock_non_cuda_sharedlib_unexpected": FILE_SHAREDLIB, + "file mock_cuda_staticlib": "current ar archive", + "file mock_noncuda_file": "ASCII text", + "cuobjdump mock_cuda_bin": CUOBJDUMP_FAT, + "cuobjdump mock_cuda_sharedlib": CUOBJDUMP_PTX_ONLY, + "cuobjdump mock_non_cuda_sharedlib": CUOBJDUMP_NON_CUDA_SHAREDLIB, + "cuobjdump mock_non_cuda_sharedlib_unexpected": CUOBJDUMP_NON_CUDA_UNEXPECTED, + "cuobjdump mock_cuda_staticlib": CUOBJDUMP_DEVICE_CODE_ONLY, } if cmd in known_cmds: return RunShellCmdResult(cmd=cmd, exit_code=0, output=known_cmds[cmd], stderr=None, work_dir=os.getcwd(), @@ -1141,6 +1310,11 @@ def test_find_library_path(self): if os_type != DARWIN: self.assertExists(lib_path) + def test_get_cuda_object_dump_raw(self): + """Test get_cuda_object_dump_raw function""" + st.run_shell_cmd = mocked_run_shell_cmd + st.get_cuda_object_dump_raw('') + def suite(): """ returns all the testcases in this module """ From f9e99a2da55d41523a75109e29f151cc811096a6 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 15 Apr 2025 10:56:11 +0200 Subject: [PATCH 062/114] Fix naming in config and put in the correct (alphabetical) place in the list --- easybuild/tools/config.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/easybuild/tools/config.py b/easybuild/tools/config.py index 0ddee0fb8a..b9ddaff7ba 100644 --- a/easybuild/tools/config.py +++ b/easybuild/tools/config.py @@ -297,6 +297,10 @@ def mk_full_default_path(name, prefix=DEFAULT_PREFIX): 'backup_patched_files', 'consider_archived_easyconfigs', 'container_build_image', + 'cuda_sanity_check_accept_ptx_as_devcode', + 'cuda_sanity_check_accept_missing_ptx', + 'cuda_sanity_check_error_on_fail', + 'cuda_sanity_check_strict', 'debug', 'debug_lmod', 'dump_autopep8', @@ -342,9 +346,6 @@ def mk_full_default_path(name, prefix=DEFAULT_PREFIX): 'use_existing_modules', 'use_f90cache', 'wait_on_lock_limit', - 'strict_cuda_sanity_check', - 'accept_ptx_as_cc_support', - 'accept_missing_cuda_ptx', ], True: [ 'cleanup_builddir', @@ -352,7 +353,6 @@ def mk_full_default_path(name, prefix=DEFAULT_PREFIX): 'cleanup_tmpdir', 'extended_dry_run_ignore_errors', 'fixed_installdir_naming_scheme', - 'ignore_cuda_sanity_failures', 'lib_lib64_symlink', 'lib64_fallback_sanity_check', 'lib64_lib_symlink', From aca934df21f2603f4210ddb07633ed3a716592e3 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 15 Apr 2025 10:56:52 +0200 Subject: [PATCH 063/114] Make sure archives are also checked. Libary does _not_ seem to be an official possibility for the output string, so we can ommit that --- easybuild/tools/systemtools.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/easybuild/tools/systemtools.py b/easybuild/tools/systemtools.py index 27880980c1..b2b4e57661 100644 --- a/easybuild/tools/systemtools.py +++ b/easybuild/tools/systemtools.py @@ -1013,9 +1013,9 @@ def get_cuda_object_dump_raw(path): fail_msg = "Failed to run 'file %s': %s" % (path, res.output) _log.warning(fail_msg) - # check that the file is an executable or library/object + # check that the file is an executable or object (shared library) or archive (static library) result = None - if any(x in res.output for x in ['executable', 'object', 'library']): + if any(x in res.output for x in ['executable', 'object', 'archive']): # Make sure we have a cuobjdump command if not shutil.which('cuobjdump'): raise EasyBuildError("Failed to get object dump from CUDA file: cuobjdump command not found") From 7fde91bfedcb27a79f9225f56b97d8f127dcee3c Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 15 Apr 2025 10:57:13 +0200 Subject: [PATCH 064/114] Initial (working) version of a unit test for get_cuda_object_dump_raw --- test/framework/systemtools.py | 33 +++++++++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/test/framework/systemtools.py b/test/framework/systemtools.py index e44e3b50ed..2b2656f369 100644 --- a/test/framework/systemtools.py +++ b/test/framework/systemtools.py @@ -28,6 +28,7 @@ @author: Kenneth hoste (Ghent University) @author: Ward Poelmans (Ghent University) """ +import copy import ctypes import re import os @@ -39,14 +40,16 @@ import easybuild.tools.systemtools as st from easybuild.tools.build_log import EasyBuildError -from easybuild.tools.filetools import adjust_permissions, read_file, symlink, which, write_file +from easybuild.tools.environment import modify_env, setvar +from easybuild.tools.filetools import adjust_permissions, mkdir, read_file, symlink, which, write_file from easybuild.tools.run import RunShellCmdResult, run_shell_cmd from easybuild.tools.systemtools import CPU_ARCHITECTURES, AARCH32, AARCH64, POWER, X86_64 from easybuild.tools.systemtools import CPU_FAMILIES, POWER_LE, DARWIN, LINUX, UNKNOWN from easybuild.tools.systemtools import CPU_VENDORS, AMD, APM, ARM, CAVIUM, IBM, INTEL from easybuild.tools.systemtools import MAX_FREQ_FP, PROC_CPUINFO_FP, PROC_MEMINFO_FP from easybuild.tools.systemtools import check_linked_shared_libs, check_os_dependency, check_python_version -from easybuild.tools.systemtools import det_parallelism, get_avail_core_count, get_cpu_arch_name, get_cpu_architecture +from easybuild.tools.systemtools import det_parallelism, get_avail_core_count, get_cuda_object_dump_raw +from easybuild.tools.systemtools import get_cuda_architectures, get_cpu_arch_name, get_cpu_architecture from easybuild.tools.systemtools import get_cpu_family, get_cpu_features, get_cpu_model, get_cpu_speed, get_cpu_vendor from easybuild.tools.systemtools import get_gcc_version, get_glibc_version, get_os_type, get_os_name, get_os_version from easybuild.tools.systemtools import get_platform_name, get_shared_lib_ext, get_system_info, get_total_memory @@ -452,7 +455,7 @@ cuobjdump info : File '/path/to/my/mock.so' does not contain device code """ -CUOBJDUMP_NON_CUDA_UNEXPECTED===""" +CUOBJDUMP_NON_CUDA_UNEXPECTED=""" cuobjdump info : Some unexpected output """ @@ -1312,8 +1315,30 @@ def test_find_library_path(self): def test_get_cuda_object_dump_raw(self): """Test get_cuda_object_dump_raw function""" + # This test modifies environment, make sure we can revert the changes: + start_env = copy.deepcopy(os.environ) + st.run_shell_cmd = mocked_run_shell_cmd - st.get_cuda_object_dump_raw('') + + # Test case 1: there's no cuobjdump on the path yet + error_pattern=r"cuobjdump command not found" + self.assertErrorRegex(EasyBuildError, error_pattern, get_cuda_object_dump_raw, path='mock_cuda_bin') + + # Put a cuobjdump on the path, doesn't matter what. It will be mocked anyway + cuobjdump_dir = os.path.join(self.test_prefix, 'cuobjdump_dir') + mkdir(cuobjdump_dir, parents=True) + setvar('PATH', '%s:%s' % (cuobjdump_dir, os.getenv('PATH'))) + cuobjdump_file = os.path.join(cuobjdump_dir, 'cuobjdump') + write_file(cuobjdump_file, "#!/bin/bash\n") + write_file(cuobjdump_file, "echo 'Mock script, this should never actually be called\n'") + adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable + + # Test case 2: get raw output from mock_cuda_bin, a 'fat' binary + # TODO: check output + get_cuda_object_dump_raw('mock_cuda_bin') + + # Restore original environment + modify_env(os.environ, start_env, verbose=False) def suite(): From 2258d91823eb125154e37fe0968a6193be9e4ef7 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 15 Apr 2025 12:53:43 +0200 Subject: [PATCH 065/114] Fix hound issues --- test/framework/systemtools.py | 45 +++++++++++------------------------ 1 file changed, 14 insertions(+), 31 deletions(-) diff --git a/test/framework/systemtools.py b/test/framework/systemtools.py index 2b2656f369..61d6d21b60 100644 --- a/test/framework/systemtools.py +++ b/test/framework/systemtools.py @@ -302,19 +302,14 @@ DirectMap1G: 65011712 kB """ -FILE_BIN=""" -ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, -for GNU/Linux 3.2.0, not stripped, too many notes (256) -""" +FILE_BIN = """ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked, interpreter +/lib64/ld-linux-x86-64.so.2, for GNU/Linux 3.2.0, not stripped, too many notes (256)""" -FILE_SHAREDLIB=""" -ELF 64-bit LSB shared object, x86-64, version 1 (SYSV), dynamically linked, -BuildID[sha1]=5535086d3380568f8eaecfa2e73f456f1edd94ec, stripped -""" +FILE_SHAREDLIB="""ELF 64-bit LSB shared object, x86-64, version 1 (SYSV), dynamically linked, +BuildID[sha1]=5535086d3380568f8eaecfa2e73f456f1edd94ec, stripped""" CUOBJDUMP_FAT=""" - -Fatbin elf code: +Fatbin elf code: ================ arch = sm_50 code version = [1,7] @@ -408,11 +403,9 @@ host = linux compile_size = 64bit compressed -ptxasOptions = -""" - -CUOBJDUMP_PTX_ONLY=""" +ptxasOptions =""" +CUOBJDUMP_PTX_ONLY = """ Fatbin ptx code: ================ arch = sm_90 @@ -429,11 +422,9 @@ host = linux compile_size = 64bit compressed -ptxasOptions = -""" - -CUOBJDUMP_DEVICE_CODE_ONLY=""" +ptxasOptions =""" +CUOBJDUMP_DEVICE_CODE_ONLY = """ Fatbin elf code: ================ arch = sm_90 @@ -447,18 +438,10 @@ arch = sm_90a code version = [1,7] host = linux -compile_size = 64bit -""" - +compile_size = 64bit""" -CUOBJDUMP_NON_CUDA_SHAREDLIB=""" -cuobjdump info : File '/path/to/my/mock.so' does not contain device code -""" - -CUOBJDUMP_NON_CUDA_UNEXPECTED=""" -cuobjdump info : Some unexpected output -""" +CUOBJDUMP_NON_CUDA_SHAREDLIB = "cuobjdump info : File '/path/to/my/mock.so' does not contain device code" MACHINE_NAME = None @@ -507,8 +490,8 @@ def mocked_run_shell_cmd(cmd, **kwargs): "file mock_noncuda_file": "ASCII text", "cuobjdump mock_cuda_bin": CUOBJDUMP_FAT, "cuobjdump mock_cuda_sharedlib": CUOBJDUMP_PTX_ONLY, - "cuobjdump mock_non_cuda_sharedlib": CUOBJDUMP_NON_CUDA_SHAREDLIB, - "cuobjdump mock_non_cuda_sharedlib_unexpected": CUOBJDUMP_NON_CUDA_UNEXPECTED, + "cuobjdump mock_non_cuda_sharedlib": "cuobjdump info : File '/path/to/mock.so' does not contain device code", + "cuobjdump mock_non_cuda_sharedlib_unexpected": "cuobjdump info : Some unexpected output", "cuobjdump mock_cuda_staticlib": CUOBJDUMP_DEVICE_CODE_ONLY, } if cmd in known_cmds: @@ -1335,7 +1318,7 @@ def test_get_cuda_object_dump_raw(self): # Test case 2: get raw output from mock_cuda_bin, a 'fat' binary # TODO: check output - get_cuda_object_dump_raw('mock_cuda_bin') + print(get_cuda_object_dump_raw('mock_cuda_bin')) # Restore original environment modify_env(os.environ, start_env, verbose=False) From f782df8bc34a7c2d1bb72d7262f3b8e2028ce271 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 15 Apr 2025 16:31:45 +0200 Subject: [PATCH 066/114] More test cases --- test/framework/systemtools.py | 43 +++++++++++++++++++++++++++-------- 1 file changed, 34 insertions(+), 9 deletions(-) diff --git a/test/framework/systemtools.py b/test/framework/systemtools.py index 61d6d21b60..812794f97d 100644 --- a/test/framework/systemtools.py +++ b/test/framework/systemtools.py @@ -30,8 +30,9 @@ """ import copy import ctypes -import re +import logging import os +import re import sys import stat @@ -305,10 +306,10 @@ FILE_BIN = """ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked, interpreter /lib64/ld-linux-x86-64.so.2, for GNU/Linux 3.2.0, not stripped, too many notes (256)""" -FILE_SHAREDLIB="""ELF 64-bit LSB shared object, x86-64, version 1 (SYSV), dynamically linked, +FILE_SHAREDLIB = """ELF 64-bit LSB shared object, x86-64, version 1 (SYSV), dynamically linked, BuildID[sha1]=5535086d3380568f8eaecfa2e73f456f1edd94ec, stripped""" -CUOBJDUMP_FAT=""" +CUOBJDUMP_FAT = """ Fatbin elf code: ================ arch = sm_50 @@ -441,8 +442,6 @@ compile_size = 64bit""" -CUOBJDUMP_NON_CUDA_SHAREDLIB = "cuobjdump info : File '/path/to/my/mock.so' does not contain device code" - MACHINE_NAME = None @@ -490,13 +489,19 @@ def mocked_run_shell_cmd(cmd, **kwargs): "file mock_noncuda_file": "ASCII text", "cuobjdump mock_cuda_bin": CUOBJDUMP_FAT, "cuobjdump mock_cuda_sharedlib": CUOBJDUMP_PTX_ONLY, - "cuobjdump mock_non_cuda_sharedlib": "cuobjdump info : File '/path/to/mock.so' does not contain device code", - "cuobjdump mock_non_cuda_sharedlib_unexpected": "cuobjdump info : Some unexpected output", "cuobjdump mock_cuda_staticlib": CUOBJDUMP_DEVICE_CODE_ONLY, } + known_fail_cmds = { + "cuobjdump mock_non_cuda_sharedlib": ("cuobjdump info : File '/path/to/mock.so' does not contain device code", 255), + "cuobjdump mock_non_cuda_sharedlib_unexpected": ("cuobjdump info : Some unexpected output", 255), + } if cmd in known_cmds: return RunShellCmdResult(cmd=cmd, exit_code=0, output=known_cmds[cmd], stderr=None, work_dir=os.getcwd(), out_file=None, err_file=None, cmd_sh=None, thread_id=None, task_id=None) + elif cmd in known_fail_cmds: + return RunShellCmdResult(cmd=cmd, exit_code=known_fail_cmds[cmd][1], output=known_fail_cmds[cmd][0], + stderr=None, work_dir=os.getcwd(), out_file=None, err_file=None, cmd_sh=None, + thread_id=None, task_id=None) else: return run_shell_cmd(cmd, **kwargs) @@ -1304,7 +1309,7 @@ def test_get_cuda_object_dump_raw(self): st.run_shell_cmd = mocked_run_shell_cmd # Test case 1: there's no cuobjdump on the path yet - error_pattern=r"cuobjdump command not found" + error_pattern = r"cuobjdump command not found" self.assertErrorRegex(EasyBuildError, error_pattern, get_cuda_object_dump_raw, path='mock_cuda_bin') # Put a cuobjdump on the path, doesn't matter what. It will be mocked anyway @@ -1318,7 +1323,27 @@ def test_get_cuda_object_dump_raw(self): # Test case 2: get raw output from mock_cuda_bin, a 'fat' binary # TODO: check output - print(get_cuda_object_dump_raw('mock_cuda_bin')) + self.assertEqual(get_cuda_object_dump_raw('mock_cuda_bin'), CUOBJDUMP_FAT) + + # Test case 3: call on a file that is NOT an executable, object or archive: + self.assertIsNone(get_cuda_object_dump_raw('mock_noncuda_file')) + + # Test case 4: call on a file that is an shared lib, but not a CUDA shared lib + # Check debug message in this case + warning_regex = re.compile(r"does not appear to be a CUDA binary: cuobjdump failed to find device code in this file", re.M) + old_log_level = st._log.getEffectiveLevel() + st._log.setLevel(logging.DEBUG) + with self.log_to_testlogfile(): + res = get_cuda_object_dump_raw('mock_non_cuda_sharedlib') + st._log.setLevel(old_log_level) + logtxt = read_file(self.logfile) + self.assertIsNone(res) + fail_msg = "Pattern '%s' should be found in: %s" % (warning_regex.pattern, logtxt) + self.assertTrue(warning_regex.search(logtxt), fail_msg) + + # Test case 5: call on a file where cuobjdump produces really unexpected output + error_pattern = r"Dumping CUDA binary file information for .* via .* failed!" + self.assertErrorRegex(EasyBuildError, error_pattern, get_cuda_object_dump_raw, path='mock_non_cuda_sharedlib_unexpected') # Restore original environment modify_env(os.environ, start_env, verbose=False) From 3ba1d7bf1063e76e6c5d6e077c85e1a6ecac719b Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 15 Apr 2025 17:27:27 +0200 Subject: [PATCH 067/114] Remove a stray print statement --- easybuild/tools/systemtools.py | 1 - 1 file changed, 1 deletion(-) diff --git a/easybuild/tools/systemtools.py b/easybuild/tools/systemtools.py index b2b4e57661..f74ad7631f 100644 --- a/easybuild/tools/systemtools.py +++ b/easybuild/tools/systemtools.py @@ -1086,7 +1086,6 @@ def get_cuda_architectures(path, section_type): else: # Try to be clear in the warning... did we not find elf/ptx code sections at all? or was the arch missing? section_regex = re.compile(f'Fatbin {section_type} code') - print(f"Section_regex: {section_regex.pattern}") section_matches = re.findall(section_regex, cuda_raw) if section_matches: fail_msg = f"Found Fatbin {section_type} code section(s) in cuobjdump output for {path}, " From 79d7084c5b224226bcc9d057ddf678be52233520 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 15 Apr 2025 17:28:02 +0200 Subject: [PATCH 068/114] Add remaining test cases for get_cuda_architecture and get_cuda_object_dump_raw --- test/framework/systemtools.py | 98 +++++++++++++++++++++++++++++++++-- 1 file changed, 94 insertions(+), 4 deletions(-) diff --git a/test/framework/systemtools.py b/test/framework/systemtools.py index 812794f97d..f86d584fbe 100644 --- a/test/framework/systemtools.py +++ b/test/framework/systemtools.py @@ -441,6 +441,14 @@ host = linux compile_size = 64bit""" +# Invalid, because it doesn't contain an arch = sm_XX entry +CUOBJDUMP_INVALID = """ +Fatbin elf code: +================ +code version = [1,7] +host = linux +compile_size = 64bit +compressed""" MACHINE_NAME = None @@ -483,12 +491,14 @@ def mocked_run_shell_cmd(cmd, **kwargs): "ulimit -u": '40', "file mock_cuda_bin": FILE_BIN, "file mock_cuda_sharedlib": FILE_SHAREDLIB, + "file mock_invalid_cuda_sharedlib": FILE_SHAREDLIB, "file mock_non_cuda_sharedlib": FILE_SHAREDLIB, "file mock_non_cuda_sharedlib_unexpected": FILE_SHAREDLIB, "file mock_cuda_staticlib": "current ar archive", "file mock_noncuda_file": "ASCII text", "cuobjdump mock_cuda_bin": CUOBJDUMP_FAT, "cuobjdump mock_cuda_sharedlib": CUOBJDUMP_PTX_ONLY, + "cuobjdump mock_invalid_cuda_sharedlib": CUOBJDUMP_INVALID, "cuobjdump mock_cuda_staticlib": CUOBJDUMP_DEVICE_CODE_ONLY, } known_fail_cmds = { @@ -1306,6 +1316,7 @@ def test_get_cuda_object_dump_raw(self): # This test modifies environment, make sure we can revert the changes: start_env = copy.deepcopy(os.environ) + # Mock the shell command for certain known commands st.run_shell_cmd = mocked_run_shell_cmd # Test case 1: there's no cuobjdump on the path yet @@ -1322,7 +1333,6 @@ def test_get_cuda_object_dump_raw(self): adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable # Test case 2: get raw output from mock_cuda_bin, a 'fat' binary - # TODO: check output self.assertEqual(get_cuda_object_dump_raw('mock_cuda_bin'), CUOBJDUMP_FAT) # Test case 3: call on a file that is NOT an executable, object or archive: @@ -1330,7 +1340,7 @@ def test_get_cuda_object_dump_raw(self): # Test case 4: call on a file that is an shared lib, but not a CUDA shared lib # Check debug message in this case - warning_regex = re.compile(r"does not appear to be a CUDA binary: cuobjdump failed to find device code in this file", re.M) + debug_regex = re.compile(r"DEBUG .* does not appear to be a CUDA binary: cuobjdump failed to find device code in this file", re.M) old_log_level = st._log.getEffectiveLevel() st._log.setLevel(logging.DEBUG) with self.log_to_testlogfile(): @@ -1338,12 +1348,92 @@ def test_get_cuda_object_dump_raw(self): st._log.setLevel(old_log_level) logtxt = read_file(self.logfile) self.assertIsNone(res) - fail_msg = "Pattern '%s' should be found in: %s" % (warning_regex.pattern, logtxt) - self.assertTrue(warning_regex.search(logtxt), fail_msg) + fail_msg = "Pattern '%s' should be found in: %s" % (debug_regex.pattern, logtxt) + self.assertTrue(debug_regex.search(logtxt), fail_msg) # Test case 5: call on a file where cuobjdump produces really unexpected output error_pattern = r"Dumping CUDA binary file information for .* via .* failed!" self.assertErrorRegex(EasyBuildError, error_pattern, get_cuda_object_dump_raw, path='mock_non_cuda_sharedlib_unexpected') + + # Test case 6: call on CUDA shared lib, which only contains PTX code + self.assertEqual(get_cuda_object_dump_raw('mock_cuda_sharedlib'), CUOBJDUMP_PTX_ONLY) + + # Test case 7: call on CUDA static lib, which only contains device code + self.assertEqual(get_cuda_object_dump_raw('mock_cuda_staticlib'), CUOBJDUMP_DEVICE_CODE_ONLY) + + # Restore original environment + modify_env(os.environ, start_env, verbose=False) + + def test_get_cuda_architectures(self): + """Test get_cuda_architectures function""" + # This test modifies environment, make sure we can revert the changes: + start_env = copy.deepcopy(os.environ) + + # Mock the shell command for certain known commands + st.run_shell_cmd = mocked_run_shell_cmd + + # Put a cuobjdump on the path, doesn't matter what. It will be mocked anyway + cuobjdump_dir = os.path.join(self.test_prefix, 'cuobjdump_dir') + mkdir(cuobjdump_dir, parents=True) + setvar('PATH', '%s:%s' % (cuobjdump_dir, os.getenv('PATH'))) + cuobjdump_file = os.path.join(cuobjdump_dir, 'cuobjdump') + write_file(cuobjdump_file, "#!/bin/bash\n") + write_file(cuobjdump_file, "echo 'Mock script, this should never actually be called\n'") + adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable + + # Test case 1: get raw output from mock_cuda_bin, a 'fat' binary + mock_cuda_bin_device_codes = ['6.0', '6.1', '7.0', '7.5', '8.0', '8.6', '8.9', '9.0', '9.0a'] + mock_cuda_bin_ptx = ['9.0', '9.0a'] + self.assertEqual(get_cuda_architectures('mock_cuda_bin', 'elf'), mock_cuda_bin_device_codes) + self.assertEqual(get_cuda_architectures('mock_cuda_bin', 'ptx'), mock_cuda_bin_ptx) + + # Test case 2: call on a file that is NOT an executable, object or archive: + self.assertIsNone(get_cuda_architectures('mock_noncuda_file', 'elf')) + self.assertIsNone(get_cuda_architectures('mock_noncuda_file', 'ptx')) + + # Test case 3: call on a file that is an shared lib, but not a CUDA shared lib + self.assertIsNone(get_cuda_architectures('mock_non_cuda_sharedlib', 'elf')) + self.assertIsNone(get_cuda_architectures('mock_non_cuda_sharedlib', 'ptx')) + + # Test case 4: call on CUDA shared lib, which only contains PTX code + warning_regex_elf = re.compile(r"WARNING Failed to find Fatbin elf code section\(s\) in cuobjdump output for mock_cuda_sharedlib", re.M) + old_log_level = st._log.getEffectiveLevel() + st._log.setLevel(logging.DEBUG) + with self.log_to_testlogfile(): + res_elf = get_cuda_architectures('mock_cuda_sharedlib', 'elf') + res_ptx = get_cuda_architectures('mock_cuda_sharedlib', 'ptx') + st._log.setLevel(old_log_level) + logtxt = read_file(self.logfile) + self.assertIsNone(res_elf) + fail_msg = "Pattern '%s' should be found in: %s" % (warning_regex_elf.pattern, logtxt) + self.assertTrue(warning_regex_elf.search(logtxt), fail_msg) + self.assertEqual(res_ptx, ['9.0', '9.0a']) + + # Test case 5: call on CUDA static lib, which only contains device code + warning_regex_ptx = re.compile(r"WARNING Failed to find Fatbin ptx code section\(s\) in cuobjdump output for mock_cuda_staticlib", re.M) + old_log_level = st._log.getEffectiveLevel() + st._log.setLevel(logging.DEBUG) + with self.log_to_testlogfile(): + res_elf = get_cuda_architectures('mock_cuda_staticlib', 'elf') + res_ptx = get_cuda_architectures('mock_cuda_staticlib', 'ptx') + st._log.setLevel(old_log_level) + logtxt = read_file(self.logfile) + self.assertIsNone(res_ptx) + fail_msg = "Pattern '%s' should be found in: %s" % (warning_regex_ptx.pattern, logtxt) + self.assertTrue(warning_regex_ptx.search(logtxt), fail_msg) + self.assertEqual(res_elf, ['9.0', '9.0a']) + + # Test case 6: call on CUDA shared lib which lacks an arch = sm_XX entry (should never happen) + warning_regex_elf = re.compile(r"WARNING Found Fatbin elf code section\(s\) in cuobjdump output for mock_invalid_cuda_sharedlib, but failed to extract CUDA architecture", re.M) + old_log_level = st._log.getEffectiveLevel() + st._log.setLevel(logging.DEBUG) + with self.log_to_testlogfile(): + res_elf = get_cuda_architectures('mock_invalid_cuda_sharedlib', 'elf') + st._log.setLevel(old_log_level) + logtxt = read_file(self.logfile) + fail_msg = "Pattern '%s' should be found in: %s" % (warning_regex_elf.pattern, logtxt) + self.assertTrue(warning_regex_elf.search(logtxt), fail_msg) + self.assertIsNone(res_elf) # Restore original environment modify_env(os.environ, start_env, verbose=False) From 316e71f6cb957d0a878bb06b3256551a571e81df Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 15 Apr 2025 18:13:18 +0200 Subject: [PATCH 069/114] Change if-elif-else into a nested if-else, with an if-if. This is since _BOTH_ additional_devcodes and missing_devcodes may be true, it's not either or. And we want to sanity check to report on _both_ issues --- easybuild/framework/easyblock.py | 188 ++++++++++++++++--------------- 1 file changed, 100 insertions(+), 88 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 9ee441318b..ca8a462eef 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3439,70 +3439,103 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): # check whether device code architectures match cuda_compute_capabilities additional_devcodes = list(set(found_dev_code_ccs) - set(cfg_ccs)) - missing_ccs = list(set(cfg_ccs) - set(found_dev_code_ccs)) - - # There are two reasons for ignoring failures: - # - We are running with --disable-cuda-sanity-check-error-on-fail - # - The specific {path} is on the cuda_sanity_ignore_files in the easyconfig - # In case we run with both, we'll just report that we're running with - # --disable-cuda-sanity-check-error-on-fail - if ignore_failures: - ignore_msg = "Failure for {path} will be ignored since we are running with " - ignore_msg += "--disable-cuda-sanity-check-error-on-fail" + missing_devcodes = list(set(cfg_ccs) - set(found_dev_code_ccs)) + + + if not missing_devcodes and not additional_devcodes: + # Device code for all architectures requested in --cuda-compute-capabilities was found + msg = (f"Output of 'cuobjdump' checked for '{path}'; device code architectures match " + "those in cuda_compute_capabilities") + self.log.debug(msg) else: - ignore_msg = f"This failure will be ignored as '{path}' is listed in " - ignore_msg += "'cuda_sanity_ignore_files'." - - # Set default failure status and empty message - is_failure = False - fail_msg = "" - if additional_devcodes: - # Device code found for more architectures than requested in cuda-compute-capabilities - fail_msg = f"Mismatch between cuda_compute_capabilities and device code in {path}. " - # Count and log for summary report - files_additional_devcode.append(os.path.relpath(path, self.installdir)) - additional_devcode_str = ', '.join(sorted(additional_devcodes, key=LooseVersion)) - fail_msg += "Surplus compute capabilities: %s. " % additional_devcode_str - if strict_cc_check: - # cuda-sanity-check-strict, so no additional compute capabilities allowed - if path in ignore_file_list or ignore_failures: - # No error, because either path is on the cuda_sanity_ignore_files list in the - # easyconfig, or we are running with --disable-cuda-sanity-check-error-on-fail - files_additional_devcode_ignored.append(os.path.relpath(path, self.installdir)) - fail_msg += ignore_msg - is_failure = False - else: - # Sanity error - files_additional_devcode_fails.append(os.path.relpath(path, self.installdir)) - is_failure = True + # There are two reasons for ignoring failures: + # - We are running with --disable-cuda-sanity-check-error-on-fail + # - The specific {path} is on the cuda_sanity_ignore_files in the easyconfig + # In case we run with both, we'll just report that we're running with + # --disable-cuda-sanity-check-error-on-fail + if ignore_failures: + ignore_msg = "Failure for {path} will be ignored since we are running with " + ignore_msg += "--disable-cuda-sanity-check-error-on-fail" else: - is_failure = False - elif missing_ccs: - # One or more device code architectures requested in cuda-compute-capabilities was - # not found in the binary - fail_msg = f"Mismatch between cuda_compute_capabilities and device code in {path}. " - # Count and log for summary report - missing_cc_str = ', '.join(sorted(missing_ccs, key=LooseVersion)) - fail_msg += "Missing compute capabilities: %s. " % missing_cc_str - # If accept_ptx_as_devcode, this might not be a failure IF there is suitable PTX - # code to JIT compile from that supports the CCs in missing_ccs - if accept_ptx_as_devcode: - # Check that for each item in missing_ccs there is PTX code for lower or equal - # CUDA compute capability - comparisons = [] - for cc in missing_ccs: - has_smaller_equal_ptx = any( - LooseVersion(ptx_cc) <= LooseVersion(cc) for ptx_cc in found_ptx_ccs - ) - comparisons.append(has_smaller_equal_ptx) - # Only if that's the case for ALL cc's in missing_ccs, this is a warning, not a - # failure - if all(comparisons): - files_missing_devcode_but_has_ptx.append(os.path.relpath(path, self.installdir)) + ignore_msg = f"This failure will be ignored as '{path}' is listed in " + ignore_msg += "'cuda_sanity_ignore_files'." + + # Set default failure status and empty message + is_failure = False + fail_msg = "" + + if additional_devcodes: + # Device code found for more architectures than requested in cuda-compute-capabilities + fail_msg += f"Mismatch between cuda_compute_capabilities and device code in {path}. " + # Count and log for summary report + files_additional_devcode.append(os.path.relpath(path, self.installdir)) + additional_devcode_str = ', '.join(sorted(additional_devcodes, key=LooseVersion)) + fail_msg += "Surplus compute capabilities: %s. " % additional_devcode_str + if strict_cc_check: + # cuda-sanity-check-strict, so no additional compute capabilities allowed + if path in ignore_file_list or ignore_failures: + # No error, because either path is on the cuda_sanity_ignore_files list in the + # easyconfig, or we are running with --disable-cuda-sanity-check-error-on-fail + files_additional_devcode_ignored.append(os.path.relpath(path, self.installdir)) + fail_msg += ignore_msg + is_failure = False + else: + # Sanity error + files_additional_devcode_fails.append(os.path.relpath(path, self.installdir)) + is_failure = True + else: is_failure = False + # Do reporting for the additional_devcodes case + # If considered a failure, append to fails so that a sanity error will be thrown + # Otherwise, log a warning + # Note that we report on the additional_devcodes and missing_devices cases separately + # Because one could be a failure, while the other isn't + if is_failure: + fail_msgs.append(fail_msg) else: - # If there are CCs for which there is no suiteable PTX that can be JIT-compiled - # from, this is considerd a failure + self.log.warning(fail_msg) + + # Both additional_devcodes and missing_devcodes could be try, so use if, not elif + if missing_devcodes: + # One or more device code architectures requested in cuda-compute-capabilities was + # not found in the binary + fail_msg += f"Mismatch between cuda_compute_capabilities and device code in {path}. " + # Count and log for summary report + missing_devcodes_str = ', '.join(sorted(missing_devcodes, key=LooseVersion)) + fail_msg += "Missing compute capabilities: %s. " % missing_devcodes_str + # If accept_ptx_as_devcode, this might not be a failure IF there is suitable PTX + # code to JIT compile from that supports the CCs in missing_devcodes + if accept_ptx_as_devcode: + # Check that for each item in missing_devcodes there is PTX code for lower or equal + # CUDA compute capability + comparisons = [] + for cc in missing_devcodes: + has_smaller_equal_ptx = any( + LooseVersion(ptx_cc) <= LooseVersion(cc) for ptx_cc in found_ptx_ccs + ) + comparisons.append(has_smaller_equal_ptx) + # Only if that's the case for ALL cc's in missing_devcodes, this is a warning, not a + # failure + if all(comparisons): + files_missing_devcode_but_has_ptx.append(os.path.relpath(path, self.installdir)) + is_failure = False + else: + # If there are CCs for which there is no suiteable PTX that can be JIT-compiled + # from, this is considerd a failure + files_missing_devcode.append(os.path.relpath(path, self.installdir)) + if path in ignore_file_list or ignore_failures: + # No error, because either path is on the cuda_sanity_ignore_files list in the + # easyconfig, or we are running with --disable-cuda-sanity-check-error-on-fail + files_missing_devcode_ignored.append(os.path.relpath(path, self.installdir)) + fail_msg += ignore_msg + is_failure = False + else: + # Sanity error + files_missing_devcode_fails.append(os.path.relpath(path, self.installdir)) + is_failure = True + else: + # Device code was missing, and we're not accepting PTX code as alternative + # This is considered a failure files_missing_devcode.append(os.path.relpath(path, self.installdir)) if path in ignore_file_list or ignore_failures: # No error, because either path is on the cuda_sanity_ignore_files list in the @@ -3514,35 +3547,14 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): # Sanity error files_missing_devcode_fails.append(os.path.relpath(path, self.installdir)) is_failure = True - else: - # Device code was missing, and we're not accepting PTX code as alternative - # This is considered a failure - files_missing_devcode.append(os.path.relpath(path, self.installdir)) - if path in ignore_file_list or ignore_failures: - # No error, because either path is on the cuda_sanity_ignore_files list in the - # easyconfig, or we are running with --disable-cuda-sanity-check-error-on-fail - files_missing_devcode_ignored.append(os.path.relpath(path, self.installdir)) - fail_msg += ignore_msg - is_failure = False + # Do reporting for the missing_devcodes case + # If considered a failure, append to fails so that a sanity error will be thrown + # Otherwise, log a warning + if is_failure: + fail_msgs.append(fail_msg) else: - # Sanity error - files_missing_devcode_fails.append(os.path.relpath(path, self.installdir)) - is_failure = True - else: - # Device code for all architectures requested in --cuda-compute-capabilities was found - msg = (f"Output of 'cuobjdump' checked for '{path}'; device code architectures match " - "those in cuda_compute_capabilities") - self.log.debug(msg) - - # If there's no failure message, device code architectures match, so don't warn or fail - if fail_msg: - # If considered a failure, append to fails so that a sanity error will be thrown - # Otherwise, log a warning - if is_failure: - fail_msgs.append(fail_msg) - else: - self.log.warning(fail_msg) - + self.log.warning(fail_msg) + # Check whether there is ptx code for the highest CC in cfg_ccs # Make sure to use LooseVersion so that e.g. 9.0 < 9.0a < 9.2 < 9.10 highest_cc = [sorted(cfg_ccs, key=LooseVersion)[-1]] From 494bd95f9ca0ca979c0c6994be534494da9a8e9e Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 15 Apr 2025 18:15:01 +0200 Subject: [PATCH 070/114] Don't keep accumulating the fail_msg after we have logged it --- easybuild/framework/easyblock.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index ca8a462eef..56ff39b178 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3462,11 +3462,10 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): # Set default failure status and empty message is_failure = False - fail_msg = "" if additional_devcodes: # Device code found for more architectures than requested in cuda-compute-capabilities - fail_msg += f"Mismatch between cuda_compute_capabilities and device code in {path}. " + fail_msg = f"Mismatch between cuda_compute_capabilities and device code in {path}. " # Count and log for summary report files_additional_devcode.append(os.path.relpath(path, self.installdir)) additional_devcode_str = ', '.join(sorted(additional_devcodes, key=LooseVersion)) @@ -3499,7 +3498,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): if missing_devcodes: # One or more device code architectures requested in cuda-compute-capabilities was # not found in the binary - fail_msg += f"Mismatch between cuda_compute_capabilities and device code in {path}. " + fail_msg = f"Mismatch between cuda_compute_capabilities and device code in {path}. " # Count and log for summary report missing_devcodes_str = ', '.join(sorted(missing_devcodes, key=LooseVersion)) fail_msg += "Missing compute capabilities: %s. " % missing_devcodes_str From 8a9ea6d1db0e4c073280650bdf837e2467a75e75 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 15 Apr 2025 21:04:05 +0200 Subject: [PATCH 071/114] added f to fstring... --- easybuild/framework/easyblock.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 56ff39b178..2b940df547 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3454,7 +3454,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): # In case we run with both, we'll just report that we're running with # --disable-cuda-sanity-check-error-on-fail if ignore_failures: - ignore_msg = "Failure for {path} will be ignored since we are running with " + ignore_msg = f"Failure for {path} will be ignored since we are running with " ignore_msg += "--disable-cuda-sanity-check-error-on-fail" else: ignore_msg = f"This failure will be ignored as '{path}' is listed in " From a2960c27bc251cc156580ede05f91c98b60c59ad Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 15 Apr 2025 21:05:38 +0200 Subject: [PATCH 072/114] Replace Surplus with Additional in warning --- easybuild/framework/easyblock.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 2b940df547..9e729c81ea 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3469,7 +3469,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): # Count and log for summary report files_additional_devcode.append(os.path.relpath(path, self.installdir)) additional_devcode_str = ', '.join(sorted(additional_devcodes, key=LooseVersion)) - fail_msg += "Surplus compute capabilities: %s. " % additional_devcode_str + fail_msg += "Additional compute capabilities: %s. " % additional_devcode_str if strict_cc_check: # cuda-sanity-check-strict, so no additional compute capabilities allowed if path in ignore_file_list or ignore_failures: From a62cdaaa68709101b1e80de711dba5973b74ceda Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 15 Apr 2025 22:27:42 +0200 Subject: [PATCH 073/114] Updateing toy builds. 10 test cases defined, of which 3 are implemented so far --- test/framework/toy_build.py | 388 +++++++++++++++++++++++------------- 1 file changed, 250 insertions(+), 138 deletions(-) diff --git a/test/framework/toy_build.py b/test/framework/toy_build.py index 3c30423919..fea713b354 100644 --- a/test/framework/toy_build.py +++ b/test/framework/toy_build.py @@ -3127,6 +3127,17 @@ def test_toy_cuda_sanity_check(self): # Shebang for cuobjdump cuobjdump_txt_shebang = "#!/bin/bash\n" + # Section for cuobjdump printing output for sm_70 architecture + cuobjdump_txt_sm70 = '\n'.join([ + "echo 'Fatbin elf code:'", + "echo '================'", + "echo 'arch = sm_70'", + "echo 'code version = [1,7]'", + "echo 'host = linux'", + "echo 'compile_size = 64bit'", + "echo ''\n" + ]) + # Section for cuobjdump printing output for sm_80 architecture cuobjdump_txt_sm80 = '\n'.join([ "echo 'Fatbin elf code:'", @@ -3135,7 +3146,7 @@ def test_toy_cuda_sanity_check(self): "echo 'code version = [1,7]'", "echo 'host = linux'", "echo 'compile_size = 64bit'", - "echo ''" + "echo ''\n" ]) # Section for cuobjdump printing output for sm_90 architecture @@ -3146,7 +3157,18 @@ def test_toy_cuda_sanity_check(self): "echo 'code version = [1,7]'", "echo 'host = linux'", "echo 'compile_size = 64bit'", - "echo ''" + "echo ''\n" + ]) + + # Section for cuobjdump printing output for sm_90a architecture + cuobjdump_txt_sm90a = '\n'.join([ + "echo 'Fatbin elf code:'", + "echo '================'", + "echo 'arch = sm_90a'", + "echo 'code version = [1,7]'", + "echo 'host = linux'", + "echo 'compile_size = 64bit'", + "echo ''\n" ]) # Section for cuobjdump printing output for sm_80 PTX code @@ -3157,7 +3179,8 @@ def test_toy_cuda_sanity_check(self): "echo 'code version = [8,1]'", "echo 'host = linux'", "echo 'compile_size = 64bit'", - "echo 'compressed'" + "echo 'compressed'", + "echo ''\n" ]) # Section for cuobjdump printing output for sm_90 PTX code @@ -3168,7 +3191,20 @@ def test_toy_cuda_sanity_check(self): "echo 'code version = [8,1]'", "echo 'host = linux'", "echo 'compile_size = 64bit'", - "echo 'compressed'" + "echo 'compressed'", + "echo ''\n" + ]) + + # Section for cuobjdump printing output for sm_90a PTX code + cuobjdump_txt_sm90a_ptx = '\n'.join([ + "echo 'Fatbin ptx code:'", + "echo '================'", + "echo 'arch = sm_90a'", + "echo 'code version = [8,1]'", + "echo 'host = linux'", + "echo 'compile_size = 64bit'", + "echo 'compressed'", + "echo ''\n" ]) # Created regex for success and failures @@ -3187,8 +3223,8 @@ def test_toy_cuda_sanity_check(self): device_missing_90_code_regex_pattern = r"Missing compute capabilities: 9.0." device_missing_90_code_regex = re.compile(device_missing_90_code_regex_pattern, re.M) - device_surplus_90_code_regex_pattern = r"Surplus compute capabilities: 9.0." - device_surplus_90_code_regex = re.compile(device_surplus_90_code_regex_pattern, re.M) + device_additional_70_90_code_regex_pattern = r"Additional compute capabilities: 7.0, 9.0." + device_additional_70_90_code_regex = re.compile(device_additional_70_90_code_regex_pattern, re.M) ptx_code_regex_success_pattern = r"DEBUG Output of 'cuobjdump' checked for '.*/bin/toy'; ptx code was " ptx_code_regex_success_pattern += r"present for \(at least\) the highest CUDA compute capability in " @@ -3218,155 +3254,231 @@ def test_toy_cuda_sanity_check(self): # Filepath to cuobjdump cuobjdump_file = os.path.join(cuobjdump_dir, 'cuobjdump') - # Test case 1: --cuda-compute-capabilities=8.0 and mocking a binary that contains 8.0 ELF code - # This means the build should succeed, so we can run with raise_error=True and check the output - # for the expected debugging output - # We also check here for the warning that no PTX code for the highest compute capability (8.0) was found - write_file(cuobjdump_file, cuobjdump_txt_shebang), - write_file(cuobjdump_file, cuobjdump_txt_sm80, append=True) - adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable - args = ['--cuda-compute-capabilities=8.0'] - # We expect this to pass, so no need to check errors - with self.mocked_stdout_stderr(): - outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) - msg = "Pattern %s not found in full build log: %s" % (device_code_regex_success.pattern, outtxt) - self.assertTrue(device_code_regex_success.search(outtxt), msg) - msg = "Pattern %s not found in full build log: %s" % (ptx_code_missing_80_regex.pattern, outtxt) - self.assertTrue(ptx_code_missing_80_regex.search(outtxt), msg) - - # Test case 2: --cuda-compute-capabilities=8.0 and mocking a binary that contains 8.0 ELF code and 8.0 PTX code - # This means the build should succeed, so we can run with raise_error=True and check the output - # for the expected debugging output - # It also means we expect output confirming that PTX code was found for the highest compute capability + # Test case 1: test with default options, --cuda-compute-capabilities=8.0 and a binary that contains + # 7.0 and 9.0 device code and 8.0 PTX code + # This should succeed (since the default for --cuda-sanity-check-error-on-fail is False) + # as to not break backwards compatibility write_file(cuobjdump_file, cuobjdump_txt_shebang), - write_file(cuobjdump_file, cuobjdump_txt_sm80, append=True) + write_file(cuobjdump_file, cuobjdump_txt_sm90, append=True) write_file(cuobjdump_file, cuobjdump_txt_sm80_ptx, append=True) + write_file(cuobjdump_file, cuobjdump_txt_sm70, append=True) adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable args = ['--cuda-compute-capabilities=8.0'] # We expect this to pass, so no need to check errors with self.mocked_stdout_stderr(): outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) - msg = "Pattern %s not found in full build log: %s" % (device_code_regex_success.pattern, outtxt) - self.assertTrue(device_code_regex_success.search(outtxt), msg) - msg = "Pattern %s not found in full build log: %s" % (ptx_code_regex_success.pattern, outtxt) - self.assertTrue(ptx_code_regex_success.search(outtxt), msg) - - # Test case 3: --cuda-compute-capabilities=8.0 and mocking a binary that contains only 9.0 ELF code - # This means we expect the build to fail, so we'll do an assertErrorRegex to check that - # Subsequently, we rerun with raise_error=False so we can check the debugging output - # There, we expect EB to tell us that 8.0 code was expected, but only 9.0 code was found - write_file(cuobjdump_file, cuobjdump_txt_shebang), - write_file(cuobjdump_file, cuobjdump_txt_sm90, append=True) - adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable - args = ['--cuda-compute-capabilities=8.0'] - # We expect this to fail - error_pattern = "Sanity check failed: Mismatch between cuda_compute_capabilities and device code in " - error_pattern += ".*/bin/toy. Surplus compute capabilities: 9.0. Missing compute capabilities: 8.0." - with self.mocked_stdout_stderr(): - self.assertErrorRegex(EasyBuildError, error_pattern, self._test_toy_build, ec_file=toy_ec, - extra_args=args, raise_error=True) - outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=False, verify=False) + msg = "Pattern %s not found in full build log: %s" % (device_additional_70_90_code_regex.pattern, outtxt) + self.assertTrue(device_additional_70_90_code_regex.search(outtxt), msg) msg = "Pattern %s not found in full build log: %s" % (device_missing_80_code_regex.pattern, outtxt) self.assertTrue(device_missing_80_code_regex.search(outtxt), msg) - msg = "Pattern %s not found in full build log: %s" % (ptx_code_missing_80_regex.pattern, outtxt) - self.assertTrue(ptx_code_missing_80_regex.search(outtxt), msg) - # Test case 4: --cuda-compute-capabilities=8.0,9.0 and mocking a binary that contains both 8.0 and 9.0 ELF code - # This means the build should succeed, so we can run with raise_error=True and check the output - # for the expected debugging output. - # We also check here for the warning that no PTX code for the highest compute capability (9.0) was found - write_file(cuobjdump_file, cuobjdump_txt_shebang), - write_file(cuobjdump_file, cuobjdump_txt_sm80, append=True) - write_file(cuobjdump_file, cuobjdump_txt_sm90, append=True) - adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable - args = ['--cuda-compute-capabilities=8.0,9.0'] - # We expect this to succeed - with self.mocked_stdout_stderr(): - outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) - msg = "Pattern %s not found in full build log: %s" % (device_code_regex_success.pattern, outtxt) - self.assertTrue(device_code_regex_success.search(outtxt), msg) - msg = "Pattern %s not found in full build log: %s" % (ptx_code_missing_90_regex.pattern, outtxt) - self.assertTrue(ptx_code_missing_90_regex.search(outtxt), msg) - - # Test case 5: --cuda-compute-capabilities=8.0,9.0 and mocking a binary that only contains 8.0 ELF code - # This means we expect the build to fail, so we'll do an assertErrorRegex to check that - # Subsequently, we rerun with raise_error=False so we can check the debugging output for the debugging - # output which tells us it expected 8.0 and 9.0, but only found 9.0 ELF code - write_file(cuobjdump_file, cuobjdump_txt_shebang), - write_file(cuobjdump_file, cuobjdump_txt_sm80, append=True) - adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable - args = ['--cuda-compute-capabilities=8.0,9.0'] - # We expect this to fail + # Test case 2: same as Test case 1, but add --cuda-sanity-check-error-on-fail + # This is expected to fail since there is missing device code for CC80 + args = ['--cuda-compute-capabilities=8.0', '--cuda-sanity-check-error-on-fail'] + # We expect this to fail, so first check error, then run again to check output error_pattern = "Sanity check failed: Mismatch between cuda_compute_capabilities and device code in " - error_pattern += ".*/bin/toy. Missing compute capabilities: 9.0." + error_pattern += ".*/bin/toy. Missing compute capabilities: 8.0." with self.mocked_stdout_stderr(): self.assertErrorRegex(EasyBuildError, error_pattern, self._test_toy_build, ec_file=toy_ec, extra_args=args, raise_error=True) outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=False, verify=False) - msg = "Pattern %s not found in full build log: %s" % (device_missing_90_code_regex.pattern, outtxt) - self.assertTrue(device_missing_90_code_regex.search(outtxt), msg) - msg = "Pattern %s not found in full build log: %s" % (ptx_code_missing_90_regex.pattern, outtxt) - self.assertTrue(ptx_code_missing_90_regex.search(outtxt), msg) - - # Test case 6: --cuda-compute-capabilities=8.0,9.0 and mocking a binary that contains 8.0 and 9.0 ELF code - # as well as 9.0 PTX code - # This means the build should succeed, so we can run with raise_error=True and check the output - # for the expected debugging output - # It also means we expect output confirming that PTX code was found for the highest compute capability - write_file(cuobjdump_file, cuobjdump_txt_shebang), - write_file(cuobjdump_file, cuobjdump_txt_sm80, append=True) - write_file(cuobjdump_file, cuobjdump_txt_sm90, append=True) - write_file(cuobjdump_file, cuobjdump_txt_sm90_ptx, append=True) - adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable - args = ['--cuda-compute-capabilities=8.0,9.0'] - # We expect this to succeed + msg = "Pattern %s not found in full build log: %s" % (device_additional_70_90_code_regex.pattern, outtxt) + self.assertTrue(device_additional_70_90_code_regex.search(outtxt), msg) + msg = "Pattern %s not found in full build log: %s" % (device_missing_80_code_regex.pattern, outtxt) + self.assertTrue(device_missing_80_code_regex.search(outtxt), msg) + + # Test case 3: same as Test case 2, but add --cuda-sanity-check-accept-ptx-as-devcode + # This is expected to succeed, since now the PTX code for CC80 will be accepted as + # device code. Note that also PTX code for the highest requested compute architecture (also CC80) + # is present, so also this part of the sanity check passes + args = ['--cuda-compute-capabilities=8.0', '--cuda-sanity-check-error-on-fail', + '--cuda-sanity-check-accept-ptx-as-devcode'] + # We expect this to pass, so no need to check errors with self.mocked_stdout_stderr(): outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) - msg = "Pattern %s not found in full build log: %s" % (device_code_regex_success.pattern, outtxt) - self.assertTrue(device_code_regex_success.search(outtxt), msg) - msg = "Pattern %s not found in full build log: %s" % (ptx_code_regex_success.pattern, outtxt) - self.assertTrue(ptx_code_regex_success.search(outtxt), msg) - - # Test case 7: --cude-compute-capabilities=8.0 --strict-cuda-sanity-check and mocking a binary that contains - # 8.0 and 9.0 ELF code - # This means we expect the build to fail, so we'll do an assertErrorRegex to check that - # Subsequently, we rerun with raise_error=False so we can check the debugging output - # There, we expect EB to tell us that only 8.0 code was expected, but both 8.0 and 9.0 code was found - write_file(cuobjdump_file, cuobjdump_txt_shebang), - write_file(cuobjdump_file, cuobjdump_txt_sm80, append=True) - write_file(cuobjdump_file, cuobjdump_txt_sm90, append=True) - adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable - args = ['--cuda-compute-capabilities=8.0', '--strict-cuda-sanity-check'] - # We expect this to fail - error_pattern = "Sanity check failed: Mismatch between cuda_compute_capabilities and device code in " - error_pattern += ".*/bin/toy. Surplus compute capabilities: 9.0." - with self.mocked_stdout_stderr(): - self.assertErrorRegex(EasyBuildError, error_pattern, self._test_toy_build, ec_file=toy_ec, - extra_args=args, raise_error=True) - outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=False, verify=False) - msg = "Pattern %s not found in full build log: %s" % (device_surplus_90_code_regex.pattern, outtxt) - self.assertTrue(device_surplus_90_code_regex.search(outtxt), msg) - msg = "Pattern %s not found in full build log: %s" % (ptx_code_missing_80_regex.pattern, outtxt) - self.assertTrue(ptx_code_missing_80_regex.search(outtxt), msg) - - # Test case 8: --cuda-compute-capabilities=8.0 and mocking a binary that contains 9.0 ELF code - # but passing that binary on the ignore_cuda_sanity_failures list - # This means we expect the build to succeed and we'll check the output for the expected debugging output - test_ec = os.path.join(self.test_prefix, 'test.eb') - test_ec_txt = read_file(toy_ec) - test_ec_txt += "\ncuda_sanity_ignore_files = ['bin/toy']" - write_file(test_ec, test_ec_txt) - write_file(cuobjdump_file, cuobjdump_txt_shebang), - write_file(cuobjdump_file, cuobjdump_txt_sm90, append=True) - adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable - args = ['--cuda-compute-capabilities=8.0'] - # We expect this to succeed - with self.mocked_stdout_stderr(): - outtxt = self._test_toy_build(ec_file=test_ec, extra_args=args, raise_error=True) - msg = "Pattern %s not found in full build log: %s" % (device_missing_80_code_ignored_regex.pattern, outtxt) - self.assertTrue(device_missing_80_code_ignored_regex.search(outtxt), msg) - msg = "Pattern %s not found in full build log: %s" % (ptx_code_missing_80_regex.pattern, outtxt) - self.assertTrue(ptx_code_missing_80_regex.search(outtxt), msg) + msg = "Pattern %s not found in full build log: %s" % (device_additional_70_90_code_regex.pattern, outtxt) + self.assertTrue(device_additional_70_90_code_regex.search(outtxt), msg) + msg = "Pattern %s not found in full build log: %s" % (device_missing_80_code_regex.pattern, outtxt) + self.assertTrue(device_missing_80_code_regex.search(outtxt), msg) + expected_summary = "Number of files missing one or more CUDA Compute Capabilities, but having suitable PTX " + expected_summary += "code that can be JIT compiled for the requested CUDA Compute Capabilities: 1" + expected_summary_regex = re.compile(expected_summary, re.M) + msg = "Pattern %s not found in full build log: %s" % (expected_summary, outtxt) + self.assertTrue(expected_summary_regex.search(outtxt), msg) + + # Test case 4: same as Test case 2, but run with --cuda-compute-capabilities=9.0 + # This is expected to fail: device code is present, but PTX code for the highest CC (9.0) is missing + + # Test case 5: same as Test case 4, but add --cuda-sanity-check-accept-missing-ptx + # This is expected to succeed: device code is present, PTX code is missing, but that's accepted + + # Test case 6: same as Test case 5, but add --cuda-sanity-check-strict + # This is expected to fail: device code is present, PTX code is missing (but accepted due to option) + # but additional device code is present, which is not allowed by --cuda-sanity-check-strict + + # Test case 7: same as Test case 7, but add the failing file to the cuda_sanity_ignore_files + # This is expected to succeed: the individual file which _would_ cause the sanity check to fail is + # now on the ignore list + + # Test case 8: running with default options and a binary that does not contain ANY CUDA device code + # This is expected to succeed, since the default is --disable-cuda-sanity-check-error-on-fail + + # Test case 9: same as Test case 8, but add --cuda-sanity-check-error-on-fail + + # Test case 10: try with --cuda-sanity-check-error-on-fail --cuda-compute-capabilities=9.0,9.0a + # on a binary that contains 9.0 and 9.0a device code, and 9.0a ptx code. This tests the correct + # ordering (i.e. 9.0a > 9.0). It should pass, since device code is present for both CCs and PTX + # code is present for the highest CC. It also tests a case with multiple compute capabilities. + +# # Test case 1: --cuda-compute-capabilities=8.0 and mocking a binary that contains 8.0 ELF code +# # This means the build should succeed, so we can run with raise_error=True and check the output +# # for the expected debugging output +# # We also check here for the warning that no PTX code for the highest compute capability (8.0) was found +# write_file(cuobjdump_file, cuobjdump_txt_shebang), +# write_file(cuobjdump_file, cuobjdump_txt_sm80, append=True) +# adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable +# args = ['--cuda-compute-capabilities=8.0'] +# # We expect this to pass, so no need to check errors +# with self.mocked_stdout_stderr(): +# outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) +# msg = "Pattern %s not found in full build log: %s" % (device_code_regex_success.pattern, outtxt) +# self.assertTrue(device_code_regex_success.search(outtxt), msg) +# msg = "Pattern %s not found in full build log: %s" % (ptx_code_missing_80_regex.pattern, outtxt) +# self.assertTrue(ptx_code_missing_80_regex.search(outtxt), msg) +# +# # Test case 2: --cuda-compute-capabilities=8.0 and mocking a binary that contains 8.0 ELF code and 8.0 PTX code +# # This means the build should succeed, so we can run with raise_error=True and check the output +# # for the expected debugging output +# # It also means we expect output confirming that PTX code was found for the highest compute capability +# write_file(cuobjdump_file, cuobjdump_txt_shebang), +# write_file(cuobjdump_file, cuobjdump_txt_sm80, append=True) +# write_file(cuobjdump_file, cuobjdump_txt_sm80_ptx, append=True) +# adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable +# args = ['--cuda-compute-capabilities=8.0'] +# # We expect this to pass, so no need to check errors +# with self.mocked_stdout_stderr(): +# outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) +# msg = "Pattern %s not found in full build log: %s" % (device_code_regex_success.pattern, outtxt) +# self.assertTrue(device_code_regex_success.search(outtxt), msg) +# msg = "Pattern %s not found in full build log: %s" % (ptx_code_regex_success.pattern, outtxt) +# self.assertTrue(ptx_code_regex_success.search(outtxt), msg) +# +# # Test case 3: --cuda-compute-capabilities=8.0 and mocking a binary that contains only 9.0 ELF code +# # This means we expect the build to fail, so we'll do an assertErrorRegex to check that +# # Subsequently, we rerun with raise_error=False so we can check the debugging output +# # There, we expect EB to tell us that 8.0 code was expected, but only 9.0 code was found +# write_file(cuobjdump_file, cuobjdump_txt_shebang), +# write_file(cuobjdump_file, cuobjdump_txt_sm90, append=True) +# adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable +# args = ['--cuda-compute-capabilities=8.0'] +# # We expect this to fail +# error_pattern = "Sanity check failed: Mismatch between cuda_compute_capabilities and device code in " +# error_pattern += ".*/bin/toy. Surplus compute capabilities: 9.0. Missing compute capabilities: 8.0." +# with self.mocked_stdout_stderr(): +# self.assertErrorRegex(EasyBuildError, error_pattern, self._test_toy_build, ec_file=toy_ec, +# extra_args=args, raise_error=True) +# outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=False, verify=False) +# msg = "Pattern %s not found in full build log: %s" % (device_missing_80_code_regex.pattern, outtxt) +# self.assertTrue(device_missing_80_code_regex.search(outtxt), msg) +# msg = "Pattern %s not found in full build log: %s" % (ptx_code_missing_80_regex.pattern, outtxt) +# self.assertTrue(ptx_code_missing_80_regex.search(outtxt), msg) +# +# # Test case 4: --cuda-compute-capabilities=8.0,9.0 and mocking a binary that contains both 8.0 and 9.0 ELF code +# # This means the build should succeed, so we can run with raise_error=True and check the output +# # for the expected debugging output. +# # We also check here for the warning that no PTX code for the highest compute capability (9.0) was found +# write_file(cuobjdump_file, cuobjdump_txt_shebang), +# write_file(cuobjdump_file, cuobjdump_txt_sm80, append=True) +# write_file(cuobjdump_file, cuobjdump_txt_sm90, append=True) +# adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable +# args = ['--cuda-compute-capabilities=8.0,9.0'] +# # We expect this to succeed +# with self.mocked_stdout_stderr(): +# outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) +# msg = "Pattern %s not found in full build log: %s" % (device_code_regex_success.pattern, outtxt) +# self.assertTrue(device_code_regex_success.search(outtxt), msg) +# msg = "Pattern %s not found in full build log: %s" % (ptx_code_missing_90_regex.pattern, outtxt) +# self.assertTrue(ptx_code_missing_90_regex.search(outtxt), msg) +# +# # Test case 5: --cuda-compute-capabilities=8.0,9.0 and mocking a binary that only contains 8.0 ELF code +# # This means we expect the build to fail, so we'll do an assertErrorRegex to check that +# # Subsequently, we rerun with raise_error=False so we can check the debugging output for the debugging +# # output which tells us it expected 8.0 and 9.0, but only found 9.0 ELF code +# write_file(cuobjdump_file, cuobjdump_txt_shebang), +# write_file(cuobjdump_file, cuobjdump_txt_sm80, append=True) +# adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable +# args = ['--cuda-compute-capabilities=8.0,9.0'] +# # We expect this to fail +# error_pattern = "Sanity check failed: Mismatch between cuda_compute_capabilities and device code in " +# error_pattern += ".*/bin/toy. Missing compute capabilities: 9.0." +# with self.mocked_stdout_stderr(): +# self.assertErrorRegex(EasyBuildError, error_pattern, self._test_toy_build, ec_file=toy_ec, +# extra_args=args, raise_error=True) +# outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=False, verify=False) +# msg = "Pattern %s not found in full build log: %s" % (device_missing_90_code_regex.pattern, outtxt) +# self.assertTrue(device_missing_90_code_regex.search(outtxt), msg) +# msg = "Pattern %s not found in full build log: %s" % (ptx_code_missing_90_regex.pattern, outtxt) +# self.assertTrue(ptx_code_missing_90_regex.search(outtxt), msg) +# +# # Test case 6: --cuda-compute-capabilities=8.0,9.0 and mocking a binary that contains 8.0 and 9.0 ELF code +# # as well as 9.0 PTX code +# # This means the build should succeed, so we can run with raise_error=True and check the output +# # for the expected debugging output +# # It also means we expect output confirming that PTX code was found for the highest compute capability +# write_file(cuobjdump_file, cuobjdump_txt_shebang), +# write_file(cuobjdump_file, cuobjdump_txt_sm80, append=True) +# write_file(cuobjdump_file, cuobjdump_txt_sm90, append=True) +# write_file(cuobjdump_file, cuobjdump_txt_sm90_ptx, append=True) +# adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable +# args = ['--cuda-compute-capabilities=8.0,9.0'] +# # We expect this to succeed +# with self.mocked_stdout_stderr(): +# outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) +# msg = "Pattern %s not found in full build log: %s" % (device_code_regex_success.pattern, outtxt) +# self.assertTrue(device_code_regex_success.search(outtxt), msg) +# msg = "Pattern %s not found in full build log: %s" % (ptx_code_regex_success.pattern, outtxt) +# self.assertTrue(ptx_code_regex_success.search(outtxt), msg) +# +# # Test case 7: --cude-compute-capabilities=8.0 --strict-cuda-sanity-check and mocking a binary that contains +# # 8.0 and 9.0 ELF code +# # This means we expect the build to fail, so we'll do an assertErrorRegex to check that +# # Subsequently, we rerun with raise_error=False so we can check the debugging output +# # There, we expect EB to tell us that only 8.0 code was expected, but both 8.0 and 9.0 code was found +# write_file(cuobjdump_file, cuobjdump_txt_shebang), +# write_file(cuobjdump_file, cuobjdump_txt_sm80, append=True) +# write_file(cuobjdump_file, cuobjdump_txt_sm90, append=True) +# adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable +# args = ['--cuda-compute-capabilities=8.0', '--strict-cuda-sanity-check'] +# # We expect this to fail +# error_pattern = "Sanity check failed: Mismatch between cuda_compute_capabilities and device code in " +# error_pattern += ".*/bin/toy. Surplus compute capabilities: 9.0." +# with self.mocked_stdout_stderr(): +# self.assertErrorRegex(EasyBuildError, error_pattern, self._test_toy_build, ec_file=toy_ec, +# extra_args=args, raise_error=True) +# outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=False, verify=False) +# msg = "Pattern %s not found in full build log: %s" % (device_surplus_90_code_regex.pattern, outtxt) +# self.assertTrue(device_surplus_90_code_regex.search(outtxt), msg) +# msg = "Pattern %s not found in full build log: %s" % (ptx_code_missing_80_regex.pattern, outtxt) +# self.assertTrue(ptx_code_missing_80_regex.search(outtxt), msg) +# +# # Test case 8: --cuda-compute-capabilities=8.0 and mocking a binary that contains 9.0 ELF code +# # but passing that binary on the ignore_cuda_sanity_failures list +# # This means we expect the build to succeed and we'll check the output for the expected debugging output +# test_ec = os.path.join(self.test_prefix, 'test.eb') +# test_ec_txt = read_file(toy_ec) +# test_ec_txt += "\ncuda_sanity_ignore_files = ['bin/toy']" +# write_file(test_ec, test_ec_txt) +# write_file(cuobjdump_file, cuobjdump_txt_shebang), +# write_file(cuobjdump_file, cuobjdump_txt_sm90, append=True) +# adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable +# args = ['--cuda-compute-capabilities=8.0'] +# # We expect this to succeed +# with self.mocked_stdout_stderr(): +# outtxt = self._test_toy_build(ec_file=test_ec, extra_args=args, raise_error=True) +# msg = "Pattern %s not found in full build log: %s" % (device_missing_80_code_ignored_regex.pattern, outtxt) +# self.assertTrue(device_missing_80_code_ignored_regex.search(outtxt), msg) +# msg = "Pattern %s not found in full build log: %s" % (ptx_code_missing_80_regex.pattern, outtxt) +# self.assertTrue(ptx_code_missing_80_regex.search(outtxt), msg) # Restore original environment modify_env(os.environ, start_env, verbose=False) From 11cf157a3184663ff47e161e172f077972b76935 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 16 Apr 2025 13:26:18 +0200 Subject: [PATCH 074/114] Architectures can be 9.0a or 10.0a now, i.e. sm_90a is a valid optimization target. See https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/#gpu-feature-list --- easybuild/tools/options.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/easybuild/tools/options.py b/easybuild/tools/options.py index dc5c39a950..6be7ed3029 100644 --- a/easybuild/tools/options.py +++ b/easybuild/tools/options.py @@ -978,7 +978,7 @@ def validate(self): # values passed to --cuda-compute-capabilities must be of form X.Y (with both X and Y integers), # see https://developer.nvidia.com/cuda-gpus if self.options.cuda_compute_capabilities: - cuda_cc_regex = re.compile(r'^[0-9]+\.[0-9]+$') + cuda_cc_regex = re.compile(r'^[0-9]+\.[0-9]+a?$') faulty_cuda_ccs = [x for x in self.options.cuda_compute_capabilities if not cuda_cc_regex.match(x)] if faulty_cuda_ccs: error_msg = "Incorrect values in --cuda-compute-capabilities (expected pattern: '%s'): %s" From 8d9720ee4f5db5c13485fd9792819e194e5fce72 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 16 Apr 2025 13:27:01 +0200 Subject: [PATCH 075/114] Some missing f-strings and small refinements in the summary reporting to make things more clear --- easybuild/framework/easyblock.py | 34 ++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 9e729c81ea..ecf5f2ae80 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3589,12 +3589,12 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): summary_msg_files = f"{len(files_missing_devcode)} files missing one or more CUDA compute capabilities: " summary_msg_files += f"{files_missing_devcode}\n" summary_msg_files += f"These failures are ignored for {len(files_missing_devcode_ignored)} files: " - summary_msg_files += "{files_missing_devcode_ignored})\n" + summary_msg_files += f"{files_missing_devcode_ignored})\n" if accept_ptx_as_devcode: summary_msg_files += f"{len(files_missing_devcode_but_has_ptx)} files missing one or more CUDA Compute " summary_msg_files += "Capabilities, but has suitable PTX code that can be JIT compiled for the requested " summary_msg_files += f"CUDA Compute Capabilities: {files_missing_devcode_but_has_ptx}\n" - summary_msg_files += "{len(files_additional_devcode)} files with device code for more CUDA Compute " + summary_msg_files += f"{len(files_additional_devcode)} files with device code for more CUDA Compute " summary_msg_files += f"Capabilities than requested: {files_additional_devcode}\n" summary_msg_files += f"These failures are ignored for {len(files_additional_devcode_ignored)} files: " summary_msg_files += f"{files_additional_devcode_ignored})\n" @@ -3607,30 +3607,44 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): # Short summary summary_msg = "CUDA sanity check summary report:\n" summary_msg += f"Number of CUDA files checked: {num_cuda_files}\n" - summary_msg += f"Number of files missing one or more CUDA Compute Capabilities: {len(files_missing_devcode)} " - summary_msg += f"(ignored: {len(files_missing_devcode_ignored)}, fails: {len(files_missing_devcode_fails)})\n" + if len(files_missing_devcode) == 0: + summary_msg += "Number of files missing one or more CUDA Compute Capabilities: 0\n" + elif ignore_failures: + summary_msg += f"Number of files missing one or more CUDA Compute Capabilities: {len(files_missing_devcode)} " + summary_msg += f"(not running with --cuda-sanity-check-fail-on-error, so not considered failures)\n" + else: + summary_msg += f"Number of files missing one or more CUDA Compute Capabilities: {len(files_missing_devcode)} " + summary_msg += f"(ignored: {len(files_missing_devcode_ignored)}, fails: {len(files_missing_devcode_fails)})\n" if accept_ptx_as_devcode: summary_msg += "Number of files missing one or more CUDA Compute Capabilities, but having suitable " summary_msg += "PTX code that can be JIT compiled for the requested CUDA Compute Capabilities: " summary_msg += f"{len(files_missing_devcode_but_has_ptx)}\n" - summary_msg += "Number of files with device code for more CUDA Compute Capabilities than requested: " - if ignore_failures: + if len(files_additional_devcode) == 0: + summary_msg += "Number of files with device code for more CUDA Compute Capabilities than requested: 0\n" + elif ignore_failures: + summary_msg += "Number of files with device code for more CUDA Compute Capabilities than requested: " summary_msg += f"{len(files_additional_devcode)} (not running with --cuda-sanity-check-fail-on-error, " summary_msg += "so not considered failures)\n" elif strict_cc_check: + summary_msg += "Number of files with device code for more CUDA Compute Capabilities than requested: " summary_msg += f"{len(files_additional_devcode)} (ignored: {len(files_additional_devcode_ignored)}, " summary_msg += f"fails: {len(files_additional_devcode_fails)})\n" else: + summary_msg += "Number of files with device code for more CUDA Compute Capabilities than requested: " summary_msg += f"{len(files_additional_devcode)} (not running with --cuda-sanity-check-strict, so not " summary_msg += "considered failures)\n" - summary_msg += "Number of files missing PTX code for the highest configured CUDA Compute Capability: " - if ignore_failures: - summary_msg += f"{len(files_missing_ptx)} (not running with --cuda-sanity-check-fail-on-error so not " + if len(files_missing_ptx) == 0: + summary_msg += "Number of files missing PTX code for the highest configured CUDA Compute Capability: 0\n" + elif ignore_failures: + summary_msg += "Number of files missing PTX code for the highest configured CUDA Compute Capability: " + summary_msg += f"{len(files_missing_ptx)} (not running with --cuda-sanity-check-fail-on-error, so not " summary_msg += "considered failures)\n" elif accept_missing_ptx: - summary_msg += f"{len(files_missing_ptx)} (running with --cuda-sanity-check-accept-missing-ptx so not " + summary_msg += "Number of files missing PTX code for the highest configured CUDA Compute Capability: " + summary_msg += f"{len(files_missing_ptx)} (running with --cuda-sanity-check-accept-missing-ptx, so not " summary_msg += "considered failures)\n" else: + summary_msg += "Number of files missing PTX code for the highest configured CUDA Compute Capability: " summary_msg += f"{len(files_missing_ptx)} (ignored: {len(files_missing_ptx_ignored)}, fails: " summary_msg += f"{len(files_missing_ptx_fails)})\n" if not build_option('debug'): From b42622699deb76283a070ee93dbc6f181d51144a Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 16 Apr 2025 13:28:44 +0200 Subject: [PATCH 076/114] Removed old tests and replaced them with new ones. New tests check all newly added options, as well as running the sanity check when multiple CUDA compute capabilities are defined, running with no CUDA compute capabilities defined, and running on a binary without cuda code --- test/framework/toy_build.py | 357 ++++++++++++++++++++---------------- 1 file changed, 201 insertions(+), 156 deletions(-) diff --git a/test/framework/toy_build.py b/test/framework/toy_build.py index fea713b354..59a43a4ec5 100644 --- a/test/framework/toy_build.py +++ b/test/framework/toy_build.py @@ -3207,6 +3207,9 @@ def test_toy_cuda_sanity_check(self): "echo ''\n" ]) + # Section for cuobjdump printing output that toy doesn't contain device code + cuobjdump_txt_no_cuda = "echo 'cuobjdump info : File '/mock/path/to/toy' does not contain device code'" + # Created regex for success and failures device_code_regex_success_pattern = r"DEBUG Output of 'cuobjdump' checked for '.*/bin/toy'; device code " device_code_regex_success_pattern += "architectures match those in cuda_compute_capabilities" @@ -3223,6 +3226,8 @@ def test_toy_cuda_sanity_check(self): device_missing_90_code_regex_pattern = r"Missing compute capabilities: 9.0." device_missing_90_code_regex = re.compile(device_missing_90_code_regex_pattern, re.M) + device_additional_70_code_regex_pattern = r"Additional compute capabilities: 7.0." + device_additional_70_code_regex = re.compile(device_additional_70_code_regex_pattern, re.M) device_additional_70_90_code_regex_pattern = r"Additional compute capabilities: 7.0, 9.0." device_additional_70_90_code_regex = re.compile(device_additional_70_90_code_regex_pattern, re.M) @@ -3271,6 +3276,14 @@ def test_toy_cuda_sanity_check(self): self.assertTrue(device_additional_70_90_code_regex.search(outtxt), msg) msg = "Pattern %s not found in full build log: %s" % (device_missing_80_code_regex.pattern, outtxt) self.assertTrue(device_missing_80_code_regex.search(outtxt), msg) + expected_summary = r"^Number of files missing one or more CUDA Compute Capabilities: 1 " + expected_summary += r"\(not running with --cuda-sanity-check-fail-on-error, so not considered failures\)$\n" + expected_summary += r"^Number of files with device code for more CUDA Compute Capabilities than requested: 1 " + expected_summary += r"\(not running with --cuda-sanity-check-fail-on-error, so not considered failures\)$\n" + expected_summary += r"^Number of files missing PTX code for the highest configured CUDA Compute Capability: 0" + expected_summary_regex = re.compile(expected_summary, re.M) + msg = "Pattern %s not found in full build log: %s" % (expected_summary, outtxt) + self.assertTrue(expected_summary_regex.search(outtxt), msg) # Test case 2: same as Test case 1, but add --cuda-sanity-check-error-on-fail # This is expected to fail since there is missing device code for CC80 @@ -3286,6 +3299,14 @@ def test_toy_cuda_sanity_check(self): self.assertTrue(device_additional_70_90_code_regex.search(outtxt), msg) msg = "Pattern %s not found in full build log: %s" % (device_missing_80_code_regex.pattern, outtxt) self.assertTrue(device_missing_80_code_regex.search(outtxt), msg) + expected_summary = r"^Number of files missing one or more CUDA Compute Capabilities: 1 " + expected_summary += "\(ignored: 0, fails: 1\)$\n" + expected_summary += r"^Number of files with device code for more CUDA Compute Capabilities than requested: 1 " + expected_summary += r"\(not running with --cuda-sanity-check-strict, so not considered failures\)$\n" + expected_summary += r"^Number of files missing PTX code for the highest configured CUDA Compute Capability: 0" + expected_summary_regex = re.compile(expected_summary, re.M) + msg = "Pattern %s not found in full build log: %s" % (expected_summary, outtxt) + self.assertTrue(expected_summary_regex.search(outtxt), msg) # Test case 3: same as Test case 2, but add --cuda-sanity-check-accept-ptx-as-devcode # This is expected to succeed, since now the PTX code for CC80 will be accepted as @@ -3305,180 +3326,204 @@ def test_toy_cuda_sanity_check(self): expected_summary_regex = re.compile(expected_summary, re.M) msg = "Pattern %s not found in full build log: %s" % (expected_summary, outtxt) self.assertTrue(expected_summary_regex.search(outtxt), msg) + expected_summary = r"^Number of files missing one or more CUDA Compute Capabilities: 0$\n" + expected_summary += r"^Number of files missing one or more CUDA Compute Capabilities, but having suitable PTX " + expected_summary += r"code that can be JIT compiled for the requested CUDA Compute Capabilities: 1$\n" + expected_summary += r"^Number of files with device code for more CUDA Compute Capabilities than requested: 1 " + expected_summary += r"\(not running with --cuda-sanity-check-strict, so not considered failures\)$\n" + expected_summary += r"^Number of files missing PTX code for the highest configured CUDA Compute Capability: 0" + expected_summary_regex = re.compile(expected_summary, re.M) + msg = "Pattern %s not found in full build log: %s" % (expected_summary, outtxt) + self.assertTrue(expected_summary_regex.search(outtxt), msg) # Test case 4: same as Test case 2, but run with --cuda-compute-capabilities=9.0 # This is expected to fail: device code is present, but PTX code for the highest CC (9.0) is missing + args = ['--cuda-compute-capabilities=9.0', '--cuda-sanity-check-error-on-fail'] + # We expect this to fail, so first check error, then run again to check output + error_pattern = "Sanity check failed: Configured highest compute capability was '9\.0', " + error_pattern += "but no PTX code for this compute capability was found in '.*/bin/toy' " + error_pattern += "\(PTX architectures supported in that file: \['8\.0'\]\)" + with self.mocked_stdout_stderr(): + self.assertErrorRegex(EasyBuildError, error_pattern, self._test_toy_build, ec_file=toy_ec, + extra_args=args, raise_error=True) + outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=False, verify=False) + msg = "Pattern %s not found in full build log: %s" % (device_additional_70_code_regex.pattern, outtxt) + self.assertTrue(device_additional_70_code_regex.search(outtxt), msg) + expected_summary = r"^Number of files missing one or more CUDA Compute Capabilities: 0$\n" + expected_summary += r"^Number of files with device code for more CUDA Compute Capabilities than requested: 1 " + expected_summary += r"\(not running with --cuda-sanity-check-strict, so not considered failures\)$\n" + expected_summary += r"^Number of files missing PTX code for the highest configured CUDA Compute Capability: 1 " + expected_summary += "\(ignored: 0, fails: 1\)" + expected_summary_regex = re.compile(expected_summary, re.M) + msg = "Pattern %s not found in full build log: %s" % (expected_summary, outtxt) + self.assertTrue(expected_summary_regex.search(outtxt), msg) # Test case 5: same as Test case 4, but add --cuda-sanity-check-accept-missing-ptx # This is expected to succeed: device code is present, PTX code is missing, but that's accepted + args = ['--cuda-compute-capabilities=9.0', '--cuda-sanity-check-error-on-fail', + '--cuda-sanity-check-accept-missing-ptx'] + # We expect this to pass, so no need to check errors + warning_pattern = "Configured highest compute capability was '9\.0', " + warning_pattern += "but no PTX code for this compute capability was found in '.*/bin/toy' " + warning_pattern += "\(PTX architectures supported in that file: \['8\.0'\]\)" + warning_pattern_regex = re.compile(warning_pattern, re.M) + with self.mocked_stdout_stderr(): + outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) + msg = "Pattern %s not found in full build log: %s" % (device_additional_70_code_regex.pattern, outtxt) + self.assertTrue(device_additional_70_code_regex.search(outtxt), msg) + msg = "Pattern %s not found in full build log: %s" % (warning_pattern, outtxt) + self.assertTrue(warning_pattern_regex.search(outtxt), msg) + expected_summary = r"^Number of files missing one or more CUDA Compute Capabilities: 0$\n" + expected_summary += r"^Number of files with device code for more CUDA Compute Capabilities than requested: 1 " + expected_summary += r"\(not running with --cuda-sanity-check-strict, so not considered failures\)$\n" + expected_summary += r"^Number of files missing PTX code for the highest configured CUDA Compute Capability: 1 " + expected_summary += "\(running with --cuda-sanity-check-accept-missing-ptx, so not considered failures\)" + expected_summary_regex = re.compile(expected_summary, re.M) + msg = "Pattern %s not found in full build log: %s" % (expected_summary, outtxt) + self.assertTrue(expected_summary_regex.search(outtxt), msg) # Test case 6: same as Test case 5, but add --cuda-sanity-check-strict # This is expected to fail: device code is present, PTX code is missing (but accepted due to option) # but additional device code is present, which is not allowed by --cuda-sanity-check-strict + args = ['--cuda-compute-capabilities=9.0', '--cuda-sanity-check-error-on-fail', + '--cuda-sanity-check-accept-missing-ptx', '--cuda-sanity-check-strict'] + # We expect this to fail, so first check error, then run again to check output + error_pattern = "Sanity check failed: Mismatch between cuda_compute_capabilities and device code in " + error_pattern += ".*/bin/toy\. Additional compute capabilities: 7\.0" + with self.mocked_stdout_stderr(): + self.assertErrorRegex(EasyBuildError, error_pattern, self._test_toy_build, ec_file=toy_ec, + extra_args=args, raise_error=True) + outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=False, verify=False) + msg = "Pattern %s not found in full build log: %s" % (device_additional_70_code_regex.pattern, outtxt) + self.assertTrue(device_additional_70_code_regex.search(outtxt), msg) + expected_summary = r"^Number of files missing one or more CUDA Compute Capabilities: 0$\n" + expected_summary += r"^Number of files with device code for more CUDA Compute Capabilities than requested: 1 " + expected_summary += "\(ignored: 0, fails: 1\)$\n" + expected_summary += r"^Number of files missing PTX code for the highest configured CUDA Compute Capability: 1 " + expected_summary += "\(running with --cuda-sanity-check-accept-missing-ptx, so not considered failures\)" + expected_summary_regex = re.compile(expected_summary, re.M) + msg = "Pattern %s not found in full build log: %s" % (expected_summary, outtxt) + self.assertTrue(expected_summary_regex.search(outtxt), msg) - # Test case 7: same as Test case 7, but add the failing file to the cuda_sanity_ignore_files + # Test case 7: same as Test case 6, but add the failing file to the cuda_sanity_ignore_files # This is expected to succeed: the individual file which _would_ cause the sanity check to fail is # now on the ignore list + topdir = os.path.dirname(os.path.abspath(__file__)) + toy_ec_file = os.path.join(topdir, 'easyconfigs', 'test_ecs', 't', 'toy', 'toy-0.0.eb') - # Test case 8: running with default options and a binary that does not contain ANY CUDA device code - # This is expected to succeed, since the default is --disable-cuda-sanity-check-error-on-fail + toy_whitelist_ec = os.path.join(self.test_prefix, 'toy-0.0-cuda-whitelist.eb') + write_file(toy_whitelist_ec, read_file(toy_ec_file) + '\ncuda_sanity_ignore_files = ["bin/toy"]') - # Test case 9: same as Test case 8, but add --cuda-sanity-check-error-on-fail + args = ['--cuda-compute-capabilities=9.0', '--cuda-sanity-check-error-on-fail', + '--cuda-sanity-check-accept-missing-ptx', '--cuda-sanity-check-strict'] + # We expect this to succeed, so check output for expected patterns + error_pattern = "Sanity check failed: Mismatch between cuda_compute_capabilities and device code in " + error_pattern += ".*/bin/toy\. Additional compute capabilities: 7\.0" + with self.mocked_stdout_stderr(): + outtxt = self._test_toy_build(ec_file=toy_whitelist_ec, extra_args=args, raise_error=True, verify=False) + msg = "Pattern %s not found in full build log: %s" % (device_additional_70_code_regex.pattern, outtxt) + self.assertTrue(device_additional_70_code_regex.search(outtxt), msg) + expected_summary = r"^Number of files missing one or more CUDA Compute Capabilities: 0$\n" + expected_summary += r"^Number of files with device code for more CUDA Compute Capabilities than requested: 1 " + expected_summary += "\(ignored: 1, fails: 0\)$\n" + expected_summary += r"^Number of files missing PTX code for the highest configured CUDA Compute Capability: 1 " + expected_summary += "\(running with --cuda-sanity-check-accept-missing-ptx, so not considered failures\)" + expected_summary_regex = re.compile(expected_summary, re.M) + msg = "Pattern %s not found in full build log: %s" % (expected_summary, outtxt) + self.assertTrue(expected_summary_regex.search(outtxt), msg) - # Test case 10: try with --cuda-sanity-check-error-on-fail --cuda-compute-capabilities=9.0,9.0a + # Test case 8: try with --cuda-sanity-check-error-on-fail --cuda-compute-capabilities=9.0,9.0a + # and --cuda-sanity-check-strict # on a binary that contains 9.0 and 9.0a device code, and 9.0a ptx code. This tests the correct # ordering (i.e. 9.0a > 9.0). It should pass, since device code is present for both CCs and PTX - # code is present for the highest CC. It also tests a case with multiple compute capabilities. - -# # Test case 1: --cuda-compute-capabilities=8.0 and mocking a binary that contains 8.0 ELF code -# # This means the build should succeed, so we can run with raise_error=True and check the output -# # for the expected debugging output -# # We also check here for the warning that no PTX code for the highest compute capability (8.0) was found -# write_file(cuobjdump_file, cuobjdump_txt_shebang), -# write_file(cuobjdump_file, cuobjdump_txt_sm80, append=True) -# adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable -# args = ['--cuda-compute-capabilities=8.0'] -# # We expect this to pass, so no need to check errors -# with self.mocked_stdout_stderr(): -# outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) -# msg = "Pattern %s not found in full build log: %s" % (device_code_regex_success.pattern, outtxt) -# self.assertTrue(device_code_regex_success.search(outtxt), msg) -# msg = "Pattern %s not found in full build log: %s" % (ptx_code_missing_80_regex.pattern, outtxt) -# self.assertTrue(ptx_code_missing_80_regex.search(outtxt), msg) -# -# # Test case 2: --cuda-compute-capabilities=8.0 and mocking a binary that contains 8.0 ELF code and 8.0 PTX code -# # This means the build should succeed, so we can run with raise_error=True and check the output -# # for the expected debugging output -# # It also means we expect output confirming that PTX code was found for the highest compute capability -# write_file(cuobjdump_file, cuobjdump_txt_shebang), -# write_file(cuobjdump_file, cuobjdump_txt_sm80, append=True) -# write_file(cuobjdump_file, cuobjdump_txt_sm80_ptx, append=True) -# adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable -# args = ['--cuda-compute-capabilities=8.0'] -# # We expect this to pass, so no need to check errors -# with self.mocked_stdout_stderr(): -# outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) -# msg = "Pattern %s not found in full build log: %s" % (device_code_regex_success.pattern, outtxt) -# self.assertTrue(device_code_regex_success.search(outtxt), msg) -# msg = "Pattern %s not found in full build log: %s" % (ptx_code_regex_success.pattern, outtxt) -# self.assertTrue(ptx_code_regex_success.search(outtxt), msg) -# -# # Test case 3: --cuda-compute-capabilities=8.0 and mocking a binary that contains only 9.0 ELF code -# # This means we expect the build to fail, so we'll do an assertErrorRegex to check that -# # Subsequently, we rerun with raise_error=False so we can check the debugging output -# # There, we expect EB to tell us that 8.0 code was expected, but only 9.0 code was found -# write_file(cuobjdump_file, cuobjdump_txt_shebang), -# write_file(cuobjdump_file, cuobjdump_txt_sm90, append=True) -# adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable -# args = ['--cuda-compute-capabilities=8.0'] -# # We expect this to fail -# error_pattern = "Sanity check failed: Mismatch between cuda_compute_capabilities and device code in " -# error_pattern += ".*/bin/toy. Surplus compute capabilities: 9.0. Missing compute capabilities: 8.0." -# with self.mocked_stdout_stderr(): -# self.assertErrorRegex(EasyBuildError, error_pattern, self._test_toy_build, ec_file=toy_ec, -# extra_args=args, raise_error=True) -# outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=False, verify=False) -# msg = "Pattern %s not found in full build log: %s" % (device_missing_80_code_regex.pattern, outtxt) -# self.assertTrue(device_missing_80_code_regex.search(outtxt), msg) -# msg = "Pattern %s not found in full build log: %s" % (ptx_code_missing_80_regex.pattern, outtxt) -# self.assertTrue(ptx_code_missing_80_regex.search(outtxt), msg) -# -# # Test case 4: --cuda-compute-capabilities=8.0,9.0 and mocking a binary that contains both 8.0 and 9.0 ELF code -# # This means the build should succeed, so we can run with raise_error=True and check the output -# # for the expected debugging output. -# # We also check here for the warning that no PTX code for the highest compute capability (9.0) was found -# write_file(cuobjdump_file, cuobjdump_txt_shebang), -# write_file(cuobjdump_file, cuobjdump_txt_sm80, append=True) -# write_file(cuobjdump_file, cuobjdump_txt_sm90, append=True) -# adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable -# args = ['--cuda-compute-capabilities=8.0,9.0'] -# # We expect this to succeed -# with self.mocked_stdout_stderr(): -# outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) -# msg = "Pattern %s not found in full build log: %s" % (device_code_regex_success.pattern, outtxt) -# self.assertTrue(device_code_regex_success.search(outtxt), msg) -# msg = "Pattern %s not found in full build log: %s" % (ptx_code_missing_90_regex.pattern, outtxt) -# self.assertTrue(ptx_code_missing_90_regex.search(outtxt), msg) -# -# # Test case 5: --cuda-compute-capabilities=8.0,9.0 and mocking a binary that only contains 8.0 ELF code -# # This means we expect the build to fail, so we'll do an assertErrorRegex to check that -# # Subsequently, we rerun with raise_error=False so we can check the debugging output for the debugging -# # output which tells us it expected 8.0 and 9.0, but only found 9.0 ELF code -# write_file(cuobjdump_file, cuobjdump_txt_shebang), -# write_file(cuobjdump_file, cuobjdump_txt_sm80, append=True) -# adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable -# args = ['--cuda-compute-capabilities=8.0,9.0'] -# # We expect this to fail -# error_pattern = "Sanity check failed: Mismatch between cuda_compute_capabilities and device code in " -# error_pattern += ".*/bin/toy. Missing compute capabilities: 9.0." -# with self.mocked_stdout_stderr(): -# self.assertErrorRegex(EasyBuildError, error_pattern, self._test_toy_build, ec_file=toy_ec, -# extra_args=args, raise_error=True) -# outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=False, verify=False) -# msg = "Pattern %s not found in full build log: %s" % (device_missing_90_code_regex.pattern, outtxt) -# self.assertTrue(device_missing_90_code_regex.search(outtxt), msg) -# msg = "Pattern %s not found in full build log: %s" % (ptx_code_missing_90_regex.pattern, outtxt) -# self.assertTrue(ptx_code_missing_90_regex.search(outtxt), msg) -# -# # Test case 6: --cuda-compute-capabilities=8.0,9.0 and mocking a binary that contains 8.0 and 9.0 ELF code -# # as well as 9.0 PTX code -# # This means the build should succeed, so we can run with raise_error=True and check the output -# # for the expected debugging output -# # It also means we expect output confirming that PTX code was found for the highest compute capability -# write_file(cuobjdump_file, cuobjdump_txt_shebang), -# write_file(cuobjdump_file, cuobjdump_txt_sm80, append=True) -# write_file(cuobjdump_file, cuobjdump_txt_sm90, append=True) -# write_file(cuobjdump_file, cuobjdump_txt_sm90_ptx, append=True) -# adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable -# args = ['--cuda-compute-capabilities=8.0,9.0'] -# # We expect this to succeed -# with self.mocked_stdout_stderr(): -# outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) -# msg = "Pattern %s not found in full build log: %s" % (device_code_regex_success.pattern, outtxt) -# self.assertTrue(device_code_regex_success.search(outtxt), msg) -# msg = "Pattern %s not found in full build log: %s" % (ptx_code_regex_success.pattern, outtxt) -# self.assertTrue(ptx_code_regex_success.search(outtxt), msg) -# -# # Test case 7: --cude-compute-capabilities=8.0 --strict-cuda-sanity-check and mocking a binary that contains -# # 8.0 and 9.0 ELF code -# # This means we expect the build to fail, so we'll do an assertErrorRegex to check that -# # Subsequently, we rerun with raise_error=False so we can check the debugging output -# # There, we expect EB to tell us that only 8.0 code was expected, but both 8.0 and 9.0 code was found -# write_file(cuobjdump_file, cuobjdump_txt_shebang), -# write_file(cuobjdump_file, cuobjdump_txt_sm80, append=True) -# write_file(cuobjdump_file, cuobjdump_txt_sm90, append=True) -# adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable -# args = ['--cuda-compute-capabilities=8.0', '--strict-cuda-sanity-check'] -# # We expect this to fail -# error_pattern = "Sanity check failed: Mismatch between cuda_compute_capabilities and device code in " -# error_pattern += ".*/bin/toy. Surplus compute capabilities: 9.0." -# with self.mocked_stdout_stderr(): -# self.assertErrorRegex(EasyBuildError, error_pattern, self._test_toy_build, ec_file=toy_ec, -# extra_args=args, raise_error=True) -# outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=False, verify=False) -# msg = "Pattern %s not found in full build log: %s" % (device_surplus_90_code_regex.pattern, outtxt) -# self.assertTrue(device_surplus_90_code_regex.search(outtxt), msg) -# msg = "Pattern %s not found in full build log: %s" % (ptx_code_missing_80_regex.pattern, outtxt) -# self.assertTrue(ptx_code_missing_80_regex.search(outtxt), msg) -# -# # Test case 8: --cuda-compute-capabilities=8.0 and mocking a binary that contains 9.0 ELF code -# # but passing that binary on the ignore_cuda_sanity_failures list -# # This means we expect the build to succeed and we'll check the output for the expected debugging output -# test_ec = os.path.join(self.test_prefix, 'test.eb') -# test_ec_txt = read_file(toy_ec) -# test_ec_txt += "\ncuda_sanity_ignore_files = ['bin/toy']" -# write_file(test_ec, test_ec_txt) -# write_file(cuobjdump_file, cuobjdump_txt_shebang), -# write_file(cuobjdump_file, cuobjdump_txt_sm90, append=True) -# adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable -# args = ['--cuda-compute-capabilities=8.0'] -# # We expect this to succeed -# with self.mocked_stdout_stderr(): -# outtxt = self._test_toy_build(ec_file=test_ec, extra_args=args, raise_error=True) -# msg = "Pattern %s not found in full build log: %s" % (device_missing_80_code_ignored_regex.pattern, outtxt) -# self.assertTrue(device_missing_80_code_ignored_regex.search(outtxt), msg) -# msg = "Pattern %s not found in full build log: %s" % (ptx_code_missing_80_regex.pattern, outtxt) -# self.assertTrue(ptx_code_missing_80_regex.search(outtxt), msg) + # code is present for the highest CC, and there is no additiona device code present + # This also tests a case with multiple compute capabilities. + write_file(cuobjdump_file, cuobjdump_txt_shebang), + write_file(cuobjdump_file, cuobjdump_txt_sm90, append=True) + write_file(cuobjdump_file, cuobjdump_txt_sm90a, append=True) + write_file(cuobjdump_file, cuobjdump_txt_sm90a_ptx, append=True) + adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable + args = ['--cuda-compute-capabilities=9.0,9.0a', '--cuda-sanity-check-error-on-fail', + '--cuda-sanity-check-strict'] + # We expect this to pass, so no need to check errors + with self.mocked_stdout_stderr(): + outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) + msg = "Pattern %s not found in full build log: %s" % (ptx_code_regex_success.pattern, outtxt) + self.assertTrue(ptx_code_regex_success.search(outtxt), msg) + expected_summary = r"^Number of files missing one or more CUDA Compute Capabilities: 0$\n" + expected_summary += r"^Number of files with device code for more CUDA Compute Capabilities than requested: 0$\n" + expected_summary += r"^Number of files missing PTX code for the highest configured CUDA Compute Capability: 0$\n" + expected_summary_regex = re.compile(expected_summary, re.M) + msg = "Pattern %s not found in full build log: %s" % (expected_summary, outtxt) + self.assertTrue(expected_summary_regex.search(outtxt), msg) + expected_result_pattern = "INFO Sanity check for toy successful" + expected_result = re.compile(expected_result_pattern, re.M) + msg = "Pattern %s not found in full build log: %s" % (expected_result, outtxt) + self.assertTrue(expected_result.search(outtxt), msg) + + # Test case 9: same as 8, but no --cuda-compute-capabilities are defined + # We expect this to lead to a skip of the CUDA sanity check, and a success for the overall sanity check + args = ['--cuda-sanity-check-error-on-fail', '--cuda-sanity-check-strict'] + # We expect this to pass, so no need to check errors + with self.mocked_stdout_stderr(): + outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) + cuda_sanity_skipped = r"INFO Skipping CUDA sanity check, as no CUDA compute capabilities were configured" + cuda_sanity_skipped_regex = re.compile(cuda_sanity_skipped, re.M) + msg = "Pattern %s not found in full build log: %s" % (cuda_sanity_skipped, outtxt) + self.assertTrue(cuda_sanity_skipped_regex.search(outtxt), msg) + expected_result_pattern = "INFO Sanity check for toy successful" + expected_result = re.compile(expected_result_pattern, re.M) + msg = "Pattern %s not found in full build log: %s" % (expected_result, outtxt) + self.assertTrue(expected_result.search(outtxt), msg) + + # Test case 10: running with default options and a binary that does not contain ANY CUDA device code + # This is expected to succeed, since the default is --disable-cuda-sanity-check-error-on-fail + write_file(cuobjdump_file, cuobjdump_txt_shebang) + write_file(cuobjdump_file, cuobjdump_txt_no_cuda, append=True) + adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable + args = ['--cuda-compute-capabilities=9.0'] + # We expect this to pass, so no need to check errors + with self.mocked_stdout_stderr(): + outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) + no_cuda_pattern = r".*/bin/toy does not appear to be a CUDA executable \(no CUDA device code found\), " + no_cuda_pattern += r"so skipping CUDA sanity check" + no_cuda_regex = re.compile(no_cuda_pattern, re.M) + msg = "Pattern %s not found in full build log: %s" % (no_cuda_pattern, outtxt) + self.assertTrue(no_cuda_regex.search(outtxt), msg) + expected_summary = r"^Number of CUDA files checked: 0$\n" + expected_summary += r"^Number of files missing one or more CUDA Compute Capabilities: 0$\n" + expected_summary += r"^Number of files with device code for more CUDA Compute Capabilities than requested: 0$\n" + expected_summary += r"^Number of files missing PTX code for the highest configured CUDA Compute Capability: 0$" + expected_summary_regex = re.compile(expected_summary, re.M) + msg = "Pattern %s not found in full build log: %s" % (expected_summary, outtxt) + self.assertTrue(expected_summary_regex.search(outtxt), msg) + expected_result_pattern = "INFO Sanity check for toy successful" + expected_result = re.compile(expected_result_pattern, re.M) + msg = "Pattern %s not found in full build log: %s" % (expected_result, outtxt) + self.assertTrue(expected_result.search(outtxt), msg) + + # Test case 11: same as Test case 10, but add --cuda-sanity-check-error-on-fail + # This should pass: if it's not a CUDA binary, it shouldn't fail the CUDA sanity check + args = ['--cuda-compute-capabilities=9.0', '--cuda-sanity-check-error-on-fail'] + # We expect this to pass, so no need to check errors + with self.mocked_stdout_stderr(): + outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) + no_cuda_pattern = r".*/bin/toy does not appear to be a CUDA executable \(no CUDA device code found\), " + no_cuda_pattern += r"so skipping CUDA sanity check" + no_cuda_regex = re.compile(no_cuda_pattern, re.M) + msg = "Pattern %s not found in full build log: %s" % (no_cuda_pattern, outtxt) + self.assertTrue(no_cuda_regex.search(outtxt), msg) + expected_summary = r"^Number of CUDA files checked: 0$\n" + expected_summary += r"^Number of files missing one or more CUDA Compute Capabilities: 0$\n" + expected_summary += r"^Number of files with device code for more CUDA Compute Capabilities than requested: 0$\n" + expected_summary += r"^Number of files missing PTX code for the highest configured CUDA Compute Capability: 0$" + expected_summary_regex = re.compile(expected_summary, re.M) + msg = "Pattern %s not found in full build log: %s" % (expected_summary, outtxt) + self.assertTrue(expected_summary_regex.search(outtxt), msg) + expected_result_pattern = "INFO Sanity check for toy successful" + expected_result = re.compile(expected_result_pattern, re.M) + msg = "Pattern %s not found in full build log: %s" % (expected_result, outtxt) + self.assertTrue(expected_result.search(outtxt), msg) # Restore original environment modify_env(os.environ, start_env, verbose=False) From e53143e7c7af96605a0dd4e6df97232301d9ab7e Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 16 Apr 2025 15:54:08 +0200 Subject: [PATCH 077/114] Fix hound issues --- easybuild/framework/easyblock.py | 20 +++---- test/framework/systemtools.py | 30 ++++++----- test/framework/toy_build.py | 91 +++++++++----------------------- 3 files changed, 55 insertions(+), 86 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index ecf5f2ae80..29450afa15 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3441,7 +3441,6 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): additional_devcodes = list(set(found_dev_code_ccs) - set(cfg_ccs)) missing_devcodes = list(set(cfg_ccs) - set(found_dev_code_ccs)) - if not missing_devcodes and not additional_devcodes: # Device code for all architectures requested in --cuda-compute-capabilities was found msg = (f"Output of 'cuobjdump' checked for '{path}'; device code architectures match " @@ -3523,8 +3522,9 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): # from, this is considerd a failure files_missing_devcode.append(os.path.relpath(path, self.installdir)) if path in ignore_file_list or ignore_failures: - # No error, because either path is on the cuda_sanity_ignore_files list in the - # easyconfig, or we are running with --disable-cuda-sanity-check-error-on-fail + # No error, because either path is on the cuda_sanity_ignore_files list in + # the easyconfig, or we are running with + # --disable-cuda-sanity-check-error-on-fail files_missing_devcode_ignored.append(os.path.relpath(path, self.installdir)) fail_msg += ignore_msg is_failure = False @@ -3553,7 +3553,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): fail_msgs.append(fail_msg) else: self.log.warning(fail_msg) - + # Check whether there is ptx code for the highest CC in cfg_ccs # Make sure to use LooseVersion so that e.g. 9.0 < 9.0a < 9.2 < 9.10 highest_cc = [sorted(cfg_ccs, key=LooseVersion)[-1]] @@ -3598,8 +3598,8 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): summary_msg_files += f"Capabilities than requested: {files_additional_devcode}\n" summary_msg_files += f"These failures are ignored for {len(files_additional_devcode_ignored)} files: " summary_msg_files += f"{files_additional_devcode_ignored})\n" - summary_msg_files += f"{len(files_missing_ptx)} files missing PTX code for the highest configured CUDA Compute " - summary_msg_files += f"Capability: {files_missing_ptx}\n" + summary_msg_files += f"{len(files_missing_ptx)} files missing PTX code for the highest configured CUDA Compute" + summary_msg_files += f" Capability: {files_missing_ptx}\n" summary_msg_files += f"These failures are ignored for {len(files_missing_ptx_ignored)} files: " summary_msg_files += f"{files_missing_ptx_ignored})" self.log.info(summary_msg_files) @@ -3610,11 +3610,13 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): if len(files_missing_devcode) == 0: summary_msg += "Number of files missing one or more CUDA Compute Capabilities: 0\n" elif ignore_failures: - summary_msg += f"Number of files missing one or more CUDA Compute Capabilities: {len(files_missing_devcode)} " + summary_msg += "Number of files missing one or more CUDA Compute Capabilities: " + summary_msg += f"{len(files_missing_devcode)} " summary_msg += f"(not running with --cuda-sanity-check-fail-on-error, so not considered failures)\n" else: - summary_msg += f"Number of files missing one or more CUDA Compute Capabilities: {len(files_missing_devcode)} " - summary_msg += f"(ignored: {len(files_missing_devcode_ignored)}, fails: {len(files_missing_devcode_fails)})\n" + summary_msg += "Number of files missing one or more CUDA Compute Capabilities: " + summary_msg += f"{len(files_missing_devcode)} (ignored: {len(files_missing_devcode_ignored)}, " + summary_msg += f"fails: {len(files_missing_devcode_fails)})\n" if accept_ptx_as_devcode: summary_msg += "Number of files missing one or more CUDA Compute Capabilities, but having suitable " summary_msg += "PTX code that can be JIT compiled for the requested CUDA Compute Capabilities: " diff --git a/test/framework/systemtools.py b/test/framework/systemtools.py index f86d584fbe..37878666d5 100644 --- a/test/framework/systemtools.py +++ b/test/framework/systemtools.py @@ -303,14 +303,14 @@ DirectMap1G: 65011712 kB """ -FILE_BIN = """ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked, interpreter -/lib64/ld-linux-x86-64.so.2, for GNU/Linux 3.2.0, not stripped, too many notes (256)""" +FILE_BIN = "ELF 64-bit LSB executable, x86-64, version 1 (SYSV), dynamically linked, interpreter " +FILE_BIN += "/lib64/ld-linux-x86-64.so.2, for GNU/Linux 3.2.0, not stripped, too many notes (256)" -FILE_SHAREDLIB = """ELF 64-bit LSB shared object, x86-64, version 1 (SYSV), dynamically linked, -BuildID[sha1]=5535086d3380568f8eaecfa2e73f456f1edd94ec, stripped""" +FILE_SHAREDLIB = "ELF 64-bit LSB shared object, x86-64, version 1 (SYSV), dynamically linked, " +FILE_SHAREDLIB += "BuildID[sha1]=5535086d3380568f8eaecfa2e73f456f1edd94ec, stripped" CUOBJDUMP_FAT = """ -Fatbin elf code: +Fatbin elf code: ================ arch = sm_50 code version = [1,7] @@ -502,7 +502,8 @@ def mocked_run_shell_cmd(cmd, **kwargs): "cuobjdump mock_cuda_staticlib": CUOBJDUMP_DEVICE_CODE_ONLY, } known_fail_cmds = { - "cuobjdump mock_non_cuda_sharedlib": ("cuobjdump info : File '/path/to/mock.so' does not contain device code", 255), + "cuobjdump mock_non_cuda_sharedlib": ("cuobjdump info : File '/path/to/mock.so' does not contain device code", + 255), "cuobjdump mock_non_cuda_sharedlib_unexpected": ("cuobjdump info : Some unexpected output", 255), } if cmd in known_cmds: @@ -1340,7 +1341,8 @@ def test_get_cuda_object_dump_raw(self): # Test case 4: call on a file that is an shared lib, but not a CUDA shared lib # Check debug message in this case - debug_regex = re.compile(r"DEBUG .* does not appear to be a CUDA binary: cuobjdump failed to find device code in this file", re.M) + debug_regex = re.compile(r"DEBUG .* does not appear to be a CUDA binary: cuobjdump failed to find device code " + "in this file", re.M) old_log_level = st._log.getEffectiveLevel() st._log.setLevel(logging.DEBUG) with self.log_to_testlogfile(): @@ -1353,8 +1355,9 @@ def test_get_cuda_object_dump_raw(self): # Test case 5: call on a file where cuobjdump produces really unexpected output error_pattern = r"Dumping CUDA binary file information for .* via .* failed!" - self.assertErrorRegex(EasyBuildError, error_pattern, get_cuda_object_dump_raw, path='mock_non_cuda_sharedlib_unexpected') - + self.assertErrorRegex(EasyBuildError, error_pattern, get_cuda_object_dump_raw, + path='mock_non_cuda_sharedlib_unexpected') + # Test case 6: call on CUDA shared lib, which only contains PTX code self.assertEqual(get_cuda_object_dump_raw('mock_cuda_sharedlib'), CUOBJDUMP_PTX_ONLY) @@ -1396,7 +1399,8 @@ def test_get_cuda_architectures(self): self.assertIsNone(get_cuda_architectures('mock_non_cuda_sharedlib', 'ptx')) # Test case 4: call on CUDA shared lib, which only contains PTX code - warning_regex_elf = re.compile(r"WARNING Failed to find Fatbin elf code section\(s\) in cuobjdump output for mock_cuda_sharedlib", re.M) + warning_regex_elf = re.compile(r"WARNING Failed to find Fatbin elf code section\(s\) in cuobjdump output for " + "mock_cuda_sharedlib", re.M) old_log_level = st._log.getEffectiveLevel() st._log.setLevel(logging.DEBUG) with self.log_to_testlogfile(): @@ -1410,7 +1414,8 @@ def test_get_cuda_architectures(self): self.assertEqual(res_ptx, ['9.0', '9.0a']) # Test case 5: call on CUDA static lib, which only contains device code - warning_regex_ptx = re.compile(r"WARNING Failed to find Fatbin ptx code section\(s\) in cuobjdump output for mock_cuda_staticlib", re.M) + warning_regex_ptx = re.compile(r"WARNING Failed to find Fatbin ptx code section\(s\) in cuobjdump output for " + "mock_cuda_staticlib", re.M) old_log_level = st._log.getEffectiveLevel() st._log.setLevel(logging.DEBUG) with self.log_to_testlogfile(): @@ -1424,7 +1429,8 @@ def test_get_cuda_architectures(self): self.assertEqual(res_elf, ['9.0', '9.0a']) # Test case 6: call on CUDA shared lib which lacks an arch = sm_XX entry (should never happen) - warning_regex_elf = re.compile(r"WARNING Found Fatbin elf code section\(s\) in cuobjdump output for mock_invalid_cuda_sharedlib, but failed to extract CUDA architecture", re.M) + warning_regex_elf = re.compile(r"WARNING Found Fatbin elf code section\(s\) in cuobjdump output for " + "mock_invalid_cuda_sharedlib, but failed to extract CUDA architecture", re.M) old_log_level = st._log.getEffectiveLevel() st._log.setLevel(logging.DEBUG) with self.log_to_testlogfile(): diff --git a/test/framework/toy_build.py b/test/framework/toy_build.py index 59a43a4ec5..e7424b3e0d 100644 --- a/test/framework/toy_build.py +++ b/test/framework/toy_build.py @@ -3138,17 +3138,6 @@ def test_toy_cuda_sanity_check(self): "echo ''\n" ]) - # Section for cuobjdump printing output for sm_80 architecture - cuobjdump_txt_sm80 = '\n'.join([ - "echo 'Fatbin elf code:'", - "echo '================'", - "echo 'arch = sm_80'", - "echo 'code version = [1,7]'", - "echo 'host = linux'", - "echo 'compile_size = 64bit'", - "echo ''\n" - ]) - # Section for cuobjdump printing output for sm_90 architecture cuobjdump_txt_sm90 = '\n'.join([ "echo 'Fatbin elf code:'", @@ -3183,18 +3172,6 @@ def test_toy_cuda_sanity_check(self): "echo ''\n" ]) - # Section for cuobjdump printing output for sm_90 PTX code - cuobjdump_txt_sm90_ptx = '\n'.join([ - "echo 'Fatbin ptx code:'", - "echo '================'", - "echo 'arch = sm_90'", - "echo 'code version = [8,1]'", - "echo 'host = linux'", - "echo 'compile_size = 64bit'", - "echo 'compressed'", - "echo ''\n" - ]) - # Section for cuobjdump printing output for sm_90a PTX code cuobjdump_txt_sm90a_ptx = '\n'.join([ "echo 'Fatbin ptx code:'", @@ -3218,14 +3195,6 @@ def test_toy_cuda_sanity_check(self): device_missing_80_code_regex_pattern = r"Missing compute capabilities: 8.0." device_missing_80_code_regex = re.compile(device_missing_80_code_regex_pattern, re.M) - device_missing_80_code_ignored_regex_pattern = r"Missing compute capabilities: 8.0. This failure will be " - device_missing_80_code_ignored_regex_pattern += "ignored as '.*/bin/toy' is listed in " - device_missing_80_code_ignored_regex_pattern += "'ignore_cuda_sanity_failures'." - device_missing_80_code_ignored_regex = re.compile(device_missing_80_code_ignored_regex_pattern, re.M) - - device_missing_90_code_regex_pattern = r"Missing compute capabilities: 9.0." - device_missing_90_code_regex = re.compile(device_missing_90_code_regex_pattern, re.M) - device_additional_70_code_regex_pattern = r"Additional compute capabilities: 7.0." device_additional_70_code_regex = re.compile(device_additional_70_code_regex_pattern, re.M) device_additional_70_90_code_regex_pattern = r"Additional compute capabilities: 7.0, 9.0." @@ -3236,16 +3205,6 @@ def test_toy_cuda_sanity_check(self): ptx_code_regex_success_pattern += "cuda_compute_capabilities" ptx_code_regex_success = re.compile(ptx_code_regex_success_pattern, re.M) - ptx_code_missing_80_regex_pattern = r"Configured highest compute capability was '8.0', but no PTX code " - ptx_code_missing_80_regex_pattern += "for this compute capability was found in '.*/bin/toy' " - ptx_code_missing_80_regex_pattern += r"\(PTX architectures supported in that file: \[\]\)" - ptx_code_missing_80_regex = re.compile(ptx_code_missing_80_regex_pattern, re.M) - - ptx_code_missing_90_regex_pattern = r"Configured highest compute capability was '9.0', but no PTX code " - ptx_code_missing_90_regex_pattern += "for this compute capability was found in '.*/bin/toy' " - ptx_code_missing_90_regex_pattern += r"\(PTX architectures supported in that file: \[\]\)" - ptx_code_missing_90_regex = re.compile(ptx_code_missing_90_regex_pattern, re.M) - # Create temporary subdir for cuobjdump, so that we don't have to add self.test_prefix itself to the PATH cuobjdump_dir = os.path.join(self.test_prefix, 'cuobjdump_dir') mkdir(cuobjdump_dir, parents=True) @@ -3279,7 +3238,7 @@ def test_toy_cuda_sanity_check(self): expected_summary = r"^Number of files missing one or more CUDA Compute Capabilities: 1 " expected_summary += r"\(not running with --cuda-sanity-check-fail-on-error, so not considered failures\)$\n" expected_summary += r"^Number of files with device code for more CUDA Compute Capabilities than requested: 1 " - expected_summary += r"\(not running with --cuda-sanity-check-fail-on-error, so not considered failures\)$\n" + expected_summary += r"\(not running with --cuda-sanity-check-fail-on-error, so not considered failures\)$\n" expected_summary += r"^Number of files missing PTX code for the highest configured CUDA Compute Capability: 0" expected_summary_regex = re.compile(expected_summary, re.M) msg = "Pattern %s not found in full build log: %s" % (expected_summary, outtxt) @@ -3289,8 +3248,8 @@ def test_toy_cuda_sanity_check(self): # This is expected to fail since there is missing device code for CC80 args = ['--cuda-compute-capabilities=8.0', '--cuda-sanity-check-error-on-fail'] # We expect this to fail, so first check error, then run again to check output - error_pattern = "Sanity check failed: Mismatch between cuda_compute_capabilities and device code in " - error_pattern += ".*/bin/toy. Missing compute capabilities: 8.0." + error_pattern = r"Sanity check failed: Mismatch between cuda_compute_capabilities and device code in " + error_pattern += r".*/bin/toy. Missing compute capabilities: 8.0." with self.mocked_stdout_stderr(): self.assertErrorRegex(EasyBuildError, error_pattern, self._test_toy_build, ec_file=toy_ec, extra_args=args, raise_error=True) @@ -3300,7 +3259,7 @@ def test_toy_cuda_sanity_check(self): msg = "Pattern %s not found in full build log: %s" % (device_missing_80_code_regex.pattern, outtxt) self.assertTrue(device_missing_80_code_regex.search(outtxt), msg) expected_summary = r"^Number of files missing one or more CUDA Compute Capabilities: 1 " - expected_summary += "\(ignored: 0, fails: 1\)$\n" + expected_summary += r"\(ignored: 0, fails: 1\)$\n" expected_summary += r"^Number of files with device code for more CUDA Compute Capabilities than requested: 1 " expected_summary += r"\(not running with --cuda-sanity-check-strict, so not considered failures\)$\n" expected_summary += r"^Number of files missing PTX code for the highest configured CUDA Compute Capability: 0" @@ -3321,8 +3280,8 @@ def test_toy_cuda_sanity_check(self): self.assertTrue(device_additional_70_90_code_regex.search(outtxt), msg) msg = "Pattern %s not found in full build log: %s" % (device_missing_80_code_regex.pattern, outtxt) self.assertTrue(device_missing_80_code_regex.search(outtxt), msg) - expected_summary = "Number of files missing one or more CUDA Compute Capabilities, but having suitable PTX " - expected_summary += "code that can be JIT compiled for the requested CUDA Compute Capabilities: 1" + expected_summary = r"Number of files missing one or more CUDA Compute Capabilities, but having suitable PTX " + expected_summary += r"code that can be JIT compiled for the requested CUDA Compute Capabilities: 1" expected_summary_regex = re.compile(expected_summary, re.M) msg = "Pattern %s not found in full build log: %s" % (expected_summary, outtxt) self.assertTrue(expected_summary_regex.search(outtxt), msg) @@ -3340,9 +3299,9 @@ def test_toy_cuda_sanity_check(self): # This is expected to fail: device code is present, but PTX code for the highest CC (9.0) is missing args = ['--cuda-compute-capabilities=9.0', '--cuda-sanity-check-error-on-fail'] # We expect this to fail, so first check error, then run again to check output - error_pattern = "Sanity check failed: Configured highest compute capability was '9\.0', " - error_pattern += "but no PTX code for this compute capability was found in '.*/bin/toy' " - error_pattern += "\(PTX architectures supported in that file: \['8\.0'\]\)" + error_pattern = r"Sanity check failed: Configured highest compute capability was '9\.0', " + error_pattern += r"but no PTX code for this compute capability was found in '.*/bin/toy' " + error_pattern += r"\(PTX architectures supported in that file: \['8\.0'\]\)" with self.mocked_stdout_stderr(): self.assertErrorRegex(EasyBuildError, error_pattern, self._test_toy_build, ec_file=toy_ec, extra_args=args, raise_error=True) @@ -3353,7 +3312,7 @@ def test_toy_cuda_sanity_check(self): expected_summary += r"^Number of files with device code for more CUDA Compute Capabilities than requested: 1 " expected_summary += r"\(not running with --cuda-sanity-check-strict, so not considered failures\)$\n" expected_summary += r"^Number of files missing PTX code for the highest configured CUDA Compute Capability: 1 " - expected_summary += "\(ignored: 0, fails: 1\)" + expected_summary += r"\(ignored: 0, fails: 1\)" expected_summary_regex = re.compile(expected_summary, re.M) msg = "Pattern %s not found in full build log: %s" % (expected_summary, outtxt) self.assertTrue(expected_summary_regex.search(outtxt), msg) @@ -3363,9 +3322,9 @@ def test_toy_cuda_sanity_check(self): args = ['--cuda-compute-capabilities=9.0', '--cuda-sanity-check-error-on-fail', '--cuda-sanity-check-accept-missing-ptx'] # We expect this to pass, so no need to check errors - warning_pattern = "Configured highest compute capability was '9\.0', " - warning_pattern += "but no PTX code for this compute capability was found in '.*/bin/toy' " - warning_pattern += "\(PTX architectures supported in that file: \['8\.0'\]\)" + warning_pattern = r"Configured highest compute capability was '9\.0', " + warning_pattern += r"but no PTX code for this compute capability was found in '.*/bin/toy' " + warning_pattern += r"\(PTX architectures supported in that file: \['8\.0'\]\)" warning_pattern_regex = re.compile(warning_pattern, re.M) with self.mocked_stdout_stderr(): outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) @@ -3377,7 +3336,7 @@ def test_toy_cuda_sanity_check(self): expected_summary += r"^Number of files with device code for more CUDA Compute Capabilities than requested: 1 " expected_summary += r"\(not running with --cuda-sanity-check-strict, so not considered failures\)$\n" expected_summary += r"^Number of files missing PTX code for the highest configured CUDA Compute Capability: 1 " - expected_summary += "\(running with --cuda-sanity-check-accept-missing-ptx, so not considered failures\)" + expected_summary += r"\(running with --cuda-sanity-check-accept-missing-ptx, so not considered failures\)" expected_summary_regex = re.compile(expected_summary, re.M) msg = "Pattern %s not found in full build log: %s" % (expected_summary, outtxt) self.assertTrue(expected_summary_regex.search(outtxt), msg) @@ -3388,8 +3347,8 @@ def test_toy_cuda_sanity_check(self): args = ['--cuda-compute-capabilities=9.0', '--cuda-sanity-check-error-on-fail', '--cuda-sanity-check-accept-missing-ptx', '--cuda-sanity-check-strict'] # We expect this to fail, so first check error, then run again to check output - error_pattern = "Sanity check failed: Mismatch between cuda_compute_capabilities and device code in " - error_pattern += ".*/bin/toy\. Additional compute capabilities: 7\.0" + error_pattern = r"Sanity check failed: Mismatch between cuda_compute_capabilities and device code in " + error_pattern += r".*/bin/toy. Additional compute capabilities: 7\.0" with self.mocked_stdout_stderr(): self.assertErrorRegex(EasyBuildError, error_pattern, self._test_toy_build, ec_file=toy_ec, extra_args=args, raise_error=True) @@ -3398,13 +3357,13 @@ def test_toy_cuda_sanity_check(self): self.assertTrue(device_additional_70_code_regex.search(outtxt), msg) expected_summary = r"^Number of files missing one or more CUDA Compute Capabilities: 0$\n" expected_summary += r"^Number of files with device code for more CUDA Compute Capabilities than requested: 1 " - expected_summary += "\(ignored: 0, fails: 1\)$\n" + expected_summary += r"\(ignored: 0, fails: 1\)$\n" expected_summary += r"^Number of files missing PTX code for the highest configured CUDA Compute Capability: 1 " - expected_summary += "\(running with --cuda-sanity-check-accept-missing-ptx, so not considered failures\)" + expected_summary += r"\(running with --cuda-sanity-check-accept-missing-ptx, so not considered failures\)" expected_summary_regex = re.compile(expected_summary, re.M) msg = "Pattern %s not found in full build log: %s" % (expected_summary, outtxt) self.assertTrue(expected_summary_regex.search(outtxt), msg) - + # Test case 7: same as Test case 6, but add the failing file to the cuda_sanity_ignore_files # This is expected to succeed: the individual file which _would_ cause the sanity check to fail is # now on the ignore list @@ -3417,17 +3376,17 @@ def test_toy_cuda_sanity_check(self): args = ['--cuda-compute-capabilities=9.0', '--cuda-sanity-check-error-on-fail', '--cuda-sanity-check-accept-missing-ptx', '--cuda-sanity-check-strict'] # We expect this to succeed, so check output for expected patterns - error_pattern = "Sanity check failed: Mismatch between cuda_compute_capabilities and device code in " - error_pattern += ".*/bin/toy\. Additional compute capabilities: 7\.0" + error_pattern = r"Sanity check failed: Mismatch between cuda_compute_capabilities and device code in " + error_pattern += r".*/bin/toy\. Additional compute capabilities: 7\.0" with self.mocked_stdout_stderr(): outtxt = self._test_toy_build(ec_file=toy_whitelist_ec, extra_args=args, raise_error=True, verify=False) msg = "Pattern %s not found in full build log: %s" % (device_additional_70_code_regex.pattern, outtxt) self.assertTrue(device_additional_70_code_regex.search(outtxt), msg) expected_summary = r"^Number of files missing one or more CUDA Compute Capabilities: 0$\n" expected_summary += r"^Number of files with device code for more CUDA Compute Capabilities than requested: 1 " - expected_summary += "\(ignored: 1, fails: 0\)$\n" + expected_summary += r"\(ignored: 1, fails: 0\)$\n" expected_summary += r"^Number of files missing PTX code for the highest configured CUDA Compute Capability: 1 " - expected_summary += "\(running with --cuda-sanity-check-accept-missing-ptx, so not considered failures\)" + expected_summary += r"\(running with --cuda-sanity-check-accept-missing-ptx, so not considered failures\)" expected_summary_regex = re.compile(expected_summary, re.M) msg = "Pattern %s not found in full build log: %s" % (expected_summary, outtxt) self.assertTrue(expected_summary_regex.search(outtxt), msg) @@ -3448,11 +3407,13 @@ def test_toy_cuda_sanity_check(self): # We expect this to pass, so no need to check errors with self.mocked_stdout_stderr(): outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) + msg = "Pattern %s not found in full build log: %s" % (device_code_regex_success.pattern, outtxt) + self.assertTrue(device_code_regex_success.search(outtxt), msg) msg = "Pattern %s not found in full build log: %s" % (ptx_code_regex_success.pattern, outtxt) self.assertTrue(ptx_code_regex_success.search(outtxt), msg) expected_summary = r"^Number of files missing one or more CUDA Compute Capabilities: 0$\n" expected_summary += r"^Number of files with device code for more CUDA Compute Capabilities than requested: 0$\n" - expected_summary += r"^Number of files missing PTX code for the highest configured CUDA Compute Capability: 0$\n" + expected_summary += r"^Number of files missing PTX code for the highest configured CUDA Compute Capability: 0$" expected_summary_regex = re.compile(expected_summary, re.M) msg = "Pattern %s not found in full build log: %s" % (expected_summary, outtxt) self.assertTrue(expected_summary_regex.search(outtxt), msg) From 5f533ea5fe847baa5a4e7af02a5a052df727fa7d Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 16 Apr 2025 16:01:32 +0200 Subject: [PATCH 078/114] Fix linting issues --- easybuild/framework/easyblock.py | 2 +- test/framework/toy_build.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 29450afa15..96e21b5f3a 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3523,7 +3523,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): files_missing_devcode.append(os.path.relpath(path, self.installdir)) if path in ignore_file_list or ignore_failures: # No error, because either path is on the cuda_sanity_ignore_files list in - # the easyconfig, or we are running with + # the easyconfig, or we are running with # --disable-cuda-sanity-check-error-on-fail files_missing_devcode_ignored.append(os.path.relpath(path, self.installdir)) fail_msg += ignore_msg diff --git a/test/framework/toy_build.py b/test/framework/toy_build.py index e7424b3e0d..650506f1ed 100644 --- a/test/framework/toy_build.py +++ b/test/framework/toy_build.py @@ -3261,7 +3261,7 @@ def test_toy_cuda_sanity_check(self): expected_summary = r"^Number of files missing one or more CUDA Compute Capabilities: 1 " expected_summary += r"\(ignored: 0, fails: 1\)$\n" expected_summary += r"^Number of files with device code for more CUDA Compute Capabilities than requested: 1 " - expected_summary += r"\(not running with --cuda-sanity-check-strict, so not considered failures\)$\n" + expected_summary += r"\(not running with --cuda-sanity-check-strict, so not considered failures\)$\n" expected_summary += r"^Number of files missing PTX code for the highest configured CUDA Compute Capability: 0" expected_summary_regex = re.compile(expected_summary, re.M) msg = "Pattern %s not found in full build log: %s" % (expected_summary, outtxt) @@ -3281,7 +3281,7 @@ def test_toy_cuda_sanity_check(self): msg = "Pattern %s not found in full build log: %s" % (device_missing_80_code_regex.pattern, outtxt) self.assertTrue(device_missing_80_code_regex.search(outtxt), msg) expected_summary = r"Number of files missing one or more CUDA Compute Capabilities, but having suitable PTX " - expected_summary += r"code that can be JIT compiled for the requested CUDA Compute Capabilities: 1" + expected_summary += r"code that can be JIT compiled for the requested CUDA Compute Capabilities: 1" expected_summary_regex = re.compile(expected_summary, re.M) msg = "Pattern %s not found in full build log: %s" % (expected_summary, outtxt) self.assertTrue(expected_summary_regex.search(outtxt), msg) From 2ee867bc6d284b681cc5a7fcbb8f2db55ef2ced0 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 16 Apr 2025 16:13:08 +0200 Subject: [PATCH 079/114] Remove f-string, as there are no placeholders in this string --- easybuild/framework/easyblock.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 96e21b5f3a..c992be3201 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3612,7 +3612,7 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): elif ignore_failures: summary_msg += "Number of files missing one or more CUDA Compute Capabilities: " summary_msg += f"{len(files_missing_devcode)} " - summary_msg += f"(not running with --cuda-sanity-check-fail-on-error, so not considered failures)\n" + summary_msg += "(not running with --cuda-sanity-check-fail-on-error, so not considered failures)\n" else: summary_msg += "Number of files missing one or more CUDA Compute Capabilities: " summary_msg += f"{len(files_missing_devcode)} (ignored: {len(files_missing_devcode_ignored)}, " From 9c32b67b6715f983dbabd3d8df7d71688ffe7b07 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Wed, 16 Apr 2025 16:32:55 +0200 Subject: [PATCH 080/114] Fix unit test expected result --- test/framework/systemtools.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/framework/systemtools.py b/test/framework/systemtools.py index 37878666d5..255ca0e42c 100644 --- a/test/framework/systemtools.py +++ b/test/framework/systemtools.py @@ -1385,7 +1385,7 @@ def test_get_cuda_architectures(self): adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable # Test case 1: get raw output from mock_cuda_bin, a 'fat' binary - mock_cuda_bin_device_codes = ['6.0', '6.1', '7.0', '7.5', '8.0', '8.6', '8.9', '9.0', '9.0a'] + mock_cuda_bin_device_codes = ['5.0', '6.0', '6.1', '7.0', '7.5', '8.0', '8.6', '8.9', '9.0', '9.0a'] mock_cuda_bin_ptx = ['9.0', '9.0a'] self.assertEqual(get_cuda_architectures('mock_cuda_bin', 'elf'), mock_cuda_bin_device_codes) self.assertEqual(get_cuda_architectures('mock_cuda_bin', 'ptx'), mock_cuda_bin_ptx) From 4fb884b707b786c0d660cd5d0c34e880ef7bc190 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 12 May 2025 16:38:28 +0200 Subject: [PATCH 081/114] Add a test that triggers the if missing_ptx_ccs: if path in ignore_file_list or ignore_failrues: logic --- test/framework/toy_build.py | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/test/framework/toy_build.py b/test/framework/toy_build.py index 650506f1ed..6c665b8a58 100644 --- a/test/framework/toy_build.py +++ b/test/framework/toy_build.py @@ -3218,8 +3218,36 @@ def test_toy_cuda_sanity_check(self): # Filepath to cuobjdump cuobjdump_file = os.path.join(cuobjdump_dir, 'cuobjdump') - # Test case 1: test with default options, --cuda-compute-capabilities=8.0 and a binary that contains - # 7.0 and 9.0 device code and 8.0 PTX code + # Test case 1a: test with default options, --cuda-compute-capabilities=8.0 and a binary that contains + # 7.0 device code + # This should succeed (since the default for --cuda-sanity-check-error-on-fail is False) + # as to not break backwards compatibility + write_file(cuobjdump_file, cuobjdump_txt_shebang), + write_file(cuobjdump_file, cuobjdump_txt_sm90, append=True) + write_file(cuobjdump_file, cuobjdump_txt_sm70, append=True) + adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable + args = ['--cuda-compute-capabilities=8.0'] + # We expect this to pass, so no need to check errors + with self.mocked_stdout_stderr(): + outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) + msg = "Pattern %s not found in full build log: %s" % (device_additional_70_90_code_regex.pattern, outtxt) + self.assertTrue(device_additional_70_90_code_regex.search(outtxt), msg) + msg = "Pattern %s not found in full build log: %s" % (device_missing_80_code_regex.pattern, outtxt) + self.assertTrue(device_missing_80_code_regex.search(outtxt), msg) + expected_summary = r"^Number of files missing one or more CUDA Compute Capabilities: 1 " + expected_summary += r"\(not running with --cuda-sanity-check-fail-on-error, so not considered failures\)$\n" + expected_summary += r"^Number of files with device code for more CUDA Compute Capabilities than requested: 1 " + expected_summary += r"\(not running with --cuda-sanity-check-fail-on-error, so not considered failures\)$\n" + expected_summary += r"^Number of files missing PTX code for the highest configured CUDA Compute Capability: 1" + expected_summary_regex = re.compile(expected_summary, re.M) + msg = "Pattern %s not found in full build log: %s" % (expected_summary, outtxt) + self.assertTrue(expected_summary_regex.search(outtxt), msg) + + + # Test case 1b: test with default options, --cuda-compute-capabilities=8.0 and a binary that contains + # 7.0 and 9.0 device code and 8.0 PTX code. + # Note that the difference with 1a is the presense of PTX code and addditional device code + # It should not matter for the result, but triggers slightly different code paths in easyblock.py # This should succeed (since the default for --cuda-sanity-check-error-on-fail is False) # as to not break backwards compatibility write_file(cuobjdump_file, cuobjdump_txt_shebang), From a02d19801da48a1c79f1b6a5e47e9cb47eaecb9a Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 12 May 2025 16:39:10 +0200 Subject: [PATCH 082/114] Remove blank line --- test/framework/toy_build.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/framework/toy_build.py b/test/framework/toy_build.py index 6c665b8a58..63bbbb8d8a 100644 --- a/test/framework/toy_build.py +++ b/test/framework/toy_build.py @@ -3243,7 +3243,6 @@ def test_toy_cuda_sanity_check(self): msg = "Pattern %s not found in full build log: %s" % (expected_summary, outtxt) self.assertTrue(expected_summary_regex.search(outtxt), msg) - # Test case 1b: test with default options, --cuda-compute-capabilities=8.0 and a binary that contains # 7.0 and 9.0 device code and 8.0 PTX code. # Note that the difference with 1a is the presense of PTX code and addditional device code From 45f659c1306ef6c795a9de6542276cd41dd266eb Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 12 May 2025 17:08:30 +0200 Subject: [PATCH 083/114] Now make sure test 1.a. actually fails on the test case @ocaisa found, which is a binary with correct dev code, missing ptx code, an all default args --- test/framework/toy_build.py | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/test/framework/toy_build.py b/test/framework/toy_build.py index 63bbbb8d8a..3a7d1bd60c 100644 --- a/test/framework/toy_build.py +++ b/test/framework/toy_build.py @@ -3138,6 +3138,17 @@ def test_toy_cuda_sanity_check(self): "echo ''\n" ]) + # Section for cuobjdump printing output for sm_70 architecture + cuobjdump_txt_sm80 = '\n'.join([ + "echo 'Fatbin elf code:'", + "echo '================'", + "echo 'arch = sm_80'", + "echo 'code version = [1,7]'", + "echo 'host = linux'", + "echo 'compile_size = 64bit'", + "echo ''\n" + ]) + # Section for cuobjdump printing output for sm_90 architecture cuobjdump_txt_sm90 = '\n'.join([ "echo 'Fatbin elf code:'", @@ -3219,33 +3230,29 @@ def test_toy_cuda_sanity_check(self): cuobjdump_file = os.path.join(cuobjdump_dir, 'cuobjdump') # Test case 1a: test with default options, --cuda-compute-capabilities=8.0 and a binary that contains - # 7.0 device code + # 8.0 device code # This should succeed (since the default for --cuda-sanity-check-error-on-fail is False) # as to not break backwards compatibility write_file(cuobjdump_file, cuobjdump_txt_shebang), - write_file(cuobjdump_file, cuobjdump_txt_sm90, append=True) - write_file(cuobjdump_file, cuobjdump_txt_sm70, append=True) + write_file(cuobjdump_file, cuobjdump_txt_sm80, append=True) adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable args = ['--cuda-compute-capabilities=8.0'] # We expect this to pass, so no need to check errors with self.mocked_stdout_stderr(): outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) - msg = "Pattern %s not found in full build log: %s" % (device_additional_70_90_code_regex.pattern, outtxt) - self.assertTrue(device_additional_70_90_code_regex.search(outtxt), msg) - msg = "Pattern %s not found in full build log: %s" % (device_missing_80_code_regex.pattern, outtxt) - self.assertTrue(device_missing_80_code_regex.search(outtxt), msg) - expected_summary = r"^Number of files missing one or more CUDA Compute Capabilities: 1 " - expected_summary += r"\(not running with --cuda-sanity-check-fail-on-error, so not considered failures\)$\n" - expected_summary += r"^Number of files with device code for more CUDA Compute Capabilities than requested: 1 " + expected_summary = r"^Number of files missing one or more CUDA Compute Capabilities: 0$\n" + expected_summary += r"^Number of files with device code for more CUDA Compute Capabilities than requested: 0 " expected_summary += r"\(not running with --cuda-sanity-check-fail-on-error, so not considered failures\)$\n" expected_summary += r"^Number of files missing PTX code for the highest configured CUDA Compute Capability: 1" expected_summary_regex = re.compile(expected_summary, re.M) msg = "Pattern %s not found in full build log: %s" % (expected_summary, outtxt) self.assertTrue(expected_summary_regex.search(outtxt), msg) + print(outtxt) # Test case 1b: test with default options, --cuda-compute-capabilities=8.0 and a binary that contains # 7.0 and 9.0 device code and 8.0 PTX code. - # Note that the difference with 1a is the presense of PTX code and addditional device code + # Note that the difference with 1a is the presense of additional device code, PTX code foor the right + # architecture, but missing device code for the requested architecture # It should not matter for the result, but triggers slightly different code paths in easyblock.py # This should succeed (since the default for --cuda-sanity-check-error-on-fail is False) # as to not break backwards compatibility From f9f3050bd81bc52579f56345aee8095f35a9a47c Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 12 May 2025 17:12:44 +0200 Subject: [PATCH 084/114] Fix the test so that it now passes once the issue is fixed... It was checking the wrong regex --- test/framework/toy_build.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/test/framework/toy_build.py b/test/framework/toy_build.py index 3a7d1bd60c..de9e346204 100644 --- a/test/framework/toy_build.py +++ b/test/framework/toy_build.py @@ -3241,13 +3241,11 @@ def test_toy_cuda_sanity_check(self): with self.mocked_stdout_stderr(): outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) expected_summary = r"^Number of files missing one or more CUDA Compute Capabilities: 0$\n" - expected_summary += r"^Number of files with device code for more CUDA Compute Capabilities than requested: 0 " - expected_summary += r"\(not running with --cuda-sanity-check-fail-on-error, so not considered failures\)$\n" + expected_summary += r"^Number of files with device code for more CUDA Compute Capabilities than requested: 0$\n" expected_summary += r"^Number of files missing PTX code for the highest configured CUDA Compute Capability: 1" expected_summary_regex = re.compile(expected_summary, re.M) msg = "Pattern %s not found in full build log: %s" % (expected_summary, outtxt) self.assertTrue(expected_summary_regex.search(outtxt), msg) - print(outtxt) # Test case 1b: test with default options, --cuda-compute-capabilities=8.0 and a binary that contains # 7.0 and 9.0 device code and 8.0 PTX code. From 1a5cd5dfbf4ac02aec2101943acb80f303bddaf9 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 12 May 2025 17:13:24 +0200 Subject: [PATCH 085/114] Fix the issue with the missing ignore_msg --- easybuild/framework/easyblock.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index c992be3201..f484b11231 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3441,24 +3441,24 @@ def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): additional_devcodes = list(set(found_dev_code_ccs) - set(cfg_ccs)) missing_devcodes = list(set(cfg_ccs) - set(found_dev_code_ccs)) + # There are two reasons for ignoring failures: + # - We are running with --disable-cuda-sanity-check-error-on-fail + # - The specific {path} is on the cuda_sanity_ignore_files in the easyconfig + # In case we run with both, we'll just report that we're running with + # --disable-cuda-sanity-check-error-on-fail + if ignore_failures: + ignore_msg = f"Failure for {path} will be ignored since we are not running with " + ignore_msg += "--cuda-sanity-check-error-on-fail" + else: + ignore_msg = f"This failure will be ignored as '{path}' is listed in " + ignore_msg += "'cuda_sanity_ignore_files'." + if not missing_devcodes and not additional_devcodes: # Device code for all architectures requested in --cuda-compute-capabilities was found msg = (f"Output of 'cuobjdump' checked for '{path}'; device code architectures match " "those in cuda_compute_capabilities") self.log.debug(msg) else: - # There are two reasons for ignoring failures: - # - We are running with --disable-cuda-sanity-check-error-on-fail - # - The specific {path} is on the cuda_sanity_ignore_files in the easyconfig - # In case we run with both, we'll just report that we're running with - # --disable-cuda-sanity-check-error-on-fail - if ignore_failures: - ignore_msg = f"Failure for {path} will be ignored since we are running with " - ignore_msg += "--disable-cuda-sanity-check-error-on-fail" - else: - ignore_msg = f"This failure will be ignored as '{path}' is listed in " - ignore_msg += "'cuda_sanity_ignore_files'." - # Set default failure status and empty message is_failure = False From 19cfe04d74ce3a7677aaf6ffad18b322e18038f0 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen <33718780+casparvl@users.noreply.github.com> Date: Mon, 12 May 2025 17:44:06 +0200 Subject: [PATCH 086/114] Apply suggestions from code review Co-authored-by: ocaisa --- easybuild/framework/easyblock.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index f484b11231..d6eaf13b0a 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -113,8 +113,8 @@ from easybuild.tools.package.utilities import package from easybuild.tools.repository.repository import init_repository from easybuild.tools.systemtools import check_linked_shared_libs, det_parallelism -from easybuild.tools.systemtools import get_linked_libs_raw, get_shared_lib_ext, pick_system_specific_value, use_group from easybuild.tools.systemtools import get_cuda_architectures +from easybuild.tools.systemtools import get_linked_libs_raw, get_shared_lib_ext, pick_system_specific_value, use_group from easybuild.tools.utilities import INDENT_4SPACES, get_class_for, nub, quote_str from easybuild.tools.utilities import remove_unwanted_chars, time2str, trace_msg from easybuild.tools.version import this_is_easybuild, VERBOSE_VERSION, VERSION From 7e3a2dd298332aa28ed76ff946ef74dd60b73536 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 12 May 2025 17:45:13 +0200 Subject: [PATCH 087/114] Remove check_cuobjdump, as it is not needed anymore --- easybuild/framework/easyblock.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index d6eaf13b0a..ab3b39f32c 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3353,7 +3353,7 @@ def _sanity_check_step_multi_deps(self, *args, **kwargs): self.cfg['builddependencies'] = builddeps self.cfg.iterating = False - def sanity_check_cuda(self, cuda_dirs=None, check_cuobjdump=True): + def sanity_check_cuda(self, cuda_dirs=None): """Sanity check that binaries/libraries contain device code for the correct architecture targets.""" self.log.info("Checking binaries/libraries for CUDA device code...") From 2442d4491b1a9c224bdbbee63bc5a0d2ac605598 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 12 May 2025 17:49:43 +0200 Subject: [PATCH 088/114] Format file lists on separate lines for better readability of the logs --- easybuild/framework/easyblock.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index ab3b39f32c..daf561487e 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3400,6 +3400,10 @@ def sanity_check_cuda(self, cuda_dirs=None): files_missing_ptx_ignored = [] files_missing_devcode_but_has_ptx = [] + # A local function to create nicely formatted file lists for the files_* lists + def format_file_list(files_list): + return "\n" + "\n".join(f" {f}" for f in files_list) + # Looping through all files to check CUDA device and PTX code for dirpath in [os.path.join(self.installdir, d) for d in cuda_dirs]: if os.path.exists(dirpath): @@ -3586,22 +3590,22 @@ def sanity_check_cuda(self, cuda_dirs=None): self.log.debug(f"Not sanity checking files in non-existing directory {dirpath}") # Long report, which prints the files that have potential issues - summary_msg_files = f"{len(files_missing_devcode)} files missing one or more CUDA compute capabilities: " - summary_msg_files += f"{files_missing_devcode}\n" - summary_msg_files += f"These failures are ignored for {len(files_missing_devcode_ignored)} files: " - summary_msg_files += f"{files_missing_devcode_ignored})\n" + summary_msg_files = f"{len(files_missing_devcode)} files missing one or more CUDA compute capabilities:" + summary_msg_files += f"{format_file_list(files_missing_devcode)}\n" + summary_msg_files += f"These failures are ignored for {len(files_missing_devcode_ignored)} files:" + summary_msg_files += f"{format_file_list(files_missing_devcode_ignored)})\n" if accept_ptx_as_devcode: summary_msg_files += f"{len(files_missing_devcode_but_has_ptx)} files missing one or more CUDA Compute " summary_msg_files += "Capabilities, but has suitable PTX code that can be JIT compiled for the requested " - summary_msg_files += f"CUDA Compute Capabilities: {files_missing_devcode_but_has_ptx}\n" + summary_msg_files += f"CUDA Compute Capabilities:{format_file_list(files_missing_devcode_but_has_ptx)}\n" summary_msg_files += f"{len(files_additional_devcode)} files with device code for more CUDA Compute " - summary_msg_files += f"Capabilities than requested: {files_additional_devcode}\n" - summary_msg_files += f"These failures are ignored for {len(files_additional_devcode_ignored)} files: " - summary_msg_files += f"{files_additional_devcode_ignored})\n" + summary_msg_files += f"Capabilities than requested:{format_file_list(files_additional_devcode)}\n" + summary_msg_files += f"These failures are ignored for {len(files_additional_devcode_ignored)} files:" + summary_msg_files += f"{format_file_list(files_additional_devcode_ignored)})\n" summary_msg_files += f"{len(files_missing_ptx)} files missing PTX code for the highest configured CUDA Compute" - summary_msg_files += f" Capability: {files_missing_ptx}\n" - summary_msg_files += f"These failures are ignored for {len(files_missing_ptx_ignored)} files: " - summary_msg_files += f"{files_missing_ptx_ignored})" + summary_msg_files += f" Capability:{format_file_list(files_missing_ptx)}\n" + summary_msg_files += f"These failures are ignored for {len(files_missing_ptx_ignored)} files:" + summary_msg_files += f"{format_file_list(files_missing_ptx_ignored)})" self.log.info(summary_msg_files) # Short summary From 97cef2b80dea869ae107c5968e395c018ac7631a Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Mon, 12 May 2025 18:31:25 +0200 Subject: [PATCH 089/114] Still need to do some formatting, but things now go to trace output for more visibility --- easybuild/framework/easyblock.py | 128 ++++++++++++++++++------------- 1 file changed, 73 insertions(+), 55 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index daf561487e..f86c052d58 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3609,77 +3609,95 @@ def format_file_list(files_list): self.log.info(summary_msg_files) # Short summary - summary_msg = "CUDA sanity check summary report:\n" - summary_msg += f"Number of CUDA files checked: {num_cuda_files}\n" + trace_msg("CUDA sanity check summary report:") + trace_msg(f"Number of CUDA files checked: {num_cuda_files}") if len(files_missing_devcode) == 0: - summary_msg += "Number of files missing one or more CUDA Compute Capabilities: 0\n" + trace_msg("Number of files missing one or more CUDA Compute Capabilities: 0") elif ignore_failures: - summary_msg += "Number of files missing one or more CUDA Compute Capabilities: " - summary_msg += f"{len(files_missing_devcode)} " - summary_msg += "(not running with --cuda-sanity-check-fail-on-error, so not considered failures)\n" + msg = f"Number of files missing one or more CUDA Compute Capabilities: {len(files_missing_devcode)}" + msg += "\n(not running with --cuda-sanity-check-fail-on-error, so not considered failures)" + trace_msg(msg) else: - summary_msg += "Number of files missing one or more CUDA Compute Capabilities: " - summary_msg += f"{len(files_missing_devcode)} (ignored: {len(files_missing_devcode_ignored)}, " - summary_msg += f"fails: {len(files_missing_devcode_fails)})\n" + msg = "Number of files missing one or more CUDA Compute Capabilities: {len(files_missing_devcode)}" + msg += f" (ignored: {len(files_missing_devcode_ignored)}, " + msg += f"fails: {len(files_missing_devcode_fails)})" + trace_msg(msg) if accept_ptx_as_devcode: - summary_msg += "Number of files missing one or more CUDA Compute Capabilities, but having suitable " - summary_msg += "PTX code that can be JIT compiled for the requested CUDA Compute Capabilities: " - summary_msg += f"{len(files_missing_devcode_but_has_ptx)}\n" + msg = "Number of files missing one or more CUDA Compute Capabilities, but having suitable " + msg += "PTX code that can be JIT compiled for the requested CUDA Compute Capabilities: " + msg += f"{len(files_missing_devcode_but_has_ptx)}" + trace_msg(msg) if len(files_additional_devcode) == 0: - summary_msg += "Number of files with device code for more CUDA Compute Capabilities than requested: 0\n" + trace_msg("Number of files with device code for more CUDA Compute Capabilities than requested: 0") elif ignore_failures: - summary_msg += "Number of files with device code for more CUDA Compute Capabilities than requested: " - summary_msg += f"{len(files_additional_devcode)} (not running with --cuda-sanity-check-fail-on-error, " - summary_msg += "so not considered failures)\n" + msg = "Number of files with device code for more CUDA Compute Capabilities than requested: " + msg += f"{len(files_additional_devcode)} (not running with --cuda-sanity-check-fail-on-error, " + msg += "so not considered failures)" + trace_msg(msg) elif strict_cc_check: - summary_msg += "Number of files with device code for more CUDA Compute Capabilities than requested: " - summary_msg += f"{len(files_additional_devcode)} (ignored: {len(files_additional_devcode_ignored)}, " - summary_msg += f"fails: {len(files_additional_devcode_fails)})\n" + msg = "Number of files with device code for more CUDA Compute Capabilities than requested: " + msg += f"{len(files_additional_devcode)} (ignored: {len(files_additional_devcode_ignored)}, " + msg += f"fails: {len(files_additional_devcode_fails)})" + trace_msg(msg) else: - summary_msg += "Number of files with device code for more CUDA Compute Capabilities than requested: " - summary_msg += f"{len(files_additional_devcode)} (not running with --cuda-sanity-check-strict, so not " - summary_msg += "considered failures)\n" + msg = "Number of files with device code for more CUDA Compute Capabilities than requested: " + msg += f"{len(files_additional_devcode)} (not running with --cuda-sanity-check-strict, so not " + msg += "considered failures)" + trace_msg(msg) if len(files_missing_ptx) == 0: - summary_msg += "Number of files missing PTX code for the highest configured CUDA Compute Capability: 0\n" + trace_msg("Number of files missing PTX code for the highest configured CUDA Compute Capability: 0") elif ignore_failures: - summary_msg += "Number of files missing PTX code for the highest configured CUDA Compute Capability: " - summary_msg += f"{len(files_missing_ptx)} (not running with --cuda-sanity-check-fail-on-error, so not " - summary_msg += "considered failures)\n" + msg = "Number of files missing PTX code for the highest configured CUDA Compute Capability: " + msg += f"{len(files_missing_ptx)} (not running with --cuda-sanity-check-fail-on-error, so not " + msg += "considered failures)" + trace_msg(msg) elif accept_missing_ptx: - summary_msg += "Number of files missing PTX code for the highest configured CUDA Compute Capability: " - summary_msg += f"{len(files_missing_ptx)} (running with --cuda-sanity-check-accept-missing-ptx, so not " - summary_msg += "considered failures)\n" + msg = "Number of files missing PTX code for the highest configured CUDA Compute Capability: " + msg += f"{len(files_missing_ptx)} (running with --cuda-sanity-check-accept-missing-ptx, so not " + msg += "considered failures)" + trace_msg(msg) else: - summary_msg += "Number of files missing PTX code for the highest configured CUDA Compute Capability: " - summary_msg += f"{len(files_missing_ptx)} (ignored: {len(files_missing_ptx_ignored)}, fails: " - summary_msg += f"{len(files_missing_ptx_fails)})\n" - if not build_option('debug'): - summary_msg += "Rerun with --debug to see a detailed list of files.\n" + msg = "Number of files missing PTX code for the highest configured CUDA Compute Capability: " + msg += f"{len(files_missing_ptx)} (ignored: {len(files_missing_ptx_ignored)}, fails: " + msg += f"{len(files_missing_ptx_fails)})" + trace_msg(msg) + if build_option('debug') and (len(files_missing_devcode) > 0 or len(files_additional_devcode) > 0 + or len(files_missing_ptx) > 0): + trace_msg("See build log for detail lists of not passing the CUDA Sanity Check") + else: + msg = "To get a detailed list of files not passing the CUDA Sanity Check in the build log, " + msg += "rerun with --debug." + trace_msg(msg) # Give some advice if len(files_missing_devcode) > 0 and not accept_ptx_as_devcode: - summary_msg += "\nYou may consider rerunning with --cuda-sanity-check-accept-ptx-as-devcode to accept " - summary_msg += "binaries that don't contain the device code for your requested CUDA Compute Capabilities, " - summary_msg += "but that do have PTX code that can be compiled for your requested CUDA Compute " - summary_msg += "Capabilities. Note that this may increase startup delay due to JIT compilation " - summary_msg += "and may also lead to suboptimal runtime performance, as the PTX code may not exploit " - summary_msg += "all features specific to your hardware architecture.\n" + msg = "You may consider rerunning with --cuda-sanity-check-accept-ptx-as-devcode to accept " + msg += "binaries that don't contain the device code for your requested CUDA Compute Capabilities, " + msg += "but that do have PTX code that can be compiled for your requested CUDA Compute " + msg += "Capabilities. Note that this may increase startup delay due to JIT compilation " + msg += "and may also lead to suboptimal runtime performance, as the PTX code may not exploit " + msg += "all features specific to your hardware architecture." + trace_msg(msg) if len(files_additional_devcode) > 0 and strict_cc_check: - summary_msg += "\nYou may consider running with --disable-cuda-sanity-check-strict. This means you'll " - summary_msg += "accept that some binaries may have CUDA Device Code for more architectures than you " - summary_msg += "requested, i.e. the binary is 'fatter' than you need. Bigger binaries may generally " - summary_msg += "cause some startup delay, and code path selection could introduce a small overhead, " - summary_msg += "though this is generally negligible.\n" + msg = "You may consider running with --disable-cuda-sanity-check-strict. This means you'll " + msg += "accept that some binaries may have CUDA Device Code for more architectures than you " + msg += "requested, i.e. the binary is 'fatter' than you need. Bigger binaries may generally " + msg += "cause some startup delay, and code path selection could introduce a small overhead, " + msg += "though this is generally negligible." + trace_msg(msg) if len(files_missing_ptx) > 0 and not accept_missing_ptx: - summary_msg += "\nYou may consider running with --cuda-sanity-check-accept-missing-ptx to accept binaries " - summary_msg += "that don't contain PTX code for the highest CUDA Compute Capability you requested. This " - summary_msg += "breaks forwards compatibility for newer CUDA Compute Capabilities (i.e. your compiled " - summary_msg += "binaries will not run on cards with higher CUDA Compute Capabilities than what " - summary_msg += "you requested in --cuda-compute-capabilities), but that may be acceptable to you.\n" - # Give this some extra visibility if we're NOT erroring out on failures - if ignore_failures: - self.log.warning(summary_msg) - else: - self.log.info(summary_msg) + msg = "You may consider running with --cuda-sanity-check-accept-missing-ptx to accept binaries " + msg += "that don't contain PTX code for the highest CUDA Compute Capability you requested. This " + msg += "breaks forwards compatibility for newer CUDA Compute Capabilities (i.e. your compiled " + msg += "binaries will not run on cards with higher CUDA Compute Capabilities than what " + msg += "you requested in --cuda-compute-capabilities), but that may be acceptable to you." + trace_msg(msg) +# Now that we write everything to the trace output... should we still _also_ log everything to the logfile? +# Otherwise, we have no record of it in the installation directory... +# # Give this some extra visibility if we're NOT erroring out on failures +# if ignore_failures: +# self.log.warning(summary_msg) +# else: +# self.log.info(summary_msg) return fail_msgs From c358e27e89386d2d71d792cac6afe9ede383f17b Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 13 May 2025 12:23:38 +0200 Subject: [PATCH 090/114] Print both to trace output (with short version of advice), and to log, so that it's also preserved in the logfile --- easybuild/framework/easyblock.py | 124 ++++++++++++++++--------------- 1 file changed, 66 insertions(+), 58 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index f86c052d58..422d036cec 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3589,115 +3589,123 @@ def format_file_list(files_list): else: self.log.debug(f"Not sanity checking files in non-existing directory {dirpath}") - # Long report, which prints the files that have potential issues - summary_msg_files = f"{len(files_missing_devcode)} files missing one or more CUDA compute capabilities:" - summary_msg_files += f"{format_file_list(files_missing_devcode)}\n" - summary_msg_files += f"These failures are ignored for {len(files_missing_devcode_ignored)} files:" - summary_msg_files += f"{format_file_list(files_missing_devcode_ignored)})\n" - if accept_ptx_as_devcode: - summary_msg_files += f"{len(files_missing_devcode_but_has_ptx)} files missing one or more CUDA Compute " - summary_msg_files += "Capabilities, but has suitable PTX code that can be JIT compiled for the requested " - summary_msg_files += f"CUDA Compute Capabilities:{format_file_list(files_missing_devcode_but_has_ptx)}\n" - summary_msg_files += f"{len(files_additional_devcode)} files with device code for more CUDA Compute " - summary_msg_files += f"Capabilities than requested:{format_file_list(files_additional_devcode)}\n" - summary_msg_files += f"These failures are ignored for {len(files_additional_devcode_ignored)} files:" - summary_msg_files += f"{format_file_list(files_additional_devcode_ignored)})\n" - summary_msg_files += f"{len(files_missing_ptx)} files missing PTX code for the highest configured CUDA Compute" - summary_msg_files += f" Capability:{format_file_list(files_missing_ptx)}\n" - summary_msg_files += f"These failures are ignored for {len(files_missing_ptx_ignored)} files:" - summary_msg_files += f"{format_file_list(files_missing_ptx_ignored)})" - self.log.info(summary_msg_files) + # Send to trace and log + def trace_and_log(msg): + self.log.info(msg) + trace_msg(msg) + # Short summary - trace_msg("CUDA sanity check summary report:") - trace_msg(f"Number of CUDA files checked: {num_cuda_files}") + trace_and_log("CUDA sanity check summary report:") + trace_and_log(f"Number of CUDA files checked: {num_cuda_files}") if len(files_missing_devcode) == 0: - trace_msg("Number of files missing one or more CUDA Compute Capabilities: 0") + trace_and_log("Number of files missing one or more CUDA Compute Capabilities: 0") elif ignore_failures: msg = f"Number of files missing one or more CUDA Compute Capabilities: {len(files_missing_devcode)}" - msg += "\n(not running with --cuda-sanity-check-fail-on-error, so not considered failures)" - trace_msg(msg) + trace_and_log(msg) + trace_and_log("(not running with --cuda-sanity-check-fail-on-error, so not considered failures)") else: msg = "Number of files missing one or more CUDA Compute Capabilities: {len(files_missing_devcode)}" msg += f" (ignored: {len(files_missing_devcode_ignored)}, " msg += f"fails: {len(files_missing_devcode_fails)})" - trace_msg(msg) + trace_and_log(msg) if accept_ptx_as_devcode: msg = "Number of files missing one or more CUDA Compute Capabilities, but having suitable " msg += "PTX code that can be JIT compiled for the requested CUDA Compute Capabilities: " msg += f"{len(files_missing_devcode_but_has_ptx)}" - trace_msg(msg) + trace_and_log(msg) if len(files_additional_devcode) == 0: - trace_msg("Number of files with device code for more CUDA Compute Capabilities than requested: 0") + trace_and_log("Number of files with device code for more CUDA Compute Capabilities than requested: 0") elif ignore_failures: msg = "Number of files with device code for more CUDA Compute Capabilities than requested: " - msg += f"{len(files_additional_devcode)} (not running with --cuda-sanity-check-fail-on-error, " - msg += "so not considered failures)" - trace_msg(msg) + msg += f"{len(files_additional_devcode)}" + trace_and_log(msg) + trace_and_log("(not running with --cuda-sanity-check-fail-on-error, so not considered failures)") elif strict_cc_check: msg = "Number of files with device code for more CUDA Compute Capabilities than requested: " msg += f"{len(files_additional_devcode)} (ignored: {len(files_additional_devcode_ignored)}, " msg += f"fails: {len(files_additional_devcode_fails)})" - trace_msg(msg) + trace_and_log(msg) else: msg = "Number of files with device code for more CUDA Compute Capabilities than requested: " - msg += f"{len(files_additional_devcode)} (not running with --cuda-sanity-check-strict, so not " - msg += "considered failures)" - trace_msg(msg) + msg += f"{len(files_additional_devcode)}" + trace_and_log(msg) + trace_and_log("(not running with --cuda-sanity-check-strict, so not considered failures)") if len(files_missing_ptx) == 0: - trace_msg("Number of files missing PTX code for the highest configured CUDA Compute Capability: 0") + trace_and_log("Number of files missing PTX code for the highest configured CUDA Compute Capability: 0") elif ignore_failures: msg = "Number of files missing PTX code for the highest configured CUDA Compute Capability: " - msg += f"{len(files_missing_ptx)} (not running with --cuda-sanity-check-fail-on-error, so not " - msg += "considered failures)" - trace_msg(msg) + msg += f"{len(files_missing_ptx)}" + trace_and_log(msg) + trace_and_log("(not running with --cuda-sanity-check-fail-on-error, so not considered failures)") elif accept_missing_ptx: msg = "Number of files missing PTX code for the highest configured CUDA Compute Capability: " - msg += f"{len(files_missing_ptx)} (running with --cuda-sanity-check-accept-missing-ptx, so not " - msg += "considered failures)" - trace_msg(msg) + msg += f"{len(files_missing_ptx)}" + trace_and_log(msg) + trace_and_log("(running with --cuda-sanity-check-accept-missing-ptx, so not considered failures)") else: msg = "Number of files missing PTX code for the highest configured CUDA Compute Capability: " msg += f"{len(files_missing_ptx)} (ignored: {len(files_missing_ptx_ignored)}, fails: " msg += f"{len(files_missing_ptx_fails)})" - trace_msg(msg) - if build_option('debug') and (len(files_missing_devcode) > 0 or len(files_additional_devcode) > 0 - or len(files_missing_ptx) > 0): - trace_msg("See build log for detail lists of not passing the CUDA Sanity Check") - else: - msg = "To get a detailed list of files not passing the CUDA Sanity Check in the build log, " - msg += "rerun with --debug." - trace_msg(msg) + trace_and_log(msg) # Give some advice if len(files_missing_devcode) > 0 and not accept_ptx_as_devcode: + short_msg = "You may consider rerunning with --cuda-sanity-check-accept-ptx-as-devcode to accept " + short_msg += "suitable PTX code instead of device code." + trace_msg(short_msg) msg = "You may consider rerunning with --cuda-sanity-check-accept-ptx-as-devcode to accept " msg += "binaries that don't contain the device code for your requested CUDA Compute Capabilities, " msg += "but that do have PTX code that can be compiled for your requested CUDA Compute " msg += "Capabilities. Note that this may increase startup delay due to JIT compilation " msg += "and may also lead to suboptimal runtime performance, as the PTX code may not exploit " msg += "all features specific to your hardware architecture." - trace_msg(msg) + self.log.info(msg) if len(files_additional_devcode) > 0 and strict_cc_check: + short_msg = "You may consider running with --disable-cuda-sanity-check-strict to accept binaries " + short_msg += "containing device code for more architectures than requested." + trace_msg(short_msg) msg = "You may consider running with --disable-cuda-sanity-check-strict. This means you'll " msg += "accept that some binaries may have CUDA Device Code for more architectures than you " msg += "requested, i.e. the binary is 'fatter' than you need. Bigger binaries may generally " msg += "cause some startup delay, and code path selection could introduce a small overhead, " msg += "though this is generally negligible." - trace_msg(msg) + self.log.info(msg) if len(files_missing_ptx) > 0 and not accept_missing_ptx: + short_msg = "You may consider running with --cuda-sanity-check-accept-missing-ptx to accept binaries " + short_msg += "missing PTX code for the highest configured CUDA Compute Capability." + trace_msg(short_msg) msg = "You may consider running with --cuda-sanity-check-accept-missing-ptx to accept binaries " msg += "that don't contain PTX code for the highest CUDA Compute Capability you requested. This " msg += "breaks forwards compatibility for newer CUDA Compute Capabilities (i.e. your compiled " msg += "binaries will not run on cards with higher CUDA Compute Capabilities than what " msg += "you requested in --cuda-compute-capabilities), but that may be acceptable to you." - trace_msg(msg) -# Now that we write everything to the trace output... should we still _also_ log everything to the logfile? -# Otherwise, we have no record of it in the installation directory... -# # Give this some extra visibility if we're NOT erroring out on failures -# if ignore_failures: -# self.log.warning(summary_msg) -# else: -# self.log.info(summary_msg) + self.log.info(msg) + if build_option('debug') and (len(files_missing_devcode) > 0 or len(files_additional_devcode) > 0 + or len(files_missing_ptx) > 0): + trace_and_log("See build log for detailed lists of files not passing the CUDA Sanity Check") + else: + msg = "To get a detailed list of files not passing the CUDA Sanity Check in the build log, " + msg += "rerun with --debug." + trace_and_log(msg) + + # Long report, which prints the files that have potential issues + summary_msg_files = f"{len(files_missing_devcode)} files missing one or more CUDA compute capabilities:" + summary_msg_files += f"{format_file_list(files_missing_devcode)}\n" + summary_msg_files += f"These failures are ignored for {len(files_missing_devcode_ignored)} files:" + summary_msg_files += f"{format_file_list(files_missing_devcode_ignored)})\n" + if accept_ptx_as_devcode: + summary_msg_files += f"{len(files_missing_devcode_but_has_ptx)} files missing one or more CUDA Compute " + summary_msg_files += "Capabilities, but has suitable PTX code that can be JIT compiled for the requested " + summary_msg_files += f"CUDA Compute Capabilities:{format_file_list(files_missing_devcode_but_has_ptx)}\n" + summary_msg_files += f"{len(files_additional_devcode)} files with device code for more CUDA Compute " + summary_msg_files += f"Capabilities than requested:{format_file_list(files_additional_devcode)}\n" + summary_msg_files += f"These failures are ignored for {len(files_additional_devcode_ignored)} files:" + summary_msg_files += f"{format_file_list(files_additional_devcode_ignored)})\n" + summary_msg_files += f"{len(files_missing_ptx)} files missing PTX code for the highest configured CUDA Compute" + summary_msg_files += f" Capability:{format_file_list(files_missing_ptx)}\n" + summary_msg_files += f"These failures are ignored for {len(files_missing_ptx_ignored)} files:" + summary_msg_files += f"{format_file_list(files_missing_ptx_ignored)})" + self.log.info(summary_msg_files) return fail_msgs From 588c34228f516f243d5d81f58eaa96b8aac4da12 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 13 May 2025 16:26:01 +0200 Subject: [PATCH 091/114] Fix hound issues - and hopefully CI checks --- easybuild/framework/easyblock.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 422d036cec..e4b0dc62d9 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3594,7 +3594,6 @@ def trace_and_log(msg): self.log.info(msg) trace_msg(msg) - # Short summary trace_and_log("CUDA sanity check summary report:") trace_and_log(f"Number of CUDA files checked: {num_cuda_files}") @@ -3680,10 +3679,14 @@ def trace_and_log(msg): msg += "binaries will not run on cards with higher CUDA Compute Capabilities than what " msg += "you requested in --cuda-compute-capabilities), but that may be acceptable to you." self.log.info(msg) - if build_option('debug') and (len(files_missing_devcode) > 0 or len(files_additional_devcode) > 0 - or len(files_missing_ptx) > 0): + if ( + build_option('debug') and + (len(files_missing_devcode) > 0 or len(files_additional_devcode) > or len(files_missing_ptx) > 0) + ): trace_and_log("See build log for detailed lists of files not passing the CUDA Sanity Check") - else: + elif ( + len(files_missing_devcode) > 0 or len(files_additional_devcode) > or len(files_missing_ptx) > 0 + ): msg = "To get a detailed list of files not passing the CUDA Sanity Check in the build log, " msg += "rerun with --debug." trace_and_log(msg) From 09a182f459b5498dc603218443861b6e2a5a5ba0 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 13 May 2025 16:27:18 +0200 Subject: [PATCH 092/114] Fix missing condition --- easybuild/framework/easyblock.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index e4b0dc62d9..230b67391a 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3681,11 +3681,11 @@ def trace_and_log(msg): self.log.info(msg) if ( build_option('debug') and - (len(files_missing_devcode) > 0 or len(files_additional_devcode) > or len(files_missing_ptx) > 0) + (len(files_missing_devcode) > 0 or len(files_additional_devcode) > 0 or len(files_missing_ptx) > 0) ): trace_and_log("See build log for detailed lists of files not passing the CUDA Sanity Check") elif ( - len(files_missing_devcode) > 0 or len(files_additional_devcode) > or len(files_missing_ptx) > 0 + len(files_missing_devcode) > 0 or len(files_additional_devcode) > 0 or len(files_missing_ptx) > 0 ): msg = "To get a detailed list of files not passing the CUDA Sanity Check in the build log, " msg += "rerun with --debug." From 1787c470d3b0939b3cd8955f05be416e54af401f Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 13 May 2025 20:55:42 +0200 Subject: [PATCH 093/114] Added missing f to f-string --- easybuild/framework/easyblock.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 230b67391a..f512863e48 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3604,7 +3604,7 @@ def trace_and_log(msg): trace_and_log(msg) trace_and_log("(not running with --cuda-sanity-check-fail-on-error, so not considered failures)") else: - msg = "Number of files missing one or more CUDA Compute Capabilities: {len(files_missing_devcode)}" + msg = f"Number of files missing one or more CUDA Compute Capabilities: {len(files_missing_devcode)}" msg += f" (ignored: {len(files_missing_devcode_ignored)}, " msg += f"fails: {len(files_missing_devcode_fails)})" trace_and_log(msg) From 0f85f135e64fbfeff12577f4e328c67abf2494d1 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 13 May 2025 20:56:04 +0200 Subject: [PATCH 094/114] Modify tests for the new syntax, and to also check the trace output --- test/framework/toy_build.py | 142 ++++++++++++++---------------------- 1 file changed, 54 insertions(+), 88 deletions(-) diff --git a/test/framework/toy_build.py b/test/framework/toy_build.py index de9e346204..5038db6640 100644 --- a/test/framework/toy_build.py +++ b/test/framework/toy_build.py @@ -3229,6 +3229,36 @@ def test_toy_cuda_sanity_check(self): # Filepath to cuobjdump cuobjdump_file = os.path.join(cuobjdump_dir, 'cuobjdump') + # Predefine a function that takes a pattern, creates a regex, searches if it's found in the log + # Also, check if it's found in stdout, if defined + # If either of these fail their assert, print an informative, standardized message + def assert_regex(pattern, log, stdout = None): + regex = re.compile(pattern, re.M) + msg = "Pattern %s not found in full build log: %s" % (pattern, log) + self.assertTrue(regex.search(log), msg) + if stdout is not None: + msg2 = "Pattern %s not found in standard output: %s" % (pattern, stdout) + self.assertTrue(regex.search(stdout), msg2) + + def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout = None, missing_cc_but_ptx = None, + num_checked = None): + if num_checked is not None: + num_checked_str = r"Number of CUDA files checked: %s" % num_checked + assert_regex(num_checked_str, outtxt, stdout) + if missing_cc_but_ptx is not None: + missing_cc_but_ptx_str = r"Number of files missing one or more CUDA Compute Capabilities, but having " + missing_cc_but_ptx_str += r"suitable PTX code that can be JIT compiled for the requested CUDA Compute " + missing_cc_but_ptx_str += r"Capabilities: %s" % additional_cc + assert_regex(missing_cc_but_ptx_str, outtxt, stdout) + missing_cc_str = r"Number of files missing one or more CUDA Compute Capabilities: %s" % missing_cc + additional_cc_str = r"Number of files with device code for more CUDA Compute Capabilities than requested: " + additional_cc_str += r"%s" % additional_cc + missing_ptx_str = r"Number of files missing PTX code for the highest configured CUDA Compute Capability: " + missing_ptx_str += r"%s" % missing_ptx + assert_regex(missing_cc_str, outtxt, stdout) + assert_regex(additional_cc_str, outtxt, stdout) + assert_regex(missing_ptx_str, outtxt, stdout) + # Test case 1a: test with default options, --cuda-compute-capabilities=8.0 and a binary that contains # 8.0 device code # This should succeed (since the default for --cuda-sanity-check-error-on-fail is False) @@ -3240,12 +3270,8 @@ def test_toy_cuda_sanity_check(self): # We expect this to pass, so no need to check errors with self.mocked_stdout_stderr(): outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) - expected_summary = r"^Number of files missing one or more CUDA Compute Capabilities: 0$\n" - expected_summary += r"^Number of files with device code for more CUDA Compute Capabilities than requested: 0$\n" - expected_summary += r"^Number of files missing PTX code for the highest configured CUDA Compute Capability: 1" - expected_summary_regex = re.compile(expected_summary, re.M) - msg = "Pattern %s not found in full build log: %s" % (expected_summary, outtxt) - self.assertTrue(expected_summary_regex.search(outtxt), msg) + stdout = self.get_stdout() + assert_cuda_report(missing_cc=0, additional_cc=0, missing_ptx=1, log=outtxt, stdout=stdout) # Test case 1b: test with default options, --cuda-compute-capabilities=8.0 and a binary that contains # 7.0 and 9.0 device code and 8.0 PTX code. @@ -3263,18 +3289,12 @@ def test_toy_cuda_sanity_check(self): # We expect this to pass, so no need to check errors with self.mocked_stdout_stderr(): outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) + stdout = self.get_stdout() msg = "Pattern %s not found in full build log: %s" % (device_additional_70_90_code_regex.pattern, outtxt) self.assertTrue(device_additional_70_90_code_regex.search(outtxt), msg) msg = "Pattern %s not found in full build log: %s" % (device_missing_80_code_regex.pattern, outtxt) self.assertTrue(device_missing_80_code_regex.search(outtxt), msg) - expected_summary = r"^Number of files missing one or more CUDA Compute Capabilities: 1 " - expected_summary += r"\(not running with --cuda-sanity-check-fail-on-error, so not considered failures\)$\n" - expected_summary += r"^Number of files with device code for more CUDA Compute Capabilities than requested: 1 " - expected_summary += r"\(not running with --cuda-sanity-check-fail-on-error, so not considered failures\)$\n" - expected_summary += r"^Number of files missing PTX code for the highest configured CUDA Compute Capability: 0" - expected_summary_regex = re.compile(expected_summary, re.M) - msg = "Pattern %s not found in full build log: %s" % (expected_summary, outtxt) - self.assertTrue(expected_summary_regex.search(outtxt), msg) + assert_cuda_report(missing_cc=1, additional_cc=1, missing_ptx=0, log=outtxt, stdout=stdout) # Test case 2: same as Test case 1, but add --cuda-sanity-check-error-on-fail # This is expected to fail since there is missing device code for CC80 @@ -3286,18 +3306,12 @@ def test_toy_cuda_sanity_check(self): self.assertErrorRegex(EasyBuildError, error_pattern, self._test_toy_build, ec_file=toy_ec, extra_args=args, raise_error=True) outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=False, verify=False) + stdout = self.get_stdout() msg = "Pattern %s not found in full build log: %s" % (device_additional_70_90_code_regex.pattern, outtxt) self.assertTrue(device_additional_70_90_code_regex.search(outtxt), msg) msg = "Pattern %s not found in full build log: %s" % (device_missing_80_code_regex.pattern, outtxt) self.assertTrue(device_missing_80_code_regex.search(outtxt), msg) - expected_summary = r"^Number of files missing one or more CUDA Compute Capabilities: 1 " - expected_summary += r"\(ignored: 0, fails: 1\)$\n" - expected_summary += r"^Number of files with device code for more CUDA Compute Capabilities than requested: 1 " - expected_summary += r"\(not running with --cuda-sanity-check-strict, so not considered failures\)$\n" - expected_summary += r"^Number of files missing PTX code for the highest configured CUDA Compute Capability: 0" - expected_summary_regex = re.compile(expected_summary, re.M) - msg = "Pattern %s not found in full build log: %s" % (expected_summary, outtxt) - self.assertTrue(expected_summary_regex.search(outtxt), msg) + assert_cuda_report(missing_cc=1, additional_cc=1, missing_ptx=0, log=outtxt, stdout=stdout) # Test case 3: same as Test case 2, but add --cuda-sanity-check-accept-ptx-as-devcode # This is expected to succeed, since now the PTX code for CC80 will be accepted as @@ -3308,24 +3322,13 @@ def test_toy_cuda_sanity_check(self): # We expect this to pass, so no need to check errors with self.mocked_stdout_stderr(): outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) + stdout = self.get_stdout() msg = "Pattern %s not found in full build log: %s" % (device_additional_70_90_code_regex.pattern, outtxt) self.assertTrue(device_additional_70_90_code_regex.search(outtxt), msg) msg = "Pattern %s not found in full build log: %s" % (device_missing_80_code_regex.pattern, outtxt) self.assertTrue(device_missing_80_code_regex.search(outtxt), msg) - expected_summary = r"Number of files missing one or more CUDA Compute Capabilities, but having suitable PTX " - expected_summary += r"code that can be JIT compiled for the requested CUDA Compute Capabilities: 1" - expected_summary_regex = re.compile(expected_summary, re.M) - msg = "Pattern %s not found in full build log: %s" % (expected_summary, outtxt) - self.assertTrue(expected_summary_regex.search(outtxt), msg) - expected_summary = r"^Number of files missing one or more CUDA Compute Capabilities: 0$\n" - expected_summary += r"^Number of files missing one or more CUDA Compute Capabilities, but having suitable PTX " - expected_summary += r"code that can be JIT compiled for the requested CUDA Compute Capabilities: 1$\n" - expected_summary += r"^Number of files with device code for more CUDA Compute Capabilities than requested: 1 " - expected_summary += r"\(not running with --cuda-sanity-check-strict, so not considered failures\)$\n" - expected_summary += r"^Number of files missing PTX code for the highest configured CUDA Compute Capability: 0" - expected_summary_regex = re.compile(expected_summary, re.M) - msg = "Pattern %s not found in full build log: %s" % (expected_summary, outtxt) - self.assertTrue(expected_summary_regex.search(outtxt), msg) + assert_cuda_report(missing_cc=0, additional_cc=1, missing_ptx=0, log=outtxt, stdout=stdout, + missing_cc_but_ptx=1) # Test case 4: same as Test case 2, but run with --cuda-compute-capabilities=9.0 # This is expected to fail: device code is present, but PTX code for the highest CC (9.0) is missing @@ -3338,16 +3341,10 @@ def test_toy_cuda_sanity_check(self): self.assertErrorRegex(EasyBuildError, error_pattern, self._test_toy_build, ec_file=toy_ec, extra_args=args, raise_error=True) outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=False, verify=False) + stdout = self.get_stdout() msg = "Pattern %s not found in full build log: %s" % (device_additional_70_code_regex.pattern, outtxt) self.assertTrue(device_additional_70_code_regex.search(outtxt), msg) - expected_summary = r"^Number of files missing one or more CUDA Compute Capabilities: 0$\n" - expected_summary += r"^Number of files with device code for more CUDA Compute Capabilities than requested: 1 " - expected_summary += r"\(not running with --cuda-sanity-check-strict, so not considered failures\)$\n" - expected_summary += r"^Number of files missing PTX code for the highest configured CUDA Compute Capability: 1 " - expected_summary += r"\(ignored: 0, fails: 1\)" - expected_summary_regex = re.compile(expected_summary, re.M) - msg = "Pattern %s not found in full build log: %s" % (expected_summary, outtxt) - self.assertTrue(expected_summary_regex.search(outtxt), msg) + assert_cuda_report(missing_cc=0, additional_cc=1, missing_ptx=1, log=outtxt, stdout=stdout) # Test case 5: same as Test case 4, but add --cuda-sanity-check-accept-missing-ptx # This is expected to succeed: device code is present, PTX code is missing, but that's accepted @@ -3360,18 +3357,12 @@ def test_toy_cuda_sanity_check(self): warning_pattern_regex = re.compile(warning_pattern, re.M) with self.mocked_stdout_stderr(): outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) + stdout = self.get_stdout() msg = "Pattern %s not found in full build log: %s" % (device_additional_70_code_regex.pattern, outtxt) self.assertTrue(device_additional_70_code_regex.search(outtxt), msg) msg = "Pattern %s not found in full build log: %s" % (warning_pattern, outtxt) self.assertTrue(warning_pattern_regex.search(outtxt), msg) - expected_summary = r"^Number of files missing one or more CUDA Compute Capabilities: 0$\n" - expected_summary += r"^Number of files with device code for more CUDA Compute Capabilities than requested: 1 " - expected_summary += r"\(not running with --cuda-sanity-check-strict, so not considered failures\)$\n" - expected_summary += r"^Number of files missing PTX code for the highest configured CUDA Compute Capability: 1 " - expected_summary += r"\(running with --cuda-sanity-check-accept-missing-ptx, so not considered failures\)" - expected_summary_regex = re.compile(expected_summary, re.M) - msg = "Pattern %s not found in full build log: %s" % (expected_summary, outtxt) - self.assertTrue(expected_summary_regex.search(outtxt), msg) + assert_cuda_report(missing_cc=0, additional_cc=1, missing_ptx=1, log=outtxt, stdout=stdout) # Test case 6: same as Test case 5, but add --cuda-sanity-check-strict # This is expected to fail: device code is present, PTX code is missing (but accepted due to option) @@ -3385,16 +3376,10 @@ def test_toy_cuda_sanity_check(self): self.assertErrorRegex(EasyBuildError, error_pattern, self._test_toy_build, ec_file=toy_ec, extra_args=args, raise_error=True) outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=False, verify=False) + stdout = self.get_stdout() msg = "Pattern %s not found in full build log: %s" % (device_additional_70_code_regex.pattern, outtxt) self.assertTrue(device_additional_70_code_regex.search(outtxt), msg) - expected_summary = r"^Number of files missing one or more CUDA Compute Capabilities: 0$\n" - expected_summary += r"^Number of files with device code for more CUDA Compute Capabilities than requested: 1 " - expected_summary += r"\(ignored: 0, fails: 1\)$\n" - expected_summary += r"^Number of files missing PTX code for the highest configured CUDA Compute Capability: 1 " - expected_summary += r"\(running with --cuda-sanity-check-accept-missing-ptx, so not considered failures\)" - expected_summary_regex = re.compile(expected_summary, re.M) - msg = "Pattern %s not found in full build log: %s" % (expected_summary, outtxt) - self.assertTrue(expected_summary_regex.search(outtxt), msg) + assert_cuda_report(missing_cc=0, additional_cc=1, missing_ptx=1, log=outtxt, stdout=stdout) # Test case 7: same as Test case 6, but add the failing file to the cuda_sanity_ignore_files # This is expected to succeed: the individual file which _would_ cause the sanity check to fail is @@ -3412,16 +3397,10 @@ def test_toy_cuda_sanity_check(self): error_pattern += r".*/bin/toy\. Additional compute capabilities: 7\.0" with self.mocked_stdout_stderr(): outtxt = self._test_toy_build(ec_file=toy_whitelist_ec, extra_args=args, raise_error=True, verify=False) + stdout = self.get_stdout() msg = "Pattern %s not found in full build log: %s" % (device_additional_70_code_regex.pattern, outtxt) self.assertTrue(device_additional_70_code_regex.search(outtxt), msg) - expected_summary = r"^Number of files missing one or more CUDA Compute Capabilities: 0$\n" - expected_summary += r"^Number of files with device code for more CUDA Compute Capabilities than requested: 1 " - expected_summary += r"\(ignored: 1, fails: 0\)$\n" - expected_summary += r"^Number of files missing PTX code for the highest configured CUDA Compute Capability: 1 " - expected_summary += r"\(running with --cuda-sanity-check-accept-missing-ptx, so not considered failures\)" - expected_summary_regex = re.compile(expected_summary, re.M) - msg = "Pattern %s not found in full build log: %s" % (expected_summary, outtxt) - self.assertTrue(expected_summary_regex.search(outtxt), msg) + assert_cuda_report(missing_cc=0, additional_cc=1, missing_ptx=1, log=outtxt, stdout=stdout) # Test case 8: try with --cuda-sanity-check-error-on-fail --cuda-compute-capabilities=9.0,9.0a # and --cuda-sanity-check-strict @@ -3439,20 +3418,16 @@ def test_toy_cuda_sanity_check(self): # We expect this to pass, so no need to check errors with self.mocked_stdout_stderr(): outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) + stdout = self.get_stdout() msg = "Pattern %s not found in full build log: %s" % (device_code_regex_success.pattern, outtxt) self.assertTrue(device_code_regex_success.search(outtxt), msg) msg = "Pattern %s not found in full build log: %s" % (ptx_code_regex_success.pattern, outtxt) self.assertTrue(ptx_code_regex_success.search(outtxt), msg) - expected_summary = r"^Number of files missing one or more CUDA Compute Capabilities: 0$\n" - expected_summary += r"^Number of files with device code for more CUDA Compute Capabilities than requested: 0$\n" - expected_summary += r"^Number of files missing PTX code for the highest configured CUDA Compute Capability: 0$" - expected_summary_regex = re.compile(expected_summary, re.M) - msg = "Pattern %s not found in full build log: %s" % (expected_summary, outtxt) - self.assertTrue(expected_summary_regex.search(outtxt), msg) expected_result_pattern = "INFO Sanity check for toy successful" expected_result = re.compile(expected_result_pattern, re.M) msg = "Pattern %s not found in full build log: %s" % (expected_result, outtxt) self.assertTrue(expected_result.search(outtxt), msg) + assert_cuda_report(missing_cc=0, additional_cc=0, missing_ptx=0, log=outtxt, stdout=stdout) # Test case 9: same as 8, but no --cuda-compute-capabilities are defined # We expect this to lead to a skip of the CUDA sanity check, and a success for the overall sanity check @@ -3460,6 +3435,7 @@ def test_toy_cuda_sanity_check(self): # We expect this to pass, so no need to check errors with self.mocked_stdout_stderr(): outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) + stdout = self.get_stdout() cuda_sanity_skipped = r"INFO Skipping CUDA sanity check, as no CUDA compute capabilities were configured" cuda_sanity_skipped_regex = re.compile(cuda_sanity_skipped, re.M) msg = "Pattern %s not found in full build log: %s" % (cuda_sanity_skipped, outtxt) @@ -3478,22 +3454,17 @@ def test_toy_cuda_sanity_check(self): # We expect this to pass, so no need to check errors with self.mocked_stdout_stderr(): outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) + stdout = self.get_stdout() no_cuda_pattern = r".*/bin/toy does not appear to be a CUDA executable \(no CUDA device code found\), " no_cuda_pattern += r"so skipping CUDA sanity check" no_cuda_regex = re.compile(no_cuda_pattern, re.M) msg = "Pattern %s not found in full build log: %s" % (no_cuda_pattern, outtxt) self.assertTrue(no_cuda_regex.search(outtxt), msg) - expected_summary = r"^Number of CUDA files checked: 0$\n" - expected_summary += r"^Number of files missing one or more CUDA Compute Capabilities: 0$\n" - expected_summary += r"^Number of files with device code for more CUDA Compute Capabilities than requested: 0$\n" - expected_summary += r"^Number of files missing PTX code for the highest configured CUDA Compute Capability: 0$" - expected_summary_regex = re.compile(expected_summary, re.M) - msg = "Pattern %s not found in full build log: %s" % (expected_summary, outtxt) - self.assertTrue(expected_summary_regex.search(outtxt), msg) expected_result_pattern = "INFO Sanity check for toy successful" expected_result = re.compile(expected_result_pattern, re.M) msg = "Pattern %s not found in full build log: %s" % (expected_result, outtxt) self.assertTrue(expected_result.search(outtxt), msg) + assert_cuda_report(missing_cc=0, additional_cc=0, missing_ptx=0, log=outtxt, stdout=stdout, num_checked=0) # Test case 11: same as Test case 10, but add --cuda-sanity-check-error-on-fail # This should pass: if it's not a CUDA binary, it shouldn't fail the CUDA sanity check @@ -3501,22 +3472,17 @@ def test_toy_cuda_sanity_check(self): # We expect this to pass, so no need to check errors with self.mocked_stdout_stderr(): outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) + stdout = self.get_stdout() no_cuda_pattern = r".*/bin/toy does not appear to be a CUDA executable \(no CUDA device code found\), " no_cuda_pattern += r"so skipping CUDA sanity check" no_cuda_regex = re.compile(no_cuda_pattern, re.M) msg = "Pattern %s not found in full build log: %s" % (no_cuda_pattern, outtxt) self.assertTrue(no_cuda_regex.search(outtxt), msg) - expected_summary = r"^Number of CUDA files checked: 0$\n" - expected_summary += r"^Number of files missing one or more CUDA Compute Capabilities: 0$\n" - expected_summary += r"^Number of files with device code for more CUDA Compute Capabilities than requested: 0$\n" - expected_summary += r"^Number of files missing PTX code for the highest configured CUDA Compute Capability: 0$" - expected_summary_regex = re.compile(expected_summary, re.M) - msg = "Pattern %s not found in full build log: %s" % (expected_summary, outtxt) - self.assertTrue(expected_summary_regex.search(outtxt), msg) expected_result_pattern = "INFO Sanity check for toy successful" expected_result = re.compile(expected_result_pattern, re.M) msg = "Pattern %s not found in full build log: %s" % (expected_result, outtxt) self.assertTrue(expected_result.search(outtxt), msg) + assert_cuda_report(missing_cc=0, additional_cc=0, missing_ptx=0, log=outtxt, stdout=stdout, num_checked=0) # Restore original environment modify_env(os.environ, start_env, verbose=False) From 4b78bd7f8adcd363d1f7316ff6630fa48343accf Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Tue, 13 May 2025 20:57:30 +0200 Subject: [PATCH 095/114] Fix hound issues --- test/framework/toy_build.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/framework/toy_build.py b/test/framework/toy_build.py index 5038db6640..2a9f7a8aac 100644 --- a/test/framework/toy_build.py +++ b/test/framework/toy_build.py @@ -3232,7 +3232,7 @@ def test_toy_cuda_sanity_check(self): # Predefine a function that takes a pattern, creates a regex, searches if it's found in the log # Also, check if it's found in stdout, if defined # If either of these fail their assert, print an informative, standardized message - def assert_regex(pattern, log, stdout = None): + def assert_regex(pattern, log, stdout=None): regex = re.compile(pattern, re.M) msg = "Pattern %s not found in full build log: %s" % (pattern, log) self.assertTrue(regex.search(log), msg) @@ -3240,8 +3240,8 @@ def assert_regex(pattern, log, stdout = None): msg2 = "Pattern %s not found in standard output: %s" % (pattern, stdout) self.assertTrue(regex.search(stdout), msg2) - def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout = None, missing_cc_but_ptx = None, - num_checked = None): + def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, missing_cc_but_ptx=None, + num_checked=None): if num_checked is not None: num_checked_str = r"Number of CUDA files checked: %s" % num_checked assert_regex(num_checked_str, outtxt, stdout) From 581767c3a72e42deb98ff5845501e9b68f6eaaf2 Mon Sep 17 00:00:00 2001 From: ocaisa Date: Wed, 14 May 2025 14:20:17 +0200 Subject: [PATCH 096/114] Apply suggestions from code review --- easybuild/framework/easyblock.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index f512863e48..3dbf3d22f3 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3429,7 +3429,7 @@ def format_file_list(files_list): # --cuda-sanity-check-strict is True (otherwise, it's a warning) # - Missing PTX code for the highest CUDA compute capability in --cuda-compute-capabilities # is considered a failure, unless --cuda-sanity-check-accept-missing-ptx is True (in which - # case it is a warning) + # case it is a warning) # If found_dev_code_ccs is None, but found_ptx_ccs isn't, or vice versa, it IS a CUDA file # but there was simply no device/ptx code, respectively. So, make that an empty list @@ -3497,7 +3497,7 @@ def format_file_list(files_list): else: self.log.warning(fail_msg) - # Both additional_devcodes and missing_devcodes could be try, so use if, not elif + # Both additional_devcodes and missing_devcodes could exist, so use if, not elif if missing_devcodes: # One or more device code architectures requested in cuda-compute-capabilities was # not found in the binary From f027c515aacaa5332c8956fc29cdc9b448091a26 Mon Sep 17 00:00:00 2001 From: ocaisa Date: Thu, 15 May 2025 15:46:25 +0200 Subject: [PATCH 097/114] Apply suggestions from code review --- easybuild/framework/easyblock.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 3dbf3d22f3..e83671a84e 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3523,7 +3523,7 @@ def format_file_list(files_list): is_failure = False else: # If there are CCs for which there is no suiteable PTX that can be JIT-compiled - # from, this is considerd a failure + # from, this is considered a failure files_missing_devcode.append(os.path.relpath(path, self.installdir)) if path in ignore_file_list or ignore_failures: # No error, because either path is on the cuda_sanity_ignore_files list in From 8b6c40d2a38006b6bd382a162cd5487c09083053 Mon Sep 17 00:00:00 2001 From: ocaisa Date: Thu, 15 May 2025 15:47:42 +0200 Subject: [PATCH 098/114] Apply suggestions from code review --- easybuild/tools/systemtools.py | 1 - 1 file changed, 1 deletion(-) diff --git a/easybuild/tools/systemtools.py b/easybuild/tools/systemtools.py index f74ad7631f..f471b7dd86 100644 --- a/easybuild/tools/systemtools.py +++ b/easybuild/tools/systemtools.py @@ -1034,7 +1034,6 @@ def get_cuda_object_dump_raw(path): else: # This should not happen: there was no string saying this was NOT a CUDA file, yet no device code # was found at all - msg = "Dumping CUDA binary file information for '%s' via '%s' failed! Output: '%s'" raise EasyBuildError(msg, path, cuda_cmd, res.output) From d6620d4d575d10cd8e130860240a4d2b16ab364e Mon Sep 17 00:00:00 2001 From: ocaisa Date: Thu, 15 May 2025 16:15:43 +0200 Subject: [PATCH 099/114] Apply suggestions from code review --- easybuild/framework/easyblock.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index e83671a84e..dd37ee3182 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3602,7 +3602,7 @@ def trace_and_log(msg): elif ignore_failures: msg = f"Number of files missing one or more CUDA Compute Capabilities: {len(files_missing_devcode)}" trace_and_log(msg) - trace_and_log("(not running with --cuda-sanity-check-fail-on-error, so not considered failures)") + trace_and_log("(not running with --cuda-sanity-check-error-on-fail, so not considered failures)") else: msg = f"Number of files missing one or more CUDA Compute Capabilities: {len(files_missing_devcode)}" msg += f" (ignored: {len(files_missing_devcode_ignored)}, " @@ -3619,7 +3619,7 @@ def trace_and_log(msg): msg = "Number of files with device code for more CUDA Compute Capabilities than requested: " msg += f"{len(files_additional_devcode)}" trace_and_log(msg) - trace_and_log("(not running with --cuda-sanity-check-fail-on-error, so not considered failures)") + trace_and_log("(not running with --cuda-sanity-check-error-on-fail, so not considered failures)") elif strict_cc_check: msg = "Number of files with device code for more CUDA Compute Capabilities than requested: " msg += f"{len(files_additional_devcode)} (ignored: {len(files_additional_devcode_ignored)}, " @@ -3636,7 +3636,7 @@ def trace_and_log(msg): msg = "Number of files missing PTX code for the highest configured CUDA Compute Capability: " msg += f"{len(files_missing_ptx)}" trace_and_log(msg) - trace_and_log("(not running with --cuda-sanity-check-fail-on-error, so not considered failures)") + trace_and_log("(not running with --cuda-sanity-check-error-on-fail, so not considered failures)") elif accept_missing_ptx: msg = "Number of files missing PTX code for the highest configured CUDA Compute Capability: " msg += f"{len(files_missing_ptx)}" From 39e5561ea82667b178ca0a9971bac8113f39ede7 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen <33718780+casparvl@users.noreply.github.com> Date: Thu, 15 May 2025 17:18:37 +0200 Subject: [PATCH 100/114] Update easybuild/framework/easyblock.py Co-authored-by: ocaisa --- easybuild/framework/easyblock.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index dd37ee3182..5d715b5a67 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3680,16 +3680,9 @@ def trace_and_log(msg): msg += "you requested in --cuda-compute-capabilities), but that may be acceptable to you." self.log.info(msg) if ( - build_option('debug') and - (len(files_missing_devcode) > 0 or len(files_additional_devcode) > 0 or len(files_missing_ptx) > 0) - ): - trace_and_log("See build log for detailed lists of files not passing the CUDA Sanity Check") - elif ( len(files_missing_devcode) > 0 or len(files_additional_devcode) > 0 or len(files_missing_ptx) > 0 ): - msg = "To get a detailed list of files not passing the CUDA Sanity Check in the build log, " - msg += "rerun with --debug." - trace_and_log(msg) + trace_and_log("See build log for detailed lists of files not passing the CUDA Sanity Check") # Long report, which prints the files that have potential issues summary_msg_files = f"{len(files_missing_devcode)} files missing one or more CUDA compute capabilities:" From 2bbfff97d7349eed436a1b14d8bb94d982bb3641 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen <33718780+casparvl@users.noreply.github.com> Date: Thu, 15 May 2025 17:33:20 +0200 Subject: [PATCH 101/114] Update easybuild/framework/easyblock.py Decide to run CUDA sanity check not based on whether CUDA is loaded, but based on whether it is in the dependency list. It doesn't have a downside for main EB users, but the upside for EESSI is that it will still run the sanity check (even if CUDA is not loaded, since we demote it to a build time dep) Co-authored-by: ocaisa --- easybuild/framework/easyblock.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 5d715b5a67..646ea7b4fe 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -4293,16 +4293,18 @@ def xs2str(xs): else: self.log.debug("Skipping RPATH sanity check") - if get_software_root('CUDA'): + if 'CUDA' in [dep['name'] for dep in self.cfg.dependencies()]: if shutil.which('cuobjdump'): cuda_fails = self.sanity_check_cuda() if cuda_fails: self.log.warning("CUDA device code sanity check failed!") self.sanity_check_fail_msgs.extend(cuda_fails) else: - raise EasyBuildError("Failed to execute CUDA sanity check: cuobjdump not found") + msg = "Failed to execute CUDA sanity check: cuobjdump not found\n" + msg += "CUDA module must be loaded for sanity check (or cuobjdump available in PATH)" + raise EasyBuildError(msg) else: - self.log.debug("Skipping CUDA sanity check: CUDA module was not loaded") + self.log.debug("Skipping CUDA sanity check: CUDA is not in dependencies") # pass or fail if self.sanity_check_fail_msgs: From f009b7d6887de24e9949151b77416700eaae5355 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 15 May 2025 21:42:36 +0200 Subject: [PATCH 102/114] Fix unit tests for the new setup where we check if CUDA is a dep, instead of checking that EBROOTCUDA is set --- test/framework/toy_build.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/test/framework/toy_build.py b/test/framework/toy_build.py index 2a9f7a8aac..d1e9ba70a5 100644 --- a/test/framework/toy_build.py +++ b/test/framework/toy_build.py @@ -3121,6 +3121,10 @@ def test_toy_cuda_sanity_check(self): topdir = os.path.dirname(os.path.abspath(__file__)) toy_ec = os.path.join(topdir, 'easyconfigs', 'test_ecs', 't', 'toy', 'toy-0.0.eb') + toy_ec_cuda = os.path.join(self.test_prefix, 'toy-0.0-cuda.eb') + write_file(toy_ec_cuda, read_file(toy_ec) + "\ndependencies = [('CUDA', '5.5.22', '', SYSTEM)]") + toy_ec = toy_ec_cuda + # Create mock cuobjdump # First, lets define sections of echo's for cuobjdump for various scenarios @@ -3223,6 +3227,14 @@ def test_toy_cuda_sanity_check(self): # Add cuobjdump_dir to the path setvar('PATH', '%s:%s' % (cuobjdump_dir, os.getenv('PATH'))) + # Pretend the CUDA dep is already installed + module_dir = os.path.join(self.test_prefix, 'modules', 'all') + mkdir(module_dir, parents=True) + cuda_mod_dir = os.path.join(module_dir, 'CUDA') + cuda_mod_file = os.path.join(cuda_mod_dir, '5.5.22.lua') + write_file(cuda_mod_file, "-- Fake module content for CUDA") + setvar('MODULEPATH', module_dir) + # Pretend we have CUDA loaded, or the sanity check won't run setvar('EBROOTCUDA', '/foo/bar') @@ -3384,11 +3396,8 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, # Test case 7: same as Test case 6, but add the failing file to the cuda_sanity_ignore_files # This is expected to succeed: the individual file which _would_ cause the sanity check to fail is # now on the ignore list - topdir = os.path.dirname(os.path.abspath(__file__)) - toy_ec_file = os.path.join(topdir, 'easyconfigs', 'test_ecs', 't', 'toy', 'toy-0.0.eb') - toy_whitelist_ec = os.path.join(self.test_prefix, 'toy-0.0-cuda-whitelist.eb') - write_file(toy_whitelist_ec, read_file(toy_ec_file) + '\ncuda_sanity_ignore_files = ["bin/toy"]') + write_file(toy_whitelist_ec, read_file(toy_ec) + '\ncuda_sanity_ignore_files = ["bin/toy"]') args = ['--cuda-compute-capabilities=9.0', '--cuda-sanity-check-error-on-fail', '--cuda-sanity-check-accept-missing-ptx', '--cuda-sanity-check-strict'] From a901ba5a8395b931ced883d45a75b76396d40bd3 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 15 May 2025 21:43:26 +0200 Subject: [PATCH 103/114] Don't set EBROOTCUDA anymore, it's no longer needed --- test/framework/toy_build.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/test/framework/toy_build.py b/test/framework/toy_build.py index d1e9ba70a5..2100040b7c 100644 --- a/test/framework/toy_build.py +++ b/test/framework/toy_build.py @@ -3235,9 +3235,6 @@ def test_toy_cuda_sanity_check(self): write_file(cuda_mod_file, "-- Fake module content for CUDA") setvar('MODULEPATH', module_dir) - # Pretend we have CUDA loaded, or the sanity check won't run - setvar('EBROOTCUDA', '/foo/bar') - # Filepath to cuobjdump cuobjdump_file = os.path.join(cuobjdump_dir, 'cuobjdump') From e304b17ee19c80d866e851fde16b70d99cc53466 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 15 May 2025 22:36:18 +0200 Subject: [PATCH 104/114] Keep failure message short: just list the number of files, and refer to the build log --- easybuild/framework/easyblock.py | 82 +++++++++++++++++--------------- 1 file changed, 43 insertions(+), 39 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 646ea7b4fe..f713013481 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3457,15 +3457,15 @@ def format_file_list(files_list): ignore_msg = f"This failure will be ignored as '{path}' is listed in " ignore_msg += "'cuda_sanity_ignore_files'." + # Boolean to track if check has failed + is_failure = False + if not missing_devcodes and not additional_devcodes: # Device code for all architectures requested in --cuda-compute-capabilities was found msg = (f"Output of 'cuobjdump' checked for '{path}'; device code architectures match " "those in cuda_compute_capabilities") self.log.debug(msg) else: - # Set default failure status and empty message - is_failure = False - if additional_devcodes: # Device code found for more architectures than requested in cuda-compute-capabilities fail_msg = f"Mismatch between cuda_compute_capabilities and device code in {path}. " @@ -3480,22 +3480,12 @@ def format_file_list(files_list): # easyconfig, or we are running with --disable-cuda-sanity-check-error-on-fail files_additional_devcode_ignored.append(os.path.relpath(path, self.installdir)) fail_msg += ignore_msg - is_failure = False else: # Sanity error files_additional_devcode_fails.append(os.path.relpath(path, self.installdir)) is_failure = True - else: - is_failure = False # Do reporting for the additional_devcodes case - # If considered a failure, append to fails so that a sanity error will be thrown - # Otherwise, log a warning - # Note that we report on the additional_devcodes and missing_devices cases separately - # Because one could be a failure, while the other isn't - if is_failure: - fail_msgs.append(fail_msg) - else: - self.log.warning(fail_msg) + self.log.warning(fail_msg) # Both additional_devcodes and missing_devcodes could exist, so use if, not elif if missing_devcodes: @@ -3520,7 +3510,6 @@ def format_file_list(files_list): # failure if all(comparisons): files_missing_devcode_but_has_ptx.append(os.path.relpath(path, self.installdir)) - is_failure = False else: # If there are CCs for which there is no suiteable PTX that can be JIT-compiled # from, this is considered a failure @@ -3531,7 +3520,6 @@ def format_file_list(files_list): # --disable-cuda-sanity-check-error-on-fail files_missing_devcode_ignored.append(os.path.relpath(path, self.installdir)) fail_msg += ignore_msg - is_failure = False else: # Sanity error files_missing_devcode_fails.append(os.path.relpath(path, self.installdir)) @@ -3545,18 +3533,12 @@ def format_file_list(files_list): # easyconfig, or we are running with --disable-cuda-sanity-check-error-on-fail files_missing_devcode_ignored.append(os.path.relpath(path, self.installdir)) fail_msg += ignore_msg - is_failure = False else: # Sanity error files_missing_devcode_fails.append(os.path.relpath(path, self.installdir)) is_failure = True # Do reporting for the missing_devcodes case - # If considered a failure, append to fails so that a sanity error will be thrown - # Otherwise, log a warning - if is_failure: - fail_msgs.append(fail_msg) - else: - self.log.warning(fail_msg) + self.log.warning(fail_msg) # Check whether there is ptx code for the highest CC in cfg_ccs # Make sure to use LooseVersion so that e.g. 9.0 < 9.0a < 9.2 < 9.10 @@ -3581,7 +3563,8 @@ def format_file_list(files_list): else: # Sanity error files_missing_ptx_fails.append(os.path.relpath(path, self.installdir)) - fail_msgs.append(fail_msg % (highest_cc[0], path, found_ptx_ccs)) + self.log.warning(fail_msg % (highest_cc[0], path, found_ptx_ccs)) + is_failure = True else: msg = (f"Output of 'cuobjdump' checked for '{path}'; ptx code was present for (at " "least) the highest CUDA compute capability in cuda_compute_capabilities") @@ -3685,25 +3668,46 @@ def trace_and_log(msg): trace_and_log("See build log for detailed lists of files not passing the CUDA Sanity Check") # Long report, which prints the files that have potential issues - summary_msg_files = f"{len(files_missing_devcode)} files missing one or more CUDA compute capabilities:" - summary_msg_files += f"{format_file_list(files_missing_devcode)}\n" - summary_msg_files += f"These failures are ignored for {len(files_missing_devcode_ignored)} files:" - summary_msg_files += f"{format_file_list(files_missing_devcode_ignored)})\n" + summary_msg_files = "" + if len(files_missing_devcode) > 0: + summary_msg_files += f"{len(files_missing_devcode)} files missing one or more CUDA compute capabilities:" + summary_msg_files += f"{format_file_list(files_missing_devcode)}\n" + if len(files_missing_devcode_ignored) > 0: + summary_msg_files += f"These failures are ignored for {len(files_missing_devcode_ignored)} files:" + summary_msg_files += f"{format_file_list(files_missing_devcode_ignored)}\n" if accept_ptx_as_devcode: summary_msg_files += f"{len(files_missing_devcode_but_has_ptx)} files missing one or more CUDA Compute " summary_msg_files += "Capabilities, but has suitable PTX code that can be JIT compiled for the requested " summary_msg_files += f"CUDA Compute Capabilities:{format_file_list(files_missing_devcode_but_has_ptx)}\n" - summary_msg_files += f"{len(files_additional_devcode)} files with device code for more CUDA Compute " - summary_msg_files += f"Capabilities than requested:{format_file_list(files_additional_devcode)}\n" - summary_msg_files += f"These failures are ignored for {len(files_additional_devcode_ignored)} files:" - summary_msg_files += f"{format_file_list(files_additional_devcode_ignored)})\n" - summary_msg_files += f"{len(files_missing_ptx)} files missing PTX code for the highest configured CUDA Compute" - summary_msg_files += f" Capability:{format_file_list(files_missing_ptx)}\n" - summary_msg_files += f"These failures are ignored for {len(files_missing_ptx_ignored)} files:" - summary_msg_files += f"{format_file_list(files_missing_ptx_ignored)})" - self.log.info(summary_msg_files) - - return fail_msgs + if len(files_additional_devcode) > 0: + summary_msg_files += f"{len(files_additional_devcode)} files with device code for more CUDA Compute " + summary_msg_files += f"Capabilities than requested:{format_file_list(files_additional_devcode)}\n" + if len(files_additional_devcode_ignored) > 0: + summary_msg_files += f"These failures are ignored for {len(files_additional_devcode_ignored)} files:" + summary_msg_files += f"{format_file_list(files_additional_devcode_ignored)}\n" + if len(files_missing_ptx) > 0: + summary_msg_files += f"{len(files_missing_ptx)} files missing PTX code for the highest configured CUDA Compute" + summary_msg_files += f" Capability:{format_file_list(files_missing_ptx)}\n" + if len(files_missing_ptx_ignored) > 0: + summary_msg_files += f"These failures are ignored for {len(files_missing_ptx_ignored)} files:" + summary_msg_files += f"{format_file_list(files_missing_ptx_ignored)}" + if summary_msg_files: + msg = "CUDA sanity check detailed report:\n" + msg += summary_msg_files + self.log.info(msg) + + fail_msg = [''] + if len(files_missing_devcode_fails) > 0: + fail_msg.append(f"Files missing CUDA device code: {len(files_missing_devcode_fails)}.") + if len(files_additional_devcode_fails) > 0: + fail_msg.append(f"Files with additional CUDA device code: {len(files_additional_devcode_fails)}.") + if len(files_missing_ptx_fails) > 0: + fail_msg.append(f"Files missing CUDA PTX code: {len(files_missing_ptx_fails)}.") + msg = "Check the build log for the 'CUDA sanity check detailed report' for a full list of files that failed " + msg += "to pass the sanity check." + fail_msg.append(msg) + + return fail_msg def sanity_check_rpath(self, rpath_dirs=None, check_readelf_rpath=True): """Sanity check binaries/libraries w.r.t. RPATH linking.""" From 4273f16c286b5c3381e8d012696c7e6a1c157197 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 15 May 2025 22:44:59 +0200 Subject: [PATCH 105/114] Make defining a non-empty failure message conditional on an actual failure having occured --- easybuild/framework/easyblock.py | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index f713013481..c10df81f35 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3686,8 +3686,8 @@ def trace_and_log(msg): summary_msg_files += f"These failures are ignored for {len(files_additional_devcode_ignored)} files:" summary_msg_files += f"{format_file_list(files_additional_devcode_ignored)}\n" if len(files_missing_ptx) > 0: - summary_msg_files += f"{len(files_missing_ptx)} files missing PTX code for the highest configured CUDA Compute" - summary_msg_files += f" Capability:{format_file_list(files_missing_ptx)}\n" + summary_msg_files += f"{len(files_missing_ptx)} files missing PTX code for the highest configured CUDA" + summary_msg_files += f" Compute Capability:{format_file_list(files_missing_ptx)}\n" if len(files_missing_ptx_ignored) > 0: summary_msg_files += f"These failures are ignored for {len(files_missing_ptx_ignored)} files:" summary_msg_files += f"{format_file_list(files_missing_ptx_ignored)}" @@ -3696,18 +3696,19 @@ def trace_and_log(msg): msg += summary_msg_files self.log.info(msg) - fail_msg = [''] - if len(files_missing_devcode_fails) > 0: - fail_msg.append(f"Files missing CUDA device code: {len(files_missing_devcode_fails)}.") - if len(files_additional_devcode_fails) > 0: - fail_msg.append(f"Files with additional CUDA device code: {len(files_additional_devcode_fails)}.") - if len(files_missing_ptx_fails) > 0: - fail_msg.append(f"Files missing CUDA PTX code: {len(files_missing_ptx_fails)}.") - msg = "Check the build log for the 'CUDA sanity check detailed report' for a full list of files that failed " - msg += "to pass the sanity check." - fail_msg.append(msg) - - return fail_msg + if is_failure: + fail_msgs = [''] + if len(files_missing_devcode_fails) > 0: + fail_msgs.append(f"Files missing CUDA device code: {len(files_missing_devcode_fails)}.") + if len(files_additional_devcode_fails) > 0: + fail_msgs.append(f"Files with additional CUDA device code: {len(files_additional_devcode_fails)}.") + if len(files_missing_ptx_fails) > 0: + fail_msgs.append(f"Files missing CUDA PTX code: {len(files_missing_ptx_fails)}.") + msg = "Check the build log for the 'CUDA sanity check detailed report' for a full list of files that failed " + msg += "to pass the sanity check." + fail_msgs.append(msg) + + return fail_msgs def sanity_check_rpath(self, rpath_dirs=None, check_readelf_rpath=True): """Sanity check binaries/libraries w.r.t. RPATH linking.""" From c2cdb872fe7816e7329e2a19068bb164bf6a1c94 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 15 May 2025 22:48:07 +0200 Subject: [PATCH 106/114] We don't need to track with is_failure, we can just check if any of the *_fails file lists is non-empty --- easybuild/framework/easyblock.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index c10df81f35..58ffdc70b7 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3457,9 +3457,6 @@ def format_file_list(files_list): ignore_msg = f"This failure will be ignored as '{path}' is listed in " ignore_msg += "'cuda_sanity_ignore_files'." - # Boolean to track if check has failed - is_failure = False - if not missing_devcodes and not additional_devcodes: # Device code for all architectures requested in --cuda-compute-capabilities was found msg = (f"Output of 'cuobjdump' checked for '{path}'; device code architectures match " @@ -3483,7 +3480,6 @@ def format_file_list(files_list): else: # Sanity error files_additional_devcode_fails.append(os.path.relpath(path, self.installdir)) - is_failure = True # Do reporting for the additional_devcodes case self.log.warning(fail_msg) @@ -3523,7 +3519,6 @@ def format_file_list(files_list): else: # Sanity error files_missing_devcode_fails.append(os.path.relpath(path, self.installdir)) - is_failure = True else: # Device code was missing, and we're not accepting PTX code as alternative # This is considered a failure @@ -3536,7 +3531,6 @@ def format_file_list(files_list): else: # Sanity error files_missing_devcode_fails.append(os.path.relpath(path, self.installdir)) - is_failure = True # Do reporting for the missing_devcodes case self.log.warning(fail_msg) @@ -3564,7 +3558,6 @@ def format_file_list(files_list): # Sanity error files_missing_ptx_fails.append(os.path.relpath(path, self.installdir)) self.log.warning(fail_msg % (highest_cc[0], path, found_ptx_ccs)) - is_failure = True else: msg = (f"Output of 'cuobjdump' checked for '{path}'; ptx code was present for (at " "least) the highest CUDA compute capability in cuda_compute_capabilities") @@ -3696,7 +3689,12 @@ def trace_and_log(msg): msg += summary_msg_files self.log.info(msg) - if is_failure: + # If any failure happened, compose a message to be raised as error + if ( + len(files_missing_devcode_fails) > 0 or + len(files_additional_devcode_fails) > 0 or + len(files_missing_ptx_fails) > 0 + ): fail_msgs = [''] if len(files_missing_devcode_fails) > 0: fail_msgs.append(f"Files missing CUDA device code: {len(files_missing_devcode_fails)}.") From e9ec5015155563dfb07e11605569fb71984cda2c Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 15 May 2025 22:49:11 +0200 Subject: [PATCH 107/114] Fix too long line --- easybuild/framework/easyblock.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 58ffdc70b7..72fe18d3d8 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3702,8 +3702,8 @@ def trace_and_log(msg): fail_msgs.append(f"Files with additional CUDA device code: {len(files_additional_devcode_fails)}.") if len(files_missing_ptx_fails) > 0: fail_msgs.append(f"Files missing CUDA PTX code: {len(files_missing_ptx_fails)}.") - msg = "Check the build log for the 'CUDA sanity check detailed report' for a full list of files that failed " - msg += "to pass the sanity check." + msg = "Check the build log for the 'CUDA sanity check detailed report' for a full list of files that " + msg += "failed to pass the sanity check." fail_msgs.append(msg) return fail_msgs From a17a42c207c9b903cda6cec863a352cb9f959432 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Thu, 15 May 2025 23:01:43 +0200 Subject: [PATCH 108/114] Fix unit tests to accomodate for the difference in the error message that is raised --- test/framework/toy_build.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/test/framework/toy_build.py b/test/framework/toy_build.py index 2100040b7c..484c1a2ff2 100644 --- a/test/framework/toy_build.py +++ b/test/framework/toy_build.py @@ -3309,8 +3309,7 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, # This is expected to fail since there is missing device code for CC80 args = ['--cuda-compute-capabilities=8.0', '--cuda-sanity-check-error-on-fail'] # We expect this to fail, so first check error, then run again to check output - error_pattern = r"Sanity check failed: Mismatch between cuda_compute_capabilities and device code in " - error_pattern += r".*/bin/toy. Missing compute capabilities: 8.0." + error_pattern = r"Files missing CUDA device code: 1." with self.mocked_stdout_stderr(): self.assertErrorRegex(EasyBuildError, error_pattern, self._test_toy_build, ec_file=toy_ec, extra_args=args, raise_error=True) @@ -3343,9 +3342,7 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, # This is expected to fail: device code is present, but PTX code for the highest CC (9.0) is missing args = ['--cuda-compute-capabilities=9.0', '--cuda-sanity-check-error-on-fail'] # We expect this to fail, so first check error, then run again to check output - error_pattern = r"Sanity check failed: Configured highest compute capability was '9\.0', " - error_pattern += r"but no PTX code for this compute capability was found in '.*/bin/toy' " - error_pattern += r"\(PTX architectures supported in that file: \['8\.0'\]\)" + error_pattern = r"Files missing CUDA PTX code: 1" with self.mocked_stdout_stderr(): self.assertErrorRegex(EasyBuildError, error_pattern, self._test_toy_build, ec_file=toy_ec, extra_args=args, raise_error=True) @@ -3379,8 +3376,7 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, args = ['--cuda-compute-capabilities=9.0', '--cuda-sanity-check-error-on-fail', '--cuda-sanity-check-accept-missing-ptx', '--cuda-sanity-check-strict'] # We expect this to fail, so first check error, then run again to check output - error_pattern = r"Sanity check failed: Mismatch between cuda_compute_capabilities and device code in " - error_pattern += r".*/bin/toy. Additional compute capabilities: 7\.0" + error_pattern = r"Files with additional CUDA device code: 1" with self.mocked_stdout_stderr(): self.assertErrorRegex(EasyBuildError, error_pattern, self._test_toy_build, ec_file=toy_ec, extra_args=args, raise_error=True) @@ -3399,8 +3395,6 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, args = ['--cuda-compute-capabilities=9.0', '--cuda-sanity-check-error-on-fail', '--cuda-sanity-check-accept-missing-ptx', '--cuda-sanity-check-strict'] # We expect this to succeed, so check output for expected patterns - error_pattern = r"Sanity check failed: Mismatch between cuda_compute_capabilities and device code in " - error_pattern += r".*/bin/toy\. Additional compute capabilities: 7\.0" with self.mocked_stdout_stderr(): outtxt = self._test_toy_build(ec_file=toy_whitelist_ec, extra_args=args, raise_error=True, verify=False) stdout = self.get_stdout() From 190156bab7886fcf9fd3670927038295afc49c5a Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Fri, 16 May 2025 13:20:17 +0200 Subject: [PATCH 109/114] rename --cuda-sanity-check-error-on-fail to --cuda-sanity-check-error-on-failed-checks + improve help text for --cuda-sanity-check-* configuration options --- easybuild/framework/easyblock.py | 2 +- easybuild/tools/config.py | 2 +- easybuild/tools/options.py | 29 +++++++++++++++-------------- easybuild/tools/systemtools.py | 2 ++ 4 files changed, 19 insertions(+), 16 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 72fe18d3d8..95c3bfb402 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3360,7 +3360,7 @@ def sanity_check_cuda(self, cuda_dirs=None): fail_msgs = [] cfg_ccs = build_option('cuda_compute_capabilities') or self.cfg.get('cuda_compute_capabilities', None) - ignore_failures = not build_option('cuda_sanity_check_error_on_fail') + ignore_failures = not build_option('cuda_sanity_check_error_on_failed_checks') strict_cc_check = build_option('cuda_sanity_check_strict') accept_ptx_as_devcode = build_option('cuda_sanity_check_accept_ptx_as_devcode') accept_missing_ptx = build_option('cuda_sanity_check_accept_missing_ptx') diff --git a/easybuild/tools/config.py b/easybuild/tools/config.py index b9ddaff7ba..609295b94f 100644 --- a/easybuild/tools/config.py +++ b/easybuild/tools/config.py @@ -299,7 +299,7 @@ def mk_full_default_path(name, prefix=DEFAULT_PREFIX): 'container_build_image', 'cuda_sanity_check_accept_ptx_as_devcode', 'cuda_sanity_check_accept_missing_ptx', - 'cuda_sanity_check_error_on_fail', + 'cuda_sanity_check_error_on_failed_checks', 'cuda_sanity_check_strict', 'debug', 'debug_lmod', diff --git a/easybuild/tools/options.py b/easybuild/tools/options.py index 6be7ed3029..62443a4303 100644 --- a/easybuild/tools/options.py +++ b/easybuild/tools/options.py @@ -406,30 +406,31 @@ def override_options(self): "--cuda-sanity-check-accept-ptx-as-devcode, " "or made more stringent using --cuda-sanity-check-strict.", 'strlist', 'extend', None), - 'cuda-sanity-check-accept-missing-ptx': ("CUDA sanity check also passes if PTX code for the highest " + 'cuda-sanity-check-accept-missing-ptx': ("Relax CUDA sanity check to accept that PTX code for the highest " "requested CUDA compute capability is not present (but will " "print a warning)", None, 'store_true', False), - 'cuda-sanity-check-accept-ptx-as-devcode': ("CUDA sanity check also passes if requested device code is " - "not present, as long as PTX code is present that can be " - "JIT-compiled for each target in --cuda-compute-capabilities " - "E.g. if --cuda-compute-capabilities=8.0 and a binary is " - "found in the installation that does not have device code for " - "8.0, but it does have PTX code for 7.0, the sanity check " - "will pass if, and only if, this option is True. " + 'cuda-sanity-check-accept-ptx-as-devcode': ("Relax CUDA sanity check to accept that requested device code " + "is not present, as long as PTX code is present that can be " + "JIT-compiled for each target in --cuda-compute-capabilities. " + "For example, if --cuda-compute-capabilities=8.0 and a binary " + "is found in the installation that does not have device code " + "for 8.0, but it does have PTX code for 7.0, the sanity check " + "will pass if, and only if, this option is enabled. " "Note that JIT-compiling means the binary will work on the " "requested architecture, but is it not necessarily as well " "optimized as when actual device code is present for the " "requested architecture ", None, 'store_true', False), - 'cuda-sanity-check-error-on-fail': ("If True, failures in the CUDA sanity check will produce an error. " - "If False, the CUDA sanity check will be performed, and failures will " - "be reported, but they will not result in an error", - None, 'store_true', False), + 'cuda-sanity-check-error-on-failed-checks': ("If enabled, failures in the CUDA sanity check will produce " + "an error. If disabled, the CUDA sanity check will be " + "performed and failures will be reported through warnings, " + "but they will not result in an error", + None, 'store_true', False), 'cuda-sanity-check-strict': ("Perform strict CUDA sanity check. Without this option, the CUDA sanity " "check will fail if the CUDA binaries don't contain code for (at least) " - "all compute capabilities defined in --cude-compute-capabilities, but will " - "accept if code for additional compute capabilities is present. " + "all compute capabilities defined in --cude-compute-capabilities, " + "but will accept if code for additional compute capabilities is present. " "With this setting, the sanity check will also fail if code is present for " "more compute capabilities than defined in --cuda-compute-capabilities.", None, 'store_true', False), diff --git a/easybuild/tools/systemtools.py b/easybuild/tools/systemtools.py index f471b7dd86..634e8fdb34 100644 --- a/easybuild/tools/systemtools.py +++ b/easybuild/tools/systemtools.py @@ -27,10 +27,12 @@ Authors: +* Kenneth Hoste (Ghent University) * Jens Timmerman (Ghent University) * Ward Poelmans (Ghent University) * Jasper Grimm (UoY) * Jan Andre Reuter (Forschungszentrum Juelich GmbH) +* Caspar van Leeuwen (SURF) """ import csv import ctypes From b14ccebbd74a45f555f17330c5cb1f49c5effb21 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Fri, 16 May 2025 13:53:40 +0200 Subject: [PATCH 110/114] Add fake modulefile for CUDA in Tcl format as well --- test/framework/toy_build.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/framework/toy_build.py b/test/framework/toy_build.py index 484c1a2ff2..176e1b732b 100644 --- a/test/framework/toy_build.py +++ b/test/framework/toy_build.py @@ -3232,7 +3232,9 @@ def test_toy_cuda_sanity_check(self): mkdir(module_dir, parents=True) cuda_mod_dir = os.path.join(module_dir, 'CUDA') cuda_mod_file = os.path.join(cuda_mod_dir, '5.5.22.lua') + cuda_mod_file_tcl = os.path.join(cuda_mod_dir, '5.5.22') write_file(cuda_mod_file, "-- Fake module content for CUDA") + write_file(cuda_mod_file_tcl, "#%Module1.0\n#This is a fake module file for CUDA") setvar('MODULEPATH', module_dir) # Filepath to cuobjdump From abc108be76ceff1d42e8f9232f1fcbdc312f6ba3 Mon Sep 17 00:00:00 2001 From: Caspar van Leeuwen Date: Fri, 16 May 2025 13:56:29 +0200 Subject: [PATCH 111/114] Spread over two writes --- test/framework/toy_build.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/framework/toy_build.py b/test/framework/toy_build.py index 176e1b732b..08fe11bb56 100644 --- a/test/framework/toy_build.py +++ b/test/framework/toy_build.py @@ -3234,7 +3234,8 @@ def test_toy_cuda_sanity_check(self): cuda_mod_file = os.path.join(cuda_mod_dir, '5.5.22.lua') cuda_mod_file_tcl = os.path.join(cuda_mod_dir, '5.5.22') write_file(cuda_mod_file, "-- Fake module content for CUDA") - write_file(cuda_mod_file_tcl, "#%Module1.0\n#This is a fake module file for CUDA") + write_file(cuda_mod_file_tcl, "#%Module1.0") + write_file(cuda_mod_file_tcl, "#This is a fake module file for CUDA", append=True) setvar('MODULEPATH', module_dir) # Filepath to cuobjdump From 22858ec74b416db024685f20cf9891eee051b71c Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Fri, 16 May 2025 14:58:33 +0200 Subject: [PATCH 112/114] also rename to --cuda-sanity-check-error-on-failed-checks in comments, trace/log messages, and tests --- easybuild/framework/easyblock.py | 24 ++++++++++++------------ test/framework/toy_build.py | 30 +++++++++++++++--------------- 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index 95c3bfb402..e9cabc9724 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3446,13 +3446,13 @@ def format_file_list(files_list): missing_devcodes = list(set(cfg_ccs) - set(found_dev_code_ccs)) # There are two reasons for ignoring failures: - # - We are running with --disable-cuda-sanity-check-error-on-fail + # - We are running with --disable-cuda-sanity-check-error-on-failed-checks # - The specific {path} is on the cuda_sanity_ignore_files in the easyconfig # In case we run with both, we'll just report that we're running with - # --disable-cuda-sanity-check-error-on-fail + # --disable-cuda-sanity-check-error-on-failed-checks if ignore_failures: ignore_msg = f"Failure for {path} will be ignored since we are not running with " - ignore_msg += "--cuda-sanity-check-error-on-fail" + ignore_msg += "--cuda-sanity-check-error-on-failed-checks" else: ignore_msg = f"This failure will be ignored as '{path}' is listed in " ignore_msg += "'cuda_sanity_ignore_files'." @@ -3473,8 +3473,8 @@ def format_file_list(files_list): if strict_cc_check: # cuda-sanity-check-strict, so no additional compute capabilities allowed if path in ignore_file_list or ignore_failures: - # No error, because either path is on the cuda_sanity_ignore_files list in the - # easyconfig, or we are running with --disable-cuda-sanity-check-error-on-fail + # No error, either path is in cuda_sanity_ignore_files list in easyconfig, + # or we are running with --disable-cuda-sanity-check-error-on-failed-checks files_additional_devcode_ignored.append(os.path.relpath(path, self.installdir)) fail_msg += ignore_msg else: @@ -3513,7 +3513,7 @@ def format_file_list(files_list): if path in ignore_file_list or ignore_failures: # No error, because either path is on the cuda_sanity_ignore_files list in # the easyconfig, or we are running with - # --disable-cuda-sanity-check-error-on-fail + # --disable-cuda-sanity-check-error-on-failed-checks files_missing_devcode_ignored.append(os.path.relpath(path, self.installdir)) fail_msg += ignore_msg else: @@ -3524,8 +3524,8 @@ def format_file_list(files_list): # This is considered a failure files_missing_devcode.append(os.path.relpath(path, self.installdir)) if path in ignore_file_list or ignore_failures: - # No error, because either path is on the cuda_sanity_ignore_files list in the - # easyconfig, or we are running with --disable-cuda-sanity-check-error-on-fail + # No error, either path is in cuda_sanity_ignore_files list in easyconfig, + # or we are running with --disable-cuda-sanity-check-error-on-failed-checks files_missing_devcode_ignored.append(os.path.relpath(path, self.installdir)) fail_msg += ignore_msg else: @@ -3547,7 +3547,7 @@ def format_file_list(files_list): fail_msg += "(PTX architectures supported in that file: %s). " if path in ignore_file_list or ignore_failures: # No error, because either path is on the cuda_sanity_ignore_files list in the - # easyconfig, or we are running with --disable-cuda-sanity-check-error-on-fail + # easyconfig, or we are running with --disable-cuda-sanity-check-error-on-failed-checks files_missing_ptx_ignored.append(os.path.relpath(path, self.installdir)) fail_msg += ignore_msg self.log.warning(fail_msg, highest_cc[0], path, found_ptx_ccs) @@ -3578,7 +3578,7 @@ def trace_and_log(msg): elif ignore_failures: msg = f"Number of files missing one or more CUDA Compute Capabilities: {len(files_missing_devcode)}" trace_and_log(msg) - trace_and_log("(not running with --cuda-sanity-check-error-on-fail, so not considered failures)") + trace_and_log("(not running with --cuda-sanity-check-error-on-failed-checks, so not considered failures)") else: msg = f"Number of files missing one or more CUDA Compute Capabilities: {len(files_missing_devcode)}" msg += f" (ignored: {len(files_missing_devcode_ignored)}, " @@ -3595,7 +3595,7 @@ def trace_and_log(msg): msg = "Number of files with device code for more CUDA Compute Capabilities than requested: " msg += f"{len(files_additional_devcode)}" trace_and_log(msg) - trace_and_log("(not running with --cuda-sanity-check-error-on-fail, so not considered failures)") + trace_and_log("(not running with --cuda-sanity-check-error-on-failed-checks, so not considered failures)") elif strict_cc_check: msg = "Number of files with device code for more CUDA Compute Capabilities than requested: " msg += f"{len(files_additional_devcode)} (ignored: {len(files_additional_devcode_ignored)}, " @@ -3612,7 +3612,7 @@ def trace_and_log(msg): msg = "Number of files missing PTX code for the highest configured CUDA Compute Capability: " msg += f"{len(files_missing_ptx)}" trace_and_log(msg) - trace_and_log("(not running with --cuda-sanity-check-error-on-fail, so not considered failures)") + trace_and_log("(not running with --cuda-sanity-check-error-on-failed-checks, so not considered failures)") elif accept_missing_ptx: msg = "Number of files missing PTX code for the highest configured CUDA Compute Capability: " msg += f"{len(files_missing_ptx)}" diff --git a/test/framework/toy_build.py b/test/framework/toy_build.py index 484c1a2ff2..bfa8b4de4a 100644 --- a/test/framework/toy_build.py +++ b/test/framework/toy_build.py @@ -3270,7 +3270,7 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, # Test case 1a: test with default options, --cuda-compute-capabilities=8.0 and a binary that contains # 8.0 device code - # This should succeed (since the default for --cuda-sanity-check-error-on-fail is False) + # This should succeed (since the default for --cuda-sanity-check-error-on-failed-checks is False) # as to not break backwards compatibility write_file(cuobjdump_file, cuobjdump_txt_shebang), write_file(cuobjdump_file, cuobjdump_txt_sm80, append=True) @@ -3287,7 +3287,7 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, # Note that the difference with 1a is the presense of additional device code, PTX code foor the right # architecture, but missing device code for the requested architecture # It should not matter for the result, but triggers slightly different code paths in easyblock.py - # This should succeed (since the default for --cuda-sanity-check-error-on-fail is False) + # This should succeed (since the default for --cuda-sanity-check-error-on-failed-checks is False) # as to not break backwards compatibility write_file(cuobjdump_file, cuobjdump_txt_shebang), write_file(cuobjdump_file, cuobjdump_txt_sm90, append=True) @@ -3305,9 +3305,9 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, self.assertTrue(device_missing_80_code_regex.search(outtxt), msg) assert_cuda_report(missing_cc=1, additional_cc=1, missing_ptx=0, log=outtxt, stdout=stdout) - # Test case 2: same as Test case 1, but add --cuda-sanity-check-error-on-fail + # Test case 2: same as Test case 1, but add --cuda-sanity-check-error-on-failed-checks # This is expected to fail since there is missing device code for CC80 - args = ['--cuda-compute-capabilities=8.0', '--cuda-sanity-check-error-on-fail'] + args = ['--cuda-compute-capabilities=8.0', '--cuda-sanity-check-error-on-failed-checks'] # We expect this to fail, so first check error, then run again to check output error_pattern = r"Files missing CUDA device code: 1." with self.mocked_stdout_stderr(): @@ -3325,7 +3325,7 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, # This is expected to succeed, since now the PTX code for CC80 will be accepted as # device code. Note that also PTX code for the highest requested compute architecture (also CC80) # is present, so also this part of the sanity check passes - args = ['--cuda-compute-capabilities=8.0', '--cuda-sanity-check-error-on-fail', + args = ['--cuda-compute-capabilities=8.0', '--cuda-sanity-check-error-on-failed-checks', '--cuda-sanity-check-accept-ptx-as-devcode'] # We expect this to pass, so no need to check errors with self.mocked_stdout_stderr(): @@ -3340,7 +3340,7 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, # Test case 4: same as Test case 2, but run with --cuda-compute-capabilities=9.0 # This is expected to fail: device code is present, but PTX code for the highest CC (9.0) is missing - args = ['--cuda-compute-capabilities=9.0', '--cuda-sanity-check-error-on-fail'] + args = ['--cuda-compute-capabilities=9.0', '--cuda-sanity-check-error-on-failed-checks'] # We expect this to fail, so first check error, then run again to check output error_pattern = r"Files missing CUDA PTX code: 1" with self.mocked_stdout_stderr(): @@ -3354,7 +3354,7 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, # Test case 5: same as Test case 4, but add --cuda-sanity-check-accept-missing-ptx # This is expected to succeed: device code is present, PTX code is missing, but that's accepted - args = ['--cuda-compute-capabilities=9.0', '--cuda-sanity-check-error-on-fail', + args = ['--cuda-compute-capabilities=9.0', '--cuda-sanity-check-error-on-failed-checks', '--cuda-sanity-check-accept-missing-ptx'] # We expect this to pass, so no need to check errors warning_pattern = r"Configured highest compute capability was '9\.0', " @@ -3373,7 +3373,7 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, # Test case 6: same as Test case 5, but add --cuda-sanity-check-strict # This is expected to fail: device code is present, PTX code is missing (but accepted due to option) # but additional device code is present, which is not allowed by --cuda-sanity-check-strict - args = ['--cuda-compute-capabilities=9.0', '--cuda-sanity-check-error-on-fail', + args = ['--cuda-compute-capabilities=9.0', '--cuda-sanity-check-error-on-failed-checks', '--cuda-sanity-check-accept-missing-ptx', '--cuda-sanity-check-strict'] # We expect this to fail, so first check error, then run again to check output error_pattern = r"Files with additional CUDA device code: 1" @@ -3392,7 +3392,7 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, toy_whitelist_ec = os.path.join(self.test_prefix, 'toy-0.0-cuda-whitelist.eb') write_file(toy_whitelist_ec, read_file(toy_ec) + '\ncuda_sanity_ignore_files = ["bin/toy"]') - args = ['--cuda-compute-capabilities=9.0', '--cuda-sanity-check-error-on-fail', + args = ['--cuda-compute-capabilities=9.0', '--cuda-sanity-check-error-on-failed-checks', '--cuda-sanity-check-accept-missing-ptx', '--cuda-sanity-check-strict'] # We expect this to succeed, so check output for expected patterns with self.mocked_stdout_stderr(): @@ -3402,7 +3402,7 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, self.assertTrue(device_additional_70_code_regex.search(outtxt), msg) assert_cuda_report(missing_cc=0, additional_cc=1, missing_ptx=1, log=outtxt, stdout=stdout) - # Test case 8: try with --cuda-sanity-check-error-on-fail --cuda-compute-capabilities=9.0,9.0a + # Test case 8: try with --cuda-sanity-check-error-on-failed-checks --cuda-compute-capabilities=9.0,9.0a # and --cuda-sanity-check-strict # on a binary that contains 9.0 and 9.0a device code, and 9.0a ptx code. This tests the correct # ordering (i.e. 9.0a > 9.0). It should pass, since device code is present for both CCs and PTX @@ -3413,7 +3413,7 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, write_file(cuobjdump_file, cuobjdump_txt_sm90a, append=True) write_file(cuobjdump_file, cuobjdump_txt_sm90a_ptx, append=True) adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable - args = ['--cuda-compute-capabilities=9.0,9.0a', '--cuda-sanity-check-error-on-fail', + args = ['--cuda-compute-capabilities=9.0,9.0a', '--cuda-sanity-check-error-on-failed-checks', '--cuda-sanity-check-strict'] # We expect this to pass, so no need to check errors with self.mocked_stdout_stderr(): @@ -3431,7 +3431,7 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, # Test case 9: same as 8, but no --cuda-compute-capabilities are defined # We expect this to lead to a skip of the CUDA sanity check, and a success for the overall sanity check - args = ['--cuda-sanity-check-error-on-fail', '--cuda-sanity-check-strict'] + args = ['--cuda-sanity-check-error-on-failed-checks', '--cuda-sanity-check-strict'] # We expect this to pass, so no need to check errors with self.mocked_stdout_stderr(): outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) @@ -3446,7 +3446,7 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, self.assertTrue(expected_result.search(outtxt), msg) # Test case 10: running with default options and a binary that does not contain ANY CUDA device code - # This is expected to succeed, since the default is --disable-cuda-sanity-check-error-on-fail + # This is expected to succeed, since the default is --disable-cuda-sanity-check-error-on-failed-checks write_file(cuobjdump_file, cuobjdump_txt_shebang) write_file(cuobjdump_file, cuobjdump_txt_no_cuda, append=True) adjust_permissions(cuobjdump_file, stat.S_IXUSR, add=True) # Make sure our mock cuobjdump is executable @@ -3466,9 +3466,9 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, self.assertTrue(expected_result.search(outtxt), msg) assert_cuda_report(missing_cc=0, additional_cc=0, missing_ptx=0, log=outtxt, stdout=stdout, num_checked=0) - # Test case 11: same as Test case 10, but add --cuda-sanity-check-error-on-fail + # Test case 11: same as Test case 10, but add --cuda-sanity-check-error-on-failed-checks # This should pass: if it's not a CUDA binary, it shouldn't fail the CUDA sanity check - args = ['--cuda-compute-capabilities=9.0', '--cuda-sanity-check-error-on-fail'] + args = ['--cuda-compute-capabilities=9.0', '--cuda-sanity-check-error-on-failed-checks'] # We expect this to pass, so no need to check errors with self.mocked_stdout_stderr(): outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) From ceacffaca771d784d885e5b557bb5cf607b4e341 Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Fri, 16 May 2025 15:09:34 +0200 Subject: [PATCH 113/114] also consider shared libraries under lib/python*/site-packages in CUDA sanity check --- easybuild/framework/easyblock.py | 319 ++++++++++++++++--------------- 1 file changed, 167 insertions(+), 152 deletions(-) diff --git a/easybuild/framework/easyblock.py b/easybuild/framework/easyblock.py index e9cabc9724..3f9b83607f 100644 --- a/easybuild/framework/easyblock.py +++ b/easybuild/framework/easyblock.py @@ -3385,6 +3385,27 @@ def sanity_check_cuda(self, cuda_dirs=None): self.log.info("Using configured subdirectories for binaries/libraries to verify CUDA device code: %s", cuda_dirs) + # collect all files to consider + files_to_check = [] + for dirpath in [os.path.join(self.installdir, d) for d in cuda_dirs]: + if os.path.exists(dirpath): + self.log.debug(f"Sanity checking files for CUDA device code under directory {dirpath}:") + for entry in os.listdir(dirpath): + path = os.path.join(dirpath, entry) + if os.path.isfile(path): + self.log.debug("Sanity checking file {path} for CUDA device code") + files_to_check.append(path) + else: + self.log.debug(f"Not sanity checking files in non-existing directory {dirpath}") + + # also consider compiled Python modules as shared libraries (*.so) under lib/python*/site-packages + python_pkgs_path = os.path.join(self.installdir, 'lib', 'python*', 'site-packages') + shlib_ext = get_shared_lib_ext() + python_shared_libs = glob.glob(os.path.join(python_pkgs_path, '**', '*.' + shlib_ext), recursive=True) + if python_shared_libs: + self.log.debug("Sanity check shared libraries found in {python_pkgs_path}: {python_shared_libs}") + files_to_check.extend(python_shared_libs) + # Tracking number of CUDA files for a summary report: num_cuda_files = 0 @@ -3405,165 +3426,159 @@ def format_file_list(files_list): return "\n" + "\n".join(f" {f}" for f in files_list) # Looping through all files to check CUDA device and PTX code - for dirpath in [os.path.join(self.installdir, d) for d in cuda_dirs]: - if os.path.exists(dirpath): - self.log.debug(f"Sanity checking files for CUDA device code under folder {dirpath}") - - for path in [os.path.join(dirpath, x) for x in os.listdir(dirpath)]: - self.log.debug("Sanity checking for CUDA device code in %s", path) - - found_dev_code_ccs = get_cuda_architectures(path, 'elf') - found_ptx_ccs = get_cuda_architectures(path, 'ptx') - if found_dev_code_ccs is None and found_ptx_ccs is None: - msg = f"{path} does not appear to be a CUDA executable (no CUDA device code found), " - msg += "so skipping CUDA sanity check." - self.log.debug(msg) - else: - # Here, we check if CUDA device code is present for all compute capabilities in - # --cuda-compute-capabilities for the file pointed to by 'path' - # We also check for the presence of ptx code for the highest CUDA compute capability - # The following is considered fail/warning/success: - # - Missing device code is considered a failure (unless there is PTX code for - # a lower CC AND --accept-ptx-for-cc-support is True, in which case it is a warning) - # - Device code for additional compute capabilities is considered a failure if - # --cuda-sanity-check-strict is True (otherwise, it's a warning) - # - Missing PTX code for the highest CUDA compute capability in --cuda-compute-capabilities - # is considered a failure, unless --cuda-sanity-check-accept-missing-ptx is True (in which - # case it is a warning) - - # If found_dev_code_ccs is None, but found_ptx_ccs isn't, or vice versa, it IS a CUDA file - # but there was simply no device/ptx code, respectively. So, make that an empty list - # then continue - if found_dev_code_ccs is None: - found_dev_code_ccs = [] - elif found_ptx_ccs is None: - found_ptx_ccs = [] - - num_cuda_files += 1 - - # check whether device code architectures match cuda_compute_capabilities - additional_devcodes = list(set(found_dev_code_ccs) - set(cfg_ccs)) - missing_devcodes = list(set(cfg_ccs) - set(found_dev_code_ccs)) - - # There are two reasons for ignoring failures: - # - We are running with --disable-cuda-sanity-check-error-on-failed-checks - # - The specific {path} is on the cuda_sanity_ignore_files in the easyconfig - # In case we run with both, we'll just report that we're running with - # --disable-cuda-sanity-check-error-on-failed-checks - if ignore_failures: - ignore_msg = f"Failure for {path} will be ignored since we are not running with " - ignore_msg += "--cuda-sanity-check-error-on-failed-checks" - else: - ignore_msg = f"This failure will be ignored as '{path}' is listed in " - ignore_msg += "'cuda_sanity_ignore_files'." - - if not missing_devcodes and not additional_devcodes: - # Device code for all architectures requested in --cuda-compute-capabilities was found - msg = (f"Output of 'cuobjdump' checked for '{path}'; device code architectures match " - "those in cuda_compute_capabilities") - self.log.debug(msg) - else: - if additional_devcodes: - # Device code found for more architectures than requested in cuda-compute-capabilities - fail_msg = f"Mismatch between cuda_compute_capabilities and device code in {path}. " - # Count and log for summary report - files_additional_devcode.append(os.path.relpath(path, self.installdir)) - additional_devcode_str = ', '.join(sorted(additional_devcodes, key=LooseVersion)) - fail_msg += "Additional compute capabilities: %s. " % additional_devcode_str - if strict_cc_check: - # cuda-sanity-check-strict, so no additional compute capabilities allowed - if path in ignore_file_list or ignore_failures: - # No error, either path is in cuda_sanity_ignore_files list in easyconfig, - # or we are running with --disable-cuda-sanity-check-error-on-failed-checks - files_additional_devcode_ignored.append(os.path.relpath(path, self.installdir)) - fail_msg += ignore_msg - else: - # Sanity error - files_additional_devcode_fails.append(os.path.relpath(path, self.installdir)) - # Do reporting for the additional_devcodes case - self.log.warning(fail_msg) - - # Both additional_devcodes and missing_devcodes could exist, so use if, not elif - if missing_devcodes: - # One or more device code architectures requested in cuda-compute-capabilities was - # not found in the binary - fail_msg = f"Mismatch between cuda_compute_capabilities and device code in {path}. " - # Count and log for summary report - missing_devcodes_str = ', '.join(sorted(missing_devcodes, key=LooseVersion)) - fail_msg += "Missing compute capabilities: %s. " % missing_devcodes_str - # If accept_ptx_as_devcode, this might not be a failure IF there is suitable PTX - # code to JIT compile from that supports the CCs in missing_devcodes - if accept_ptx_as_devcode: - # Check that for each item in missing_devcodes there is PTX code for lower or equal - # CUDA compute capability - comparisons = [] - for cc in missing_devcodes: - has_smaller_equal_ptx = any( - LooseVersion(ptx_cc) <= LooseVersion(cc) for ptx_cc in found_ptx_ccs - ) - comparisons.append(has_smaller_equal_ptx) - # Only if that's the case for ALL cc's in missing_devcodes, this is a warning, not a - # failure - if all(comparisons): - files_missing_devcode_but_has_ptx.append(os.path.relpath(path, self.installdir)) - else: - # If there are CCs for which there is no suiteable PTX that can be JIT-compiled - # from, this is considered a failure - files_missing_devcode.append(os.path.relpath(path, self.installdir)) - if path in ignore_file_list or ignore_failures: - # No error, because either path is on the cuda_sanity_ignore_files list in - # the easyconfig, or we are running with - # --disable-cuda-sanity-check-error-on-failed-checks - files_missing_devcode_ignored.append(os.path.relpath(path, self.installdir)) - fail_msg += ignore_msg - else: - # Sanity error - files_missing_devcode_fails.append(os.path.relpath(path, self.installdir)) - else: - # Device code was missing, and we're not accepting PTX code as alternative - # This is considered a failure - files_missing_devcode.append(os.path.relpath(path, self.installdir)) - if path in ignore_file_list or ignore_failures: - # No error, either path is in cuda_sanity_ignore_files list in easyconfig, - # or we are running with --disable-cuda-sanity-check-error-on-failed-checks - files_missing_devcode_ignored.append(os.path.relpath(path, self.installdir)) - fail_msg += ignore_msg - else: - # Sanity error - files_missing_devcode_fails.append(os.path.relpath(path, self.installdir)) - # Do reporting for the missing_devcodes case - self.log.warning(fail_msg) - - # Check whether there is ptx code for the highest CC in cfg_ccs - # Make sure to use LooseVersion so that e.g. 9.0 < 9.0a < 9.2 < 9.10 - highest_cc = [sorted(cfg_ccs, key=LooseVersion)[-1]] - missing_ptx_ccs = list(set(highest_cc) - set(found_ptx_ccs)) - - if missing_ptx_ccs: - # There is no PTX code for the highest compute capability in --cuda-compute-capabilities - files_missing_ptx.append(os.path.relpath(path, self.installdir)) - fail_msg = "Configured highest compute capability was '%s', " - fail_msg += "but no PTX code for this compute capability was found in '%s' " - fail_msg += "(PTX architectures supported in that file: %s). " + for path in files_to_check: + self.log.debug(f"Sanity checking for CUDA device code in {path}") + + found_dev_code_ccs = get_cuda_architectures(path, 'elf') + found_ptx_ccs = get_cuda_architectures(path, 'ptx') + if found_dev_code_ccs is None and found_ptx_ccs is None: + msg = f"{path} does not appear to be a CUDA executable (no CUDA device code found), " + msg += "so skipping CUDA sanity check." + self.log.debug(msg) + else: + # Here, we check if CUDA device code is present for all compute capabilities in + # --cuda-compute-capabilities for the file pointed to by 'path' + # We also check for the presence of ptx code for the highest CUDA compute capability + # The following is considered fail/warning/success: + # - Missing device code is considered a failure (unless there is PTX code for + # a lower CC AND --accept-ptx-for-cc-support is True, in which case it is a warning) + # - Device code for additional compute capabilities is considered a failure if + # --cuda-sanity-check-strict is True (otherwise, it's a warning) + # - Missing PTX code for the highest CUDA compute capability in --cuda-compute-capabilities + # is considered a failure, unless --cuda-sanity-check-accept-missing-ptx is True (in which + # case it is a warning) + + # If found_dev_code_ccs is None, but found_ptx_ccs isn't, or vice versa, it IS a CUDA file + # but there was simply no device/ptx code, respectively. So, make that an empty list + # then continue + if found_dev_code_ccs is None: + found_dev_code_ccs = [] + elif found_ptx_ccs is None: + found_ptx_ccs = [] + + num_cuda_files += 1 + + # check whether device code architectures match cuda_compute_capabilities + additional_devcodes = list(set(found_dev_code_ccs) - set(cfg_ccs)) + missing_devcodes = list(set(cfg_ccs) - set(found_dev_code_ccs)) + + # There are two reasons for ignoring failures: + # - We are running with --disable-cuda-sanity-check-error-on-failed-checks + # - The specific {path} is on the cuda_sanity_ignore_files in the easyconfig + # In case we run with both, we'll just report that we're running with + # --disable-cuda-sanity-check-error-on-failed-checks + if ignore_failures: + ignore_msg = f"Failure for {path} will be ignored since we are not running with " + ignore_msg += "--cuda-sanity-check-error-on-failed-checks" + else: + ignore_msg = f"This failure will be ignored as '{path}' is listed in " + ignore_msg += "'cuda_sanity_ignore_files'." + + if not missing_devcodes and not additional_devcodes: + # Device code for all architectures requested in --cuda-compute-capabilities was found + msg = (f"Output of 'cuobjdump' checked for '{path}'; device code architectures match " + "those in cuda_compute_capabilities") + self.log.debug(msg) + else: + if additional_devcodes: + # Device code found for more architectures than requested in cuda-compute-capabilities + fail_msg = f"Mismatch between cuda_compute_capabilities and device code in {path}. " + # Count and log for summary report + files_additional_devcode.append(os.path.relpath(path, self.installdir)) + additional_devcode_str = ', '.join(sorted(additional_devcodes, key=LooseVersion)) + fail_msg += "Additional compute capabilities: %s. " % additional_devcode_str + if strict_cc_check: + # cuda-sanity-check-strict, so no additional compute capabilities allowed if path in ignore_file_list or ignore_failures: # No error, because either path is on the cuda_sanity_ignore_files list in the # easyconfig, or we are running with --disable-cuda-sanity-check-error-on-failed-checks - files_missing_ptx_ignored.append(os.path.relpath(path, self.installdir)) + files_additional_devcode_ignored.append(os.path.relpath(path, self.installdir)) fail_msg += ignore_msg - self.log.warning(fail_msg, highest_cc[0], path, found_ptx_ccs) - elif accept_missing_ptx: - # No error, because we are running with --cuda-sanity-check-accept-missing-ptx - self.log.warning(fail_msg, highest_cc[0], path, found_ptx_ccs) else: # Sanity error - files_missing_ptx_fails.append(os.path.relpath(path, self.installdir)) - self.log.warning(fail_msg % (highest_cc[0], path, found_ptx_ccs)) + files_additional_devcode_fails.append(os.path.relpath(path, self.installdir)) + # Do reporting for the additional_devcodes case + self.log.warning(fail_msg) + + # Both additional_devcodes and missing_devcodes could exist, so use if, not elif + if missing_devcodes: + # One or more device code architectures requested in cuda-compute-capabilities was + # not found in the binary + fail_msg = f"Mismatch between cuda_compute_capabilities and device code in {path}. " + # Count and log for summary report + missing_devcodes_str = ', '.join(sorted(missing_devcodes, key=LooseVersion)) + fail_msg += "Missing compute capabilities: %s. " % missing_devcodes_str + # If accept_ptx_as_devcode, this might not be a failure IF there is suitable PTX + # code to JIT compile from that supports the CCs in missing_devcodes + if accept_ptx_as_devcode: + # Check that for each item in missing_devcodes there is PTX code for lower or equal + # CUDA compute capability + comparisons = [] + for cc in missing_devcodes: + has_smaller_equal_ptx = any( + LooseVersion(ptx_cc) <= LooseVersion(cc) for ptx_cc in found_ptx_ccs + ) + comparisons.append(has_smaller_equal_ptx) + # Only if that's the case for ALL cc's in missing_devcodes, this is a warning, not a + # failure + if all(comparisons): + files_missing_devcode_but_has_ptx.append(os.path.relpath(path, self.installdir)) + else: + # If there are CCs for which there is no suiteable PTX that can be JIT-compiled + # from, this is considered a failure + files_missing_devcode.append(os.path.relpath(path, self.installdir)) + if path in ignore_file_list or ignore_failures: + # No error, because either path is on the cuda_sanity_ignore_files list in + # the easyconfig, or we are running with + # --disable-cuda-sanity-check-error-on-failed-checks + files_missing_devcode_ignored.append(os.path.relpath(path, self.installdir)) + fail_msg += ignore_msg + else: + # Sanity error + files_missing_devcode_fails.append(os.path.relpath(path, self.installdir)) else: - msg = (f"Output of 'cuobjdump' checked for '{path}'; ptx code was present for (at " - "least) the highest CUDA compute capability in cuda_compute_capabilities") - self.log.debug(msg) - else: - self.log.debug(f"Not sanity checking files in non-existing directory {dirpath}") + # Device code was missing, and we're not accepting PTX code as alternative + # This is considered a failure + files_missing_devcode.append(os.path.relpath(path, self.installdir)) + if path in ignore_file_list or ignore_failures: + # No error, because either path is on the cuda_sanity_ignore_files list in the + # easyconfig, or we are running with --disable-cuda-sanity-check-error-on-failed-checks + files_missing_devcode_ignored.append(os.path.relpath(path, self.installdir)) + fail_msg += ignore_msg + else: + # Sanity error + files_missing_devcode_fails.append(os.path.relpath(path, self.installdir)) + # Do reporting for the missing_devcodes case + self.log.warning(fail_msg) + + # Check whether there is ptx code for the highest CC in cfg_ccs + # Make sure to use LooseVersion so that e.g. 9.0 < 9.0a < 9.2 < 9.10 + highest_cc = [sorted(cfg_ccs, key=LooseVersion)[-1]] + missing_ptx_ccs = list(set(highest_cc) - set(found_ptx_ccs)) + + if missing_ptx_ccs: + # There is no PTX code for the highest compute capability in --cuda-compute-capabilities + files_missing_ptx.append(os.path.relpath(path, self.installdir)) + fail_msg = "Configured highest compute capability was '%s', " + fail_msg += "but no PTX code for this compute capability was found in '%s' " + fail_msg += "(PTX architectures supported in that file: %s). " + if path in ignore_file_list or ignore_failures: + # No error, because either path is on the cuda_sanity_ignore_files list in the + # easyconfig, or we are running with --disable-cuda-sanity-check-error-on-failed-checks + files_missing_ptx_ignored.append(os.path.relpath(path, self.installdir)) + fail_msg += ignore_msg + self.log.warning(fail_msg, highest_cc[0], path, found_ptx_ccs) + elif accept_missing_ptx: + # No error, because we are running with --cuda-sanity-check-accept-missing-ptx + self.log.warning(fail_msg, highest_cc[0], path, found_ptx_ccs) + else: + # Sanity error + files_missing_ptx_fails.append(os.path.relpath(path, self.installdir)) + self.log.warning(fail_msg % (highest_cc[0], path, found_ptx_ccs)) + else: + msg = (f"Output of 'cuobjdump' checked for '{path}'; ptx code was present for (at " + "least) the highest CUDA compute capability in cuda_compute_capabilities") + self.log.debug(msg) # Send to trace and log def trace_and_log(msg): From 7e92cd5dd69ce5fec3220112a2e0ccc395e291c3 Mon Sep 17 00:00:00 2001 From: Kenneth Hoste Date: Fri, 16 May 2025 16:47:54 +0200 Subject: [PATCH 114/114] extend test_toy_cuda_sanity_check to also check whether shared libraries under lib/python*/site-packages are being checked in CUDA sanity check --- test/framework/toy_build.py | 119 +++++++++++++++++++++--------------- 1 file changed, 69 insertions(+), 50 deletions(-) diff --git a/test/framework/toy_build.py b/test/framework/toy_build.py index bfa8b4de4a..2c08028cca 100644 --- a/test/framework/toy_build.py +++ b/test/framework/toy_build.py @@ -3121,9 +3121,22 @@ def test_toy_cuda_sanity_check(self): topdir = os.path.dirname(os.path.abspath(__file__)) toy_ec = os.path.join(topdir, 'easyconfigs', 'test_ecs', 't', 'toy', 'toy-0.0.eb') + toy_bin = '%(installdir)s/bin/toy' + py_site_pkgs = '%(installdir)s/lib/python3.9/site-packages' + shlib_ext = get_shared_lib_ext() + toy_ec_cuda = os.path.join(self.test_prefix, 'toy-0.0-cuda.eb') - write_file(toy_ec_cuda, read_file(toy_ec) + "\ndependencies = [('CUDA', '5.5.22', '', SYSTEM)]") - toy_ec = toy_ec_cuda + toy_ec_txt = read_file(toy_ec) + toy_ec_txt += '\n' + '\n'.join([ + "dependencies = [('CUDA', '5.5.22', '', SYSTEM)]", + "postinstallcmds += [", + " 'mkdir -p %(installdir)s/lib/python3.9/site-packages/plugins',", + # copy 'toy' binary, must be something that passes 'file' check in get_cuda_object_dump_raw + " 'cp %s %s/pytoy-cuda.cpython-39-x86_64-linux-gnu.%s'," % (toy_bin, py_site_pkgs, shlib_ext), + " 'cp %s %s/plugins/libpytoy_cuda.%s'," % (toy_bin, py_site_pkgs, shlib_ext), + "]", + ]) + write_file(toy_ec_cuda, toy_ec_txt) # Create mock cuobjdump # First, lets define sections of echo's for cuobjdump for various scenarios @@ -3243,10 +3256,10 @@ def test_toy_cuda_sanity_check(self): # If either of these fail their assert, print an informative, standardized message def assert_regex(pattern, log, stdout=None): regex = re.compile(pattern, re.M) - msg = "Pattern %s not found in full build log: %s" % (pattern, log) + msg = "Pattern '%s' not found in full build log: %s" % (pattern, log) self.assertTrue(regex.search(log), msg) if stdout is not None: - msg2 = "Pattern %s not found in standard output: %s" % (pattern, stdout) + msg2 = "Pattern '%s' not found in standard output: %s" % (pattern, stdout) self.assertTrue(regex.search(stdout), msg2) def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, missing_cc_but_ptx=None, @@ -3278,9 +3291,9 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, args = ['--cuda-compute-capabilities=8.0'] # We expect this to pass, so no need to check errors with self.mocked_stdout_stderr(): - outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) + outtxt = self._test_toy_build(ec_file=toy_ec_cuda, extra_args=args, raise_error=True) stdout = self.get_stdout() - assert_cuda_report(missing_cc=0, additional_cc=0, missing_ptx=1, log=outtxt, stdout=stdout) + assert_cuda_report(missing_cc=0, additional_cc=0, missing_ptx=3, log=outtxt, stdout=stdout) # Test case 1b: test with default options, --cuda-compute-capabilities=8.0 and a binary that contains # 7.0 and 9.0 device code and 8.0 PTX code. @@ -3297,29 +3310,29 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, args = ['--cuda-compute-capabilities=8.0'] # We expect this to pass, so no need to check errors with self.mocked_stdout_stderr(): - outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) + outtxt = self._test_toy_build(ec_file=toy_ec_cuda, extra_args=args, raise_error=True) stdout = self.get_stdout() - msg = "Pattern %s not found in full build log: %s" % (device_additional_70_90_code_regex.pattern, outtxt) + msg = "Pattern '%s' not found in full build log: %s" % (device_additional_70_90_code_regex.pattern, outtxt) self.assertTrue(device_additional_70_90_code_regex.search(outtxt), msg) - msg = "Pattern %s not found in full build log: %s" % (device_missing_80_code_regex.pattern, outtxt) + msg = "Pattern '%s' not found in full build log: %s" % (device_missing_80_code_regex.pattern, outtxt) self.assertTrue(device_missing_80_code_regex.search(outtxt), msg) - assert_cuda_report(missing_cc=1, additional_cc=1, missing_ptx=0, log=outtxt, stdout=stdout) + assert_cuda_report(missing_cc=3, additional_cc=3, missing_ptx=0, log=outtxt, stdout=stdout) # Test case 2: same as Test case 1, but add --cuda-sanity-check-error-on-failed-checks # This is expected to fail since there is missing device code for CC80 args = ['--cuda-compute-capabilities=8.0', '--cuda-sanity-check-error-on-failed-checks'] # We expect this to fail, so first check error, then run again to check output - error_pattern = r"Files missing CUDA device code: 1." + error_pattern = r"Files missing CUDA device code: 3." with self.mocked_stdout_stderr(): - self.assertErrorRegex(EasyBuildError, error_pattern, self._test_toy_build, ec_file=toy_ec, + self.assertErrorRegex(EasyBuildError, error_pattern, self._test_toy_build, ec_file=toy_ec_cuda, extra_args=args, raise_error=True) - outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=False, verify=False) + outtxt = self._test_toy_build(ec_file=toy_ec_cuda, extra_args=args, raise_error=False, verify=False) stdout = self.get_stdout() - msg = "Pattern %s not found in full build log: %s" % (device_additional_70_90_code_regex.pattern, outtxt) + msg = "Pattern '%s' not found in full build log: %s" % (device_additional_70_90_code_regex.pattern, outtxt) self.assertTrue(device_additional_70_90_code_regex.search(outtxt), msg) - msg = "Pattern %s not found in full build log: %s" % (device_missing_80_code_regex.pattern, outtxt) + msg = "Pattern '%s' not found in full build log: %s" % (device_missing_80_code_regex.pattern, outtxt) self.assertTrue(device_missing_80_code_regex.search(outtxt), msg) - assert_cuda_report(missing_cc=1, additional_cc=1, missing_ptx=0, log=outtxt, stdout=stdout) + assert_cuda_report(missing_cc=3, additional_cc=3, missing_ptx=0, log=outtxt, stdout=stdout) # Test case 3: same as Test case 2, but add --cuda-sanity-check-accept-ptx-as-devcode # This is expected to succeed, since now the PTX code for CC80 will be accepted as @@ -3329,28 +3342,28 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, '--cuda-sanity-check-accept-ptx-as-devcode'] # We expect this to pass, so no need to check errors with self.mocked_stdout_stderr(): - outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) + outtxt = self._test_toy_build(ec_file=toy_ec_cuda, extra_args=args, raise_error=True) stdout = self.get_stdout() - msg = "Pattern %s not found in full build log: %s" % (device_additional_70_90_code_regex.pattern, outtxt) + msg = "Pattern '%s' not found in full build log: %s" % (device_additional_70_90_code_regex.pattern, outtxt) self.assertTrue(device_additional_70_90_code_regex.search(outtxt), msg) - msg = "Pattern %s not found in full build log: %s" % (device_missing_80_code_regex.pattern, outtxt) + msg = "Pattern '%s' not found in full build log: %s" % (device_missing_80_code_regex.pattern, outtxt) self.assertTrue(device_missing_80_code_regex.search(outtxt), msg) - assert_cuda_report(missing_cc=0, additional_cc=1, missing_ptx=0, log=outtxt, stdout=stdout, - missing_cc_but_ptx=1) + assert_cuda_report(missing_cc=0, additional_cc=3, missing_ptx=0, log=outtxt, stdout=stdout, + missing_cc_but_ptx=3) # Test case 4: same as Test case 2, but run with --cuda-compute-capabilities=9.0 # This is expected to fail: device code is present, but PTX code for the highest CC (9.0) is missing args = ['--cuda-compute-capabilities=9.0', '--cuda-sanity-check-error-on-failed-checks'] # We expect this to fail, so first check error, then run again to check output - error_pattern = r"Files missing CUDA PTX code: 1" + error_pattern = r"Files missing CUDA PTX code: 3" with self.mocked_stdout_stderr(): - self.assertErrorRegex(EasyBuildError, error_pattern, self._test_toy_build, ec_file=toy_ec, + self.assertErrorRegex(EasyBuildError, error_pattern, self._test_toy_build, ec_file=toy_ec_cuda, extra_args=args, raise_error=True) - outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=False, verify=False) + outtxt = self._test_toy_build(ec_file=toy_ec_cuda, extra_args=args, raise_error=False, verify=False) stdout = self.get_stdout() - msg = "Pattern %s not found in full build log: %s" % (device_additional_70_code_regex.pattern, outtxt) + msg = "Pattern '%s' not found in full build log: %s" % (device_additional_70_code_regex.pattern, outtxt) self.assertTrue(device_additional_70_code_regex.search(outtxt), msg) - assert_cuda_report(missing_cc=0, additional_cc=1, missing_ptx=1, log=outtxt, stdout=stdout) + assert_cuda_report(missing_cc=0, additional_cc=3, missing_ptx=3, log=outtxt, stdout=stdout) # Test case 5: same as Test case 4, but add --cuda-sanity-check-accept-missing-ptx # This is expected to succeed: device code is present, PTX code is missing, but that's accepted @@ -3362,13 +3375,13 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, warning_pattern += r"\(PTX architectures supported in that file: \['8\.0'\]\)" warning_pattern_regex = re.compile(warning_pattern, re.M) with self.mocked_stdout_stderr(): - outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) + outtxt = self._test_toy_build(ec_file=toy_ec_cuda, extra_args=args, raise_error=True) stdout = self.get_stdout() - msg = "Pattern %s not found in full build log: %s" % (device_additional_70_code_regex.pattern, outtxt) + msg = "Pattern '%s' not found in full build log: %s" % (device_additional_70_code_regex.pattern, outtxt) self.assertTrue(device_additional_70_code_regex.search(outtxt), msg) - msg = "Pattern %s not found in full build log: %s" % (warning_pattern, outtxt) + msg = "Pattern '%s' not found in full build log: %s" % (warning_pattern, outtxt) self.assertTrue(warning_pattern_regex.search(outtxt), msg) - assert_cuda_report(missing_cc=0, additional_cc=1, missing_ptx=1, log=outtxt, stdout=stdout) + assert_cuda_report(missing_cc=0, additional_cc=3, missing_ptx=3, log=outtxt, stdout=stdout) # Test case 6: same as Test case 5, but add --cuda-sanity-check-strict # This is expected to fail: device code is present, PTX code is missing (but accepted due to option) @@ -3376,21 +3389,27 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, args = ['--cuda-compute-capabilities=9.0', '--cuda-sanity-check-error-on-failed-checks', '--cuda-sanity-check-accept-missing-ptx', '--cuda-sanity-check-strict'] # We expect this to fail, so first check error, then run again to check output - error_pattern = r"Files with additional CUDA device code: 1" + error_pattern = r"Files with additional CUDA device code: 3" with self.mocked_stdout_stderr(): - self.assertErrorRegex(EasyBuildError, error_pattern, self._test_toy_build, ec_file=toy_ec, + self.assertErrorRegex(EasyBuildError, error_pattern, self._test_toy_build, ec_file=toy_ec_cuda, extra_args=args, raise_error=True) - outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=False, verify=False) + outtxt = self._test_toy_build(ec_file=toy_ec_cuda, extra_args=args, raise_error=False, verify=False) stdout = self.get_stdout() - msg = "Pattern %s not found in full build log: %s" % (device_additional_70_code_regex.pattern, outtxt) + msg = "Pattern '%s' not found in full build log: %s" % (device_additional_70_code_regex.pattern, outtxt) self.assertTrue(device_additional_70_code_regex.search(outtxt), msg) - assert_cuda_report(missing_cc=0, additional_cc=1, missing_ptx=1, log=outtxt, stdout=stdout) + assert_cuda_report(missing_cc=0, additional_cc=3, missing_ptx=3, log=outtxt, stdout=stdout) # Test case 7: same as Test case 6, but add the failing file to the cuda_sanity_ignore_files # This is expected to succeed: the individual file which _would_ cause the sanity check to fail is # now on the ignore list toy_whitelist_ec = os.path.join(self.test_prefix, 'toy-0.0-cuda-whitelist.eb') - write_file(toy_whitelist_ec, read_file(toy_ec) + '\ncuda_sanity_ignore_files = ["bin/toy"]') + toy_ec_txt = read_file(toy_ec) + toy_ec_txt += '\n' + '\n'.join([ + "dependencies = [('CUDA', '5.5.22', '', SYSTEM)]", + "cuda_sanity_ignore_files = ['bin/toy']", + ]) + write_file(toy_ec_cuda, toy_ec_txt) + write_file(toy_whitelist_ec, toy_ec_txt) args = ['--cuda-compute-capabilities=9.0', '--cuda-sanity-check-error-on-failed-checks', '--cuda-sanity-check-accept-missing-ptx', '--cuda-sanity-check-strict'] @@ -3398,7 +3417,7 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, with self.mocked_stdout_stderr(): outtxt = self._test_toy_build(ec_file=toy_whitelist_ec, extra_args=args, raise_error=True, verify=False) stdout = self.get_stdout() - msg = "Pattern %s not found in full build log: %s" % (device_additional_70_code_regex.pattern, outtxt) + msg = "Pattern '%s' not found in full build log: %s" % (device_additional_70_code_regex.pattern, outtxt) self.assertTrue(device_additional_70_code_regex.search(outtxt), msg) assert_cuda_report(missing_cc=0, additional_cc=1, missing_ptx=1, log=outtxt, stdout=stdout) @@ -3417,15 +3436,15 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, '--cuda-sanity-check-strict'] # We expect this to pass, so no need to check errors with self.mocked_stdout_stderr(): - outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) + outtxt = self._test_toy_build(ec_file=toy_ec_cuda, extra_args=args, raise_error=True) stdout = self.get_stdout() - msg = "Pattern %s not found in full build log: %s" % (device_code_regex_success.pattern, outtxt) + msg = "Pattern '%s' not found in full build log: %s" % (device_code_regex_success.pattern, outtxt) self.assertTrue(device_code_regex_success.search(outtxt), msg) - msg = "Pattern %s not found in full build log: %s" % (ptx_code_regex_success.pattern, outtxt) + msg = "Pattern '%s' not found in full build log: %s" % (ptx_code_regex_success.pattern, outtxt) self.assertTrue(ptx_code_regex_success.search(outtxt), msg) expected_result_pattern = "INFO Sanity check for toy successful" expected_result = re.compile(expected_result_pattern, re.M) - msg = "Pattern %s not found in full build log: %s" % (expected_result, outtxt) + msg = "Pattern '%s' not found in full build log: %s" % (expected_result, outtxt) self.assertTrue(expected_result.search(outtxt), msg) assert_cuda_report(missing_cc=0, additional_cc=0, missing_ptx=0, log=outtxt, stdout=stdout) @@ -3434,15 +3453,15 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, args = ['--cuda-sanity-check-error-on-failed-checks', '--cuda-sanity-check-strict'] # We expect this to pass, so no need to check errors with self.mocked_stdout_stderr(): - outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) + outtxt = self._test_toy_build(ec_file=toy_ec_cuda, extra_args=args, raise_error=True) stdout = self.get_stdout() cuda_sanity_skipped = r"INFO Skipping CUDA sanity check, as no CUDA compute capabilities were configured" cuda_sanity_skipped_regex = re.compile(cuda_sanity_skipped, re.M) - msg = "Pattern %s not found in full build log: %s" % (cuda_sanity_skipped, outtxt) + msg = "Pattern '%s' not found in full build log: %s" % (cuda_sanity_skipped, outtxt) self.assertTrue(cuda_sanity_skipped_regex.search(outtxt), msg) expected_result_pattern = "INFO Sanity check for toy successful" expected_result = re.compile(expected_result_pattern, re.M) - msg = "Pattern %s not found in full build log: %s" % (expected_result, outtxt) + msg = "Pattern '%s' not found in full build log: %s" % (expected_result, outtxt) self.assertTrue(expected_result.search(outtxt), msg) # Test case 10: running with default options and a binary that does not contain ANY CUDA device code @@ -3453,16 +3472,16 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, args = ['--cuda-compute-capabilities=9.0'] # We expect this to pass, so no need to check errors with self.mocked_stdout_stderr(): - outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) + outtxt = self._test_toy_build(ec_file=toy_ec_cuda, extra_args=args, raise_error=True) stdout = self.get_stdout() no_cuda_pattern = r".*/bin/toy does not appear to be a CUDA executable \(no CUDA device code found\), " no_cuda_pattern += r"so skipping CUDA sanity check" no_cuda_regex = re.compile(no_cuda_pattern, re.M) - msg = "Pattern %s not found in full build log: %s" % (no_cuda_pattern, outtxt) + msg = "Pattern '%s' not found in full build log: %s" % (no_cuda_pattern, outtxt) self.assertTrue(no_cuda_regex.search(outtxt), msg) expected_result_pattern = "INFO Sanity check for toy successful" expected_result = re.compile(expected_result_pattern, re.M) - msg = "Pattern %s not found in full build log: %s" % (expected_result, outtxt) + msg = "Pattern '%s' not found in full build log: %s" % (expected_result, outtxt) self.assertTrue(expected_result.search(outtxt), msg) assert_cuda_report(missing_cc=0, additional_cc=0, missing_ptx=0, log=outtxt, stdout=stdout, num_checked=0) @@ -3471,16 +3490,16 @@ def assert_cuda_report(missing_cc, additional_cc, missing_ptx, log, stdout=None, args = ['--cuda-compute-capabilities=9.0', '--cuda-sanity-check-error-on-failed-checks'] # We expect this to pass, so no need to check errors with self.mocked_stdout_stderr(): - outtxt = self._test_toy_build(ec_file=toy_ec, extra_args=args, raise_error=True) + outtxt = self._test_toy_build(ec_file=toy_ec_cuda, extra_args=args, raise_error=True) stdout = self.get_stdout() no_cuda_pattern = r".*/bin/toy does not appear to be a CUDA executable \(no CUDA device code found\), " no_cuda_pattern += r"so skipping CUDA sanity check" no_cuda_regex = re.compile(no_cuda_pattern, re.M) - msg = "Pattern %s not found in full build log: %s" % (no_cuda_pattern, outtxt) + msg = "Pattern '%s' not found in full build log: %s" % (no_cuda_pattern, outtxt) self.assertTrue(no_cuda_regex.search(outtxt), msg) expected_result_pattern = "INFO Sanity check for toy successful" expected_result = re.compile(expected_result_pattern, re.M) - msg = "Pattern %s not found in full build log: %s" % (expected_result, outtxt) + msg = "Pattern '%s' not found in full build log: %s" % (expected_result, outtxt) self.assertTrue(expected_result.search(outtxt), msg) assert_cuda_report(missing_cc=0, additional_cc=0, missing_ptx=0, log=outtxt, stdout=stdout, num_checked=0)