Skip to content

Commit ff6c83f

Browse files
authored
Merge pull request #3765 from Flamefire/20250610174336_new_pr_tensorflow
Update tensorflow easyblock for CUDA support in TensorFlow 2.18+
2 parents 4f9dddc + 3435420 commit ff6c83f

1 file changed

Lines changed: 77 additions & 41 deletions

File tree

easybuild/easyblocks/t/tensorflow.py

Lines changed: 77 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -170,7 +170,7 @@ def is_version_ok(version_range):
170170
'com_github_googleapis_googleapis': '2.0.0:2.5.0',
171171
'com_github_googlecloudplatform_google_cloud_cpp': '2.0.0:', # Not used due to $TF_NEED_GCP=0
172172
'com_github_grpc_grpc': '2.2.0:',
173-
'com_googlesource_code_re2': '2.0.0:', # Requires the RE2 version with Abseil (or 2023-06-01+)
173+
'com_googlesource_code_re2': '2.0.0:', # Requires or 2023-06-01+ and building TF with system Abseil
174174
'grpc': '2.0.0:2.2.0',
175175
}
176176
# Python packages installed as extensions or in the Python module
@@ -424,8 +424,8 @@ def get_system_libs(self):
424424
libpaths.append(os.path.join(openssl_root, libpath))
425425

426426
if ignored_system_deps:
427-
print_warning('%d TensorFlow dependencies have not been resolved by EasyBuild. Check the log for details.',
428-
len(ignored_system_deps))
427+
print_warning('%d TensorFlow dependencies have not been resolved by EasyBuild. '
428+
"Search the log for 'TF_SYSTEM_LIBS' for details.", len(ignored_system_deps))
429429
self.log.warning('For the following $TF_SYSTEM_LIBS dependencies TensorFlow will download a copy ' +
430430
'because an EB dependency was not found: \n%s\n' +
431431
'EC Dependencies: %s\n' +
@@ -564,6 +564,8 @@ def configure_step(self):
564564

565565
self._with_cuda = bool(cuda_root)
566566

567+
repo_env = {} # Variables that need to be passed as --repo_env to Bazel
568+
567569
config_env_vars = {
568570
'CC_OPT_FLAGS': os.getenv('CXXFLAGS'),
569571
'MPI_HOME': mpi_home,
@@ -575,7 +577,6 @@ def configure_step(self):
575577
'TF_NEED_CUDA': ('0', '1')[self._with_cuda],
576578
'TF_NEED_OPENCL': ('0', '1')[bool(opencl_root)],
577579
'TF_NEED_ROCM': '0',
578-
'TF_NEED_TENSORRT': '0',
579580
'TF_SET_ANDROID_WORKSPACE': '0',
580581
'TF_SYSTEM_LIBS': ','.join(self.system_libs_info[0]),
581582
}
@@ -612,10 +613,10 @@ def configure_step(self):
612613
# Clang toggle since 2.14.0
613614
if LooseVersion(self.version) > LooseVersion('2.13'):
614615
config_env_vars['TF_NEED_CLANG'] = '0'
615-
# Hermietic python version since 2.14.0
616+
# Hermetic python version since 2.14.0
616617
if LooseVersion(self.version) > LooseVersion('2.13'):
617618
pyver = det_python_version(self.python_cmd)
618-
config_env_vars['TF_PYTHON_VERSION'] = '.'.join(pyver.split('.')[:2])
619+
repo_env['TF_PYTHON_VERSION'] = '.'.join(pyver.split('.')[:2])
619620

620621
if self._with_cuda:
621622
cuda_version = get_software_version('CUDA')
@@ -627,18 +628,9 @@ def configure_step(self):
627628
else:
628629
compiler_path = which(os.getenv('CC'), on_error=ERROR)
629630

630-
# list of CUDA compute capabilities to use can be specifed in two ways (where (2) overrules (1)):
631-
# (1) in the easyconfig file, via the custom cuda_compute_capabilities;
632-
# (2) in the EasyBuild configuration, via --cuda-compute-capabilities configuration option;
633-
ec_cuda_cc = self.cfg['cuda_compute_capabilities']
634-
cfg_cuda_cc = build_option('cuda_compute_capabilities')
635-
cuda_cc = cfg_cuda_cc or ec_cuda_cc or []
631+
cuda_cc = self.cfg.get_cuda_cc_template_value("cuda_cc_space_sep", required=False).split()
636632

637-
if cfg_cuda_cc and ec_cuda_cc:
638-
warning_msg = "cuda_compute_capabilities specified in easyconfig (%s) are overruled by " % ec_cuda_cc
639-
warning_msg += "--cuda-compute-capabilities configuration option (%s)" % cfg_cuda_cc
640-
print_warning(warning_msg)
641-
elif not cuda_cc:
633+
if not cuda_cc:
642634
warning_msg = "No CUDA compute capabilities specified, so using TensorFlow default "
643635
warning_msg += "(which may not be optimal for your system).\nYou should use "
644636
warning_msg += "the --cuda-compute-capabilities configuration option or the cuda_compute_capabilities "
@@ -662,19 +654,25 @@ def configure_step(self):
662654
'GCC_HOST_COMPILER_PATH': compiler_path,
663655
# This is the binutils bin folder: https://github.com/tensorflow/tensorflow/issues/39263
664656
'GCC_HOST_COMPILER_PREFIX': self.binutils_bin_path,
665-
'TF_CUDA_COMPUTE_CAPABILITIES': ','.join(cuda_cc),
666-
'TF_CUDA_VERSION': cuda_maj_min_ver,
667657
})
668658

669659
# from v2.18 TF with CUDA needs this envs be set
670660
if LooseVersion(self.version) >= LooseVersion('2.18'):
671661
config_env_vars.update({
672662
'CUDA_NVCC': '1',
673-
'HERMETIC_CUDA_COMPUTE_CAPABILITIES': ','.join(cuda_cc),
663+
'HERMETIC_CUDA_VERSION': cuda_version,
664+
'HERMETIC_CUDA_COMPUTE_CAPABILITIES': ','.join(f"sm_{cc.replace('.', '')}" for cc in cuda_cc),
665+
'LOCAL_CUDA_PATH': cuda_root,
666+
})
667+
else:
668+
config_env_vars.update({
669+
'TF_CUDA_COMPUTE_CAPABILITIES': ','.join(cuda_cc),
670+
'TF_CUDA_VERSION': cuda_maj_min_ver,
674671
})
675672

676-
# for recent TensorFlow versions, $TF_CUDA_PATHS and $TF_CUBLAS_VERSION must also be set
677-
if LooseVersion(self.version) >= LooseVersion('1.14'):
673+
# for these TensorFlow versions, $TF_CUDA_PATHS and $TF_CUBLAS_VERSION must also be set
674+
# TF 2.18 introduced "Hermetic CUDA" which doesn't use those env vars anymore
675+
if '1.14' <= LooseVersion(self.version) < '2.18':
678676

679677
# figure out correct major/minor version for CUBLAS from cublas_api.h
680678
cublas_api_header_glob_pattern = os.path.join(cuda_root, 'targets', '*', 'include', 'cublas_api.h')
@@ -698,41 +696,71 @@ def configure_step(self):
698696
'TF_CUDA_PATHS': cuda_root,
699697
'TF_CUBLAS_VERSION': '.'.join(cublas_ver_parts),
700698
})
699+
elif LooseVersion(self.version) >= '2.18':
700+
# TF_CUDA_PATHS replaced CUDNN_INSTALL_PATH, TENSORRT_INSTALL_PATH, NCCL_INSTALL_PATH, NCCL_HDR_PATH
701+
# in 2.0. Version guard set to 2.18 to avoid potentially breaking older easyconfigs
702+
repo_env['TF_CUDA_PATHS'] = cuda_root
701703

702704
if cudnn_root:
703705
cudnn_version = get_software_version('cuDNN')
704706
cudnn_maj_min_patch_ver = '.'.join(cudnn_version.split('.')[:3])
705707

706-
config_env_vars.update({
707-
'CUDNN_INSTALL_PATH': cudnn_root,
708-
'TF_CUDNN_VERSION': cudnn_maj_min_patch_ver,
709-
})
708+
if LooseVersion(self.version) >= '2.18':
709+
repo_env['TF_CUDA_PATHS'] += ',' + cudnn_root
710+
repo_env['TF_CUDNN_VERSION'] = cudnn_version
711+
config_env_vars.update({
712+
'LOCAL_CUDNN_PATH': cudnn_root,
713+
'HERMETIC_CUDNN_VERSION': cudnn_version,
714+
})
715+
else:
716+
config_env_vars.update({
717+
'CUDNN_INSTALL_PATH': cudnn_root,
718+
'TF_CUDNN_VERSION': cudnn_maj_min_patch_ver,
719+
})
710720
else:
711721
raise EasyBuildError("TensorFlow has a strict dependency on cuDNN if CUDA is enabled")
722+
712723
if nccl_root:
713724
nccl_version = get_software_version('NCCL')
714725
# Ignore the PKG_REVISION identifier if it exists (i.e., report 2.4.6 for 2.4.6-1 or 2.4.6-2)
715726
nccl_version = nccl_version.split('-')[0]
716-
config_env_vars.update({
717-
'NCCL_INSTALL_PATH': nccl_root,
718-
})
727+
if LooseVersion(self.version) >= '2.18':
728+
repo_env['TF_CUDA_PATHS'] += ',' + nccl_root
729+
config_env_vars['LOCAL_NCCL_PATH'] = nccl_root
730+
else:
731+
config_env_vars['NCCL_INSTALL_PATH'] = nccl_root
719732
else:
720733
nccl_version = '1.3' # Use simple downloadable version
721-
config_env_vars.update({
722-
'TF_NCCL_VERSION': nccl_version,
723-
})
734+
if LooseVersion(self.version) >= '2.18':
735+
repo_env['TF_NCCL_VERSION'] = nccl_version
736+
else:
737+
config_env_vars['TF_NCCL_VERSION'] = nccl_version
738+
724739
if tensorrt_root:
725740
tensorrt_version = get_software_version('TensorRT')
726-
config_env_vars.update({
741+
tensor_rt_vars = {
727742
'TF_NEED_TENSORRT': '1',
728-
'TENSORRT_INSTALL_PATH': tensorrt_root,
729743
'TF_TENSORRT_VERSION': tensorrt_version,
730-
})
744+
'TENSORRT_INSTALL_PATH': tensorrt_root,
745+
}
746+
if LooseVersion(self.version) >= '2.18':
747+
repo_env['TF_CUDA_PATHS'] += ',' + tensorrt_root
748+
else:
749+
tensor_rt_vars = {'TF_NEED_TENSORRT': '0'}
750+
if LooseVersion(self.version) >= '2.18':
751+
repo_env.update(tensor_rt_vars)
752+
else:
753+
config_env_vars.update(tensor_rt_vars)
754+
755+
nvshmem_root = get_software_root('NVSHMEM')
756+
if nvshmem_root and LooseVersion(self.version) >= '2.18':
757+
repo_env['LOCAL_NVSHMEM_PATH'] = nvshmem_root
731758

732759
configure_py_contents = read_file('configure.py')
733760
for key, val in sorted(config_env_vars.items()):
734761
if key.startswith('TF_') and key not in configure_py_contents:
735-
self.log.warning('Did not find %s option in configure.py. Setting might not have any effect', key)
762+
print_warning('Did not find %s option in configure.py. Setting might not have any effect',
763+
key, log=self.log)
736764
env.setvar(key, val)
737765

738766
# configure.py (called by configure script) already calls bazel to determine the bazel version
@@ -746,11 +774,19 @@ def configure_step(self):
746774
cmd = self.cfg['preconfigopts'] + './configure ' + self.cfg['configopts']
747775
run_shell_cmd(cmd)
748776

777+
tf_conf_bazelrc = os.path.join(self.start_dir, '.tf_configure.bazelrc')
778+
779+
if LooseVersion(self.version) >= '2.17':
780+
repo_env['WHEEL_NAME'] = 'tensorflow'
781+
782+
write_file(tf_conf_bazelrc,
783+
'\n'.join(f'build --repo_env {key}="{value}"' for key, value in repo_env.items()),
784+
append=True)
785+
749786
# when building on Arm 64-bit we can't just use --copt=-mcpu=native (or likewise for any -mcpu=...),
750787
# because it breaks the build of XNNPACK;
751788
# see also https://github.com/easybuilders/easybuild-easyconfigs/issues/18899
752789
if get_cpu_architecture() == AARCH64:
753-
tf_conf_bazelrc = os.path.join(self.start_dir, '.tf_configure.bazelrc')
754790
regex_subs = [
755791
# use --per_file_copt instead of --copt to selectively use -mcpu=native (not for XNNPACK),
756792
# the leading '-' ensures that -mcpu=native is *not* used when building XNNPACK;
@@ -976,12 +1012,12 @@ def build_step(self):
9761012
+ self.target_opts
9771013
+ [self.cfg['buildopts']]
9781014
)
979-
if LooseVersion(self.version) < LooseVersion('2.16'):
1015+
if LooseVersion(self.version) < '2.16':
9801016
cmd += ['//tensorflow/tools/pip_package:build_pip_package']
981-
elif LooseVersion(self.version) < LooseVersion('2.17'): # for v2.16.x
982-
cmd += ['//tensorflow/tools/pip_package:v2/wheel --repo_env=WHEEL_NAME=tensorflow']
1017+
elif LooseVersion(self.version) < '2.17': # for v2.16.x
1018+
cmd += ['//tensorflow/tools/pip_package:v2/wheel']
9831019
else:
984-
cmd += ['//tensorflow/tools/pip_package:wheel --repo_env=WHEEL_NAME=tensorflow']
1020+
cmd += ['//tensorflow/tools/pip_package:wheel']
9851021

9861022
with self.set_tmp_dir():
9871023
run_shell_cmd(' '.join(cmd))

0 commit comments

Comments
 (0)