Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
120 changes: 78 additions & 42 deletions easybuild/easyblocks/t/tensorflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ def is_version_ok(version_range):
'com_github_googleapis_googleapis': '2.0.0:2.5.0',
'com_github_googlecloudplatform_google_cloud_cpp': '2.0.0:', # Not used due to $TF_NEED_GCP=0
'com_github_grpc_grpc': '2.2.0:',
'com_googlesource_code_re2': '2.0.0:', # Requires the RE2 version with Abseil (or 2023-06-01+)
'com_googlesource_code_re2': '2.0.0:', # Requires or 2023-06-01+ and building TF with system Abseil
'grpc': '2.0.0:2.2.0',
}
# Python packages installed as extensions or in the Python module
Expand Down Expand Up @@ -424,8 +424,8 @@ def get_system_libs(self):
libpaths.append(os.path.join(openssl_root, libpath))

if ignored_system_deps:
print_warning('%d TensorFlow dependencies have not been resolved by EasyBuild. Check the log for details.',
len(ignored_system_deps))
print_warning('%d TensorFlow dependencies have not been resolved by EasyBuild. '
"Search the log for 'TF_SYSTEM_LIBS' for details.", len(ignored_system_deps))
self.log.warning('For the following $TF_SYSTEM_LIBS dependencies TensorFlow will download a copy ' +
'because an EB dependency was not found: \n%s\n' +
'EC Dependencies: %s\n' +
Expand Down Expand Up @@ -562,6 +562,8 @@ def configure_step(self):

self._with_cuda = bool(cuda_root)

repo_env = {} # Variables that need to be passed as --repo_env to Bazel

config_env_vars = {
'CC_OPT_FLAGS': os.getenv('CXXFLAGS'),
'MPI_HOME': mpi_home,
Expand All @@ -573,7 +575,6 @@ def configure_step(self):
'TF_NEED_CUDA': ('0', '1')[self._with_cuda],
'TF_NEED_OPENCL': ('0', '1')[bool(opencl_root)],
'TF_NEED_ROCM': '0',
'TF_NEED_TENSORRT': '0',
'TF_SET_ANDROID_WORKSPACE': '0',
'TF_SYSTEM_LIBS': ','.join(self.system_libs_info[0]),
}
Expand Down Expand Up @@ -610,10 +611,10 @@ def configure_step(self):
# Clang toggle since 2.14.0
if LooseVersion(self.version) > LooseVersion('2.13'):
config_env_vars['TF_NEED_CLANG'] = '0'
# Hermietic python version since 2.14.0
# Hermetic python version since 2.14.0
if LooseVersion(self.version) > LooseVersion('2.13'):
pyver = det_python_version(self.python_cmd)
config_env_vars['TF_PYTHON_VERSION'] = '.'.join(pyver.split('.')[:2])
repo_env['TF_PYTHON_VERSION'] = '.'.join(pyver.split('.')[:2])

if self._with_cuda:
cuda_version = get_software_version('CUDA')
Expand All @@ -625,18 +626,9 @@ def configure_step(self):
else:
compiler_path = which(os.getenv('CC'), on_error=ERROR)

# list of CUDA compute capabilities to use can be specifed in two ways (where (2) overrules (1)):
# (1) in the easyconfig file, via the custom cuda_compute_capabilities;
# (2) in the EasyBuild configuration, via --cuda-compute-capabilities configuration option;
ec_cuda_cc = self.cfg['cuda_compute_capabilities']
cfg_cuda_cc = build_option('cuda_compute_capabilities')
cuda_cc = cfg_cuda_cc or ec_cuda_cc or []
cuda_cc = self.cfg.get_cuda_cc_template_value("cuda_cc_space_sep", required=False).split()

if cfg_cuda_cc and ec_cuda_cc:
warning_msg = "cuda_compute_capabilities specified in easyconfig (%s) are overruled by " % ec_cuda_cc
warning_msg += "--cuda-compute-capabilities configuration option (%s)" % cfg_cuda_cc
print_warning(warning_msg)
elif not cuda_cc:
if not cuda_cc:
warning_msg = "No CUDA compute capabilities specified, so using TensorFlow default "
warning_msg += "(which may not be optimal for your system).\nYou should use "
warning_msg += "the --cuda-compute-capabilities configuration option or the cuda_compute_capabilities "
Expand All @@ -660,19 +652,25 @@ def configure_step(self):
'GCC_HOST_COMPILER_PATH': compiler_path,
# This is the binutils bin folder: https://github.com/tensorflow/tensorflow/issues/39263
'GCC_HOST_COMPILER_PREFIX': self.binutils_bin_path,
'TF_CUDA_COMPUTE_CAPABILITIES': ','.join(cuda_cc),
'TF_CUDA_VERSION': cuda_maj_min_ver,
})

# from v2.18 TF with CUDA needs this envs be set
if LooseVersion(self.version) >= LooseVersion('2.18'):
config_env_vars.update({
'CUDA_NVCC': '1',
'HERMETIC_CUDA_COMPUTE_CAPABILITIES': ','.join(cuda_cc),
'HERMETIC_CUDA_VERSION': cuda_version,
'HERMETIC_CUDA_COMPUTE_CAPABILITIES': ','.join(f"sm_{cc.replace('.', '')}" for cc in cuda_cc),
'LOCAL_CUDA_PATH': cuda_root,
})
else:
config_env_vars.update({
'TF_CUDA_COMPUTE_CAPABILITIES': ','.join(cuda_cc),
'TF_CUDA_VERSION': cuda_maj_min_ver,
})

# for recent TensorFlow versions, $TF_CUDA_PATHS and $TF_CUBLAS_VERSION must also be set
if LooseVersion(self.version) >= LooseVersion('1.14'):
# for these TensorFlow versions, $TF_CUDA_PATHS and $TF_CUBLAS_VERSION must also be set
# TF 2.18 introduced "Hermetic CUDA" which doesn't use those env vars anymore
if '1.14' <= LooseVersion(self.version) < '2.18':

# figure out correct major/minor version for CUBLAS from cublas_api.h
cublas_api_header_glob_pattern = os.path.join(cuda_root, 'targets', '*', 'include', 'cublas_api.h')
Expand All @@ -696,41 +694,71 @@ def configure_step(self):
'TF_CUDA_PATHS': cuda_root,
'TF_CUBLAS_VERSION': '.'.join(cublas_ver_parts),
})
elif LooseVersion(self.version) >= '2.18':
# TF_CUDA_PATHS replaced CUDNN_INSTALL_PATH, TENSORRT_INSTALL_PATH, NCCL_INSTALL_PATH, NCCL_HDR_PATH
# in 2.0. Version guard set to 2.18 to avoid potentially breaking older easyconfigs
repo_env['TF_CUDA_PATHS'] = cuda_root

if cudnn_root:
cudnn_version = get_software_version('cuDNN')
cudnn_maj_min_patch_ver = '.'.join(cudnn_version.split('.')[:3])

config_env_vars.update({
'CUDNN_INSTALL_PATH': cudnn_root,
'TF_CUDNN_VERSION': cudnn_maj_min_patch_ver,
})
if LooseVersion(self.version) >= '2.18':
repo_env['TF_CUDA_PATHS'] += ',' + cudnn_root
repo_env['TF_CUDNN_VERSION'] = cudnn_version
config_env_vars.update({
'LOCAL_CUDNN_PATH': cudnn_root,
'HERMETIC_CUDNN_VERSION': cudnn_version,
})
else:
config_env_vars.update({
'CUDNN_INSTALL_PATH': cudnn_root,
'TF_CUDNN_VERSION': cudnn_maj_min_patch_ver,
})
else:
raise EasyBuildError("TensorFlow has a strict dependency on cuDNN if CUDA is enabled")

if nccl_root:
nccl_version = get_software_version('NCCL')
# Ignore the PKG_REVISION identifier if it exists (i.e., report 2.4.6 for 2.4.6-1 or 2.4.6-2)
nccl_version = nccl_version.split('-')[0]
config_env_vars.update({
'NCCL_INSTALL_PATH': nccl_root,
})
if LooseVersion(self.version) >= '2.18':
repo_env['TF_CUDA_PATHS'] += ',' + nccl_root
config_env_vars['LOCAL_NCCL_PATH'] = nccl_root
else:
config_env_vars['NCCL_INSTALL_PATH'] = nccl_root
else:
nccl_version = '1.3' # Use simple downloadable version
config_env_vars.update({
'TF_NCCL_VERSION': nccl_version,
})
if LooseVersion(self.version) >= '2.18':
repo_env['TF_NCCL_VERSION'] = nccl_version
else:
config_env_vars['TF_NCCL_VERSION'] = nccl_version

if tensorrt_root:
tensorrt_version = get_software_version('TensorRT')
config_env_vars.update({
tensor_rt_vars = {
'TF_NEED_TENSORRT': '1',
'TENSORRT_INSTALL_PATH': tensorrt_root,
'TF_TENSORRT_VERSION': tensorrt_version,
})
'TENSORRT_INSTALL_PATH': tensorrt_root,
}
if LooseVersion(self.version) >= '2.18':
repo_env['TF_CUDA_PATHS'] += ',' + tensorrt_root
else:
tensor_rt_vars = {'TF_NEED_TENSORRT': '0'}
if LooseVersion(self.version) >= '2.18':
repo_env.update(tensor_rt_vars)
else:
config_env_vars.update(tensor_rt_vars)

nvshmem_root = get_software_root('NVSHMEM')
if nvshmem_root and LooseVersion(self.version) >= '2.18':
repo_env['LOCAL_NVSHMEM_PATH'] = nvshmem_root

configure_py_contents = read_file('configure.py')
for key, val in sorted(config_env_vars.items()):
if key.startswith('TF_') and key not in configure_py_contents:
self.log.warning('Did not find %s option in configure.py. Setting might not have any effect', key)
print_warning('Did not find %s option in configure.py. Setting might not have any effect',
key, log=self.log)
env.setvar(key, val)

# configure.py (called by configure script) already calls bazel to determine the bazel version
Expand All @@ -744,11 +772,19 @@ def configure_step(self):
cmd = self.cfg['preconfigopts'] + './configure ' + self.cfg['configopts']
run_shell_cmd(cmd)

tf_conf_bazelrc = os.path.join(self.start_dir, '.tf_configure.bazelrc')

if LooseVersion(self.version) >= '2.17':
repo_env['WHEEL_NAME'] = 'tensorflow'

write_file(tf_conf_bazelrc,
'\n'.join(f'build --repo_env {key}="{value}"' for key, value in repo_env.items()),
append=True)

# when building on Arm 64-bit we can't just use --copt=-mcpu=native (or likewise for any -mcpu=...),
# because it breaks the build of XNNPACK;
# see also https://github.com/easybuilders/easybuild-easyconfigs/issues/18899
if get_cpu_architecture() == AARCH64:
tf_conf_bazelrc = os.path.join(self.start_dir, '.tf_configure.bazelrc')
regex_subs = [
# use --per_file_copt instead of --copt to selectively use -mcpu=native (not for XNNPACK),
# the leading '-' ensures that -mcpu=native is *not* used when building XNNPACK;
Expand Down Expand Up @@ -817,7 +853,7 @@ def patch_crosstool_files(self):
regex_subs.extend([('-fPIE', '-fPIC'), ('"-pie"', '"-fPIC"')])

# patch all CROSSTOOL* scripts to fix hardcoding of locations of binutils/GCC binaries
for path, dirnames, filenames in os.walk(os.getcwd()):
for path, _dirnames, filenames in os.walk(os.getcwd()):
for filename in filenames:
if filename.startswith('CROSSTOOL'):
full_path = os.path.join(path, filename)
Expand Down Expand Up @@ -972,12 +1008,12 @@ def build_step(self):
+ self.target_opts
+ [self.cfg['buildopts']]
)
if LooseVersion(self.version) < LooseVersion('2.16'):
if LooseVersion(self.version) < '2.16':
cmd += ['//tensorflow/tools/pip_package:build_pip_package']
elif LooseVersion(self.version) < LooseVersion('2.17'): # for v2.16.x
cmd += ['//tensorflow/tools/pip_package:v2/wheel --repo_env=WHEEL_NAME=tensorflow']
elif LooseVersion(self.version) < '2.17': # for v2.16.x
cmd += ['//tensorflow/tools/pip_package:v2/wheel']
else:
cmd += ['//tensorflow/tools/pip_package:wheel --repo_env=WHEEL_NAME=tensorflow']
cmd += ['//tensorflow/tools/pip_package:wheel']

with self.set_tmp_dir():
run_shell_cmd(' '.join(cmd))
Expand Down