Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
a2f5544
Try to change the subdir in which the CUDA toolkit is installed so th…
Aug 6, 2025
333e009
Fix sed command
Aug 6, 2025
1ac3748
Ok, now actually overwrite the EASYBUILD_INSTALLPATH
Aug 6, 2025
2cfe306
Fix the installpath that is reported
Aug 6, 2025
d49059c
Add software to reported dir
Aug 6, 2025
b63778e
Reassing host_inj_path
Aug 6, 2025
e3f746f
Change host injections location for binary non-redistributable files …
Aug 7, 2025
2e10d3d
Update Lmod hook to print more specific warning in case the CUDA / cu…
Aug 7, 2025
26cd405
Make sure update SitePackage.lua is included in the tarball
Aug 7, 2025
b124fcf
fix the replacement, since the already contains , so now it was tryi…
Aug 7, 2025
ecfd373
Undo change on create_tarball.sh
Aug 7, 2025
448d9e0
Fix typo
Aug 7, 2025
64069f9
Small fix, forgot to change name when copying
Aug 7, 2025
2b03950
Fixed more issues
Aug 7, 2025
56bfd45
Fixed more issues
Aug 7, 2025
5e92c55
Make sure it actually raises an error
Aug 7, 2025
7119169
Insert two spaces
Aug 7, 2025
d4af942
Cleanout the old installation script for the CUDA toolkit, as it is r…
Aug 7, 2025
8c903fd
Add easystack file to build CUDA and cuDNN in the software layer
Aug 7, 2025
0ab005b
Accept EULA for cuDNN
Aug 7, 2025
3dfe565
Fix chicken and egg problem where EESSI_ACCELERATOR_TARGET is not set…
Aug 11, 2025
d36908d
Some more clear code commenting
Aug 11, 2025
8ca152e
Make sure to actual check for EESSI_ACCELERATOR_TARGET_OVERRIDE to be…
Aug 12, 2025
f237c93
Added readme to explain that there SHOULD normally not be any easysta…
Aug 12, 2025
e974183
Apply suggestions from code review
casparvl Aug 12, 2025
7a1e4c1
Remove easystack file. CUDA 12.1.1 was already covered in the other o…
Aug 12, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions EESSI-install-software.sh
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,29 @@ else
# make sure the the software and modules directory exist
# (since it's expected by init/eessi_environment_variables when using archdetect and by the EESSI module)
mkdir -p ${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_SUBDIR_OVERRIDE}/{modules,software}

# If EESSI_ACCELERATOR_TARGET_OVERRIDE is defined, we are building for an accelerator target
# In that case, make sure the modulepath for the accelerator subdir exists, otherwise the EESSI module will not
# set EESSI_ACCELERATOR_TARGET and the if-condition later in this script which checks if EESSI_ACCELERATOR_TARGET
# is equal to EESSI_ACCELERATOR_TARGET_OVERRIDE will fail
# See https://github.com/EESSI/software-layer-scripts/pull/59#issuecomment-3173593882
if [ -n $EESSI_ACCELERATOR_TARGET_OVERRIDE ]; then
# Note that ${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_SUBDIR_OVERRIDE}/${EESSI_ACCELERATOR_TARGET_OVERRIDE}/modules/all
# is only the correct path if EESSI_ACCEL_SOFTWARE_SUBDIR_OVERRIDE is not set
if [ -z $EESSI_ACCEL_SOFTWARE_SUBDIR_OVERRIDE ]; then
mkdir -p ${EESSI_PREFIX}/software/${EESSI_OS_TYPE}/${EESSI_SOFTWARE_SUBDIR_OVERRIDE}/${EESSI_ACCELERATOR_TARGET_OVERRIDE}/modules/all
else
# At runtime, one might want to use a different CPU subdir for a given accelerator. E.g. one could use
# a zen2 CPU subdir on a zen4 node if the required GPU software isn't available in the zen4 tree.
# At build time, this doesn't make a lot of sense: we'd probably build in a CPU prefix that is different
# from what the code will be optimized for, and we wouldn't want that
# So this message _should_ never be printed...
msg="When building the software subdirectory for the CPU should almost certainly be that of the host."
msg="$msg If you think this is incorrect, please implement behaviour that makes sense in "
msg="$msg EESSI-software-installation.sh, essentially replacing this error."
fatal_error "$msg"
fi
fi
)
fi

Expand Down
26 changes: 22 additions & 4 deletions create_lmodsitepackage.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,13 +123,31 @@
local refer_to_docs = "For more information on how to do this, see https://www.eessi.io/docs/site_specific_config/gpu/.\\n"
if packagesList[simpleName] then
-- simpleName is a module in packagesList
-- get the full host_injections path
local hostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections')
-- first, check the old host_injections path prior to https://github.com/EESSI/software-layer-scripts/pull/59
-- If that exists, print a more targetted, explanatory warning
local previousHostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", 'versions', 'host_injections')
local previousPackageEasyBuildDir = previousHostInjections .. "/software/" .. t.modFullName .. "/easybuild"
local previousPackageDirExists = isDir(previousPackageEasyBuildDir)

-- get the host_injections path, and add only the EESSI_CPU_FAMILY at the end
local strip_suffix = os.getenv('EESSI_VERSION') .. "/software/" .. os.getenv('EESSI_OS_TYPE') .. "/"
strip_suffix = strip_suffix .. os.getenv('EESSI_SOFTWARE_SUBDIR')
local hostInjections = string.gsub(os.getenv('EESSI_SOFTWARE_PATH') or "", strip_suffix, os.getenv('EESSI_CPU_FAMILY'))

-- build final path where the software should be installed
local packageEasyBuildDir = hostInjections .. "/software/" .. t.modFullName .. "/easybuild"
local packageDirExists = isDir(packageEasyBuildDir)
if not packageDirExists then
if previousPackageDirExists and not packageDirExists then
local advice = "but while the module file exists, the actual software is not entirely shipped with EESSI "
advice = advice .. "due to licencing. You will need to install a full copy of the " .. simpleName .. " package where EESSI "
advice = advice .. "can find it.\\n"
advice = advice .. "Note that a full copy is installed at " .. previousHostInjections .. "/software/" .. t.modFullName .. ". "
advice = advice .. "However, EESSI expects it in a different location since Aug'25, namely at "
advice = advice .. hostInjections .. "/software/" .. t.modFullName .. ". "
advice = advice .. "Please re-install the package at the new location. "
advice = advice .. refer_to_docs
LmodError("\\nYou requested to load ", simpleName, " ", advice)
elseif not packageDirExists then
local advice = "but while the module file exists, the actual software is not entirely shipped with EESSI "
advice = advice .. "due to licencing. You will need to install a full copy of the " .. simpleName .. " package where EESSI "
advice = advice .. "can find it.\\n"
Expand Down Expand Up @@ -293,7 +311,7 @@ def error(msg):
# the install path (if it exists)
accel_subdir = os.getenv("EESSI_ACCELERATOR_TARGET")
if accel_subdir:
sitepackage_path = sitepackage_path.replace("/accel/%s" % accel_subdir, '')
sitepackage_path = sitepackage_path.replace("/%s" % accel_subdir, '')
try:
os.makedirs(os.path.dirname(sitepackage_path), exist_ok=True)
with open(sitepackage_path, 'w') as fp:
Expand Down
5 changes: 5 additions & 0 deletions easystacks/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
WARNING: in principle _all_ easystack files should go into EESSI/software-layer, not in EESSI/software-layer-scripts. Easystack files are only added in EESSI/software-layer-scripts by exception, for example when the (re)deployment of the software has to be done synchronously with a change in EESSI/software-layer-scripts.

Here, we list past deployments for which this was the case (and why):

[PR#59](https://github.com/EESSI/software-layer-scripts/pull/59): modified the prefix in which `install_cuda_and_libraries.sh` installs the CUDA toolkit within `host_injections`. Also, updated the Lmod SitePackage.lua to print an informative message in case the CUDA Toolkit is found in the old location. This requires synchronous deployment of new CUDA and cuDNN installations in the software layer, because the symlinks from these installations should be redirected to the new prefix in `host_injections`.
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# In https://github.com/EESSI/software-layer-scripts/pull/59 we introduced a new location for
# installing the CUDA toolkit within the host_injections directory. This requires reinstallation
# of CUDA and cuDNN to make sure all symlinks point to these new locations
easyconfigs:
- CUDA-12.1.1.eb:
options:
accept-eula-for: CUDA
- CUDA-12.4.0.eb:
options:
accept-eula-for: CUDA
- cuDNN-8.9.2.26-CUDA-12.1.1.eb:
options:
accept-eula-for: cuDNN
43 changes: 35 additions & 8 deletions eb_hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ def parse_list_of_dicts_env(var_name):
if not re.match(r'^[A-Za-z_][A-Za-z0-9_]*$', var_name):
raise ValueError(f"Invalid environment variable name: {var_name}")
list_string = os.getenv(var_name, '[]')

list_of_dicts = []
try:
# Try JSON format first
Expand All @@ -162,7 +162,7 @@ def parse_list_of_dicts_env(var_name):
list_of_dicts = ast.literal_eval(list_string)
except (ValueError, SyntaxError):
raise ValueError(f"Environment variable '{var_name}' does not contain a valid list of dictionaries.")

return list_of_dicts


Expand Down Expand Up @@ -211,7 +211,7 @@ def post_ready_hook(self, *args, **kwargs):
parallel = self.parallel
else:
parallel = self.cfg['parallel']

if parallel == 1:
return # no need to limit if already using 1 core

Expand Down Expand Up @@ -733,7 +733,7 @@ def pre_configure_hook_score_p(self, *args, **kwargs):
def pre_configure_hook_vsearch(self, *args, **kwargs):
"""
Pre-configure hook for VSEARCH
- Workaround for a Zlib macro being renamed in Gentoo, see https://bugs.gentoo.org/383179
- Workaround for a Zlib macro being renamed in Gentoo, see https://bugs.gentoo.org/383179
(solves "expected initializer before 'OF'" errors)
"""
if self.name == 'VSEARCH':
Expand Down Expand Up @@ -1199,7 +1199,7 @@ def post_postproc_cuda(self, *args, **kwargs):

# replace files that are not distributable with symlinks into
# host_injections
replace_non_distributable_files_with_symlinks(self.log, self.installdir, self.name, allowlist)
replace_binary_non_distributable_files_with_symlinks(self.log, self.installdir, self.name, allowlist)
else:
print_msg(f"EESSI hook to respect CUDA license not triggered for installation path {self.installdir}")
else:
Expand Down Expand Up @@ -1249,16 +1249,19 @@ def post_postproc_cudnn(self, *args, **kwargs):

# replace files that are not distributable with symlinks into
# host_injections
replace_non_distributable_files_with_symlinks(self.log, self.installdir, self.name, allowlist)
replace_binary_non_distributable_files_with_symlinks(self.log, self.installdir, self.name, allowlist)
else:
print_msg(f"EESSI hook to respect cuDDN license not triggered for installation path {self.installdir}")
else:
raise EasyBuildError("cuDNN-specific hook triggered for non-cuDNN easyconfig?!")


def replace_non_distributable_files_with_symlinks(log, install_dir, pkg_name, allowlist):
def replace_binary_non_distributable_files_with_symlinks(log, install_dir, pkg_name, allowlist):
"""
Replace files that cannot be distributed with symlinks into host_injections
Since these are binary files, only the CPU family will be included in the prefix,
no microarchitecture or accelerator architecture will be included. For example,
/cvmfs/software.eessi.io/host_injections/x86_64/suffix/to/actual/file
"""
# Different packages use different ways to specify which files or file
# 'types' may be redistributed. For CUDA, the 'EULA.txt' lists full file
Expand Down Expand Up @@ -1301,13 +1304,37 @@ def replace_non_distributable_files_with_symlinks(log, install_dir, pkg_name, al
log.debug("%s is not found in allowlist, so replacing it with symlink: %s",
print_name, full_path)
# the host_injections path is under a fixed repo/location for CUDA or cuDNN
# full_path is something similar to
# /cvmfs/software.eessi.io/version/.../x86_64/amd/zen4/accel/nvidia/cc90/.../CUDA/bin/nvcc
# host_inj_path will then be
# /cvmfs/software.eessi.io/host_injections/.../x86_64/amd/zen4/accel/nvidia/cc90/.../CUDA/bin/nvcc
host_inj_path = re.sub(EESSI_INSTALLATION_REGEX, HOST_INJECTIONS_LOCATION, full_path)
# CUDA and cu* libraries themselves don't care about compute capability so remove this
# duplication from under host_injections (symlink to a single CUDA or cu* library
# installation for all compute capabilities)
accel_subdir = get_eessi_envvar("EESSI_ACCELERATOR_TARGET")
# If accel_subdir is defined, remove it from the full path
# After removal of accel_subdir, host_inj_path will be something like
# /cvmfs/software.eessi.io/host_injections/.../x86_64/amd/zen4/.../CUDA/bin/nvcc
if accel_subdir:
host_inj_path = host_inj_path.replace("/accel/%s" % accel_subdir, '')
host_inj_path = host_inj_path.replace(accel_subdir, '')
software_subdir = get_eessi_envvar("EESSI_SOFTWARE_SUBDIR")
cpu_family = get_eessi_envvar("EESSI_CPU_FAMILY")
os_type = get_eessi_envvar("EESSI_OS_TYPE")
eessi_version = get_eessi_envvar("EESSI_VERSION")
if software_subdir and cpu_family and os_type and eessi_version:
# Compose the string to be removed:
partial_path = f"{eessi_version}/software/{os_type}/{software_subdir}"
# After this, host_inj_path will be e.g.
# /cvmfs/software.eessi.io/host_injections/x86_64/software/CUDA/bin/nvcc
host_inj_path = host_inj_path.replace(partial_path, cpu_family)
else:
msg = "Failed to construct path to symlink for file (%s). All of the following values "
msg += "have to be defined: EESSI_SOFTWARE_SUBDIR='%s', EESSI_CPU_FAMILY='%s', "
msg += "EESSI_OS_TYPE='%s', EESSI_VERSION='%s'. Failed to replace non-redistributable file "
msg += "with symlink, aborting..."
raise EasyBuildError(msg, full_path, software_subdir, cpu_family, os_type, eessi_version)

# make sure source and target of symlink are not the same
if full_path == host_inj_path:
raise EasyBuildError("Source (%s) and target (%s) are the same location, are you sure you "
Expand Down
15 changes: 11 additions & 4 deletions scripts/gpu_support/nvidia/install_cuda_and_libraries.sh
Original file line number Diff line number Diff line change
Expand Up @@ -132,9 +132,16 @@ for EASYSTACK_FILE in ${TOPDIR}/easystacks/eessi-*CUDA*.yml; do

# If there is a GPU on the node, the installation path will by default have an
# accelerator subdirectory. For CUDA and cu*, these are binary installations and
# don't care about the target compute capability. Our hooks are aware of this and
# therefore expect CUDA to be available under EESSI_SITE_SOFTWARE_PATH
export EASYBUILD_INSTALLPATH=$EESSI_SITE_SOFTWARE_PATH
# we don't care about the target compute capability nor the CPU microarchitecture.
# Our hooks are aware of this and therefore expect CUDA to be available under
# something like EESSI_SITE_SOFTWARE_PATH, but then with the CPU micro-architecture
# stripped
# This sed command will capture everything from the EESSI_SITE_SOFTWARE_PATH up until
# the EESSI_VERSION in a capture group. It will the replace that with the content
# of the capture group and then have the EESSI_CPU_FAMILY appended
# Thus EESSI_SITE_CPU_FAMILY_PATH is then something like /cvmfs/software.eessi.io/host_injections/x86_64
EESSI_SITE_CPU_FAMILY_PATH=$(echo "$EESSI_SITE_SOFTWARE_PATH" | sed 's|\(.*\)'"$EESSI_VERSION"/software/"$EESSI_OS_TYPE"/"$EESSI_SOFTWARE_SUBDIR"'|\1'"$EESSI_CPU_FAMILY"'|')
export EASYBUILD_INSTALLPATH=$EESSI_SITE_CPU_FAMILY_PATH

# Install modules in hidden .modules dir to keep track of what was installed before
# (this action is temporary, and we do not call Lmod again within the current shell context, but in EasyBuild
Expand Down Expand Up @@ -258,7 +265,7 @@ for EASYSTACK_FILE in ${TOPDIR}/easystacks/eessi-*CUDA*.yml; do
cp -a ${eb_last_log} .
fatal_error "some installation failed, please check EasyBuild logs ${PWD}/$(basename ${eb_last_log})..."
else
echo_green "all installations at ${EESSI_SITE_SOFTWARE_PATH}/software/... succeeded!"
echo_green "all installations at ${EASYBUILD_INSTALLPATH}/software/... succeeded!"
fi

# clean up tmpdir content
Expand Down
Loading