Skip to content

Commit 2473b71

Browse files
authored
Merge pull request #3009 from ComputeCanada/build_lock
adding locking to prevent two parallel builds of the same installation directory
2 parents c5545c2 + 4b01f31 commit 2473b71

File tree

4 files changed

+140
-5
lines changed

4 files changed

+140
-5
lines changed

easybuild/framework/easyblock.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3043,6 +3043,37 @@ def run_all_steps(self, run_test_cases):
30433043

30443044
print_msg("building and installing %s..." % self.full_mod_name, log=self.log, silent=self.silent)
30453045
trace_msg("installation prefix: %s" % self.installdir)
3046+
3047+
ignore_locks = build_option('ignore_locks')
3048+
3049+
if ignore_locks:
3050+
self.log.info("Ignoring locks...")
3051+
else:
3052+
locks_dir = build_option('locks_dir') or os.path.join(install_path('software'), '.locks')
3053+
lock_path = os.path.join(locks_dir, '%s.lock' % self.installdir.replace('/', '_'))
3054+
3055+
# if lock already exists, either abort or wait until it disappears
3056+
if os.path.exists(lock_path):
3057+
wait_on_lock = build_option('wait_on_lock')
3058+
if wait_on_lock:
3059+
while os.path.exists(lock_path):
3060+
print_msg("lock %s exists, waiting %d seconds..." % (lock_path, wait_on_lock),
3061+
silent=self.silent)
3062+
time.sleep(wait_on_lock)
3063+
else:
3064+
raise EasyBuildError("Lock %s already exists, aborting!", lock_path)
3065+
3066+
# create lock to avoid that another installation running in parallel messes things up;
3067+
# we use a directory as a lock, since that's atomically created
3068+
try:
3069+
mkdir(lock_path, parents=True)
3070+
except EasyBuildError as err:
3071+
# clean up the error message a bit, get rid of the "Failed to create directory" part + quotes
3072+
stripped_err = str(err).split(':', 1)[1].strip().replace("'", '').replace('"', '')
3073+
raise EasyBuildError("Failed to create lock %s: %s", lock_path, stripped_err)
3074+
3075+
self.log.info("Lock created: %s", lock_path)
3076+
30463077
try:
30473078
for (step_name, descr, step_methods, skippable) in steps:
30483079
if self._skip_step(step_name, skippable):
@@ -3057,6 +3088,10 @@ def run_all_steps(self, run_test_cases):
30573088

30583089
except StopException:
30593090
pass
3091+
finally:
3092+
if not ignore_locks:
3093+
remove_dir(lock_path)
3094+
self.log.info("Lock removed: %s", lock_path)
30603095

30613096
# return True for successfull build (or stopped build)
30623097
return True

easybuild/tools/config.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -185,6 +185,7 @@ def mk_full_default_path(name, prefix=DEFAULT_PREFIX):
185185
'job_output_dir',
186186
'job_polling_interval',
187187
'job_target_resource',
188+
'locks_dir',
188189
'modules_footer',
189190
'modules_header',
190191
'mpi_cmd_template',
@@ -225,6 +226,7 @@ def mk_full_default_path(name, prefix=DEFAULT_PREFIX):
225226
'group_writable_installdir',
226227
'hidden',
227228
'ignore_checksums',
229+
'ignore_locks',
228230
'install_latest_eb_release',
229231
'lib64_fallback_sanity_check',
230232
'logtostdout',
@@ -249,6 +251,7 @@ def mk_full_default_path(name, prefix=DEFAULT_PREFIX):
249251
'use_f90cache',
250252
'use_existing_modules',
251253
'set_default_module',
254+
'wait_on_lock',
252255
],
253256
True: [
254257
'cleanup_builddir',

easybuild/tools/options.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,8 +255,13 @@ def basic_options(self):
255255
'extended-dry-run-ignore-errors': ("Ignore errors that occur during dry run", None, 'store_true', True),
256256
'force': ("Force to rebuild software even if it's already installed (i.e. if it can be found as module), "
257257
"and skipping check for OS dependencies", None, 'store_true', False, 'f'),
258+
'ignore-locks': ("Ignore locks that prevent two identical installations running in parallel",
259+
None, 'store_true', False),
258260
'job': ("Submit the build as a job", None, 'store_true', False),
259261
'logtostdout': ("Redirect main log to stdout", None, 'store_true', False, 'l'),
262+
'locks-dir': ("Directory to store lock files (should be on a shared filesystem); "
263+
"None implies .locks subdirectory of software installation directory",
264+
None, 'store_or_None', None),
260265
'missing-modules': ("Print list of missing modules for dependencies of specified easyconfigs",
261266
None, 'store_true', False, 'M'),
262267
'only-blocks': ("Only build listed blocks", 'strlist', 'extend', None, 'b', {'metavar': 'BLOCKS'}),
@@ -434,6 +439,8 @@ def override_options(self):
434439
None, 'store_true', False),
435440
'verify-easyconfig-filenames': ("Verify whether filename of specified easyconfigs matches with contents",
436441
None, 'store_true', False),
442+
'wait-on-lock': ("Wait interval (in seconds) to use when waiting for existing lock to be removed "
443+
"(0: implies no waiting, but exiting with an error)", int, 'store', 0),
437444
'zip-logs': ("Zip logs that are copied to install directory, using specified command",
438445
None, 'store_or_None', 'gzip'),
439446

test/framework/toy_build.py

Lines changed: 95 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
import os
3535
import re
3636
import shutil
37+
import signal
3738
import stat
3839
import sys
3940
import tempfile
@@ -1415,7 +1416,7 @@ def test_module_only(self):
14151416
self.assertTrue(os.path.exists(os.path.join(self.test_installpath, 'software', 'toy', '0.0-deps', 'bin')))
14161417
modtxt = read_file(toy_mod)
14171418
self.assertTrue(re.search("set root %s" % prefix, modtxt))
1418-
self.assertEqual(len(os.listdir(os.path.join(self.test_installpath, 'software'))), 1)
1419+
self.assertEqual(len(os.listdir(os.path.join(self.test_installpath, 'software'))), 2)
14191420
self.assertEqual(len(os.listdir(os.path.join(self.test_installpath, 'software', 'toy'))), 1)
14201421

14211422
# install (only) additional module under a hierarchical MNS
@@ -1430,7 +1431,7 @@ def test_module_only(self):
14301431
# existing install is reused
14311432
modtxt2 = read_file(toy_core_mod)
14321433
self.assertTrue(re.search("set root %s" % prefix, modtxt2))
1433-
self.assertEqual(len(os.listdir(os.path.join(self.test_installpath, 'software'))), 2)
1434+
self.assertEqual(len(os.listdir(os.path.join(self.test_installpath, 'software'))), 3)
14341435
self.assertEqual(len(os.listdir(os.path.join(self.test_installpath, 'software', 'toy'))), 1)
14351436

14361437
# make sure load statements for dependencies are included
@@ -1441,7 +1442,7 @@ def test_module_only(self):
14411442
os.remove(toy_core_mod)
14421443

14431444
# test installing (only) additional module in Lua syntax (if Lmod is available)
1444-
lmod_abspath = which('lmod')
1445+
lmod_abspath = os.environ.get('LMOD_CMD') or which('lmod')
14451446
if lmod_abspath is not None:
14461447
args = common_args[:-1] + [
14471448
'--allow-modules-tool-mismatch',
@@ -1455,7 +1456,7 @@ def test_module_only(self):
14551456
# existing install is reused
14561457
modtxt3 = read_file(toy_mod + '.lua')
14571458
self.assertTrue(re.search('local root = "%s"' % prefix, modtxt3))
1458-
self.assertEqual(len(os.listdir(os.path.join(self.test_installpath, 'software'))), 2)
1459+
self.assertEqual(len(os.listdir(os.path.join(self.test_installpath, 'software'))), 3)
14591460
self.assertEqual(len(os.listdir(os.path.join(self.test_installpath, 'software', 'toy'))), 1)
14601461

14611462
# make sure load statements for dependencies are included
@@ -2057,7 +2058,7 @@ def test_toy_modaltsoftname(self):
20572058
self.assertTrue(os.path.exists(os.path.join(modules_path, 'yot', yot_name)))
20582059

20592060
# only subdirectories for software should be created
2060-
self.assertEqual(os.listdir(software_path), ['toy'])
2061+
self.assertEqual(sorted(os.listdir(software_path)), sorted(['toy', '.locks']))
20612062
self.assertEqual(sorted(os.listdir(os.path.join(software_path, 'toy'))), ['0.0-one', '0.0-two'])
20622063

20632064
# only subdirectories for modules with alternative names should be created
@@ -2516,6 +2517,95 @@ def test_toy_ghost_installdir(self):
25162517

25172518
self.assertFalse(os.path.exists(toy_installdir))
25182519

2520+
def test_toy_build_lock(self):
2521+
"""Test toy installation when a lock is already in place."""
2522+
2523+
locks_dir = os.path.join(self.test_installpath, 'software', '.locks')
2524+
toy_installdir = os.path.join(self.test_installpath, 'software', 'toy', '0.0')
2525+
toy_lock_fn = toy_installdir.replace(os.path.sep, '_') + '.lock'
2526+
2527+
toy_lock_path = os.path.join(locks_dir, toy_lock_fn)
2528+
mkdir(toy_lock_path, parents=True)
2529+
2530+
error_pattern = "Lock .*_software_toy_0.0.lock already exists, aborting!"
2531+
self.assertErrorRegex(EasyBuildError, error_pattern, self.test_toy_build, raise_error=True, verbose=False)
2532+
2533+
locks_dir = os.path.join(self.test_prefix, 'locks')
2534+
2535+
# no lock in place, so installation proceeds as normal
2536+
extra_args = ['--locks-dir=%s' % locks_dir]
2537+
self.test_toy_build(extra_args=extra_args, verify=True, raise_error=True)
2538+
2539+
# put lock in place in custom locks dir, try again
2540+
toy_lock_path = os.path.join(locks_dir, toy_lock_fn)
2541+
mkdir(toy_lock_path, parents=True)
2542+
self.assertErrorRegex(EasyBuildError, error_pattern, self.test_toy_build,
2543+
extra_args=extra_args, raise_error=True, verbose=False)
2544+
2545+
# also test use of --ignore-locks
2546+
self.test_toy_build(extra_args=extra_args + ['--ignore-locks'], verify=True, raise_error=True)
2547+
2548+
# define a context manager that remove a lock after a while, so we can check the use of --wait-for-lock
2549+
class remove_lock_after:
2550+
def __init__(self, seconds, lock_fp):
2551+
self.seconds = seconds
2552+
self.lock_fp = lock_fp
2553+
2554+
def remove_lock(self, *args):
2555+
remove_dir(self.lock_fp)
2556+
2557+
def __enter__(self):
2558+
signal.signal(signal.SIGALRM, self.remove_lock)
2559+
signal.alarm(self.seconds)
2560+
2561+
def __exit__(self, type, value, traceback):
2562+
pass
2563+
2564+
# wait for lock to be removed, with 1 second interval of checking
2565+
extra_args.append('--wait-on-lock=1')
2566+
2567+
wait_regex = re.compile("^== lock .*_software_toy_0.0.lock exists, waiting 1 seconds", re.M)
2568+
ok_regex = re.compile("^== COMPLETED: Installation ended successfully", re.M)
2569+
2570+
self.assertTrue(os.path.exists(toy_lock_path))
2571+
2572+
# use context manager to remove lock after 3 seconds
2573+
with remove_lock_after(3, toy_lock_path):
2574+
self.mock_stderr(True)
2575+
self.mock_stdout(True)
2576+
self.test_toy_build(extra_args=extra_args, verify=False, raise_error=True, testing=False)
2577+
stderr, stdout = self.get_stderr(), self.get_stdout()
2578+
self.mock_stderr(False)
2579+
self.mock_stdout(False)
2580+
2581+
self.assertEqual(stderr, '')
2582+
2583+
wait_matches = wait_regex.findall(stdout)
2584+
# we can't rely on an exact number of 'waiting' messages, so let's go with a range...
2585+
self.assertTrue(len(wait_matches) in range(2, 5))
2586+
2587+
self.assertTrue(ok_regex.search(stdout), "Pattern '%s' found in: %s" % (ok_regex.pattern, stdout))
2588+
2589+
# when there is no lock in place, --wait-on-lock has no impact
2590+
self.assertFalse(os.path.exists(toy_lock_path))
2591+
self.mock_stderr(True)
2592+
self.mock_stdout(True)
2593+
self.test_toy_build(extra_args=extra_args, verify=False, raise_error=True, testing=False)
2594+
stderr, stdout = self.get_stderr(), self.get_stdout()
2595+
self.mock_stderr(False)
2596+
self.mock_stdout(False)
2597+
2598+
self.assertEqual(stderr, '')
2599+
self.assertTrue(ok_regex.search(stdout), "Pattern '%s' found in: %s" % (ok_regex.pattern, stdout))
2600+
self.assertFalse(wait_regex.search(stdout), "Pattern '%s' not found in: %s" % (wait_regex.pattern, stdout))
2601+
2602+
# check for clean error on creation of lock
2603+
extra_args = ['--locks-dir=/']
2604+
error_pattern = r"Failed to create lock /.*_software_toy_0.0.lock:.* "
2605+
error_pattern += r"(Read-only file system|Permission denied)"
2606+
self.assertErrorRegex(EasyBuildError, error_pattern, self.test_toy_build,
2607+
extra_args=extra_args, raise_error=True, verbose=False)
2608+
25192609

25202610
def suite():
25212611
""" return all the tests in this file """

0 commit comments

Comments
 (0)