Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
167 changes: 97 additions & 70 deletions easybuild/tools/toolchain/mpi.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,12 @@
:author: Stijn De Weirdt (Ghent University)
:author: Kenneth Hoste (Ghent University)
"""
import copy
import os
import tempfile
from distutils.version import LooseVersion

from easybuild.base import fancylogger
import easybuild.tools.environment as env
import easybuild.tools.toolchain as toolchain
from easybuild.tools.build_log import EasyBuildError
Expand All @@ -41,6 +43,95 @@
from easybuild.tools.toolchain.toolchain import Toolchain


_log = fancylogger.getLogger('tools.toolchain.mpi', fname=False)


def get_mpi_cmd_template(mpi_family, params, mpi_version=None):
"""
Return template for MPI command, for specified MPI family.

:param mpi_family: MPI family to use to determine MPI command template
"""

params = copy.deepcopy(params)

mpi_cmd_template = build_option('mpi_cmd_template')
if mpi_cmd_template:
_log.info("Using specified template for MPI commands: %s", mpi_cmd_template)
else:
# different known mpirun commands
mpirun_n_cmd = "mpirun -n %(nr_ranks)s %(cmd)s"
mpi_cmds = {
toolchain.OPENMPI: mpirun_n_cmd,
toolchain.QLOGICMPI: "mpirun -H localhost -np %(nr_ranks)s %(cmd)s",
toolchain.INTELMPI: mpirun_n_cmd,
toolchain.MVAPICH2: mpirun_n_cmd,
toolchain.MPICH: mpirun_n_cmd,
toolchain.MPICH2: mpirun_n_cmd,
}

# Intel MPI mpirun needs more work
if mpi_cmd_template is None:

if mpi_family == toolchain.INTELMPI:

if mpi_version is None:
raise EasyBuildError("Intel MPI version unknown, can't determine MPI command template!")

# for old versions of Intel MPI, we need to use MPD
if LooseVersion(mpi_version) <= LooseVersion('4.1'):

mpi_cmds[toolchain.INTELMPI] = "mpirun %(mpdbf)s %(nodesfile)s -np %(nr_ranks)s %(cmd)s"

# set temporary dir for MPD
# note: this needs to be kept *short*,
# to avoid mpirun failing with "socket.error: AF_UNIX path too long"
# exact limit is unknown, but ~20 characters seems to be OK
env.setvar('I_MPI_MPD_TMPDIR', tempfile.gettempdir())
mpd_tmpdir = os.environ['I_MPI_MPD_TMPDIR']
if len(mpd_tmpdir) > 20:
_log.warning("$I_MPI_MPD_TMPDIR should be (very) short to avoid problems: %s", mpd_tmpdir)

# temporary location for mpdboot and nodes files
tmpdir = tempfile.mkdtemp(prefix='mpi_cmd_for-')

# set PBS_ENVIRONMENT, so that --file option for mpdboot isn't stripped away
env.setvar('PBS_ENVIRONMENT', "PBS_BATCH_MPI")

# make sure we're always using mpd as process manager
# only required for/picked up by Intel MPI v4.1 or higher, no harm done for others
env.setvar('I_MPI_PROCESS_MANAGER', 'mpd')

# create mpdboot file
mpdboot = os.path.join(tmpdir, 'mpdboot')
write_file(mpdboot, "localhost ifhn=localhost")

params.update({'mpdbf': "--file=%s" % mpdboot})

# create nodes file
nodes = os.path.join(tmpdir, 'nodes')
write_file(nodes, "localhost\n" * int(params['nr_ranks']))

params.update({'nodesfile': "-machinefile %s" % nodes})

if mpi_family in mpi_cmds:
mpi_cmd_template = mpi_cmds[mpi_family]
_log.info("Using template MPI command '%s' for MPI family '%s'", mpi_cmd_template, mpi_family)
else:
raise EasyBuildError("Don't know which template MPI command to use for MPI family '%s'", mpi_family)

missing = []
for key in sorted(params.keys()):
tmpl = '%(' + key + ')s'
if tmpl not in mpi_cmd_template:
missing.append(tmpl)
if missing:
raise EasyBuildError("Missing templates in mpi-cmd-template value '%s': %s",
mpi_cmd_template, ', '.join(missing))

return mpi_cmd_template, params


class Mpi(Toolchain):
"""General MPI-like class
can't be used without creating new class M(Mpi)
Expand Down Expand Up @@ -191,79 +282,15 @@ def mpi_cmd_for(self, cmd, nr_ranks):
'cmd': cmd,
}

mpi_cmd_template = build_option('mpi_cmd_template')
if mpi_cmd_template:
self.log.info("Using specified template for MPI commands: %s", mpi_cmd_template)
else:
# different known mpirun commands
mpirun_n_cmd = "mpirun -n %(nr_ranks)s %(cmd)s"
mpi_cmds = {
toolchain.OPENMPI: mpirun_n_cmd,
toolchain.QLOGICMPI: "mpirun -H localhost -np %(nr_ranks)s %(cmd)s",
toolchain.INTELMPI: mpirun_n_cmd,
toolchain.MVAPICH2: mpirun_n_cmd,
toolchain.MPICH: mpirun_n_cmd,
toolchain.MPICH2: mpirun_n_cmd,
}

mpi_family = self.mpi_family()

# Intel MPI mpirun needs more work
if mpi_cmd_template is None:

if mpi_family == toolchain.INTELMPI:

# for old versions of Intel MPI, we need to use MPD
impi_ver = self.get_software_version(self.MPI_MODULE_NAME)[0]
if LooseVersion(impi_ver) <= LooseVersion('4.1'):

mpi_cmds[toolchain.INTELMPI] = "mpirun %(mpdbf)s %(nodesfile)s -np %(nr_ranks)s %(cmd)s"

# set temporary dir for MPD
# note: this needs to be kept *short*,
# to avoid mpirun failing with "socket.error: AF_UNIX path too long"
# exact limit is unknown, but ~20 characters seems to be OK
env.setvar('I_MPI_MPD_TMPDIR', tempfile.gettempdir())
mpd_tmpdir = os.environ['I_MPI_MPD_TMPDIR']
if len(mpd_tmpdir) > 20:
self.log.warning("$I_MPI_MPD_TMPDIR should be (very) short to avoid problems: %s", mpd_tmpdir)

# temporary location for mpdboot and nodes files
tmpdir = tempfile.mkdtemp(prefix='mpi_cmd_for-')

# set PBS_ENVIRONMENT, so that --file option for mpdboot isn't stripped away
env.setvar('PBS_ENVIRONMENT', "PBS_BATCH_MPI")

# make sure we're always using mpd as process manager
# only required for/picked up by Intel MPI v4.1 or higher, no harm done for others
env.setvar('I_MPI_PROCESS_MANAGER', 'mpd')

# create mpdboot file
mpdboot = os.path.join(tmpdir, 'mpdboot')
write_file(mpdboot, "localhost ifhn=localhost")

params.update({'mpdbf': "--file=%s" % mpdboot})

# create nodes file
nodes = os.path.join(tmpdir, 'nodes')
write_file(nodes, "localhost\n" * int(nr_ranks))

params.update({'nodesfile': "-machinefile %s" % nodes})

if mpi_family in mpi_cmds.keys():
mpi_cmd_template = mpi_cmds[mpi_family]
self.log.info("Using template MPI command '%s' for MPI family '%s'", mpi_cmd_template, mpi_family)
else:
raise EasyBuildError("Don't know which template MPI command to use for MPI family '%s'", mpi_family)
if mpi_family == toolchain.INTELMPI:
mpi_version = self.get_software_version(self.MPI_MODULE_NAME)[0]
else:
mpi_version = None

missing = []
for key in sorted(params.keys()):
tmpl = '%(' + key + ')s'
if tmpl not in mpi_cmd_template:
missing.append(tmpl)
if missing:
raise EasyBuildError("Missing templates in mpi-cmd-template value '%s': %s",
mpi_cmd_template, ', '.join(missing))
mpi_cmd_template, params = get_mpi_cmd_template(mpi_family, params, mpi_version=mpi_version)
self.log.info("Using MPI command template '%s' (params: %s)", mpi_cmd_template, params)

try:
res = mpi_cmd_template % params
Expand Down
36 changes: 36 additions & 0 deletions test/framework/toolchain.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
from test.framework.utilities import EnhancedTestCase, TestLoaderFiltered, find_full_path, init_config

import easybuild.tools.modules as modules
import easybuild.tools.toolchain as toolchain
import easybuild.tools.toolchain.compiler
from easybuild.framework.easyconfig.easyconfig import EasyConfig, ActiveMNS
from easybuild.toolchains.system import SystemToolchain
Expand All @@ -49,6 +50,7 @@
from easybuild.tools.filetools import adjust_permissions, copy_dir, find_eb_script, mkdir, read_file, write_file, which
from easybuild.tools.py2vs3 import string_type
from easybuild.tools.run import run_cmd
from easybuild.tools.toolchain.mpi import get_mpi_cmd_template
from easybuild.tools.toolchain.toolchain import env_vars_external_module
from easybuild.tools.toolchain.utilities import get_toolchain, search_toolchain

Expand Down Expand Up @@ -1027,6 +1029,40 @@ def test_mpi_cmd_for(self):
error_pattern = "Failed to complete MPI cmd template .* with .*: KeyError 'foo'"
self.assertErrorRegex(EasyBuildError, error_pattern, tc.mpi_cmd_for, 'test', 1)

def test_get_mpi_cmd_template(self):
"""Test get_mpi_cmd_template function."""

# search_toolchain needs to be called once to make sure constants like toolchain.OPENMPI are in place
search_toolchain('')

input_params = {'nr_ranks': 123, 'cmd': 'this_is_just_a_test'}

for mpi_fam in [toolchain.OPENMPI, toolchain.MPICH, toolchain.MPICH2, toolchain.MVAPICH2]:
mpi_cmd_tmpl, params = get_mpi_cmd_template(mpi_fam, input_params)
self.assertEqual(mpi_cmd_tmpl, "mpirun -n %(nr_ranks)s %(cmd)s")
self.assertEqual(params, input_params)

# Intel MPI is a special case, also requires MPI version to be known
impi = toolchain.INTELMPI
error_pattern = "Intel MPI version unknown, can't determine MPI command template!"
self.assertErrorRegex(EasyBuildError, error_pattern, get_mpi_cmd_template, impi, {})

mpi_cmd_tmpl, params = get_mpi_cmd_template(toolchain.INTELMPI, input_params, mpi_version='1.0')
self.assertEqual(mpi_cmd_tmpl, "mpirun %(mpdbf)s %(nodesfile)s -np %(nr_ranks)s %(cmd)s")
self.assertEqual(sorted(params.keys()), ['cmd', 'mpdbf', 'nodesfile', 'nr_ranks'])
self.assertEqual(params['cmd'], 'this_is_just_a_test')
self.assertEqual(params['nr_ranks'], 123)

mpdbf = params['mpdbf']
regex = re.compile('^--file=.*/mpdboot$')
self.assertTrue(regex.match(mpdbf), "'%s' should match pattern '%s'" % (mpdbf, regex.pattern))
self.assertTrue(os.path.exists(mpdbf.split('=')[1]))

nodesfile = params['nodesfile']
regex = re.compile('^-machinefile /.*/nodes$')
self.assertTrue(regex.match(nodesfile), "'%s' should match pattern '%s'" % (nodesfile, regex.pattern))
self.assertTrue(os.path.exists(nodesfile.split(' ')[1]))

def test_prepare_deps(self):
"""Test preparing for a toolchain when dependencies are involved."""
tc = self.get_toolchain('GCC', version='6.4.0-2.28')
Expand Down