Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions .github/workflows/hvd-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ concurrency:
jobs:
horovod-tests:
runs-on: ubuntu-latest
timeout-minutes: 60
timeout-minutes: 120
strategy:
matrix:
python-version: ["3.11"]
Expand Down Expand Up @@ -64,15 +64,15 @@ jobs:
#install other dependencies
pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu
pip install -r requirements-dev.txt

# Install Horovod from source and apply a patch to build with recent pytorch
# We can't use pip install <whatever> as build-env can't find pytorch and
# We can't use pip install <whatever> as build-env can't find pytorch and
# `--no-build-isolation` does not work with horovod setup.py
git clone --recursive https://github.com/horovod/horovod.git /tmp/horovod
cd /tmp/horovod
sed -i "s/CMAKE_CXX_STANDARD 14/CMAKE_CXX_STANDARD 17/g" CMakeLists.txt
sed -i "s/CMAKE_CXX_STANDARD 14/CMAKE_CXX_STANDARD 17/g" horovod/torch/CMakeLists.txt
HOROVOD_WITH_PYTORCH=1 python setup.py install
sed -i "s/CMAKE_CXX_STANDARD 14/CMAKE_CXX_STANDARD 17/g" horovod/torch/CMakeLists.txt
HOROVOD_WITH_PYTORCH=1 python setup.py install
cd -
# test the installation:
python -c "import horovod.torch as hvd; hvd.mpi_ops.Sum"
Expand All @@ -90,11 +90,11 @@ jobs:
- name: Run Tests
uses: nick-fields/retry@v3
with:
max_attempts: 5
timeout_minutes: 15
max_attempts: 3
timeout_minutes: 40
shell: bash
command: bash tests/run_cpu_tests.sh
new_command_on_retry: USE_LAST_FAILED=1 bash tests/run_cpu_tests.sh
command: USE_XDIST=0 bash tests/run_cpu_tests.sh
new_command_on_retry: USE_LAST_FAILED=1 USE_XDIST=0 bash tests/run_cpu_tests.sh

- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
Expand Down
2 changes: 1 addition & 1 deletion tests/ignite/distributed/utils/test_horovod.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,7 @@ def test_idist_methods_overhead_hvd(gloo_hvd_executor):
sync_model = False
gloo_hvd_executor(_test_idist_methods_overhead, (ok_factor, sync_model), np=np, do_init=True)

ok_factor = 3.0
ok_factor = 3.5
sync_model = True
gloo_hvd_executor(_test_idist_methods_overhead, (ok_factor, sync_model), np=np, do_init=True)

Expand Down
10 changes: 8 additions & 2 deletions tests/run_cpu_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,16 +6,22 @@ skip_distrib_tests=${SKIP_DISTRIB_TESTS:-0}
use_last_failed=${USE_LAST_FAILED:-0}
match_tests_expression=${1:-""}

use_xdist=${USE_XDIST:-1}
core_args="-vvv tests/ignite"
if [ "${use_xdist}" -eq "1" ]; then
core_args="${core_args} --tx 4*popen//python=python"
fi

CUDA_VISIBLE_DEVICES="" run_tests \
--core_args "--tx 4*popen//python=python -vvv tests/ignite" \
--core_args "${core_args}" \
--cache_dir ".cpu-not-distrib" \
--skip_distrib_tests "${skip_distrib_tests}" \
--use_coverage 1 \
--match_tests_expression "${match_tests_expression}" \
--use_last_failed ${use_last_failed}

# https://pubs.opengroup.org/onlinepubs/009695399/utilities/xcu_chap02.html#tag_02_06_02
if [ "${skip_distrib_tests}" -eq "1" ]; then
if [ "${skip_distrib_tests}" -eq "1" ] || [ "${use_xdist}" -eq "0" ]; then
exit 0
fi

Expand Down
Loading