diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.1-foss-2022a-CUDA-11.7.0.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.1-foss-2022a-CUDA-11.7.0.eb index 7f1ab7e67f8..528fb671557 100644 --- a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.1-foss-2022a-CUDA-11.7.0.eb +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.1-foss-2022a-CUDA-11.7.0.eb @@ -24,6 +24,8 @@ patches = [ 'PyTorch-1.11.0_increase_c10d_gloo_timeout.patch', 'PyTorch-1.11.0_increase-distributed-test-timeout.patch', 'PyTorch-1.11.0_install-vsx-vec-headers.patch', + 'PyTorch-1.11.1_skip-test_init_from_local_shards.patch', + 'PyTorch-1.12.1_fix-autograd-thread_shutdown-test.patch', 'PyTorch-1.12.1_fix-cuda-gcc-version-check.patch', 'PyTorch-1.12.1_fix-skip-decorators.patch', 'PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch', @@ -41,7 +43,7 @@ patches = [ 'PyTorch-1.12.1_remove-flaky-test-in-testnn.patch', 'PyTorch-1.12.1_skip-ao-sparsity-test-without-fbgemm.patch', 'PyTorch-1.12.1_skip-failing-grad-test.patch', - 'PyTorch-1.12.1_skip-test_round_robin_create_destroy.patch', + 'PyTorch-1.12.1_skip-test_round_robin.patch', ] checksums = [ '031c71073db73da732b5d01710220564ce6dd88d812ba053f0cc94296401eccb', # pytorch-v1.12.1.tar.gz @@ -64,6 +66,10 @@ checksums = [ # PyTorch-1.11.0_increase-distributed-test-timeout.patch '087ad20163a1291773ae3457569b80523080eb3731e210946459b2333a919f3f', 'f2e6b9625733d9a471bb75e1ea20e28814cf1380b4f9089aa838ee35ddecf07d', # PyTorch-1.11.0_install-vsx-vec-headers.patch + # PyTorch-1.11.1_skip-test_init_from_local_shards.patch + '4aeb1b0bc863d4801b0095cbce69f8794066748f0df27c6aaaf729c5ecba04b7', + # PyTorch-1.12.1_fix-autograd-thread_shutdown-test.patch + 'd97cd6b0570a167ecc3e631dc4ea884d95ace285cc38aa980566f4fec2c0d089', # PyTorch-1.12.1_fix-cuda-gcc-version-check.patch 'a650f4576f06c749f244cada52ff9c02499fa8f182019129488db3845e0756ab', 'e3ca6e42b2fa592ea095939fb59ab875668a058479407db3f3684cc5c6f4146c', # PyTorch-1.12.1_fix-skip-decorators.patch @@ -91,8 +97,8 @@ checksums = [ # PyTorch-1.12.1_skip-ao-sparsity-test-without-fbgemm.patch 'edd464ec8c37b44c07a72008d732604f6837f2dd61c7810c391a86ba4945ca39', '1c89e7e67287fe6b9a95480a4178d3653b94d0ab2fe68edf227606c8ae548fdc', # PyTorch-1.12.1_skip-failing-grad-test.patch - # PyTorch-1.12.1_skip-test_round_robin_create_destroy.patch - '1435fcac3234edc865479199673b902eb67f6a2bd046af7d731141f03594666d', + # PyTorch-1.12.1_skip-test_round_robin.patch + '63d4849b78605aa088fdff695637d9473ea60dee603a3ff7f788690d70c55349', ] osdependencies = [OS_PKG_IBVERBS_DEV] @@ -131,6 +137,9 @@ excluded_tests = { 'distributed/test_distributed_spawn', # Broken on CUDA 11.6/11.7: https://github.com/pytorch/pytorch/issues/75375 'distributions/test_constraints', + # Those 2 abort on some machines. Skip for now + 'distributed/fsdp/test_fsdp_input', + 'distributed/fsdp/test_fsdp_mixed_precision', ] } diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.1_fix-autograd-thread_shutdown-test.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.1_fix-autograd-thread_shutdown-test.patch new file mode 100644 index 00000000000..c82cc060467 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.1_fix-autograd-thread_shutdown-test.patch @@ -0,0 +1,24 @@ +Fix flaky test_thread_shutdown in test_autograd + +From https://github.com/pytorch/pytorch/pull/86464 + +Backport: Alexander Grund (TU Dresden) + +diff --git a/test/test_autograd.py b/test/test_autograd.py +index da1e859682e..0c0bc4f1a2a 100644 +--- a/test/test_autograd.py ++++ b/test/test_autograd.py +@@ -4320,8 +4320,12 @@ class MyFunction(Function): + def backward(ctx, grad): + return grad + ++# Run on cuda if it is available to ensure that the worker thread ++# is properly initialized by the time we exit. ++device = "cuda" if torch.cuda.is_available() else "cpu" ++ + for shape in [(1,), ()]: +- v = torch.ones(shape, requires_grad=True) ++ v = torch.ones(shape, requires_grad=True, device=device) + MyFunction.apply(v).backward() + """ + s = TestCase.runWithPytorchAPIUsageStderr(code) diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.1_skip-test_round_robin.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.1_skip-test_round_robin.patch new file mode 100644 index 00000000000..ebeec6b80b5 --- /dev/null +++ b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.12.1_skip-test_round_robin.patch @@ -0,0 +1,35 @@ +test_round_robin & test_round_robin_create_destroy of distributed/test_c10d_gloo may run into timeouts. +So simply skip the on all OS (not only on Windows), the existing skip marker suggest that this is OK. + +Author: Alexander Grund (TU Dresden) + +diff --git a/test/distributed/test_c10d_gloo.py b/test/distributed/test_c10d_gloo.py +index e49d65ea33d..b4fb75a1b11 100644 +--- a/test/distributed/test_c10d_gloo.py ++++ b/test/distributed/test_c10d_gloo.py +@@ -10,6 +10,7 @@ import sys + import tempfile + from functools import reduce + from itertools import groupby ++from unittest import skip + + import torch + import torch.distributed as c10d +@@ -1415,7 +1415,7 @@ class ProcessGroupGlooTest(MultiProcessTestCase): + for i, tensor in enumerate(tensors): + self.assertEqual(torch.full(size, float(i * self.world_size)), tensor) + +- @skip_if_win32() ++ @skip("Occasionally times out") + @requires_gloo() + def test_round_robin(self): + num_process_groups = 2 +@@ -1438,7 +1439,7 @@ class ProcessGroupGlooTest(MultiProcessTestCase): + pg.broadcast(tensor, root=0).wait() + self.assertEqual(torch.full([100, 100], 0.0), tensor) + +- @skip_if_win32() ++ @skip("Occasionally times out") + @requires_gloo() + def test_round_robin_create_destroy(self): + store = c10d.FileStore(self.file_name, self.world_size)