easybuilders · branfosj · Dec 17, 2023 · Oct 24, 2023 · Oct 26, 2023 · Oct 26, 2023
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.0.1-foss-2022b.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.0.1-foss-2022b.eb
@@ -0,0 +1,152 @@
+name = 'PyTorch'
+version = '2.0.1'
+
+homepage = 'https://pytorch.org/'
+description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration.
+PyTorch is a deep learning framework that puts Python first."""
+
+toolchain = {'name': 'foss', 'version': '2022b'}
+
+source_urls = [GITHUB_RELEASE]
+sources = ['%(namelower)s-v%(version)s.tar.gz']
+patches = [
+    'PyTorch-1.7.0_disable-dev-shm-test.patch',
+    'PyTorch-1.11.1_skip-test_init_from_local_shards.patch',
+    'PyTorch-1.12.1_add-hypothesis-suppression.patch',
+    'PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch',
+    'PyTorch-1.12.1_fix-TestTorch.test_to.patch',
+    'PyTorch-1.12.1_skip-test_round_robin.patch',
+    'PyTorch-1.13.1_fix-gcc-12-warning-in-fbgemm.patch',
+    'PyTorch-1.13.1_fix-protobuf-dependency.patch',
+    'PyTorch-1.13.1_fix-warning-in-test-cpp-api.patch',
+    'PyTorch-1.13.1_skip-failing-singular-grad-test.patch',
+    'PyTorch-1.13.1_skip-tests-without-fbgemm.patch',
+    'PyTorch-2.0.1_add-missing-vsx-vector-shift-functions.patch',
+    'PyTorch-2.0.1_avoid-test_quantization-failures.patch',
+    'PyTorch-2.0.1_disable-gcc12-warning.patch',
+    'PyTorch-2.0.1_disable-test-sharding.patch',
+    'PyTorch-2.0.1_fix-numpy-compat.patch',
+    'PyTorch-2.0.1_fix-shift-ops.patch',
+    'PyTorch-2.0.1_fix-skip-decorators.patch',
+    'PyTorch-2.0.1_fix-test_memory_profiler.patch',
+    'PyTorch-2.0.1_fix-test-ops-conf.patch',
+    'PyTorch-2.0.1_fix-torch.compile-on-ppc.patch',
+    'PyTorch-2.0.1_fix-ub-in-inductor-codegen.patch',
+    'PyTorch-2.0.1_fix-vsx-loadu.patch',
+    'PyTorch-2.0.1_ignore_unexpected_success_in_test_torchinductor_opinfo.patch',
+    'PyTorch-2.0.1_no-cuda-stubs-rpath.patch',
+    'PyTorch-2.0.1_remove-test-requiring-online-access.patch',
+    'PyTorch-2.0.1_skip-diff-test-on-ppc.patch',
+    'PyTorch-2.0.1_skip-failing-gradtest.patch',
+    'PyTorch-2.0.1_skip-test_baddbmm_cpu_bfloat16.patch',
+    'PyTorch-2.0.1_skip-test_shuffle_reproducibility.patch',
+    'PyTorch-2.0.1_skip-tests-skipped-in-subprocess.patch',
+]
+checksums = [
+    {'pytorch-v2.0.1.tar.gz': '9c564ca440265c69400ef5fdd48bf15e28af5aa4bed84c95efaad960a6699998'},
+    {'PyTorch-1.7.0_disable-dev-shm-test.patch': '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a'},
+    {'PyTorch-1.11.1_skip-test_init_from_local_shards.patch':
+     '4aeb1b0bc863d4801b0095cbce69f8794066748f0df27c6aaaf729c5ecba04b7'},
+    {'PyTorch-1.12.1_add-hypothesis-suppression.patch':
+     'e71ffb94ebe69f580fa70e0de84017058325fdff944866d6bd03463626edc32c'},
+    {'PyTorch-1.12.1_fix-test_cpp_extensions_jit.patch':
+     '1efc9850c431d702e9117d4766277d3f88c5c8b3870997c9974971bce7f2ab83'},
+    {'PyTorch-1.12.1_fix-TestTorch.test_to.patch': '75f27987c3f25c501e719bd2b1c70a029ae0ee28514a97fe447516aee02b1535'},
+    {'PyTorch-1.12.1_skip-test_round_robin.patch': '63d4849b78605aa088fdff695637d9473ea60dee603a3ff7f788690d70c55349'},
+    {'PyTorch-1.13.1_fix-gcc-12-warning-in-fbgemm.patch':
+     '5c7be91a6096083a0b1315efe0001537499c600f1f569953c6a2c7f4cc1d0910'},
+    {'PyTorch-1.13.1_fix-protobuf-dependency.patch':
+     '8bd755a0cab7233a243bc65ca57c9630dfccdc9bf8c9792f0de4e07a644fcb00'},
+    {'PyTorch-1.13.1_fix-warning-in-test-cpp-api.patch':
+     'bdde0f2105215c95a54de64ec4b1a4520528510663174fef6d5b900eb1db3937'},
+    {'PyTorch-1.13.1_skip-failing-singular-grad-test.patch':
+     '72688a57b2bb617665ad1a1d5e362c5111ae912c10936bb38a089c0204729f48'},
+    {'PyTorch-1.13.1_skip-tests-without-fbgemm.patch':
+     '481e595f673baf8ae58b41697a6792b83048b0264aa79b422f48cd8c22948bb7'},
+    {'PyTorch-2.0.1_add-missing-vsx-vector-shift-functions.patch':
+     'da44961d6c204403ba0c4b88cedccf06a7a3d24f29c4398545f96efae7a45c95'},
+    {'PyTorch-2.0.1_avoid-test_quantization-failures.patch':
+     '02e3f47e4ed1d7d6077e26f1ae50073dc2b20426269930b505f4aefe5d2f33cd'},
+    {'PyTorch-2.0.1_disable-gcc12-warning.patch': 'f558dfc8f7cdcdc74c4c58ef7e8fe6d67870aec6386ac0d923f1b745d108eec7'},
+    {'PyTorch-2.0.1_disable-test-sharding.patch': 'a1ed7f21c9a269ea039a07a3d6574f885787b30ca5687143c96e096d31066cca'},
+    {'PyTorch-2.0.1_fix-numpy-compat.patch': 'f3e5798193e0909a415d824f13772973200965db84476c1737824f2735f2db94'},
+    {'PyTorch-2.0.1_fix-shift-ops.patch': '5ee655d5dba56d801d5618543b6ca299fa874939a3471f7b5449bfcb7f3f18c7'},
+    {'PyTorch-2.0.1_fix-skip-decorators.patch': '2039012cef45446065e1a2097839fe20bb29fe3c1dcc926c3695ebf29832e920'},
+    {'PyTorch-2.0.1_fix-test_memory_profiler.patch':
+     'fd03117c46f59c1c62227d31c410c4cdd98fd35410976758cb9e7ec947582ddb'},
+    {'PyTorch-2.0.1_fix-test-ops-conf.patch': '0f995e4f89baf3cbeb8666cbfe694666a2ef2bc53d97d6301f768b3ff9001fa4'},
+    {'PyTorch-2.0.1_fix-torch.compile-on-ppc.patch':
+     '20f9172ae696da0c5c7b3bae6f0bf1221192cb1cbac3a44526a415087834bee7'},
+    {'PyTorch-2.0.1_fix-ub-in-inductor-codegen.patch':
+     '1b37194f55ae678f3657b8728dfb896c18ffe8babe90987ce468c4fa9274f357'},
+    {'PyTorch-2.0.1_fix-vsx-loadu.patch': 'a0ffa61da2d47c6acd09aaf6d4791e527d8919a6f4f1aa7ed38454cdcadb1f72'},
+    {'PyTorch-2.0.1_ignore_unexpected_success_in_test_torchinductor_opinfo.patch':
+     '57e2985a5b7085c2786e4b0c4a5f0c81f6b2ae9d5804bbd552b06e8b1570f4c4'},
+    {'PyTorch-2.0.1_no-cuda-stubs-rpath.patch': '8902e58a762240f24cdbf0182e99ccdfc2a93492869352fcb4ca0ec7e407f83a'},
+    {'PyTorch-2.0.1_remove-test-requiring-online-access.patch':
+     '721ab0d35ed0ff8a46cb84ced5a98c0fb8ce6143cf6cea80b1360d3d7f64f584'},
+    {'PyTorch-2.0.1_skip-diff-test-on-ppc.patch': 'f6e39cd774e5663df25507a73d37ad598157c2eadb2f47ca20a537dbe4b3e14f'},
+    {'PyTorch-2.0.1_skip-failing-gradtest.patch': '8030bdec6ba49b057ab232d19a7f1a5e542e47e2ec340653a246ec9ed59f8bc1'},
+    {'PyTorch-2.0.1_skip-test_baddbmm_cpu_bfloat16.patch':
+     '199005bbbb913837e557358dee31535d8e3f63af9ac7cdcece624ab8e572e28a'},
+    {'PyTorch-2.0.1_skip-test_shuffle_reproducibility.patch':
+     '7047862abc1abaff62954da59700f36d4f39fcf83167a638183b1b7f8fec78ae'},
+    {'PyTorch-2.0.1_skip-tests-skipped-in-subprocess.patch':
+     '166c134573a95230e39b9ea09ece3ad8072f39d370c9a88fb2a1e24f6aaac2b5'},
+]
+
+osdependencies = [OS_PKG_IBVERBS_DEV]
+
+builddependencies = [
+    ('CMake', '3.24.3'),
+    ('hypothesis', '6.68.2'),
+    # For tests
+    ('pytest-rerunfailures', '12.0'),
+    ('pytest-shard', '0.1.2'),
+]
+
+dependencies = [
+    ('Ninja', '1.11.1'),  # Required for JIT compilation of C++ extensions
+    ('Python', '3.10.8'),
+    ('protobuf', '23.0'),
+    ('protobuf-python', '4.23.0'),
+    ('pybind11', '2.10.3'),
+    ('SciPy-bundle', '2023.02'),
+    ('PyYAML', '6.0'),
+    ('MPFR', '4.2.0'),
+    ('GMP', '6.2.1'),
+    ('numactl', '2.0.16'),
+    ('FFmpeg', '5.1.2'),
+    ('Pillow', '9.4.0'),
+    ('expecttest', '0.1.3'),
+    ('networkx', '3.0'),
+    ('sympy', '1.12'),
+]
+
+excluded_tests = {
+    '': [
+        # This test seems to take too long on NVIDIA Ampere at least.
+        'distributed/test_distributed_spawn',
+        # Broken on CUDA 11.6/11.7: https://github.com/pytorch/pytorch/issues/75375
+        'distributions/test_constraints',
+        # no xdoctest
+        'doctests',
+        # failing on broadwell
+        # See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712
+        'test_native_mha',
+        # intermittent failures on various systems
+        # See https://github.com/easybuilders/easybuild-easyconfigs/issues/17712
+        'distributed/rpc/test_tensorpipe_agent',
+    ]
+}
+
+runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error  --verbose %(excluded_tests)s'
+
+# Especially test_quantization has a few corner cases that are triggered by the random input values,
+# those cannot be easily avoided, see https://github.com/pytorch/pytorch/issues/107030
+# So allow a low number of tests to fail as the tests "usually" succeed
+max_failed_tests = 3
+
+tests = ['PyTorch-check-cpp-extension.py']
+
+moduleclass = 'ai'
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.0.1_disable-gcc12-warning.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.0.1_disable-gcc12-warning.patch
@@ -0,0 +1,32 @@
+GCC 12 has a false positive warning when compiled for some architectures, e.g. Intel Sapphire Rapids.
+See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=112370
+
+Suppress this warning such that the build doesn't error.
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/CMakeLists.txt b/CMakeLists.txt
+index 471fc8a8d3d..5eb7b432630 100644
+--- a/CMakeLists.txt
++++ b/CMakeLists.txt
+@@ -557,6 +557,7 @@ string(APPEND CMAKE_CUDA_FLAGS " -Xfatbin -compress-all")
+ if(NOT MSVC)
+   string(APPEND CMAKE_CUDA_FLAGS_DEBUG " -g -lineinfo --source-in-ptx")
+   string(APPEND CMAKE_CUDA_FLAGS_RELWITHDEBINFO " -g -lineinfo --source-in-ptx")
++  append_cxx_flag_if_supported("-Wno-free-nonheap-object" CMAKE_CXX_FLAGS)
+ endif(NOT MSVC)
+
+ # Set INTERN_BUILD_MOBILE for all mobile builds. Components that are not
+diff --git a/cmake/public/utils.cmake b/cmake/public/utils.cmake
+index 60cca5383dd..76c02d7479f 100644
+--- a/cmake/public/utils.cmake
++++ b/cmake/public/utils.cmake
+@@ -548,6 +548,8 @@ function(torch_update_find_cuda_flags)
+   endif()
+ endfunction()
+
++include(CheckCXXCompilerFlag)
++
+ ##############################################################################
+ # CHeck if given flag is supported and append it to provided outputvar
+ # Also define HAS_UPPER_CASE_FLAG_NAME variable
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-2.0.1_skip-test_baddbmm_cpu_bfloat16.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-2.0.1_skip-test_baddbmm_cpu_bfloat16.patch
@@ -0,0 +1,25 @@
+test_baddbmm_cpu_bfloat16 in test_linalg.py fails with
+> AssertionError: Tensor-likes are not close!
+> 
+> Mismatched elements: 1387 / 6000 (23.1%)
+> Greatest absolute difference: 3.98046875 at index (0, 11, 7) (up to 0.5 allowed)
+> Greatest relative difference: 1324.7142857142858 at index (0, 4, 9) (up to 0.016 allowed)
+
+Happens also with the official 2.0.1 PIP package, and seems to be known to be flaky: https://github.com/pytorch/pytorch/issues/103046
+So assume this to be expected and skip the test.
+
+Author: Alexander Grund (TU Dresden)
+
+diff --git a/test/test_linalg.py b/test/test_linalg.py
+index 29a0e482d86..d195ad60add 100644
+--- a/test/test_linalg.py
++++ b/test/test_linalg.py
+@@ -5871,7 +5871,7 @@ scipy_lobpcg  | {:10.2e}  | {:10.2e}  | {:6} | N/A
+
+     @precisionOverride({torch.half: 0.1, torch.bfloat16: 0.5})
+     @onlyNativeDeviceTypes
+-    @dtypes(*floating_and_complex_types_and(torch.bfloat16))
++    @dtypes(*floating_and_complex_types())
+     @tf32_on_and_off(0.05)
+     def test_baddbmm(self, device, dtype):
+         if self.device_type == 'cuda' and dtype is torch.bfloat16 and not SM53OrLater: