easybuilders · branfosj · Nov 12, 2021 · Oct 28, 2021 · Oct 28, 2021 · Oct 28, 2021
diff --git a/easybuild/easyconfigs/h/Horovod/Horovod-0.23.0-foss-2021a-CUDA-11.3.1-PyTorch-1.10.0.eb b/easybuild/easyconfigs/h/Horovod/Horovod-0.23.0-foss-2021a-CUDA-11.3.1-PyTorch-1.10.0.eb
@@ -0,0 +1,48 @@
+easyblock = 'PythonBundle'
+
+name = 'Horovod'
+version = '0.23.0'
+local_pt_version = '1.10.0'
+local_cuda_suffix = '-CUDA-%(cudaver)s'
+versionsuffix = local_cuda_suffix + '-PyTorch-%s' % local_pt_version
+
+homepage = 'https://github.com/uber/horovod'
+description = "Horovod is a distributed training framework for TensorFlow."
+
+toolchain = {'name': 'foss', 'version': '2021a'}
+
+builddependencies = [
+    ('CMake', '3.20.1'),
+    ('flatbuffers', '2.0.0'),
+]
+dependencies = [
+    ('Python', '3.9.5'),
+    ('PyYAML', '5.4.1'),
+    ('CUDA', '11.3.1', '', True),
+    ('NCCL', '2.10.3', local_cuda_suffix),
+    ('PyTorch', local_pt_version, local_cuda_suffix),
+]
+
+use_pip = True
+sanity_pip_check = True
+
+preinstallopts = 'HOROVOD_WITH_MPI=1 HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL '
+preinstallopts += 'HOROVOD_WITHOUT_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 HOROVOD_WITHOUT_MXNET=1 '
+
+parallel = 1  # Bug in CMake causes a race condition on horovod_cuda_kernels_generated_cuda_kernels.cu.o.NVCC-depend
+
+exts_list = [
+    ('cloudpickle', '2.0.0', {
+        'checksums': ['5cd02f3b417a783ba84a4ec3e290ff7929009fe51f6405423cfccfadd43ba4a4'],
+    }),
+    ('horovod', version, {
+        'checksums': ['72ab3e5f59df6a000473999937e52e6831ad1a5e4e7bd23885a04bcdd4d8691c'],
+    }),
+]
+
+sanity_check_paths = {
+    'files': ['bin/horovodrun'],
+    'dirs': ['lib/python%(pyshortver)s/site-packages'],
+}
+
+moduleclass = 'tools'
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0-foss-2021a-CUDA-11.3.1.eb b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0-foss-2021a-CUDA-11.3.1.eb
@@ -0,0 +1,119 @@
+name = 'PyTorch'
+version = '1.10.0'
+versionsuffix = '-CUDA-%(cudaver)s'
+
+homepage = 'https://pytorch.org/'
+description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration.
+PyTorch is a deep learning framework that puts Python first."""
+
+toolchain = {'name': 'foss', 'version': '2021a'}
+
+sources = [{
+    'filename': '%(name)s-%(version)s.tar.gz',
+    'git_config': {
+        'url': 'https://github.com/pytorch',
+        'repo_name': 'pytorch',
+        'tag': 'v%(version)s',
+        'recursive': True,
+    },
+}]
+patches = [
+    'PyTorch-1.7.0_avoid-nan-in-test-torch.patch',
+    'PyTorch-1.7.0_disable-dev-shm-test.patch',
+    'PyTorch-1.7.1_correctly-pass-jit_opt_level.patch',
+    'PyTorch-1.8.1_dont-use-gpu-ccc-in-test.patch',
+    'PyTorch-1.8.1_increase-distributed-test-timeout.patch',
+    'PyTorch-1.9.0_limit-world-size-for-zero-redundancy-opt-test.patch',
+    'PyTorch-1.10.0_fix-test-dataloader-fixed-affinity.patch',
+    'PyTorch-1.10.0_fix-alias-violation-in-bitwise-ops.patch',
+    'PyTorch-1.10.0_fix-faulty-asserts-and-skip-test.patch',
+    'PyTorch-1.10.0_fix-test-cond-cpu.patch',
+    'PyTorch-1.10.0_fix-vnni-detection.patch',
+    'PyTorch-1.10.0_increase_zero_optimizer_test_tolerance.patch',
+    'PyTorch-1.10.0_skip_failing_ops_tests.patch',
+    'PyTorch-1.10.0_skip_nan_tests_openblas.patch',
+]
+checksums = [
+    None,  # can't add proper SHA256 checksum, because source tarball is created locally after recursive 'git clone'
+    'b899aa94d9e60f11ee75a706563312ccefa9cf432756c470caa8e623991c8f18',  # PyTorch-1.7.0_avoid-nan-in-test-torch.patch
+    '622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a',  # PyTorch-1.7.0_disable-dev-shm-test.patch
+    # PyTorch-1.7.1_correctly-pass-jit_opt_level.patch
+    'd4d967d47f8a6172fcbf57f0a61835482968850967c4fdb01108b720696a988d',
+    '89ac7a8e9e7df2e64cf8404fe3a279f5e9b759fee41c9de3aaff9c22f385c2c6',  # PyTorch-1.8.1_dont-use-gpu-ccc-in-test.patch
+    # PyTorch-1.8.1_increase-distributed-test-timeout.patch
+    '7a6e512274f0b8673f4f207a5bc53387d88be7e79833f42d20365668b2118071',
+    # PyTorch-1.9.0_limit-world-size-for-zero-redundancy-opt-test.patch
+    'ff573660913ce055e24cfd194ce747ba5685091c631cfd443eae2a99d56b57ea',
+    # PyTorch-1.10.0_fix-test-dataloader-fixed-affinity.patch
+    '313dca681f45ce3bc7c4557fdcdcbe0b77216d2c708fa30a2ec0e22c44876707',
+    # PyTorch-1.10.0_fix-alias-violation-in-bitwise-ops.patch
+    '426c9ead1a74b656748d4c8bf8afd4303d8b9f2394ad22b21a845d07c8ca1d12',
+    # PyTorch-1.10.0_fix-faulty-asserts-and-skip-test.patch
+    '67152215e4530a9b1d7349fb20864445fd815288f04ab9e96e45c73b2d87827a',
+    # PyTorch-1.10.0_fix-test-cond-cpu.patch
+    '51f83f5d5ef69656ef35b73f17e0671e70113798421be11ea4c7b56ffcc4da03',
+    # PyTorch-1.10.0_fix-vnni-detection.patch
+    '1f3664c0febfa2a3fc4c0cd3bae185f289716ac0b6c3d7e8fa1cee19ba62b7cc',
+    # PyTorch-1.10.0_increase_zero_optimizer_test_tolerance.patch
+    'e65afb01786f7f030ccb5faada1eb474bb0c418bcadcf1baaa71a4fa2f3f4240',
+    # PyTorch-1.10.0_skip_failing_ops_tests.patch
+    '399af94ffcef4a6db5226552c46f11e9b0f0f371b2d7924b9e5764d2281581ab',
+    # PyTorch-1.10.0_skip_nan_tests_openblas.patch
+    '7d3f83e3056d9e47a460790313238f28708beb596cafaa7ae55e374d368bbedf',
+]
+
+osdependencies = [OS_PKG_IBVERBS_DEV]
+
+builddependencies = [
+    ('CMake', '3.20.1'),
+    ('hypothesis', '6.13.1'),
+]
+
+dependencies = [
+    ('CUDA', '11.3.1', '', True),
+    ('Ninja', '1.10.2'),  # Required for JIT compilation of C++ extensions
+    ('Python', '3.9.5'),
+    ('protobuf', '3.17.3'),
+    ('protobuf-python', '3.17.3'),
+    ('pybind11', '2.6.2'),
+    ('SciPy-bundle', '2021.05'),
+    ('typing-extensions', '3.10.0.0'),
+    ('PyYAML', '5.4.1'),
+    ('MPFR', '4.1.0'),
+    ('GMP', '6.2.1'),
+    ('numactl', '2.0.14'),
+    ('FFmpeg', '4.3.2'),
+    ('Pillow', '8.2.0'),
+    ('cuDNN', '8.2.1.32', '-CUDA-%(cudaver)s', True),
+    ('magma', '2.6.1', '-CUDA-%(cudaver)s'),
+    ('NCCL', '2.10.3', '-CUDA-%(cudaver)s'),
+    ('expecttest', '0.1.3'),
+]
+
+# default CUDA compute capabilities to use (override via --cuda-compute-capabilities)
+cuda_compute_capabilities = ['3.5', '3.7', '5.2', '6.0', '6.1', '7.0', '7.2', '7.5', '8.0', '8.6']
+
+custom_opts = ["USE_CUPTI_SO=1"]
+
+excluded_tests = {
+    '': [
+        # Bad tests: https://github.com/pytorch/pytorch/issues/60260
+        'distributed/elastic/utils/distributed_test',
+        'distributed/elastic/multiprocessing/api_test',
+        # These tests fail on A10s at the very least, they time out forever no matter how long the timeout is.
+        # Possibly related to NCCL 2.8.3: https://docs.nvidia.com/deeplearning/nccl/release-notes/rel_2-8-3.html
+        # 'distributed/test_distributed_fork',
+        'distributed/test_distributed_spawn',
+        # Fails on A10s: https://github.com/pytorch/pytorch/issues/63079
+        'test_optim',
+        # Test from this suite timeout often. The process group backend is deprecated anyway
+        # 'distributed/rpc/test_process_group_agent',
+    ]
+}
+
+runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error  --verbose %(excluded_tests)s'
+
+sanity_check_commands = ["python -c 'import caffe2.python'"]
+tests = ['PyTorch-check-cpp-extension.py']
+
+moduleclass = 'devel'
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_fix-alias-violation-in-bitwise-ops.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_fix-alias-violation-in-bitwise-ops.patch
@@ -0,0 +1,31 @@
+A reinterpret_cast to an unrelated type is undefined behavior.
+This causes real issues due to misoptimizations on at least GCC 10.2 on POWER
+See https://github.com/pytorch/pytorch/issues/58031
+
+Author: Alexander Grund (TU Dresden)
+
+Adapted for PT-1.10, where this is now in vec_base.h and templated
+
+diff --git a/aten/src/ATen/cpu/vec/vec_base.h b/aten/src/ATen/cpu/vec/vec_base.h
+index 697996ab8e..1663ae239a 100644
+--- a/aten/src/ATen/cpu/vec/vec_base.h
++++ b/aten/src/ATen/cpu/vec/vec_base.h
+@@ -701,12 +701,14 @@ inline Vectorized<T> operator^(const Vectorized<T>& a, const Vectorized<T>& b) {
+
+ template<class T, typename Op>
+ static inline Vectorized<T> bitwise_binary_op(const Vectorized<T> &a, const Vectorized<T> &b, Op op) {
+-  static constexpr uint32_t element_no = VECTOR_WIDTH / sizeof(intmax_t);
++  constexpr uint32_t element_no = VECTOR_WIDTH / sizeof(intmax_t);
++  __at_align__ intmax_t buffer_a[element_no];
++  __at_align__ intmax_t buffer_b[element_no];
+   __at_align__ intmax_t buffer[element_no];
+-  const intmax_t *a_ptr = reinterpret_cast<const intmax_t*>((const T*) a);
+-  const intmax_t *b_ptr = reinterpret_cast<const intmax_t*>((const T*) b);
++  a.store(buffer_a);
++  b.store(buffer_b);
+   for (uint32_t i = 0U; i < element_no; ++ i) {
+-    buffer[i] = op(a_ptr[i], b_ptr[i]);
++    buffer[i] = op(buffer_a[i], buffer_b[i]);
+   }
+   return Vectorized<T>::loadu(buffer);
+ }
diff --git a/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_fix-faulty-asserts-and-skip-test.patch b/easybuild/easyconfigs/p/PyTorch/PyTorch-1.10.0_fix-faulty-asserts-and-skip-test.patch
@@ -0,0 +1,188 @@
+From: Alexander Grund <[email protected]>
+Date: Tue, 18 May 2021 15:08:41 +0200
+Subject: [PATCH 1/2] Fix usage of TORCH_INTERNAL_ASSERT with message
+
+Using only a string as the argument for TORCH_INTERNAL_ASSERT will never
+trigger a failure as a string is always a truethy value.
+This hides actual bugs and makes users and devs think all worked while
+it did not.
+Change to use TORCH_INTERNAL_ASSERT(false, "msg")
+
+Subject: [PATCH 2/2] Add missing skip decorator for
+test_preserve_bundled_inputs_methods
+
+This test uses optimize_for_mobile which requires NNPACK to work
+
+diff --git a/aten/src/ATen/native/BinaryOps.cpp b/aten/src/ATen/native/BinaryOps.cpp
+index c4edadb03e..e889cd03a8 100644
+--- a/aten/src/ATen/native/BinaryOps.cpp
++++ b/aten/src/ATen/native/BinaryOps.cpp
+@@ -106,6 +106,7 @@ Tensor& add_relu_impl(
+     max_val = std::numeric_limits<double>::max();
+   } else {
+     TORCH_INTERNAL_ASSERT(
++        false,
+         "Unsupported datatype for add_relu:", self.dtype().name());
+   }
+
+diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp
+index 050fdce2ca..7e72263917 100644
+--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp
++++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp
+@@ -780,6 +780,7 @@ class QEmbeddingBag final {
+           include_last_offset);
+     } else {
+       TORCH_INTERNAL_ASSERT(
++          false,
+           "Currently only support 8-bit embedding_bag quantization");
+     }
+   }
+@@ -808,6 +809,7 @@ class QEmbedding final {
+
+     } else {
+       TORCH_INTERNAL_ASSERT(
++          false,
+           "Currently only support 8-bit embedding quantization");
+     }
+     return output;
+diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack_utils.h b/aten/src/ATen/native/quantized/cpu/qnnpack_utils.h
+index 6de646acfe..66341c959d 100644
+--- a/aten/src/ATen/native/quantized/cpu/qnnpack_utils.h
++++ b/aten/src/ATen/native/quantized/cpu/qnnpack_utils.h
+@@ -131,6 +131,7 @@ struct PackedConvWeightsQnnp : public ConvPackedParamsBase<kSpatialDim> {
+
+           if (conv_p.per_channel && conv_p.ukernel_type == pytorch_qnnp_ukernel_type_xzp_gemm) {
+             TORCH_INTERNAL_ASSERT(
++              false,
+               "Per channel quantized weights are not supported for XZP kernels");
+           }
+
+@@ -140,6 +141,7 @@ struct PackedConvWeightsQnnp : public ConvPackedParamsBase<kSpatialDim> {
+               static_cast<pytorch_qnnp_operator_t>(calloc(1, sizeof(struct pytorch_qnnp_operator)));
+           if (convolution == nullptr) {
+             TORCH_INTERNAL_ASSERT(
++                false,
+                 "failed to allocate %zu bytes for pytorch_qnnp_operator structure",
+                 sizeof(struct pytorch_qnnp_operator));
+           }
+@@ -406,7 +408,7 @@ std::pair<std::vector<uint8_t>, at::Tensor> make_zero_points_and_scales_tensor(
+               128);
+     }
+   } else {
+-    TORCH_INTERNAL_ASSERT("Unsupported quantization scheme.");
++    TORCH_INTERNAL_ASSERT(false, "Unsupported quantization scheme.");
+   }
+   at:: Tensor weight_scales =
+     at::empty(
+@@ -423,7 +425,7 @@ std::pair<std::vector<uint8_t>, at::Tensor> make_zero_points_and_scales_tensor(
+         weight_contig.q_per_channel_scales()[i].item<float>();
+     }
+   } else {
+-    TORCH_INTERNAL_ASSERT("Unsupported quantization scheme.");
++    TORCH_INTERNAL_ASSERT(false, "Unsupported quantization scheme.");
+   }
+   for (int i = num_output_channels; i <  num_output_channels_padded; ++i) {
+     weight_scales_data[i] = 1.f;
+diff --git a/test/test_mobile_optimizer.py b/test/test_mobile_optimizer.py
+index 11ef019a26..7b5ac1a239 100644
+--- a/test/test_mobile_optimizer.py
++++ b/test/test_mobile_optimizer.py
+@@ -269,6 +269,9 @@ class TestOptimizer(TestCase):
+         bi_module_lint_list = generate_mobile_module_lints(bi_module)
+         self.assertEqual(len(bi_module_lint_list), 0)
+
++    @unittest.skipUnless(torch.backends.xnnpack.enabled,
++                         " XNNPACK must be enabled for these tests."
++                         " Please build with USE_XNNPACK=1.")
+     def test_preserve_bundled_inputs_methods(self):
+         class MyBundledInputModule(torch.nn.Module):
+             def __init__(self):
+diff --git a/torch/csrc/jit/api/module.cpp b/torch/csrc/jit/api/module.cpp
+index 38592b80b9..8f9508321b 100644
+--- a/torch/csrc/jit/api/module.cpp
++++ b/torch/csrc/jit/api/module.cpp
+@@ -305,7 +305,7 @@ void Module::train(bool on) {
+     if (auto slot = m._ivalue()->type()->findAttributeSlot("training")) {
+       m._ivalue()->setSlot(*slot, on);
+     } else {
+-      TORCH_INTERNAL_ASSERT("'training' attribute not found");
++      TORCH_INTERNAL_ASSERT(false, "'training' attribute not found");
+     }
+   }
+ }
+diff --git a/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp b/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp
+index 53a13b6cf1..93c2b5a7da 100644
+--- a/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp
++++ b/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp
+@@ -304,6 +304,7 @@ Node* insertEmbeddingBagOps(Node* observer, const std::string& op_name) {
+     quant_fn = "quantized::embedding_bag_byte_rowwise_offsets";
+   } else {
+     TORCH_INTERNAL_ASSERT(
++        false,
+         "Graph Mode Quantization currently supports 4-bit and 8-bit embedding bag quantization.");
+   }
+
+diff --git a/torch/csrc/jit/passes/xnnpack_rewrite.cpp b/torch/csrc/jit/passes/xnnpack_rewrite.cpp
+index 3be480068c..2289f028ae 100644
+--- a/torch/csrc/jit/passes/xnnpack_rewrite.cpp
++++ b/torch/csrc/jit/passes/xnnpack_rewrite.cpp
+@@ -405,21 +405,25 @@ script::Module optimizeForMobile(
+
+ void insertPrePackedOps(std::shared_ptr<Graph>& graph) {
+   TORCH_INTERNAL_ASSERT(
++      false,
+       "XNNPACK is not enabled. Please build with USE_XNNPACK=1");
+ }
+
+ void insertPrePackedOps(script::Module& module) {
+   TORCH_INTERNAL_ASSERT(
++      false,
+       "XNNPACK is not enabled. Please build with USE_XNNPACK=1");
+ }
+
+ void fusePrePackedLinearConvWithClamp(script::Module& module) {
+   TORCH_INTERNAL_ASSERT(
++      false,
+       "XNNPACK is not enabled. Please build with USE_XNNPACK=1");
+ }
+
+ void FoldPrePackingOps(script::Module& m) {
+   TORCH_INTERNAL_ASSERT(
++      false,
+       "XNNPACK is not enabled. Please build with USE_XNNPACK=1");
+ }
+
+@@ -428,6 +432,7 @@ script::Module optimizeForMobile(
+     const std::set<MobileOptimizerType>& blocklist,
+     const std::vector<std::string>& preserved_methods) {
+   TORCH_INTERNAL_ASSERT(
++      false,
+       "Mobile optimization only available with XNNPACK at the moment. "
+       "XNNPACK is not enabled. Please build with USE_XNNPACK=1");
+   return module;
+diff --git a/torch/csrc/jit/runtime/register_ops_utils.cpp b/torch/csrc/jit/runtime/register_ops_utils.cpp
+index 537716e1ad..3bcff0af55 100644
+--- a/torch/csrc/jit/runtime/register_ops_utils.cpp
++++ b/torch/csrc/jit/runtime/register_ops_utils.cpp
+@@ -182,7 +182,7 @@ IValue tensorToListRecursive(
+     } else if (inner_result.isBool()) {
+       result.emplace_back(inner_result.toBool());
+     } else {
+-      TORCH_INTERNAL_ASSERT("Unknown return type for tensorToListRecursive");
++      TORCH_INTERNAL_ASSERT(false, "Unknown return type for tensorToListRecursive");
+     }
+
+     data += strides[cur_dim] * element_size;
+diff --git a/torch/csrc/distributed/c10d/ProcessGroup.cpp b/torch/csrc/distributed/c10d/ProcessGroup.cpp
+index 7909bfa7c9..9e2a51f291 100644
+--- a/torch/csrc/distributed/c10d/ProcessGroup.cpp
++++ b/torch/csrc/distributed/c10d/ProcessGroup.cpp
+@@ -43,7 +43,7 @@ std::string opTypeToString(OpType opType) {
+     case OpType::UNKNOWN:
+       return "UNKNOWN";
+     default:
+-      TORCH_INTERNAL_ASSERT("Unknown op type!");
++      TORCH_INTERNAL_ASSERT(false, "Unknown op type!");
+   }
+   return "UNKNOWN";
+ }