Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
easyblock = 'PythonBundle'

name = 'Horovod'
version = '0.23.0'
local_pt_version = '1.10.0'
local_cuda_suffix = '-CUDA-%(cudaver)s'
versionsuffix = local_cuda_suffix + '-PyTorch-%s' % local_pt_version

homepage = 'https://github.com/uber/horovod'
description = "Horovod is a distributed training framework for TensorFlow."

toolchain = {'name': 'foss', 'version': '2021a'}

builddependencies = [
('CMake', '3.20.1'),
('flatbuffers', '2.0.0'),
]
dependencies = [
('Python', '3.9.5'),
('PyYAML', '5.4.1'),
('CUDA', '11.3.1', '', True),
('NCCL', '2.10.3', local_cuda_suffix),
('PyTorch', local_pt_version, local_cuda_suffix),
]

use_pip = True
sanity_pip_check = True

preinstallopts = 'HOROVOD_WITH_MPI=1 HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_GPU_BROADCAST=NCCL '
preinstallopts += 'HOROVOD_WITHOUT_TENSORFLOW=1 HOROVOD_WITH_PYTORCH=1 HOROVOD_WITHOUT_MXNET=1 '

parallel = 1 # Bug in CMake causes a race condition on horovod_cuda_kernels_generated_cuda_kernels.cu.o.NVCC-depend

exts_list = [
('cloudpickle', '2.0.0', {
'checksums': ['5cd02f3b417a783ba84a4ec3e290ff7929009fe51f6405423cfccfadd43ba4a4'],
}),
('horovod', version, {
'checksums': ['72ab3e5f59df6a000473999937e52e6831ad1a5e4e7bd23885a04bcdd4d8691c'],
}),
]

sanity_check_paths = {
'files': ['bin/horovodrun'],
'dirs': ['lib/python%(pyshortver)s/site-packages'],
}

moduleclass = 'tools'
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
name = 'PyTorch'
version = '1.10.0'
versionsuffix = '-CUDA-%(cudaver)s'

homepage = 'https://pytorch.org/'
description = """Tensors and Dynamic neural networks in Python with strong GPU acceleration.
PyTorch is a deep learning framework that puts Python first."""

toolchain = {'name': 'foss', 'version': '2021a'}

sources = [{
'filename': '%(name)s-%(version)s.tar.gz',
'git_config': {
'url': 'https://github.com/pytorch',
'repo_name': 'pytorch',
'tag': 'v%(version)s',
'recursive': True,
},
}]
patches = [
'PyTorch-1.7.0_avoid-nan-in-test-torch.patch',
'PyTorch-1.7.0_disable-dev-shm-test.patch',
'PyTorch-1.7.1_correctly-pass-jit_opt_level.patch',
'PyTorch-1.8.1_dont-use-gpu-ccc-in-test.patch',
'PyTorch-1.8.1_increase-distributed-test-timeout.patch',
'PyTorch-1.9.0_limit-world-size-for-zero-redundancy-opt-test.patch',
'PyTorch-1.10.0_fix-test-dataloader-fixed-affinity.patch',
'PyTorch-1.10.0_fix-alias-violation-in-bitwise-ops.patch',
'PyTorch-1.10.0_fix-faulty-asserts-and-skip-test.patch',
'PyTorch-1.10.0_fix-test-cond-cpu.patch',
'PyTorch-1.10.0_fix-vnni-detection.patch',
'PyTorch-1.10.0_increase_zero_optimizer_test_tolerance.patch',
'PyTorch-1.10.0_skip_failing_ops_tests.patch',
'PyTorch-1.10.0_skip_nan_tests_openblas.patch',
]
checksums = [
None, # can't add proper SHA256 checksum, because source tarball is created locally after recursive 'git clone'
'b899aa94d9e60f11ee75a706563312ccefa9cf432756c470caa8e623991c8f18', # PyTorch-1.7.0_avoid-nan-in-test-torch.patch
'622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a', # PyTorch-1.7.0_disable-dev-shm-test.patch
# PyTorch-1.7.1_correctly-pass-jit_opt_level.patch
'd4d967d47f8a6172fcbf57f0a61835482968850967c4fdb01108b720696a988d',
'89ac7a8e9e7df2e64cf8404fe3a279f5e9b759fee41c9de3aaff9c22f385c2c6', # PyTorch-1.8.1_dont-use-gpu-ccc-in-test.patch
# PyTorch-1.8.1_increase-distributed-test-timeout.patch
'7a6e512274f0b8673f4f207a5bc53387d88be7e79833f42d20365668b2118071',
# PyTorch-1.9.0_limit-world-size-for-zero-redundancy-opt-test.patch
'ff573660913ce055e24cfd194ce747ba5685091c631cfd443eae2a99d56b57ea',
# PyTorch-1.10.0_fix-test-dataloader-fixed-affinity.patch
'313dca681f45ce3bc7c4557fdcdcbe0b77216d2c708fa30a2ec0e22c44876707',
# PyTorch-1.10.0_fix-alias-violation-in-bitwise-ops.patch
'426c9ead1a74b656748d4c8bf8afd4303d8b9f2394ad22b21a845d07c8ca1d12',
# PyTorch-1.10.0_fix-faulty-asserts-and-skip-test.patch
'67152215e4530a9b1d7349fb20864445fd815288f04ab9e96e45c73b2d87827a',
# PyTorch-1.10.0_fix-test-cond-cpu.patch
'51f83f5d5ef69656ef35b73f17e0671e70113798421be11ea4c7b56ffcc4da03',
# PyTorch-1.10.0_fix-vnni-detection.patch
'1f3664c0febfa2a3fc4c0cd3bae185f289716ac0b6c3d7e8fa1cee19ba62b7cc',
# PyTorch-1.10.0_increase_zero_optimizer_test_tolerance.patch
'e65afb01786f7f030ccb5faada1eb474bb0c418bcadcf1baaa71a4fa2f3f4240',
# PyTorch-1.10.0_skip_failing_ops_tests.patch
'399af94ffcef4a6db5226552c46f11e9b0f0f371b2d7924b9e5764d2281581ab',
# PyTorch-1.10.0_skip_nan_tests_openblas.patch
'7d3f83e3056d9e47a460790313238f28708beb596cafaa7ae55e374d368bbedf',
]

osdependencies = [OS_PKG_IBVERBS_DEV]

builddependencies = [
('CMake', '3.20.1'),
('hypothesis', '6.13.1'),
]

dependencies = [
('CUDA', '11.3.1', '', True),
('Ninja', '1.10.2'), # Required for JIT compilation of C++ extensions
('Python', '3.9.5'),
('protobuf', '3.17.3'),
('protobuf-python', '3.17.3'),
('pybind11', '2.6.2'),
('SciPy-bundle', '2021.05'),
('typing-extensions', '3.10.0.0'),
('PyYAML', '5.4.1'),
('MPFR', '4.1.0'),
('GMP', '6.2.1'),
('numactl', '2.0.14'),
('FFmpeg', '4.3.2'),
('Pillow', '8.2.0'),
('cuDNN', '8.2.1.32', '-CUDA-%(cudaver)s', True),
('magma', '2.6.1', '-CUDA-%(cudaver)s'),
('NCCL', '2.10.3', '-CUDA-%(cudaver)s'),
('expecttest', '0.1.3'),
]

# default CUDA compute capabilities to use (override via --cuda-compute-capabilities)
cuda_compute_capabilities = ['3.5', '3.7', '5.2', '6.0', '6.1', '7.0', '7.2', '7.5', '8.0', '8.6']

custom_opts = ["USE_CUPTI_SO=1"]

excluded_tests = {
'': [
# Bad tests: https://github.com/pytorch/pytorch/issues/60260
'distributed/elastic/utils/distributed_test',
'distributed/elastic/multiprocessing/api_test',
# These tests fail on A10s at the very least, they time out forever no matter how long the timeout is.
# Possibly related to NCCL 2.8.3: https://docs.nvidia.com/deeplearning/nccl/release-notes/rel_2-8-3.html
# 'distributed/test_distributed_fork',
'distributed/test_distributed_spawn',
# Fails on A10s: https://github.com/pytorch/pytorch/issues/63079
'test_optim',
# Test from this suite timeout often. The process group backend is deprecated anyway
# 'distributed/rpc/test_process_group_agent',
]
}

runtest = 'cd test && PYTHONUNBUFFERED=1 %(python)s run_test.py --continue-through-error --verbose %(excluded_tests)s'

sanity_check_commands = ["python -c 'import caffe2.python'"]
tests = ['PyTorch-check-cpp-extension.py']

moduleclass = 'devel'
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
A reinterpret_cast to an unrelated type is undefined behavior.
This causes real issues due to misoptimizations on at least GCC 10.2 on POWER
See https://github.com/pytorch/pytorch/issues/58031

Author: Alexander Grund (TU Dresden)

Adapted for PT-1.10, where this is now in vec_base.h and templated

diff --git a/aten/src/ATen/cpu/vec/vec_base.h b/aten/src/ATen/cpu/vec/vec_base.h
index 697996ab8e..1663ae239a 100644
--- a/aten/src/ATen/cpu/vec/vec_base.h
+++ b/aten/src/ATen/cpu/vec/vec_base.h
@@ -701,12 +701,14 @@ inline Vectorized<T> operator^(const Vectorized<T>& a, const Vectorized<T>& b) {

template<class T, typename Op>
static inline Vectorized<T> bitwise_binary_op(const Vectorized<T> &a, const Vectorized<T> &b, Op op) {
- static constexpr uint32_t element_no = VECTOR_WIDTH / sizeof(intmax_t);
+ constexpr uint32_t element_no = VECTOR_WIDTH / sizeof(intmax_t);
+ __at_align__ intmax_t buffer_a[element_no];
+ __at_align__ intmax_t buffer_b[element_no];
__at_align__ intmax_t buffer[element_no];
- const intmax_t *a_ptr = reinterpret_cast<const intmax_t*>((const T*) a);
- const intmax_t *b_ptr = reinterpret_cast<const intmax_t*>((const T*) b);
+ a.store(buffer_a);
+ b.store(buffer_b);
for (uint32_t i = 0U; i < element_no; ++ i) {
- buffer[i] = op(a_ptr[i], b_ptr[i]);
+ buffer[i] = op(buffer_a[i], buffer_b[i]);
}
return Vectorized<T>::loadu(buffer);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
From: Alexander Grund <[email protected]>
Date: Tue, 18 May 2021 15:08:41 +0200
Subject: [PATCH 1/2] Fix usage of TORCH_INTERNAL_ASSERT with message

Using only a string as the argument for TORCH_INTERNAL_ASSERT will never
trigger a failure as a string is always a truethy value.
This hides actual bugs and makes users and devs think all worked while
it did not.
Change to use TORCH_INTERNAL_ASSERT(false, "msg")

Subject: [PATCH 2/2] Add missing skip decorator for
test_preserve_bundled_inputs_methods

This test uses optimize_for_mobile which requires NNPACK to work

diff --git a/aten/src/ATen/native/BinaryOps.cpp b/aten/src/ATen/native/BinaryOps.cpp
index c4edadb03e..e889cd03a8 100644
--- a/aten/src/ATen/native/BinaryOps.cpp
+++ b/aten/src/ATen/native/BinaryOps.cpp
@@ -106,6 +106,7 @@ Tensor& add_relu_impl(
max_val = std::numeric_limits<double>::max();
} else {
TORCH_INTERNAL_ASSERT(
+ false,
"Unsupported datatype for add_relu:", self.dtype().name());
}

diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp
index 050fdce2ca..7e72263917 100644
--- a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp
@@ -780,6 +780,7 @@ class QEmbeddingBag final {
include_last_offset);
} else {
TORCH_INTERNAL_ASSERT(
+ false,
"Currently only support 8-bit embedding_bag quantization");
}
}
@@ -808,6 +809,7 @@ class QEmbedding final {

} else {
TORCH_INTERNAL_ASSERT(
+ false,
"Currently only support 8-bit embedding quantization");
}
return output;
diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack_utils.h b/aten/src/ATen/native/quantized/cpu/qnnpack_utils.h
index 6de646acfe..66341c959d 100644
--- a/aten/src/ATen/native/quantized/cpu/qnnpack_utils.h
+++ b/aten/src/ATen/native/quantized/cpu/qnnpack_utils.h
@@ -131,6 +131,7 @@ struct PackedConvWeightsQnnp : public ConvPackedParamsBase<kSpatialDim> {

if (conv_p.per_channel && conv_p.ukernel_type == pytorch_qnnp_ukernel_type_xzp_gemm) {
TORCH_INTERNAL_ASSERT(
+ false,
"Per channel quantized weights are not supported for XZP kernels");
}

@@ -140,6 +141,7 @@ struct PackedConvWeightsQnnp : public ConvPackedParamsBase<kSpatialDim> {
static_cast<pytorch_qnnp_operator_t>(calloc(1, sizeof(struct pytorch_qnnp_operator)));
if (convolution == nullptr) {
TORCH_INTERNAL_ASSERT(
+ false,
"failed to allocate %zu bytes for pytorch_qnnp_operator structure",
sizeof(struct pytorch_qnnp_operator));
}
@@ -406,7 +408,7 @@ std::pair<std::vector<uint8_t>, at::Tensor> make_zero_points_and_scales_tensor(
128);
}
} else {
- TORCH_INTERNAL_ASSERT("Unsupported quantization scheme.");
+ TORCH_INTERNAL_ASSERT(false, "Unsupported quantization scheme.");
}
at:: Tensor weight_scales =
at::empty(
@@ -423,7 +425,7 @@ std::pair<std::vector<uint8_t>, at::Tensor> make_zero_points_and_scales_tensor(
weight_contig.q_per_channel_scales()[i].item<float>();
}
} else {
- TORCH_INTERNAL_ASSERT("Unsupported quantization scheme.");
+ TORCH_INTERNAL_ASSERT(false, "Unsupported quantization scheme.");
}
for (int i = num_output_channels; i < num_output_channels_padded; ++i) {
weight_scales_data[i] = 1.f;
diff --git a/test/test_mobile_optimizer.py b/test/test_mobile_optimizer.py
index 11ef019a26..7b5ac1a239 100644
--- a/test/test_mobile_optimizer.py
+++ b/test/test_mobile_optimizer.py
@@ -269,6 +269,9 @@ class TestOptimizer(TestCase):
bi_module_lint_list = generate_mobile_module_lints(bi_module)
self.assertEqual(len(bi_module_lint_list), 0)

+ @unittest.skipUnless(torch.backends.xnnpack.enabled,
+ " XNNPACK must be enabled for these tests."
+ " Please build with USE_XNNPACK=1.")
def test_preserve_bundled_inputs_methods(self):
class MyBundledInputModule(torch.nn.Module):
def __init__(self):
diff --git a/torch/csrc/jit/api/module.cpp b/torch/csrc/jit/api/module.cpp
index 38592b80b9..8f9508321b 100644
--- a/torch/csrc/jit/api/module.cpp
+++ b/torch/csrc/jit/api/module.cpp
@@ -305,7 +305,7 @@ void Module::train(bool on) {
if (auto slot = m._ivalue()->type()->findAttributeSlot("training")) {
m._ivalue()->setSlot(*slot, on);
} else {
- TORCH_INTERNAL_ASSERT("'training' attribute not found");
+ TORCH_INTERNAL_ASSERT(false, "'training' attribute not found");
}
}
}
diff --git a/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp b/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp
index 53a13b6cf1..93c2b5a7da 100644
--- a/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp
+++ b/torch/csrc/jit/passes/quantization/insert_quant_dequant.cpp
@@ -304,6 +304,7 @@ Node* insertEmbeddingBagOps(Node* observer, const std::string& op_name) {
quant_fn = "quantized::embedding_bag_byte_rowwise_offsets";
} else {
TORCH_INTERNAL_ASSERT(
+ false,
"Graph Mode Quantization currently supports 4-bit and 8-bit embedding bag quantization.");
}

diff --git a/torch/csrc/jit/passes/xnnpack_rewrite.cpp b/torch/csrc/jit/passes/xnnpack_rewrite.cpp
index 3be480068c..2289f028ae 100644
--- a/torch/csrc/jit/passes/xnnpack_rewrite.cpp
+++ b/torch/csrc/jit/passes/xnnpack_rewrite.cpp
@@ -405,21 +405,25 @@ script::Module optimizeForMobile(

void insertPrePackedOps(std::shared_ptr<Graph>& graph) {
TORCH_INTERNAL_ASSERT(
+ false,
"XNNPACK is not enabled. Please build with USE_XNNPACK=1");
}

void insertPrePackedOps(script::Module& module) {
TORCH_INTERNAL_ASSERT(
+ false,
"XNNPACK is not enabled. Please build with USE_XNNPACK=1");
}

void fusePrePackedLinearConvWithClamp(script::Module& module) {
TORCH_INTERNAL_ASSERT(
+ false,
"XNNPACK is not enabled. Please build with USE_XNNPACK=1");
}

void FoldPrePackingOps(script::Module& m) {
TORCH_INTERNAL_ASSERT(
+ false,
"XNNPACK is not enabled. Please build with USE_XNNPACK=1");
}

@@ -428,6 +432,7 @@ script::Module optimizeForMobile(
const std::set<MobileOptimizerType>& blocklist,
const std::vector<std::string>& preserved_methods) {
TORCH_INTERNAL_ASSERT(
+ false,
"Mobile optimization only available with XNNPACK at the moment. "
"XNNPACK is not enabled. Please build with USE_XNNPACK=1");
return module;
diff --git a/torch/csrc/jit/runtime/register_ops_utils.cpp b/torch/csrc/jit/runtime/register_ops_utils.cpp
index 537716e1ad..3bcff0af55 100644
--- a/torch/csrc/jit/runtime/register_ops_utils.cpp
+++ b/torch/csrc/jit/runtime/register_ops_utils.cpp
@@ -182,7 +182,7 @@ IValue tensorToListRecursive(
} else if (inner_result.isBool()) {
result.emplace_back(inner_result.toBool());
} else {
- TORCH_INTERNAL_ASSERT("Unknown return type for tensorToListRecursive");
+ TORCH_INTERNAL_ASSERT(false, "Unknown return type for tensorToListRecursive");
}

data += strides[cur_dim] * element_size;
diff --git a/torch/csrc/distributed/c10d/ProcessGroup.cpp b/torch/csrc/distributed/c10d/ProcessGroup.cpp
index 7909bfa7c9..9e2a51f291 100644
--- a/torch/csrc/distributed/c10d/ProcessGroup.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroup.cpp
@@ -43,7 +43,7 @@ std::string opTypeToString(OpType opType) {
case OpType::UNKNOWN:
return "UNKNOWN";
default:
- TORCH_INTERNAL_ASSERT("Unknown op type!");
+ TORCH_INTERNAL_ASSERT(false, "Unknown op type!");
}
return "UNKNOWN";
}
Loading