Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,10 @@ patches = [
'PyTorch-1.10.0_skip_failing_ops_tests.patch',
'PyTorch-1.10.0_skip_nan_tests_openblas.patch',
'PyTorch-1.10.0_skip_cmake_rpath.patch',
'PyTorch-1.10.0_detect_ucx_cuda.patch',
]
checksums = [
None, # can't add proper SHA256 checksum, because source tarball is created locally after recursive 'git clone'
'd15a2702256e6739cb77c8e58e060c0eecb7340c654e40cbf280c87791ab5dd0', # PyTorch-1.10.0.tar.gz
'b899aa94d9e60f11ee75a706563312ccefa9cf432756c470caa8e623991c8f18', # PyTorch-1.7.0_avoid-nan-in-test-torch.patch
'622cb1eaeadc06e13128a862d9946bcc1f1edd3d02b259c56a9aecc4d5406b8a', # PyTorch-1.7.0_disable-dev-shm-test.patch
# PyTorch-1.7.1_correctly-pass-jit_opt_level.patch
Expand All @@ -51,18 +52,14 @@ checksums = [
'426c9ead1a74b656748d4c8bf8afd4303d8b9f2394ad22b21a845d07c8ca1d12',
# PyTorch-1.10.0_fix-faulty-asserts-and-skip-test.patch
'67152215e4530a9b1d7349fb20864445fd815288f04ab9e96e45c73b2d87827a',
# PyTorch-1.10.0_fix-test-cond-cpu.patch
'51f83f5d5ef69656ef35b73f17e0671e70113798421be11ea4c7b56ffcc4da03',
# PyTorch-1.10.0_fix-vnni-detection.patch
'1f3664c0febfa2a3fc4c0cd3bae185f289716ac0b6c3d7e8fa1cee19ba62b7cc',
'51f83f5d5ef69656ef35b73f17e0671e70113798421be11ea4c7b56ffcc4da03', # PyTorch-1.10.0_fix-test-cond-cpu.patch
'1f3664c0febfa2a3fc4c0cd3bae185f289716ac0b6c3d7e8fa1cee19ba62b7cc', # PyTorch-1.10.0_fix-vnni-detection.patch
# PyTorch-1.10.0_increase_zero_optimizer_test_tolerance.patch
'e65afb01786f7f030ccb5faada1eb474bb0c418bcadcf1baaa71a4fa2f3f4240',
# PyTorch-1.10.0_skip_failing_ops_tests.patch
'399af94ffcef4a6db5226552c46f11e9b0f0f371b2d7924b9e5764d2281581ab',
# PyTorch-1.10.0_skip_nan_tests_openblas.patch
'7d3f83e3056d9e47a460790313238f28708beb596cafaa7ae55e374d368bbedf',
# PyTorch-1.10.0_skip_cmake_rpath.patch
'ac05943bb205623f91ef140aa00869efc5fe844184bd666bebf5405808610448',
'399af94ffcef4a6db5226552c46f11e9b0f0f371b2d7924b9e5764d2281581ab', # PyTorch-1.10.0_skip_failing_ops_tests.patch
'7d3f83e3056d9e47a460790313238f28708beb596cafaa7ae55e374d368bbedf', # PyTorch-1.10.0_skip_nan_tests_openblas.patch
'ac05943bb205623f91ef140aa00869efc5fe844184bd666bebf5405808610448', # PyTorch-1.10.0_skip_cmake_rpath.patch
'aa9d2dc7b090ec40011ad37b884f91fade20d49af7c4a090c1d8a270806f0ae1', # PyTorch-1.10.0_detect_ucx_cuda.patch
]

osdependencies = [OS_PKG_IBVERBS_DEV]
Expand All @@ -74,6 +71,7 @@ builddependencies = [

dependencies = [
('CUDA', '11.3.1', '', True),
('UCX-CUDA', '1.10.0', versionsuffix),
('Ninja', '1.10.2'), # Required for JIT compilation of C++ extensions
('Python', '3.9.5'),
('protobuf', '3.17.3'),
Expand All @@ -87,9 +85,9 @@ dependencies = [
('numactl', '2.0.14'),
('FFmpeg', '4.3.2'),
('Pillow', '8.2.0'),
('cuDNN', '8.2.1.32', '-CUDA-%(cudaver)s', True),
('magma', '2.6.1', '-CUDA-%(cudaver)s'),
('NCCL', '2.10.3', '-CUDA-%(cudaver)s'),
('cuDNN', '8.2.1.32', versionsuffix, True),
('magma', '2.6.1', versionsuffix),
('NCCL', '2.10.3', versionsuffix),
('expecttest', '0.1.3'),
]

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
Make PyTorch CUDA-aware-OpenMPI detection aware of UCX.
The old detection relies on the old pre-UCX smcuda btl from ompi_info.
Use ucx_info -d to find CUDA enabled UCX if there is no smcuda.
ucx_info -d only shows cuda_cpy when run on a node with a GPU available.

Åke Sandgren, 2021-12-03
diff -ru pytorch.orig/cmake/Dependencies.cmake pytorch/cmake/Dependencies.cmake
--- pytorch.orig/cmake/Dependencies.cmake 2021-11-29 09:13:59.000000000 +0100
+++ pytorch/cmake/Dependencies.cmake 2021-12-03 13:37:47.646865417 +0100
@@ -1072,16 +1072,29 @@
find_program(OMPI_INFO
NAMES ompi_info
HINTS ${MPI_CXX_LIBRARIES}/../bin)
+ set(CUDA_MPI_FOUND False)
if(OMPI_INFO)
execute_process(COMMAND ${OMPI_INFO}
OUTPUT_VARIABLE _output)
if(_output MATCHES "smcuda")
- message(STATUS "Found OpenMPI with CUDA support built.")
+ set(CUDA_MPI_FOUND True)
else()
- message(WARNING "OpenMPI found, but it is not built with CUDA support.")
- set(CAFFE2_FORCE_FALLBACK_CUDA_MPI 1)
+ find_program(UCX_INFO NAMES ucx_info)
+ if(UCX_INFO)
+ execute_process(COMMAND ${UCX_INFO} -d
+ OUTPUT_VARIABLE _output)
+ if(_output MATCHES "cuda_cpy")
+ set(CUDA_MPI_FOUND True)
+ endif()
+ endif()
endif()
endif()
+ if (CUDA_MPI_FOUND)
+ message(STATUS "Found OpenMPI with CUDA support built.")
+ else()
+ message(WARNING "OpenMPI found, but it is not built with CUDA support.")
+ set(CAFFE2_FORCE_FALLBACK_CUDA_MPI 1)
+ endif()
else()
message(WARNING "Not compiling with MPI. Suppress this warning with -DUSE_MPI=OFF")
caffe2_update_option(USE_MPI OFF)