Skip to content

Commit 9c3cef4

Browse files
d4l3kpytorchmergebot
authored andcommitted
gloo: support ibverbs in cmake (pytorch#153425)
This updates the gloo submodule in PyTorch to a version that supports the new ibverbs backend that can be used with PyTorch. Test plan: ``` sudo dnf install rdma-core-devel USE_GLOO_IBVERBS=ON python setup.py develop torchrun --nproc_per_node 2 ~/scripts/gloo_ibverbs_test.py ``` ```py """ run with: torchrun --nproc_per_node 2 ~/scripts/gloo_ibverbs_test.py """ import os os.environ["GLOO_DEVICE_TRANSPORT"] = "IBVERBS" import torch import torch.distributed as dist dist.init_process_group("gloo") rank = dist.get_rank() if rank == 0: device = "cpu" else: device = "cuda" print(device) t = torch.full((10, 100), fill_value=(rank+1), device=device) target = torch.full((10, 100), fill_value=3, device=device) dist.all_reduce(t) torch.testing.assert_close(t, target) t = torch.full((10, 100), fill_value=(rank+1), device=device) if rank == 0: dist.send(t, dst=1) else: dist.recv(t, src=0) torch.testing.assert_close(t, torch.full_like(t, 1)) ``` Pull Request resolved: pytorch#153425 Approved by: https://github.com/fduwjj
1 parent dde7058 commit 9c3cef4

File tree

5 files changed

+13
-2
lines changed

5 files changed

+13
-2
lines changed

CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -331,6 +331,9 @@ cmake_dependent_option(
331331
cmake_dependent_option(
332332
USE_GLOO_WITH_OPENSSL "Use Gloo with OpenSSL. Only available if USE_GLOO is on." OFF
333333
"USE_GLOO AND LINUX AND NOT INTERN_BUILD_MOBILE" OFF)
334+
cmake_dependent_option(
335+
USE_GLOO_IBVERBS "Use Gloo with ibverbs backend. Only available if USE_GLOO is on." OFF
336+
"USE_GLOO AND LINUX AND NOT INTERN_BUILD_MOBILE" OFF)
334337
cmake_dependent_option(
335338
USE_C10D_GLOO "USE C10D GLOO" ON "USE_DISTRIBUTED;USE_GLOO" OFF)
336339
cmake_dependent_option(

cmake/Dependencies.cmake

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1217,6 +1217,10 @@ if(USE_GLOO)
12171217
set(GLOO_INSTALL OFF CACHE BOOL "" FORCE)
12181218
set(GLOO_STATIC_OR_SHARED STATIC CACHE STRING "" FORCE)
12191219

1220+
if(USE_GLOO_IBVERBS)
1221+
set(USE_IBVERBS ON)
1222+
endif()
1223+
12201224
# Temporarily override variables to avoid building Gloo tests/benchmarks
12211225
set(__BUILD_TEST ${BUILD_TEST})
12221226
set(__BUILD_BENCHMARK ${BUILD_BENCHMARK})

cmake/Summary.cmake

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,7 @@ function(caffe2_print_configuration_summary)
188188
message(STATUS " USE_MPI : ${USE_MPI}")
189189
message(STATUS " USE_GLOO : ${USE_GLOO}")
190190
message(STATUS " USE_GLOO_WITH_OPENSSL : ${USE_GLOO_WITH_OPENSSL}")
191+
message(STATUS " USE_GLOO_IBVERBS : ${USE_GLOO_IBVERBS}")
191192
message(STATUS " USE_TENSORPIPE : ${USE_TENSORPIPE}")
192193
endif()
193194
if(NOT "${SELECTED_OP_LIST}" STREQUAL "")

torch/csrc/distributed/c10d/GlooDeviceFactory.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -151,7 +151,10 @@ static std::shared_ptr<::gloo::transport::Device> makeIBVerbsDevice(
151151
const std::string& interface,
152152
const std::string& hostname,
153153
bool lazyInit) {
154-
TORCH_CHECK(hostname.empty(), "ibverbs transport does not support hostname");
154+
if (!hostname.empty()) {
155+
TORCH_WARN(
156+
"ibverbs transport does not support hostname, defaulting to any");
157+
}
155158

156159
TORCH_CHECK(!lazyInit, "transport does not support lazy init");
157160

0 commit comments

Comments
 (0)