Skip to content

Commit f29fb39

Browse files
authored
dygraph nccl init support host domain name (#28107)
* nccl init support hostname and ip; test=develop
1 parent 5cd97a1 commit f29fb39

File tree

3 files changed

+15
-2
lines changed

3 files changed

+15
-2
lines changed

paddle/fluid/imperative/nccl_context.cc

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,19 @@ void NCCLParallelContext::SendNCCLID(const std::string &ep,
100100
serv_addr.sin_family = AF_INET;
101101
serv_addr.sin_port = htons(port);
102102

103-
if (inet_pton(AF_INET, host.c_str(), &serv_addr.sin_addr) <= 0) {
103+
char *ip = NULL;
104+
struct hostent *hp;
105+
if ((hp = gethostbyname(host.c_str())) == NULL) {
106+
PADDLE_THROW(platform::errors::InvalidArgument(
107+
"Fail to get host by name %s.", host));
108+
}
109+
int i = 0;
110+
while (hp->h_addr_list[i] != NULL) {
111+
ip = inet_ntoa(*(struct in_addr *)hp->h_addr_list[i]);
112+
VLOG(3) << "gethostbyname host:" << host << " ->ip: " << ip;
113+
break;
114+
}
115+
if (inet_pton(AF_INET, ip, &serv_addr.sin_addr) <= 0) {
104116
PADDLE_THROW(platform::errors::Unavailable("Open address %s failed.", ep));
105117
}
106118

paddle/fluid/imperative/nccl_context.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
// network header files
1717
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
1818
#include <arpa/inet.h>
19+
#include <netdb.h>
1920
#include <netinet/in.h>
2021
#include <stdlib.h>
2122
#include <sys/socket.h>

paddle/fluid/imperative/tests/nccl_context_test.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ namespace imperative = paddle::imperative;
2020
namespace platform = paddle::platform;
2121

2222
imperative::ParallelStrategy GetStrategy(int local_rank) {
23-
std::vector<std::string> eps = {"127.0.0.1:9866", "127.0.0.1:9867"};
23+
std::vector<std::string> eps = {"127.0.0.1:9866", "localhost:9867"};
2424
imperative::ParallelStrategy strategy;
2525
strategy.trainer_endpoints_ = eps;
2626
strategy.current_endpoint_ = eps[local_rank];

0 commit comments

Comments
 (0)