Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 11 additions & 1 deletion paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ limitations under the License. */
#include "paddle/fluid/platform/hccl_helper.h"
#endif

DECLARE_int32(get_host_by_name_time);

namespace paddle {
namespace operators {

Expand Down Expand Up @@ -226,7 +228,15 @@ static int ConnectAddr(const std::string& ep, const char* head) {

char* ip = NULL;
struct hostent* hp = NULL;
hp = gethostbyname(host.c_str());
// sleep for get_host_by_name_time seconds.
for (int i = 0; 2 * i < FLAGS_get_host_by_name_time; i++) {
hp = gethostbyname(host.c_str());
if (hp != NULL) {
break;
}
std::this_thread::sleep_for(std::chrono::seconds(2));
LOG(WARNING) << "gethostbyname " << host.c_str() << " error!";
}
PADDLE_ENFORCE_NOT_NULL(hp, platform::errors::InvalidArgument(
"Fail to get host by name %s.", host));

Expand Down
14 changes: 14 additions & 0 deletions paddle/fluid/platform/flags.cc
Original file line number Diff line number Diff line change
Expand Up @@ -606,3 +606,17 @@ DEFINE_bool(check_kernel_launch, false,
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
DEFINE_bool(conv2d_disable_cudnn, false, "Disable cudnn in conv2d");
#endif

/**
* Distributed related FLAG
* Name: FLAGS_get_host_by_name_time
* Since Version: 2.2.0
* Value Range: int32, default=120
* Example:
* Note: Get host by name time.
*/
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_XPU) || \
defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_HIP)
DEFINE_int32(get_host_by_name_time, 120,
"The maximum time for get host by name time");
#endif
13 changes: 12 additions & 1 deletion paddle/fluid/platform/gen_comm_id_helper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@ limitations under the License. */
#include "paddle/fluid/platform/collective_helper.h"
#endif

DECLARE_int32(get_host_by_name_time);

namespace paddle {
namespace platform {

Expand Down Expand Up @@ -236,7 +238,16 @@ static int ConnectAddr(const std::string& ep, const CommHead head) {

char* ip = NULL;
struct hostent* hp = NULL;
hp = gethostbyname(host.c_str());

// sleep for get_host_by_name_time seconds.
for (int i = 0; 2 * i < FLAGS_get_host_by_name_time; i++) {
hp = gethostbyname(host.c_str());
if (hp != NULL) {
break;
}
std::this_thread::sleep_for(std::chrono::seconds(2));
LOG(WARNING) << "gethostbyname " << host.c_str() << " error!";
}
PADDLE_ENFORCE_NOT_NULL(hp, platform::errors::InvalidArgument(
"Fail to get host by name %s.", host));

Expand Down
2 changes: 2 additions & 0 deletions python/paddle/fluid/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,7 @@ def __bootstrap__():
'local_exe_sub_scope_limit',
'gpu_memory_limit_mb',
'conv2d_disable_cudnn',
'get_host_by_name_time',
]

if core.is_compiled_with_npu():
Expand All @@ -246,6 +247,7 @@ def __bootstrap__():
'reallocate_gpu_memory_in_mb',
'gpu_memory_limit_mb',
'npu_config_path',
'get_host_by_name_time',
]

core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)])
Expand Down