From 2ced8d21dc07db07a2eb8333d98e6fdc3ae5ee19 Mon Sep 17 00:00:00 2001 From: shenliang03 Date: Fri, 16 Jul 2021 15:41:21 +0800 Subject: [PATCH 1/3] add wait_server_ready --- python/paddle/distributed/parallel.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py index efe747408428a6..8676dbaa6c26ba 100644 --- a/python/paddle/distributed/parallel.py +++ b/python/paddle/distributed/parallel.py @@ -193,6 +193,12 @@ def _check_var_exists(var_name): elif core.is_compiled_with_xpu(): parallel_helper._set_parallel_ctx( core.BKCLParallelContext(strategy, place)) + + other_endpoints = strategy.trainer_endpoints[:].remove( + strategy.current_endpoint) + if strategy.local_rank == 0: + wait_server_ready(other_endpoints) + parallel_helper._init_parallel_ctx() # 5: init gloo context (step 2: gloo init) From 87e1f16a8239907d856402b0225881c522f5d3fd Mon Sep 17 00:00:00 2001 From: shenliang03 Date: Fri, 16 Jul 2021 15:50:08 +0800 Subject: [PATCH 2/3] fix remove bug --- python/paddle/distributed/parallel.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py index 8676dbaa6c26ba..1a4fe21afa577f 100644 --- a/python/paddle/distributed/parallel.py +++ b/python/paddle/distributed/parallel.py @@ -194,8 +194,8 @@ def _check_var_exists(var_name): parallel_helper._set_parallel_ctx( core.BKCLParallelContext(strategy, place)) - other_endpoints = strategy.trainer_endpoints[:].remove( - strategy.current_endpoint) + other_endpoints = strategy.trainer_endpoints[:] + other_endpoints.remove(strategy.current_endpoint) if strategy.local_rank == 0: wait_server_ready(other_endpoints) From 39dab20c89075e16cc35322996cd1f21ef223c1b Mon Sep 17 00:00:00 2001 From: shenliang03 Date: Fri, 16 Jul 2021 16:12:19 +0800 Subject: [PATCH 3/3] add comment --- paddle/fluid/operators/lookup_table_v2_op_npu.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc index c75ea537216f35..020dbad53076d2 100644 --- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc +++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc @@ -40,6 +40,7 @@ class LookupTableV2NPUKernel : public framework::OpKernel { platform::errors::InvalidArgument("npu only accept LoDTensor")); output_t->mutable_data(ctx.GetPlace()); + // add copy ids to ensure ids_t is prepared. std::vector ids; TensorToVector(*ids_t, ctx.device_context(), &ids);