From fe72cf7ed27cc83f48c0021fc423e4200b4660ad Mon Sep 17 00:00:00 2001 From: gongweibao Date: Thu, 21 Jan 2021 16:15:52 +0800 Subject: [PATCH 1/9] add --- python/paddle/distributed/fleet/__init__.py | 2 ++ python/paddle/distributed/fleet/base/fleet_base.py | 6 ++++++ python/paddle/distributed/fleet/base/role_maker.py | 12 ++++++++++++ python/paddle/distributed/fleet/launch_utils.py | 11 ++++++++++- .../fleet/meta_optimizers/ascend/ascend_optimizer.py | 2 +- 5 files changed, 31 insertions(+), 2 deletions(-) diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py index 970a932fcc0bce..51ac5f1523caa7 100644 --- a/python/paddle/distributed/fleet/__init__.py +++ b/python/paddle/distributed/fleet/__init__.py @@ -44,6 +44,8 @@ rank=fleet.worker_index nranks=fleet.worker_num world_size=fleet.worker_num +current_worker_device_id=fleet.current_worker_device_id +worker_device_ids=fleet.worker_device_ids rank_in_node=fleet.rank_in_node is_worker = fleet.is_worker worker_endpoints = fleet.worker_endpoints diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py index 547fe76063cdc4..9b204107f6ceb0 100644 --- a/python/paddle/distributed/fleet/base/fleet_base.py +++ b/python/paddle/distributed/fleet/base/fleet_base.py @@ -294,6 +294,12 @@ def node_num(self): def rank_in_node(self): return self._role_maker._get_rank_in_node() + def current_worker_device_id(self): + return self._role_maker._get_current_worker_device_id() + + def worker_device_ids(self): + return self._role_maker._get_worker_device_ids() + def is_worker(self): """ Check whether the node is an instance of worker. diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py index 3da3e546352162..5c2877173d12db 100644 --- a/python/paddle/distributed/fleet/base/role_maker.py +++ b/python/paddle/distributed/fleet/base/role_maker.py @@ -627,6 +627,16 @@ def _get_rank_in_node(self): self._generate_role() return self._rank_in_node + def _get_current_worker_device_id(self): + if not self._role_is_generated: + self._generate_role() + return self._current_worker_device_id + + def _get_worker_device_ids(self): + if not self._role_is_generated: + self._generate_role() + return self._worker_device_ids + def _get_trainer_endpoints(self): """ get endpoint of all trainers @@ -788,6 +798,8 @@ def _collective_env(self): self._nodes_num = len( set([x.split(':')[0] for x in self._worker_endpoints])) self._rank_in_node = os.getenv("PADDLE_RANK_IN_NODE") + self._current_worker_device_id=os.getenv("PADDLE_CURRENT_WORKER_DEVICE_ID") + self._worker_device_ids=os.getenv("PADDLE_WORKER_DEVICE_IDS") def _gloo_init(self): # PADDLE_WITH_GLOO 1: trainer barrier, 2: all barrier diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py index 93e39249059323..b6224508320e6e 100644 --- a/python/paddle/distributed/fleet/launch_utils.py +++ b/python/paddle/distributed/fleet/launch_utils.py @@ -98,6 +98,13 @@ def trainers_endpoints(self): r.append(t.endpoint) return r + def worker_device_ids(self): + r = [] + for pod in self.pods: + for t in pod.trainers: + r.append(t.accelerators) + return r + def pods_endpoints(self): r = [] for pod in self.pods: @@ -459,7 +466,9 @@ def start_local_trainers(cluster, "PADDLE_CURRENT_ENDPOINT": "%s" % t.endpoint, "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(), "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()), - "PADDLE_RANK_IN_NODE": str(idx) + "PADDLE_RANK_IN_NODE": str(idx), + "PADDLE_CURRENT_WORKER_DEVICE_ID":",".join(t.accelerators), + "PADDLE_WORKER_DEVICE_IDS":",".join(cluster.worker_device_ids()), } if len(t.accelerators) > 0 and pod.device_mode==DeviceMode.GPU: diff --git a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py index d6ad1b2f2d0cd7..f0dd98f1794fb6 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py @@ -150,7 +150,7 @@ def minimize(self, # Config about Graph Engine can be found in https://support.huaweicloud.com/ config = { - "ge.exec.deviceId": str(fleet.rank_in_node()), + "ge.exec.deviceId": str(fleet.current_worker_device_id()), "ge.graphRunMode": "1", "ge.exec.precision_mode": "must_keep_origin_dtype", # if multi mode From 0e7b7558d453291892b97c9c2a721e8a5ee55a8e Mon Sep 17 00:00:00 2001 From: gongweibao Date: Thu, 21 Jan 2021 16:50:45 +0800 Subject: [PATCH 2/9] fix device --- .../tests/unittests/ascend_multi_process_collective.py | 6 ++++-- .../fluid/tests/unittests/test_fleet_launch_ascend.sh | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py b/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py index 435af572973329..ab2957c6a7c11b 100644 --- a/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py +++ b/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py @@ -23,9 +23,11 @@ def train(prefix): current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") worker_endpoints = worker_endpoints_env trainers_num = len(worker_endpoints.split(',')) + device_ids=os.getenv("PADDLE_WORKER_DEVICE_IDS") + current_device_id=os.getenv("PADDLE_CURRENT_WORKER_DEVICE_ID") - details = "selected_accelerators:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"\ - .format(selected_accelerators, worker_endpoints, trainers_num, current_endpoint,trainer_id) + details = "selected_accelerators:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{} device_ids:{} device_id:{}"\ + .format(selected_accelerators, worker_endpoints, trainers_num, current_endpoint,trainer_id,device_ids, current_device_id) print(details) with open("multi_process_{}.check_{}.log".format(prefix, trainer_id), "w") as f: diff --git a/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh b/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh index 233fe7f7f25b3b..d0a80b3426ec3c 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh +++ b/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh @@ -30,8 +30,8 @@ export TRAINER_PORTS_NUM=2 distributed_args="--ips=${cluster_node_ips} --ascend_npus=0,1 --log_dir=testlog" python -m paddle.distributed.fleet.launch ${distributed_args} ascend_multi_process_collective.py fleetlaunchascend -str1="selected_accelerators:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0" -str2="selected_accelerators:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1" +str1="selected_accelerators:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0 device_ids:0,1,0,1 device_id:0" +str2="selected_accelerators:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1 device_ids:0,1,0,1 device_id:0" file_0="multi_process_fleetlaunchascend.check_0.log" file_1="multi_process_fleetlaunchascend.check_1.log" From 46bc662732e8e73cab62b8159867360ce4de07b7 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Thu, 21 Jan 2021 17:05:27 +0800 Subject: [PATCH 3/9] fix device --- python/paddle/distributed/fleet/__init__.py | 4 ++-- python/paddle/distributed/fleet/base/fleet_base.py | 8 ++++---- python/paddle/distributed/fleet/base/role_maker.py | 12 ++++++------ python/paddle/distributed/fleet/launch_utils.py | 6 +++--- .../fleet/meta_optimizers/ascend/ascend_optimizer.py | 2 +- .../unittests/ascend_multi_process_collective.py | 4 ++-- 6 files changed, 18 insertions(+), 18 deletions(-) diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py index 51ac5f1523caa7..2afc1faec9c229 100644 --- a/python/paddle/distributed/fleet/__init__.py +++ b/python/paddle/distributed/fleet/__init__.py @@ -44,8 +44,8 @@ rank=fleet.worker_index nranks=fleet.worker_num world_size=fleet.worker_num -current_worker_device_id=fleet.current_worker_device_id -worker_device_ids=fleet.worker_device_ids +current_worker_accelerator_id=fleet.current_worker_accelerator_id +worker_accelerator_ids=fleet.worker_accelerator_ids rank_in_node=fleet.rank_in_node is_worker = fleet.is_worker worker_endpoints = fleet.worker_endpoints diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py index 9b204107f6ceb0..27eff3a626f73b 100644 --- a/python/paddle/distributed/fleet/base/fleet_base.py +++ b/python/paddle/distributed/fleet/base/fleet_base.py @@ -294,11 +294,11 @@ def node_num(self): def rank_in_node(self): return self._role_maker._get_rank_in_node() - def current_worker_device_id(self): - return self._role_maker._get_current_worker_device_id() + def current_worker_accelerator_id(self): + return self._role_maker._get_current_worker_accelerator_id() - def worker_device_ids(self): - return self._role_maker._get_worker_device_ids() + def worker_accelerator_ids(self): + return self._role_maker._get_worker_accelerator_ids() def is_worker(self): """ diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py index 5c2877173d12db..cd4ee873aca774 100644 --- a/python/paddle/distributed/fleet/base/role_maker.py +++ b/python/paddle/distributed/fleet/base/role_maker.py @@ -627,15 +627,15 @@ def _get_rank_in_node(self): self._generate_role() return self._rank_in_node - def _get_current_worker_device_id(self): + def _get_current_worker_accelerator_id(self): if not self._role_is_generated: self._generate_role() - return self._current_worker_device_id + return self._current_worker_accelerator_id - def _get_worker_device_ids(self): + def _get_worker_accelerator_ids(self): if not self._role_is_generated: self._generate_role() - return self._worker_device_ids + return self._worker_accelerator_ids def _get_trainer_endpoints(self): """ @@ -798,8 +798,8 @@ def _collective_env(self): self._nodes_num = len( set([x.split(':')[0] for x in self._worker_endpoints])) self._rank_in_node = os.getenv("PADDLE_RANK_IN_NODE") - self._current_worker_device_id=os.getenv("PADDLE_CURRENT_WORKER_DEVICE_ID") - self._worker_device_ids=os.getenv("PADDLE_WORKER_DEVICE_IDS") + self._current_worker_accelerator_id=os.getenv("PADDLE_CURRENT_WORK_ACCLERATOR_ID") + self._worker_accelerator_ids=os.getenv("PADDLE_WORK_ACCLERATOR_IDS") def _gloo_init(self): # PADDLE_WITH_GLOO 1: trainer barrier, 2: all barrier diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py index b6224508320e6e..5cdcd6e614dd12 100644 --- a/python/paddle/distributed/fleet/launch_utils.py +++ b/python/paddle/distributed/fleet/launch_utils.py @@ -98,7 +98,7 @@ def trainers_endpoints(self): r.append(t.endpoint) return r - def worker_device_ids(self): + def worker_accelerator_ids(self): r = [] for pod in self.pods: for t in pod.trainers: @@ -467,8 +467,8 @@ def start_local_trainers(cluster, "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(), "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()), "PADDLE_RANK_IN_NODE": str(idx), - "PADDLE_CURRENT_WORKER_DEVICE_ID":",".join(t.accelerators), - "PADDLE_WORKER_DEVICE_IDS":",".join(cluster.worker_device_ids()), + "PADDLE_CURRENT_WORK_ACCLERATOR_ID":",".join(t.accelerators), + "PADDLE_WORK_ACCLERATOR_IDS":",".join(cluster.worker_accelerator_ids()), } if len(t.accelerators) > 0 and pod.device_mode==DeviceMode.GPU: diff --git a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py index f0dd98f1794fb6..3f93ec41d6503e 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py @@ -150,7 +150,7 @@ def minimize(self, # Config about Graph Engine can be found in https://support.huaweicloud.com/ config = { - "ge.exec.deviceId": str(fleet.current_worker_device_id()), + "ge.exec.deviceId": str(fleet.current_worker_accelerator_id()), "ge.graphRunMode": "1", "ge.exec.precision_mode": "must_keep_origin_dtype", # if multi mode diff --git a/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py b/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py index ab2957c6a7c11b..28422efad91d75 100644 --- a/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py +++ b/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py @@ -23,8 +23,8 @@ def train(prefix): current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") worker_endpoints = worker_endpoints_env trainers_num = len(worker_endpoints.split(',')) - device_ids=os.getenv("PADDLE_WORKER_DEVICE_IDS") - current_device_id=os.getenv("PADDLE_CURRENT_WORKER_DEVICE_ID") + device_ids=os.getenv("PADDLE_WORK_ACCLERATOR_IDS") + current_device_id=os.getenv("PADDLE_CURRENT_WORK_ACCLERATOR_ID") details = "selected_accelerators:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{} device_ids:{} device_id:{}"\ .format(selected_accelerators, worker_endpoints, trainers_num, current_endpoint,trainer_id,device_ids, current_device_id) From c3f172161a1cb84e6ff5c33daa1f54b56074bf58 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Thu, 21 Jan 2021 17:14:57 +0800 Subject: [PATCH 4/9] add envs --- .../paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh b/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh index d0a80b3426ec3c..bd389e30f77cd8 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh +++ b/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh @@ -30,8 +30,8 @@ export TRAINER_PORTS_NUM=2 distributed_args="--ips=${cluster_node_ips} --ascend_npus=0,1 --log_dir=testlog" python -m paddle.distributed.fleet.launch ${distributed_args} ascend_multi_process_collective.py fleetlaunchascend -str1="selected_accelerators:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0 device_ids:0,1,0,1 device_id:0" -str2="selected_accelerators:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1 device_ids:0,1,0,1 device_id:0" +str1="selected_accelerators:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0 device_ids:0,1,0,1,0,1,0,1 device_id:0" +str2="selected_accelerators:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1 device_ids:0,1,0,1,0,1,0,1 device_id:1" file_0="multi_process_fleetlaunchascend.check_0.log" file_1="multi_process_fleetlaunchascend.check_1.log" From 6e662a23ff21c5d142cd9e3eb9aca995ea0ed8db Mon Sep 17 00:00:00 2001 From: gongweibao Date: Thu, 21 Jan 2021 17:38:53 +0800 Subject: [PATCH 5/9] add envs --- python/paddle/distributed/fleet/launch_utils.py | 4 +++- .../paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py index 5cdcd6e614dd12..3fce264aac49bc 100644 --- a/python/paddle/distributed/fleet/launch_utils.py +++ b/python/paddle/distributed/fleet/launch_utils.py @@ -461,6 +461,8 @@ def start_local_trainers(cluster, procs = [] for idx, t in enumerate(pod.trainers): + ids=cluster.worker_accelerator_ids() + res = [':'.join(ele) for ele in ids] proc_env = { "PADDLE_TRAINER_ID": "%d" % t.rank, "PADDLE_CURRENT_ENDPOINT": "%s" % t.endpoint, @@ -468,7 +470,7 @@ def start_local_trainers(cluster, "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()), "PADDLE_RANK_IN_NODE": str(idx), "PADDLE_CURRENT_WORK_ACCLERATOR_ID":",".join(t.accelerators), - "PADDLE_WORK_ACCLERATOR_IDS":",".join(cluster.worker_accelerator_ids()), + "PADDLE_WORK_ACCLERATOR_IDS":",".join(res), } if len(t.accelerators) > 0 and pod.device_mode==DeviceMode.GPU: diff --git a/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh b/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh index bd389e30f77cd8..0960083abf28ec 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh +++ b/python/paddle/fluid/tests/unittests/test_fleet_launch_ascend.sh @@ -30,8 +30,8 @@ export TRAINER_PORTS_NUM=2 distributed_args="--ips=${cluster_node_ips} --ascend_npus=0,1 --log_dir=testlog" python -m paddle.distributed.fleet.launch ${distributed_args} ascend_multi_process_collective.py fleetlaunchascend -str1="selected_accelerators:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0 device_ids:0,1,0,1,0,1,0,1 device_id:0" -str2="selected_accelerators:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1 device_ids:0,1,0,1,0,1,0,1 device_id:1" +str1="selected_accelerators:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0 device_ids:0,1,0,1 device_id:0" +str2="selected_accelerators:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1 device_ids:0,1,0,1 device_id:1" file_0="multi_process_fleetlaunchascend.check_0.log" file_1="multi_process_fleetlaunchascend.check_1.log" From 7b80babb3d0c9671c732b5cdd831a0c1d8cff41a Mon Sep 17 00:00:00 2001 From: gongweibao Date: Thu, 21 Jan 2021 17:46:26 +0800 Subject: [PATCH 6/9] add envs --- python/paddle/distributed/fleet/launch_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py index 3fce264aac49bc..67c1bf979ed6ae 100644 --- a/python/paddle/distributed/fleet/launch_utils.py +++ b/python/paddle/distributed/fleet/launch_utils.py @@ -459,17 +459,17 @@ def start_local_trainers(cluster, current_env.pop("http_proxy", None) current_env.pop("https_proxy", None) + ids=cluster.worker_accelerator_ids() + res = [':'.join(ele) for ele in ids] procs = [] for idx, t in enumerate(pod.trainers): - ids=cluster.worker_accelerator_ids() - res = [':'.join(ele) for ele in ids] proc_env = { "PADDLE_TRAINER_ID": "%d" % t.rank, "PADDLE_CURRENT_ENDPOINT": "%s" % t.endpoint, "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(), "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()), "PADDLE_RANK_IN_NODE": str(idx), - "PADDLE_CURRENT_WORK_ACCLERATOR_ID":",".join(t.accelerators), + "PADDLE_CURRENT_WORK_ACCLERATOR_ID":":".join(t.accelerators), "PADDLE_WORK_ACCLERATOR_IDS":",".join(res), } From aab0cdb5d365535b601676906bdc9b3a47842f1e Mon Sep 17 00:00:00 2001 From: gongweibao Date: Thu, 21 Jan 2021 17:57:12 +0800 Subject: [PATCH 7/9] add envs --- python/paddle/distributed/fleet/launch_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py index 67c1bf979ed6ae..69382ec26e4d07 100644 --- a/python/paddle/distributed/fleet/launch_utils.py +++ b/python/paddle/distributed/fleet/launch_utils.py @@ -469,7 +469,7 @@ def start_local_trainers(cluster, "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(), "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()), "PADDLE_RANK_IN_NODE": str(idx), - "PADDLE_CURRENT_WORK_ACCLERATOR_ID":":".join(t.accelerators), + "PADDLE_CURRENT_WORK_ACCLERATOR_ID":",".join(t.accelerators), "PADDLE_WORK_ACCLERATOR_IDS":",".join(res), } From 2fc0cf560357b72310a039826f55ea97f7237132 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Thu, 21 Jan 2021 19:55:48 +0800 Subject: [PATCH 8/9] fix device --- python/paddle/distributed/fleet/__init__.py | 10 +++++++--- .../distributed/fleet/base/fleet_base.py | 12 ++++++------ .../distributed/fleet/base/role_maker.py | 18 +++++++++--------- .../paddle/distributed/fleet/launch_utils.py | 4 ++-- .../meta_optimizers/ascend/ascend_optimizer.py | 2 +- 5 files changed, 25 insertions(+), 21 deletions(-) diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py index 2afc1faec9c229..fff89b07535864 100644 --- a/python/paddle/distributed/fleet/__init__.py +++ b/python/paddle/distributed/fleet/__init__.py @@ -44,9 +44,13 @@ rank=fleet.worker_index nranks=fleet.worker_num world_size=fleet.worker_num -current_worker_accelerator_id=fleet.current_worker_accelerator_id -worker_accelerator_ids=fleet.worker_accelerator_ids -rank_in_node=fleet.rank_in_node +# device id in current trainer +local_device_id=fleet.local_device_id +# device ids in world +world_device_ids=fleet.world_device_ids +# rank in node +local_rank=fleet.local_rank +rank_in_node=local_rank is_worker = fleet.is_worker worker_endpoints = fleet.worker_endpoints server_num = fleet.server_num diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py index 27eff3a626f73b..17d9907213b4f1 100644 --- a/python/paddle/distributed/fleet/base/fleet_base.py +++ b/python/paddle/distributed/fleet/base/fleet_base.py @@ -291,14 +291,14 @@ def worker_num(self): def node_num(self): return self._role_maker._get_node_num() - def rank_in_node(self): - return self._role_maker._get_rank_in_node() + def local_rank(self): + return self._role_maker._get_local_rank() - def current_worker_accelerator_id(self): - return self._role_maker._get_current_worker_accelerator_id() + def local_device_id(self): + return self._role_maker._get_local_device_id() - def worker_accelerator_ids(self): - return self._role_maker._get_worker_accelerator_ids() + def world_device_ids(self): + return self._role_maker._get_world_device_ids() def is_worker(self): """ diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py index cd4ee873aca774..fa891cb4f08e31 100644 --- a/python/paddle/distributed/fleet/base/role_maker.py +++ b/python/paddle/distributed/fleet/base/role_maker.py @@ -622,20 +622,20 @@ def _get_node_num(self): self._generate_role() return self._nodes_num - def _get_rank_in_node(self): + def _get_local_rank(self): if not self._role_is_generated: self._generate_role() - return self._rank_in_node + return self._local_rank - def _get_current_worker_accelerator_id(self): + def _get_local_device_id(self): if not self._role_is_generated: self._generate_role() - return self._current_worker_accelerator_id + return self._local_device_id - def _get_worker_accelerator_ids(self): + def _get_world_device_ids(self): if not self._role_is_generated: self._generate_role() - return self._worker_accelerator_ids + return self._world_device_ids def _get_trainer_endpoints(self): """ @@ -797,9 +797,9 @@ def _collective_env(self): self._trainers_num = len(self._worker_endpoints) self._nodes_num = len( set([x.split(':')[0] for x in self._worker_endpoints])) - self._rank_in_node = os.getenv("PADDLE_RANK_IN_NODE") - self._current_worker_accelerator_id=os.getenv("PADDLE_CURRENT_WORK_ACCLERATOR_ID") - self._worker_accelerator_ids=os.getenv("PADDLE_WORK_ACCLERATOR_IDS") + self._local_rank = os.getenv("PADDLE_RANK_IN_NODE") + self._local_device_id=os.getenv("PADDLE_CURRENT_WORK_ACCLERATOR_ID") + self._world_device_ids=os.getenv("PADDLE_WORK_ACCLERATOR_IDS") def _gloo_init(self): # PADDLE_WITH_GLOO 1: trainer barrier, 2: all barrier diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py index 69382ec26e4d07..30e101ef0f47b2 100644 --- a/python/paddle/distributed/fleet/launch_utils.py +++ b/python/paddle/distributed/fleet/launch_utils.py @@ -98,7 +98,7 @@ def trainers_endpoints(self): r.append(t.endpoint) return r - def worker_accelerator_ids(self): + def world_device_ids(self): r = [] for pod in self.pods: for t in pod.trainers: @@ -459,7 +459,7 @@ def start_local_trainers(cluster, current_env.pop("http_proxy", None) current_env.pop("https_proxy", None) - ids=cluster.worker_accelerator_ids() + ids=cluster.world_device_ids() res = [':'.join(ele) for ele in ids] procs = [] for idx, t in enumerate(pod.trainers): diff --git a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py index 3f93ec41d6503e..086fc63d54e5b4 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py @@ -150,7 +150,7 @@ def minimize(self, # Config about Graph Engine can be found in https://support.huaweicloud.com/ config = { - "ge.exec.deviceId": str(fleet.current_worker_accelerator_id()), + "ge.exec.deviceId": str(fleet.local_device_id()), "ge.graphRunMode": "1", "ge.exec.precision_mode": "must_keep_origin_dtype", # if multi mode From 76950c290ce7012b3a4641a1c32c77f0bd1d86a8 Mon Sep 17 00:00:00 2001 From: gongweibao Date: Thu, 21 Jan 2021 19:59:42 +0800 Subject: [PATCH 9/9] fix device --- python/paddle/distributed/fleet/__init__.py | 2 +- python/paddle/distributed/fleet/base/fleet_base.py | 4 ++-- python/paddle/distributed/fleet/base/role_maker.py | 8 ++++---- python/paddle/distributed/fleet/launch_utils.py | 4 ++-- .../fleet/meta_optimizers/ascend/ascend_optimizer.py | 2 +- .../tests/unittests/ascend_multi_process_collective.py | 4 ++-- 6 files changed, 12 insertions(+), 12 deletions(-) diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py index fff89b07535864..51537b9306ffcd 100644 --- a/python/paddle/distributed/fleet/__init__.py +++ b/python/paddle/distributed/fleet/__init__.py @@ -45,7 +45,7 @@ nranks=fleet.worker_num world_size=fleet.worker_num # device id in current trainer -local_device_id=fleet.local_device_id +local_device_ids=fleet.local_device_ids # device ids in world world_device_ids=fleet.world_device_ids # rank in node diff --git a/python/paddle/distributed/fleet/base/fleet_base.py b/python/paddle/distributed/fleet/base/fleet_base.py index 17d9907213b4f1..5ae4475ecce6ca 100644 --- a/python/paddle/distributed/fleet/base/fleet_base.py +++ b/python/paddle/distributed/fleet/base/fleet_base.py @@ -294,8 +294,8 @@ def node_num(self): def local_rank(self): return self._role_maker._get_local_rank() - def local_device_id(self): - return self._role_maker._get_local_device_id() + def local_device_ids(self): + return self._role_maker._get_local_device_ids() def world_device_ids(self): return self._role_maker._get_world_device_ids() diff --git a/python/paddle/distributed/fleet/base/role_maker.py b/python/paddle/distributed/fleet/base/role_maker.py index fa891cb4f08e31..d17741decfbcd8 100644 --- a/python/paddle/distributed/fleet/base/role_maker.py +++ b/python/paddle/distributed/fleet/base/role_maker.py @@ -627,10 +627,10 @@ def _get_local_rank(self): self._generate_role() return self._local_rank - def _get_local_device_id(self): + def _get_local_device_ids(self): if not self._role_is_generated: self._generate_role() - return self._local_device_id + return self._local_device_ids def _get_world_device_ids(self): if not self._role_is_generated: @@ -798,8 +798,8 @@ def _collective_env(self): self._nodes_num = len( set([x.split(':')[0] for x in self._worker_endpoints])) self._local_rank = os.getenv("PADDLE_RANK_IN_NODE") - self._local_device_id=os.getenv("PADDLE_CURRENT_WORK_ACCLERATOR_ID") - self._world_device_ids=os.getenv("PADDLE_WORK_ACCLERATOR_IDS") + self._local_device_ids=os.getenv("PADDLE_LOCAL_DEVICE_IDS") + self._world_device_ids=os.getenv("PADDLE_WORLD_DEVICE_IDS") def _gloo_init(self): # PADDLE_WITH_GLOO 1: trainer barrier, 2: all barrier diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py index 30e101ef0f47b2..f39e2284a5805f 100644 --- a/python/paddle/distributed/fleet/launch_utils.py +++ b/python/paddle/distributed/fleet/launch_utils.py @@ -469,8 +469,8 @@ def start_local_trainers(cluster, "PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(), "PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()), "PADDLE_RANK_IN_NODE": str(idx), - "PADDLE_CURRENT_WORK_ACCLERATOR_ID":",".join(t.accelerators), - "PADDLE_WORK_ACCLERATOR_IDS":",".join(res), + "PADDLE_LOCAL_DEVICE_IDS":",".join(t.accelerators), + "PADDLE_WORLD_DEVICE_IDS":",".join(res), } if len(t.accelerators) > 0 and pod.device_mode==DeviceMode.GPU: diff --git a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py index 086fc63d54e5b4..8e8447ad7eab0a 100644 --- a/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py @@ -150,7 +150,7 @@ def minimize(self, # Config about Graph Engine can be found in https://support.huaweicloud.com/ config = { - "ge.exec.deviceId": str(fleet.local_device_id()), + "ge.exec.deviceId": str(fleet.local_device_ids()), "ge.graphRunMode": "1", "ge.exec.precision_mode": "must_keep_origin_dtype", # if multi mode diff --git a/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py b/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py index 28422efad91d75..3f12ba91b227e4 100644 --- a/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py +++ b/python/paddle/fluid/tests/unittests/ascend_multi_process_collective.py @@ -23,8 +23,8 @@ def train(prefix): current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT") worker_endpoints = worker_endpoints_env trainers_num = len(worker_endpoints.split(',')) - device_ids=os.getenv("PADDLE_WORK_ACCLERATOR_IDS") - current_device_id=os.getenv("PADDLE_CURRENT_WORK_ACCLERATOR_ID") + device_ids=os.getenv("PADDLE_WORLD_DEVICE_IDS") + current_device_id=os.getenv("PADDLE_LOCAL_DEVICE_IDS") details = "selected_accelerators:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{} device_ids:{} device_id:{}"\ .format(selected_accelerators, worker_endpoints, trainers_num, current_endpoint,trainer_id,device_ids, current_device_id)