Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion python/paddle/distributed/fleet/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,13 @@
rank=fleet.worker_index
nranks=fleet.worker_num
world_size=fleet.worker_num
rank_in_node=fleet.rank_in_node
# device id in current trainer
local_device_ids=fleet.local_device_ids
# device ids in world
world_device_ids=fleet.world_device_ids
# rank in node
local_rank=fleet.local_rank
rank_in_node=local_rank
is_worker = fleet.is_worker
worker_endpoints = fleet.worker_endpoints
server_num = fleet.server_num
Expand Down
10 changes: 8 additions & 2 deletions python/paddle/distributed/fleet/base/fleet_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,8 +291,14 @@ def worker_num(self):
def node_num(self):
return self._role_maker._get_node_num()

def rank_in_node(self):
return self._role_maker._get_rank_in_node()
def local_rank(self):
return self._role_maker._get_local_rank()

def local_device_ids(self):
return self._role_maker._get_local_device_ids()

def world_device_ids(self):
return self._role_maker._get_world_device_ids()

def is_worker(self):
"""
Expand Down
18 changes: 15 additions & 3 deletions python/paddle/distributed/fleet/base/role_maker.py
Original file line number Diff line number Diff line change
Expand Up @@ -622,10 +622,20 @@ def _get_node_num(self):
self._generate_role()
return self._nodes_num

def _get_rank_in_node(self):
def _get_local_rank(self):
if not self._role_is_generated:
self._generate_role()
return self._rank_in_node
return self._local_rank

def _get_local_device_ids(self):
if not self._role_is_generated:
self._generate_role()
return self._local_device_ids

def _get_world_device_ids(self):
if not self._role_is_generated:
self._generate_role()
return self._world_device_ids

def _get_trainer_endpoints(self):
"""
Expand Down Expand Up @@ -787,7 +797,9 @@ def _collective_env(self):
self._trainers_num = len(self._worker_endpoints)
self._nodes_num = len(
set([x.split(':')[0] for x in self._worker_endpoints]))
self._rank_in_node = os.getenv("PADDLE_RANK_IN_NODE")
self._local_rank = os.getenv("PADDLE_RANK_IN_NODE")
self._local_device_ids=os.getenv("PADDLE_LOCAL_DEVICE_IDS")
self._world_device_ids=os.getenv("PADDLE_WORLD_DEVICE_IDS")

def _gloo_init(self):
# PADDLE_WITH_GLOO 1: trainer barrier, 2: all barrier
Expand Down
13 changes: 12 additions & 1 deletion python/paddle/distributed/fleet/launch_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,13 @@ def trainers_endpoints(self):
r.append(t.endpoint)
return r

def world_device_ids(self):
r = []
for pod in self.pods:
for t in pod.trainers:
r.append(t.accelerators)
return r

def pods_endpoints(self):
r = []
for pod in self.pods:
Expand Down Expand Up @@ -452,14 +459,18 @@ def start_local_trainers(cluster,
current_env.pop("http_proxy", None)
current_env.pop("https_proxy", None)

ids=cluster.world_device_ids()
res = [':'.join(ele) for ele in ids]
procs = []
for idx, t in enumerate(pod.trainers):
proc_env = {
"PADDLE_TRAINER_ID": "%d" % t.rank,
"PADDLE_CURRENT_ENDPOINT": "%s" % t.endpoint,
"PADDLE_TRAINERS_NUM": "%d" % cluster.trainers_nranks(),
"PADDLE_TRAINER_ENDPOINTS": ",".join(cluster.trainers_endpoints()),
"PADDLE_RANK_IN_NODE": str(idx)
"PADDLE_RANK_IN_NODE": str(idx),
"PADDLE_LOCAL_DEVICE_IDS":",".join(t.accelerators),
"PADDLE_WORLD_DEVICE_IDS":",".join(res),
}

if len(t.accelerators) > 0 and pod.device_mode==DeviceMode.GPU:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ def minimize(self,

# Config about Graph Engine can be found in https://support.huaweicloud.com/
config = {
"ge.exec.deviceId": str(fleet.rank_in_node()),
"ge.exec.deviceId": str(fleet.local_device_ids()),
"ge.graphRunMode": "1",
"ge.exec.precision_mode": "must_keep_origin_dtype",
# if multi mode
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,11 @@ def train(prefix):
current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
worker_endpoints = worker_endpoints_env
trainers_num = len(worker_endpoints.split(','))
device_ids=os.getenv("PADDLE_WORLD_DEVICE_IDS")
current_device_id=os.getenv("PADDLE_LOCAL_DEVICE_IDS")

details = "selected_accelerators:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{}"\
.format(selected_accelerators, worker_endpoints, trainers_num, current_endpoint,trainer_id)
details = "selected_accelerators:{} worker_endpoints:{} trainers_num:{} current_endpoint:{} trainer_id:{} device_ids:{} device_id:{}"\
.format(selected_accelerators, worker_endpoints, trainers_num, current_endpoint,trainer_id,device_ids, current_device_id)

print(details)
with open("multi_process_{}.check_{}.log".format(prefix, trainer_id), "w") as f:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ export TRAINER_PORTS_NUM=2
distributed_args="--ips=${cluster_node_ips} --ascend_npus=0,1 --log_dir=testlog"
python -m paddle.distributed.fleet.launch ${distributed_args} ascend_multi_process_collective.py fleetlaunchascend

str1="selected_accelerators:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0"
str2="selected_accelerators:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1"
str1="selected_accelerators:0 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35789 trainer_id:0 device_ids:0,1,0,1 device_id:0"
str2="selected_accelerators:1 worker_endpoints:127.0.0.1:35789,127.0.0.1:35790,127.0.0.2:35789,127.0.0.2:35790 trainers_num:4 current_endpoint:127.0.0.1:35790 trainer_id:1 device_ids:0,1,0,1 device_id:1"
file_0="multi_process_fleetlaunchascend.check_0.log"
file_1="multi_process_fleetlaunchascend.check_1.log"

Expand Down