Skip to content

Commit 4873c20

Browse files
author
lilong12
authored
modify ut cmakefile (#28140)
* modify ut cmakefile, test=develop
1 parent e8db441 commit 4873c20

File tree

2 files changed

+10
-34
lines changed

2 files changed

+10
-34
lines changed

python/paddle/fluid/tests/unittests/CMakeLists.txt

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,12 +15,6 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding)
1515
list(APPEND DIST_TEST_OPS test_parallel_dygraph_transformer)
1616
list(APPEND DIST_TEST_OPS test_listen_and_serv_op)
1717
list(APPEND DIST_TEST_OPS test_fleet_graph_execution_meta_optimizer)
18-
list(APPEND DIST_TEST_OPS test_collective_reduce_api)
19-
list(APPEND DIST_TEST_OPS test_collective_scatter_api)
20-
list(APPEND DIST_TEST_OPS test_collective_barrier_api)
21-
list(APPEND DIST_TEST_OPS test_collective_allreduce_api)
22-
list(APPEND DIST_TEST_OPS test_collective_broadcast_api)
23-
list(APPEND DIST_TEST_OPS test_collective_allgather_api)
2418
set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS})
2519
#remove distribute unittests.
2620
list(APPEND MIXED_DIST_TEST_OPS test_dgc_op)
@@ -70,6 +64,12 @@ if(NOT WITH_GPU OR WIN32)
7064
LIST(REMOVE_ITEM TEST_OPS test_collective_scatter)
7165
LIST(REMOVE_ITEM TEST_OPS test_reducescatter)
7266
LIST(REMOVE_ITEM TEST_OPS test_reducescatter_api)
67+
LIST(REMOVE_ITEM TEST_OPS test_collective_reduce_api)
68+
LIST(REMOVE_ITEM TEST_OPS test_collective_scatter_api)
69+
LIST(REMOVE_ITEM TEST_OPS test_collective_barrier_api)
70+
LIST(REMOVE_ITEM TEST_OPS test_collective_allreduce_api)
71+
LIST(REMOVE_ITEM TEST_OPS test_collective_broadcast_api)
72+
LIST(REMOVE_ITEM TEST_OPS test_collective_allgather_api)
7373
endif()
7474

7575
#TODO(sunxiaolong01): Fix this unitest failed on GCC8.

python/paddle/fluid/tests/unittests/test_collective_api_base.py

Lines changed: 4 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -37,30 +37,6 @@ def get_model(self, train_prog, startup_prog, rank):
3737
raise NotImplementedError(
3838
"get model should be implemented by child class.")
3939

40-
def wait_server_ready(self, endpoints):
41-
assert not isinstance(endpoints, string_types)
42-
while True:
43-
all_ok = True
44-
not_ready_endpoints = []
45-
for ep in endpoints:
46-
ip_port = ep.split(":")
47-
with closing(
48-
socket.socket(socket.AF_INET,
49-
socket.SOCK_STREAM)) as sock:
50-
sock.settimeout(2)
51-
result = sock.connect_ex((ip_port[0], int(ip_port[1])))
52-
if result != 0:
53-
all_ok = False
54-
not_ready_endpoints.append(ep)
55-
if not all_ok:
56-
sys.stderr.write("server not ready, wait 3 sec to retry...\n")
57-
sys.stderr.write("not ready endpoints:" + str(
58-
not_ready_endpoints) + "\n")
59-
sys.stderr.flush()
60-
time.sleep(3)
61-
else:
62-
break
63-
6440
def run_trainer(self, args):
6541
train_prog = fluid.Program()
6642
startup_prog = fluid.Program()
@@ -157,8 +133,8 @@ def _run_cluster(self, model_file, envs):
157133
tr_cmd = "%s %s"
158134
tr0_cmd = tr_cmd % (self._python_interp, model_file)
159135
tr1_cmd = tr_cmd % (self._python_interp, model_file)
160-
tr0_pipe = open("/tmp/tr0_err.log", "w")
161-
tr1_pipe = open("/tmp/tr1_err.log", "w")
136+
tr0_pipe = open("/tmp/tr0_err_%d.log" % os.getpid(), "w")
137+
tr1_pipe = open("/tmp/tr1_err_%d.log" % os.getpid(), "w")
162138
#print(tr0_cmd)
163139
tr0_proc = subprocess.Popen(
164140
tr0_cmd.strip().split(),
@@ -179,9 +155,9 @@ def _run_cluster(self, model_file, envs):
179155
# close trainer file
180156
tr0_pipe.close()
181157
tr1_pipe.close()
182-
with open("/tmp/tr0_err.log", "r") as f:
158+
with open("/tmp/tr0_err_%d.log" % os.getpid(), "r") as f:
183159
sys.stderr.write('trainer 0 stderr file: %s\n' % f.read())
184-
with open("/tmp/tr1_err.log", "r") as f:
160+
with open("/tmp/tr1_err_%d.log" % os.getpid(), "r") as f:
185161
sys.stderr.write('trainer 1 stderr file: %s\n' % f.read())
186162
return pickle.loads(tr0_out), pickle.loads(
187163
tr1_out), tr0_proc.pid, tr1_proc.pid

0 commit comments

Comments
 (0)