@@ -37,30 +37,6 @@ def get_model(self, train_prog, startup_prog, rank):
3737 raise NotImplementedError (
3838 "get model should be implemented by child class." )
3939
40- def wait_server_ready (self , endpoints ):
41- assert not isinstance (endpoints , string_types )
42- while True :
43- all_ok = True
44- not_ready_endpoints = []
45- for ep in endpoints :
46- ip_port = ep .split (":" )
47- with closing (
48- socket .socket (socket .AF_INET ,
49- socket .SOCK_STREAM )) as sock :
50- sock .settimeout (2 )
51- result = sock .connect_ex ((ip_port [0 ], int (ip_port [1 ])))
52- if result != 0 :
53- all_ok = False
54- not_ready_endpoints .append (ep )
55- if not all_ok :
56- sys .stderr .write ("server not ready, wait 3 sec to retry...\n " )
57- sys .stderr .write ("not ready endpoints:" + str (
58- not_ready_endpoints ) + "\n " )
59- sys .stderr .flush ()
60- time .sleep (3 )
61- else :
62- break
63-
6440 def run_trainer (self , args ):
6541 train_prog = fluid .Program ()
6642 startup_prog = fluid .Program ()
@@ -157,8 +133,8 @@ def _run_cluster(self, model_file, envs):
157133 tr_cmd = "%s %s"
158134 tr0_cmd = tr_cmd % (self ._python_interp , model_file )
159135 tr1_cmd = tr_cmd % (self ._python_interp , model_file )
160- tr0_pipe = open ("/tmp/tr0_err .log" , "w" )
161- tr1_pipe = open ("/tmp/tr1_err .log" , "w" )
136+ tr0_pipe = open ("/tmp/tr0_err_%d .log" % os . getpid () , "w" )
137+ tr1_pipe = open ("/tmp/tr1_err_%d .log" % os . getpid () , "w" )
162138 #print(tr0_cmd)
163139 tr0_proc = subprocess .Popen (
164140 tr0_cmd .strip ().split (),
@@ -179,9 +155,9 @@ def _run_cluster(self, model_file, envs):
179155 # close trainer file
180156 tr0_pipe .close ()
181157 tr1_pipe .close ()
182- with open ("/tmp/tr0_err .log" , "r" ) as f :
158+ with open ("/tmp/tr0_err_%d .log" % os . getpid () , "r" ) as f :
183159 sys .stderr .write ('trainer 0 stderr file: %s\n ' % f .read ())
184- with open ("/tmp/tr1_err .log" , "r" ) as f :
160+ with open ("/tmp/tr1_err_%d .log" % os . getpid () , "r" ) as f :
185161 sys .stderr .write ('trainer 1 stderr file: %s\n ' % f .read ())
186162 return pickle .loads (tr0_out ), pickle .loads (
187163 tr1_out ), tr0_proc .pid , tr1_proc .pid
0 commit comments