2727paddle .enable_static ()
2828
2929
30- class CollectiveCPUBarrierText (unittest .TestCase ):
30+ class CollectiveCPUBarrierWithGlooTest (unittest .TestCase ):
3131 def find_free_port (self ):
3232 def _free_port ():
3333 with closing (socket .socket (socket .AF_INET ,
@@ -42,42 +42,50 @@ def _free_port():
4242 return port
4343
4444 def barrier_func (self , id , rank_num , server_endpoint , out_dict , sleep_time ):
45- paddle .distributed .init_gloo_parallel_env (id , rank_num , server_endpoint )
46- # 1st barrier
47- # Run barrier to synchronize processes after starting
48- paddle .distributed .barrier_func ()
49- # 2nd barrier
50- # Let rank 0 sleep for one second and check that all processes
51- # saw that artificial delay through the barrier
52- start = time .time ()
53- if (id == 0 ):
54- time .sleep (sleep_time )
55- paddle .distributed .barrier_func ()
56- end = time .time ()
57- out_dict [id ] = end - start
58- # Release
59- paddle .distributed .release_gloo (id )
45+ try :
46+ paddle .distributed .gloo_init_parallel_env (id , rank_num ,
47+ server_endpoint )
48+ # 1st barrier
49+ # Run barrier to synchronize processes after starting
50+ paddle .distributed .gloo_barrier ()
51+ # 2nd barrier
52+ # Let rank 0 sleep for one second and check that all processes
53+ # saw that artificial delay through the barrier
54+ start = time .time ()
55+ if (id == 0 ):
56+ time .sleep (sleep_time )
57+ paddle .distributed .gloo_barrier ()
58+ end = time .time ()
59+ out_dict [id ] = end - start
60+ # Release
61+ paddle .distributed .gloo_release ()
62+ except :
63+ out_dict [id ] = 0
6064
6165 def barrier_op (self , id , rank_num , server_endpoint , out_dict , sleep_time ):
62- main_prog = fluid .Program ()
63- startup_prog = fluid .Program ()
64- paddle .distributed .init_gloo_parallel_env (id , rank_num , server_endpoint )
65- place = fluid .CPUPlace ()
66- with fluid .program_guard (main_prog , startup_prog ):
67- paddle .distributed .barrier ()
68- exe = fluid .Executor (place )
69- # Run barrier to synchronize processes after starting
70- exe .run (main_prog )
71- # Let rank 0 sleep for one second and check that all processes
72- # saw that artificial delay through the barrier
73- start = time .time ()
74- if (id == 0 ):
75- time .sleep (sleep_time )
76- exe .run (main_prog )
77- end = time .time ()
78- out_dict [id ] = end - start
79- # Release
80- paddle .distributed .release_gloo (id )
66+ try :
67+ main_prog = fluid .Program ()
68+ startup_prog = fluid .Program ()
69+ paddle .distributed .gloo_init_parallel_env (id , rank_num ,
70+ server_endpoint )
71+ place = fluid .CPUPlace ()
72+ with fluid .program_guard (main_prog , startup_prog ):
73+ paddle .distributed .barrier ()
74+ exe = fluid .Executor (place )
75+ # Run barrier to synchronize processes after starting
76+ exe .run (main_prog )
77+ # Let rank 0 sleep for one second and check that all processes
78+ # saw that artificial delay through the barrier
79+ start = time .time ()
80+ if (id == 0 ):
81+ time .sleep (sleep_time )
82+ exe .run (main_prog )
83+ end = time .time ()
84+ out_dict [id ] = end - start
85+ # Release
86+ paddle .distributed .gloo_release ()
87+ except :
88+ out_dict [id ] = 0
8189
8290 def test_barrier_func_with_multiprocess (self ):
8391 num_of_ranks = 4
@@ -90,7 +98,7 @@ def test_barrier_func_with_multiprocess(self):
9098 jobs = []
9199 for id in range (num_of_ranks ):
92100 p = multiprocessing .Process (
93- target = self .barrier_op ,
101+ target = self .barrier_func ,
94102 args = (id , num_of_ranks , ep_str , procs_out_dict , sleep_time ))
95103 jobs .append (p )
96104 p .start ()
0 commit comments