|
19 | 19 | import paddle |
20 | 20 | import paddle.distributed as dist |
21 | 21 | import paddle.fluid as fluid |
22 | | -from paddle.fluid.framework import _test_eager_guard |
23 | 22 | from paddle.nn import Linear |
24 | 23 |
|
25 | 24 | paddle.seed(1024) |
@@ -69,58 +68,57 @@ def forward(self, x): |
69 | 68 | class TestDistTraning(unittest.TestCase): |
70 | 69 | def test_multiple_xpus(self): |
71 | 70 | self.trainer_id = dist.get_rank() |
72 | | - with _test_eager_guard(): |
73 | | - self.pg = dist.init_parallel_env() |
| 71 | + self.pg = dist.init_parallel_env() |
74 | 72 |
|
75 | | - model_a = SimpleNet(self.trainer_id) |
76 | | - model_b = SimpleNet(self.trainer_id) |
| 73 | + model_a = SimpleNet(self.trainer_id) |
| 74 | + model_b = SimpleNet(self.trainer_id) |
77 | 75 |
|
78 | | - state_dict = model_a.state_dict() |
79 | | - model_b.set_state_dict(state_dict) |
| 76 | + state_dict = model_a.state_dict() |
| 77 | + model_b.set_state_dict(state_dict) |
80 | 78 |
|
81 | | - model_a = paddle.DataParallel( |
82 | | - model_a, find_unused_parameters=True, group=self.pg |
| 79 | + model_a = paddle.DataParallel( |
| 80 | + model_a, find_unused_parameters=True, group=self.pg |
| 81 | + ) |
| 82 | + model_b = paddle.DataParallel( |
| 83 | + model_b, find_unused_parameters=True, group=self.pg |
| 84 | + ) |
| 85 | + |
| 86 | + ones_input = paddle.ones(shape=(batch, in_dim)) |
| 87 | + ones_input.stop_gradient = True |
| 88 | + |
| 89 | + w1_grad_sum = np.zeros((in_dim, out_dim), dtype='float32') |
| 90 | + w2_grad_sum = np.zeros((in_dim, out_dim), dtype='float32') |
| 91 | + |
| 92 | + for step_id in range(5): |
| 93 | + random_input = paddle.rand(shape=(batch, in_dim)) |
| 94 | + random_input.stop_gradient = True |
| 95 | + |
| 96 | + if step_id % 2 == 0: |
| 97 | + out_a = model_a(random_input) |
| 98 | + out_b = model_b(random_input) |
| 99 | + else: |
| 100 | + out_a = model_a(ones_input) |
| 101 | + out_b = model_b(ones_input) |
| 102 | + |
| 103 | + out_a.sum().backward() |
| 104 | + out_b.sum().backward() |
| 105 | + |
| 106 | + self.check_gradient(model_a.parameters()) |
| 107 | + self.check_gradient(model_b.parameters()) |
| 108 | + |
| 109 | + # test acc gradient |
| 110 | + w1_grad_sum = self.check_acc( |
| 111 | + model_a._layers.w1.grad, |
| 112 | + w1_grad_sum, |
| 113 | + model_b._layers.w1.grad, |
83 | 114 | ) |
84 | | - model_b = paddle.DataParallel( |
85 | | - model_b, find_unused_parameters=True, group=self.pg |
| 115 | + w2_grad_sum = self.check_acc( |
| 116 | + model_a._layers.w2.grad, |
| 117 | + w2_grad_sum, |
| 118 | + model_b._layers.w2.grad, |
86 | 119 | ) |
87 | 120 |
|
88 | | - ones_input = paddle.ones(shape=(batch, in_dim)) |
89 | | - ones_input.stop_gradient = True |
90 | | - |
91 | | - w1_grad_sum = np.zeros((in_dim, out_dim), dtype='float32') |
92 | | - w2_grad_sum = np.zeros((in_dim, out_dim), dtype='float32') |
93 | | - |
94 | | - for step_id in range(5): |
95 | | - random_input = paddle.rand(shape=(batch, in_dim)) |
96 | | - random_input.stop_gradient = True |
97 | | - |
98 | | - if step_id % 2 == 0: |
99 | | - out_a = model_a(random_input) |
100 | | - out_b = model_b(random_input) |
101 | | - else: |
102 | | - out_a = model_a(ones_input) |
103 | | - out_b = model_b(ones_input) |
104 | | - |
105 | | - out_a.sum().backward() |
106 | | - out_b.sum().backward() |
107 | | - |
108 | | - self.check_gradient(model_a.parameters()) |
109 | | - self.check_gradient(model_b.parameters()) |
110 | | - |
111 | | - # test acc gradient |
112 | | - w1_grad_sum = self.check_acc( |
113 | | - model_a._layers.w1.grad, |
114 | | - w1_grad_sum, |
115 | | - model_b._layers.w1.grad, |
116 | | - ) |
117 | | - w2_grad_sum = self.check_acc( |
118 | | - model_a._layers.w2.grad, |
119 | | - w2_grad_sum, |
120 | | - model_b._layers.w2.grad, |
121 | | - ) |
122 | | - |
123 | | - model_a.clear_gradients() |
| 121 | + model_a.clear_gradients() |
124 | 122 |
|
125 | 123 | def check_acc(self, grad, grad_sum, acc_grad): |
126 | 124 | if grad is not None: |
|
0 commit comments