resolve the new logic problem, test=allcase

FeixLiu · FeixLiu · commit 1b12a43fb969 · 2021-08-10T10:09:38.000+08:00
diff --git a/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py
@@ -221,10 +221,11 @@ def _allreduce_fusion_program(self):
 
         # find all grad params
         for idx, op in enumerate(block.ops):
+            if first_backward_idx == -1 and \
+                    is_backward_op(op):
+                first_backward_idx = idx
             if is_backward_op(op) and \
                     OP_ROLE_VAR_KEY in op.attr_names:
-                if first_backward_idx == -1:
-                    first_backward_idx = idx
                 op_role_var = op.attr(OP_ROLE_VAR_KEY)
                 if len(op_role_var) == 0:
                     continue
@@ -239,14 +240,25 @@ def _allreduce_fusion_program(self):
                         continue
                     param_grads.append((param, grad))
 
-        # find the index of the op which generates the grad
-        grads_to_idx = {}
-        for param, grad in param_grads:
-            for idx in range(first_backward_idx, len(block.ops)):
-                op = block.ops[idx]
-                if grad.name in op.output_arg_names:
-                    grads_to_idx[grad] = idx
-                    break
+        # Each item of outputs_name_to_idx is a pair of idx
+        # The first entry of this pair is the idx of the first op generates the grad
+        # which is used to indicate the position to  insert coalesce op
+        # The second entry of this pair is the idx of the last op generates the grad
+        # which is used to indicate teh position to  insert sync and allreduce op
+        outputs_name_to_idx = {}
+        for idx in range(first_backward_idx, len(block.ops)):
+            op = block.ops[idx]
+            if is_optimizer_op(op):
+                break
+            for name in op.output_arg_names:
+                var = block.var(name)
+                if not outputs_name_to_idx.get(var):
+                    # if the grad only be generated by one op
+                    # the first idx and the last ids are identical
+                    outputs_name_to_idx[var] = (idx, idx)
+                else:
+                    outputs_name_to_idx[var] = (outputs_name_to_idx[var][0],
+                                                idx)
 
         # structure of grad_param_segments is
         # [([grad0, grad1], [param0, param1]), ([grad2, grad3], [param2, param3])]
@@ -279,8 +291,8 @@ def _allreduce_fusion_program(self):
                 dtype=grad_segment[0].dtype,
                 persistable=False,
                 stop_gradient=True)
-            before_idx = grads_to_idx[grad_segment[0]]
-            after_idx = grads_to_idx[grad_segment[-1]]
+            before_idx = outputs_name_to_idx[grad_segment[0]][0]
+            after_idx = outputs_name_to_idx[grad_segment[-1]][1]
             offset = 1
             for j in range(i + 1, len(grad_param_segments)):
                 # Find the offset of the sync op and allreduce op
@@ -289,7 +301,12 @@ def _allreduce_fusion_program(self):
                 # the first grad in next segment are from the same op, it means
                 # a coalesce op has already been inserted before this op.
                 # Therefore, we have to insert the the sync/allreduce op with offset.
-                if after_idx == grads_to_idx[grad_param_segments[j][0][0]]:
+                # The j is to get the ([grad0, grad1], [param0, param1]) tuple
+                # The first 0 is to get [grad0, grad1] list
+                # The second 0 is to get grad0 entry
+                # The 1 is to get the idx of the last op generates the grad
+                if after_idx == outputs_name_to_idx[grad_param_segments[j][0][
+                        0]][1]:
                     offset += 1
             block._insert_op_without_sync(
                 after_idx + offset,