bugs fix for before_idx

FeixLiu · FeixLiu · commit abbae2ee72d8 · 2021-08-11T10:56:50.000+08:00
diff --git a/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py
@@ -240,25 +240,8 @@ def _allreduce_fusion_program(self):
                         continue
                     param_grads.append((param, grad))
 
-        # Each item of outputs_name_to_idx is a pair of idx
-        # The first entry of this pair is the idx of the first op generates the grad
-        # which is used to indicate the position to  insert coalesce op
-        # The second entry of this pair is the idx of the last op generates the grad
-        # which is used to indicate teh position to  insert sync and allreduce op
-        outputs_name_to_idx = {}
-        for idx in range(first_backward_idx, len(block.ops)):
-            op = block.ops[idx]
-            if is_optimizer_op(op):
-                break
-            for name in op.output_arg_names:
-                var = block.var(name)
-                if not outputs_name_to_idx.get(var):
-                    # if the grad only be generated by one op
-                    # the first idx and the last ids are identical
-                    outputs_name_to_idx[var] = (idx, idx)
-                else:
-                    outputs_name_to_idx[var] = (outputs_name_to_idx[var][0],
-                                                idx)
+        outputs_name_to_idx = self.__get_ouputs_name_to_idx(first_backward_idx,
+                                                            block)
 
         # structure of grad_param_segments is
         # [([grad0, grad1], [param0, param1]), ([grad2, grad3], [param2, param3])]
@@ -280,6 +263,7 @@ def _allreduce_fusion_program(self):
         if len(grad_param_segments) == 0:
             return
 
+        fused_vars = [None] * len(grad_param_segments)
         for i in range(len(grad_param_segments) - 1, -1, -1):
             # travers the grad_param_segments in backward
             # not to use reversed since needs the absolute index value
@@ -291,25 +275,10 @@ def _allreduce_fusion_program(self):
                 dtype=grad_segment[0].dtype,
                 persistable=False,
                 stop_gradient=True)
-            before_idx = outputs_name_to_idx[grad_segment[0]][0]
+            fused_vars[i] = fused_var
             after_idx = outputs_name_to_idx[grad_segment[-1]][1]
-            offset = 1
-            for j in range(i + 1, len(grad_param_segments)):
-                # Find the offset of the sync op and allreduce op
-                # Some ops may have multi grad_param pairs, and these grads might be
-                # split into different segments. If the last grad in this segment and
-                # the first grad in next segment are from the same op, it means
-                # a coalesce op has already been inserted before this op.
-                # Therefore, we have to insert the the sync/allreduce op with offset.
-                # The j is to get the ([grad0, grad1], [param0, param1]) tuple
-                # The first 0 is to get [grad0, grad1] list
-                # The second 0 is to get grad0 entry
-                # The 1 is to get the idx of the last op generates the grad
-                if after_idx == outputs_name_to_idx[grad_param_segments[j][0][
-                        0]][1]:
-                    offset += 1
             block._insert_op_without_sync(
-                after_idx + offset,
+                after_idx + 1,
                 type='c_allreduce_sum',
                 inputs={'X': fused_var},
                 outputs={'Out': fused_var},
@@ -320,11 +289,35 @@ def _allreduce_fusion_program(self):
                 })
             if not self.calc_comm_same_stream:
                 block._insert_op_without_sync(
-                    after_idx + offset,
+                    after_idx + 1,
                     type='c_sync_calc_stream',
                     inputs={'X': fused_var},
                     outputs={'Out': fused_var},
                     attrs={OP_ROLE_KEY: OpRole.Backward})
+
+        # update the outputs_name_to_idx after insertion of sync/allreduce ops
+        outputs_name_to_idx = self.__get_ouputs_name_to_idx(first_backward_idx,
+                                                            block)
+        # the before_idx is not guaranteed sorted, therefore we have to find the
+        # topology to insert the coalesce ops
+        pos_for_coalesce = {}
+        for i in range(len(grad_param_segments) - 1, -1, -1):
+            # We separate the insertion of coalesce op and the insertion of sync/allreduce op,
+            # since that the coalesce op's insertion may invalidate the outputs_name_to_idx
+            grad_segment, param_segment = grad_param_segments[i]
+            before_idx = len(block.ops)
+            for grad in outputs_name_to_idx:
+                before_idx = min(before_idx, outputs_name_to_idx[grad][0])
+            pos_for_coalesce[i] = before_idx
+
+        # insert the coalesce op based on the sorted before_idx
+        pos_for_coalesce = sorted(
+            pos_for_coalesce.items(),
+            key=lambda kv: (kv[1], kv[0]),
+            reverse=True)
+        for i, before_idx in pos_for_coalesce:
+            grad_segment, param_segment = grad_param_segments[i]
+            fused_var = fused_vars[i]
             block._insert_op_without_sync(
                 before_idx,
                 type="coalesce_tensor",
@@ -354,3 +347,25 @@ def _allreduce_fusion_program(self):
                            OP_ROLE_KEY: OpRole.Backward})
                 break
         block._sync_with_cpp()
+
+    def __get_ouputs_name_to_idx(self, first_backward_idx, block):
+        # Each item of outputs_name_to_idx is a pair of idx.
+        # The first entry of this pair is the idx of the first op generates the grad,
+        # which is used to indicate the position to insert coalesce op.
+        # The second entry of this pair is the idx of the last op generates the grad,
+        # which is used to indicate the position to insert sync and allreduce op.
+        outputs_name_to_idx = {}
+        for idx in range(first_backward_idx, len(block.ops)):
+            op = block.ops[idx]
+            if is_optimizer_op(op):
+                break
+            for name in op.output_arg_names:
+                var = block.var(name)
+                if not outputs_name_to_idx.get(var):
+                    # if the grad only be generated by one op
+                    # the first idx and the last ids are identical
+                    outputs_name_to_idx[var] = (idx, idx)
+                else:
+                    outputs_name_to_idx[var] = (outputs_name_to_idx[var][0],
+                                                idx)
+        return outputs_name_to_idx