PaddlePaddle · fuyinno4 · Sep 8, 2021 · Jun 28, 2021 · Jun 28, 2021 · Jul 1, 2021
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
@@ -202,6 +202,7 @@ message DistributedStrategy {
   optional bool calc_comm_same_stream = 32 [ default = false ];
   optional bool asp = 33 [ default = false ];
   optional bool fuse_grad_merge = 34 [ default = false ];
+  optional bool semi_auto = 35 [ default = false ];
 
   optional RecomputeConfig recompute_configs = 101;
   optional AMPConfig amp_configs = 102;

diff --git a/python/paddle/distributed/auto_parallel/completion.py b/python/paddle/distributed/auto_parallel/completion.py
@@ -253,6 +253,9 @@ def update_tensor_node_dims_mapping(dist_context, tensor_node, fwd=True):
     if (not tensor_node.is_var()) or (tensor_node.var() is None):
         return False
     tensor_desc = tensor_node.var()
+    # Skip reader tensor
+    if tensor_desc.type() == core.VarDesc.VarType.READER:
+        return False
     tensor_dist_attr = dist_context.get_tensor_distributed_attr_for_graph(
         tensor_node)
     assert tensor_dist_attr is not None
@@ -263,6 +266,10 @@ def update_tensor_node_dims_mapping(dist_context, tensor_node, fwd=True):
         dims_mapping_list = []
         for pred_op_node in tensor_node.inputs:
             if pred_op_node.op() is not None:
+                if pred_op_node.op().type() == "create_py_reader" \
+                    or pred_op_node.op().type() == "create_double_buffer_reader" \
+                    or pred_op_node.op().type() == "read":
+                    continue
                 op_dist_attr = dist_context.get_op_distributed_attr_for_graph(
                     pred_op_node)
                 op_dims_mapping = op_dist_attr.get_output_dims_mapping(
@@ -279,6 +286,10 @@ def update_tensor_node_dims_mapping(dist_context, tensor_node, fwd=True):
         dims_mapping_list = []
         for succ_op_node in tensor_node.outputs:
             if succ_op_node.op() is not None:
+                if succ_op_node.op().type() == "create_py_reader" \
+                    or succ_op_node.op().type() == "create_double_buffer_reader" \
+                    or succ_op_node.op().type() == "read":
+                    continue
                 op_dist_attr = dist_context.get_op_distributed_attr_for_graph(
                     succ_op_node)
                 op_dims_mapping = op_dist_attr.get_input_dims_mapping(
@@ -298,11 +309,18 @@ def update_op_node_dims_mapping(dist_context, op_node, fwd=True):
     changed = False
     if (not op_node.is_op()) or (op_node.op() is None):
         return False
+    # Skip reader op
     op_desc = op_node.op()
+    if op_desc.type() == "create_py_reader" \
+        or op_desc.type() == "create_double_buffer_reader" \
+        or op_desc.type() == "read":
+        return False
     op_dist_attr = dist_context.get_op_distributed_attr_for_graph(op_node)
     if fwd:
         for tensor_node in op_node.inputs:
             if tensor_node.var() is not None:
+                if tensor_node.var().type() == core.VarDesc.VarType.READER:
+                    continue
                 tensor_desc = tensor_node.var()
                 if op_dist_attr.is_annotated_input_dims_mapping(
                         tensor_desc.name()):
@@ -344,6 +362,8 @@ def update_op_node_dims_mapping(dist_context, op_node, fwd=True):
     else:
         for tensor_node in op_node.outputs:
             if tensor_node.var() is not None:
+                if tensor_node.var().type() == core.VarDesc.VarType.READER:
+                    continue
                 tensor_desc = tensor_node.var()
                 if op_dist_attr.is_annotated_output_dims_mapping(
                         tensor_desc.name()):
@@ -400,47 +420,143 @@ def complete_annotation(program, dist_context=None):
     if dist_context is None:
         dist_context = get_default_distributed_context()
 
-    # Initialize distributed attributes for all var and op node in program 
+    # Initialize distributed attributes for all var and op node in program
     dist_context.initialize_distributed_attr_for_program(program)
-    # print_program_with_distributed_attr(program, dist_context)
 
     # Convert program to graph
     graph = framework.IrGraph(core.Graph(program.desc))
 
     # Initialize distributed attributes for all var and op node in graph
     dist_context.initialize_distributed_attr_for_graph(graph)
 
-    # # Complete process mesh for each node
+    # Complete process mesh for each node
     all_nodes = list(graph.all_nodes())
+
+    def sort_key_fun(node):
+        first = -1
+        if node.is_op():
+            first = 0
+        else:
+            first = 1
+        second = -1
+        if node.is_op() and node.op() is not None:
+            second = node.op().id()
+        if node.is_var() and node.var() is not None:
+            second = node.var().id()
+        return (first, second)
+
+    all_nodes.sort(key=sort_key_fun)
+
     reach_fix_point = False
     while not reach_fix_point:
-        changed = False
-        for node in all_nodes:
-            if node.is_var() and node.var() is not None:
-                tensor_changed = update_tensor_node_process_mesh(
-                    dist_context, node, fwd=True)
-                if tensor_changed:
-                    changed = True
-            if node.is_op() and node.op() is not None:
-                op_changed = update_op_node_process_mesh(
-                    dist_context, node, fwd=True)
-                if op_changed:
-                    changed = True
-        for node in reversed(all_nodes):
-            if node.is_var() and node.var() is not None:
-                tensor_changed = update_tensor_node_process_mesh(
-                    dist_context, node, fwd=False)
-                if tensor_changed:
-                    changed = True
-            if node.is_op() and node.op() is not None:
-                op_changed = update_op_node_process_mesh(
-                    dist_context, node, fwd=False)
-                if op_changed:
-                    changed = True
-        if changed:
+        total_changed = False
+        reach_fwd_fix_point = False
+        reach_bwd_fix_point = False
+        while not reach_fwd_fix_point:
+            changed = False
+            for node in all_nodes:
+                if node.is_var() and node.var() is not None:
+                    tensor_changed = update_tensor_node_process_mesh(
+                        dist_context, node, fwd=True)
+                    if tensor_changed:
+                        changed = True
+                if node.is_op() and node.op() is not None:
+                    op_changed = update_op_node_process_mesh(
+                        dist_context, node, fwd=True)
+                    if op_changed:
+                        changed = True
+            if changed:
+                reach_fwd_fix_point = False
+                total_changed = True
+            else:
+                reach_fwd_fix_point = True
+        while not reach_bwd_fix_point:
+            changed = False
+            for node in all_nodes:
+                if node.is_var() and node.var() is not None:
+                    tensor_changed = update_tensor_node_process_mesh(
+                        dist_context, node, fwd=False)
+                    if tensor_changed:
+                        changed = True
+                if node.is_op() and node.op() is not None:
+                    op_changed = update_op_node_process_mesh(
+                        dist_context, node, fwd=False)
+                    if op_changed:
+                        changed = True
+            if changed:
+                reach_bwd_fix_point = False
+                total_changed = True
+            else:
+                reach_bwd_fix_point = True
+        if total_changed:
             reach_fix_point = False
         else:
             reach_fix_point = True
+            # Validation the completion of process meshes and should be moved to a proper location
+            is_wrong = False
+            for node in all_nodes:
+                if node.is_var() and node.var() is not None:
+                    tensor_dist_attr = dist_context.get_tensor_distributed_attr_for_graph(
+                        node)
+                    if tensor_dist_attr.get_process_mesh() is None:
+                        msg_str = ""
+                        for op_node in node.inputs:
+                            if op_node.op() is not None:
+                                op_dist_attr = dist_context.get_op_distributed_attr_for_graph(
+                                    op_node)
+                                msg_str += "{} [{}], ".format(
+                                    op_node.op().type(),
+                                    op_dist_attr.get_process_mesh())
+                            else:
+                                msg_str += "{} [{}], ".format(op_node.name(),
+                                                              None)
+                        for op_node in node.outputs:
+                            if op_node.op() is not None:
+                                op_dist_attr = dist_context.get_op_distributed_attr_for_graph(
+                                    op_node)
+                                msg_str += "{} [{}], ".format(
+                                    op_node.op().type(),
+                                    op_dist_attr.get_process_mesh())
+                            else:
+                                msg_str += "{} [{}], ".format(op_node.name(),
+                                                              None)
+                        msg_str = "Cannot decide ProcessMesh of {} among {}. Please use shard_tensor api explicitly to annotate it".format(
+                            node.var().name(), msg_str[:-2])
+                        is_wrong = True
+                        print(msg_str)
+                if node.is_op() and node.op() is not None:
+                    op_dist_attr = dist_context.get_op_distributed_attr_for_graph(
+                        node)
+                    if op_dist_attr.get_process_mesh() is None:
+                        msg_str = ""
+                        for tensor_node in node.inputs:
+                            if tensor_node.var() is not None:
+                                tensor_dist_attr = dist_context.get_tensor_distributed_attr_for_graph(
+                                    tensor_node)
+                                msg_str += "{} [{}], ".format(
+                                    tensor_node.var().name(),
+                                    tensor_dist_attr.get_process_mesh())
+                            else:
+                                msg_str += "{} [{}], ".format(
+                                    tensor_node.name(), None)
+                        for tensor_node in node.outputs:
+                            if tensor_node.var() is not None:
+                                tensor_dist_attr = dist_context.get_tensor_distributed_attr_for_graph(
+                                    tensor_node)
+                                msg_str += "{} [{}], ".format(
+                                    tensor_node.var().name(),
+                                    tensor_dist_attr.get_process_mesh())
+                            else:
+                                msg_str += "{} [{}], ".format(
+                                    tensor_node.name(), None)
+                        msg_str = "Cannot decide ProcessMesh of {} among {}. Please use shard_op api explicitly to annotate it".format(
+                            node.op().type(), msg_str[:-2])
+                        is_wrong = True
+                        print(msg_str)
+                if node.is_op() and node.op() is None:
+                    print("op op is None", node.name())
+            if is_wrong:
+                assert False, "Cannot complete process_meshes of the program."
 
     # Complete dims_mapping for each node
     reach_fix_point = False

diff --git a/python/paddle/distributed/auto_parallel/context.py b/python/paddle/distributed/auto_parallel/context.py
@@ -142,12 +142,15 @@ def initialize_distributed_attr_for_program(self, program):
                         tensor.desc, tensor_dist_attr)
                     self.set_tensor_distributed_attr_for_program(
                         tensor, tensor_dist_attr)
-                tensor_dist_attr.set_shape(tensor.desc.shape())
+                if tensor.type == core.VarDesc.VarType.READER:
+                    tensor_dist_attr.set_shape([])
+                else:
+                    tensor_dist_attr.set_shape(tensor.desc.shape())
                 if tensor_dist_attr.get_process_mesh() is not None:
                     tensor_dist_attr.mark_as_annotated("process_mesh")
                 if tensor_dist_attr.get_dims_mapping() is None:
                     tensor_dims_mapping = [
-                        -1 for _ in range(len(tensor.desc.shape()))
+                        -1 for _ in range(len(tensor_dist_attr.get_shape()))
                     ]
                     tensor_dist_attr.set_dims_mapping(tensor_dims_mapping)
                 else:
@@ -168,12 +171,18 @@ def initialize_distributed_attr_for_program(self, program):
                     op_dist_attr.mark_as_annotated("process_mesh")
                 for tensor_name in op.input_arg_names:
                     # There may be a better way to find the tensor by name
-                    tensor = op.block._var_recursive(tensor_name)
-                    op_dist_attr.set_input_shape(tensor_name,
-                                                 tensor.desc.shape())
+                    if op.type == "create_py_reader" \
+                        or tensor.type == core.VarDesc.VarType.READER:
+                        op_dist_attr.set_input_shape(tensor_name, [])
+                    else:
+                        tensor = op.block._var_recursive(tensor_name)
+                        op_dist_attr.set_input_shape(tensor_name,
+                                                     tensor.desc.shape())
                     if op_dist_attr.get_input_dims_mapping(tensor_name) is None:
                         tensor_dims_mapping = [
-                            -1 for _ in range(len(tensor.desc.shape()))
+                            -1
+                            for _ in range(
+                                len(op_dist_attr.get_input_shape(tensor_name)))
                         ]
                         op_dist_attr.set_input_dims_mapping(tensor_name,
                                                             tensor_dims_mapping)
@@ -184,12 +193,18 @@ def initialize_distributed_attr_for_program(self, program):
                         op_dist_attr.mark_as_parameter(tensor_name)
                 for tensor_name in op.output_arg_names:
                     tensor = op.block._var_recursive(tensor_name)
-                    op_dist_attr.set_output_shape(tensor_name,
-                                                  tensor.desc.shape())
+                    if tensor.type == core.VarDesc.VarType.READER:
+                        op_dist_attr.set_output_shape(tensor_name, [])
+                    else:
+                        op_dist_attr.set_output_shape(tensor_name,
+                                                      tensor.desc.shape())
                     if op_dist_attr.get_output_dims_mapping(
                             tensor_name) is None:
                         tensor_dims_mapping = [
-                            -1 for _ in range(len(tensor.desc.shape()))
+                            -1
+                            for _ in range(
+                                len(
+                                    op_dist_attr.get_output_shape(tensor_name)))
                         ]
                         op_dist_attr.set_output_dims_mapping(
                             tensor_name, tensor_dims_mapping)
@@ -378,8 +393,8 @@ def amend_distributed_attr_for_program(self):
             # If the dimension of tensor is less than the sharding dimension of process mesh,
             # we just amend the dimension mapping to -1. (Is this really OK?)
             for i in range(len(tensor_shape)):
-                if dims_mapping[i] != -1 and process_mesh_shape[dims_mapping[
-                        i]] > tensor_shape[i]:
+                if dims_mapping[i] != -1 and tensor_shape[i] > 0 \
+                    and process_mesh_shape[dims_mapping[i]] > tensor_shape[i]:
                     dims_mapping[i] = -1
 
         for attr in self._op_distributed_attr_map_for_program.values():
@@ -392,8 +407,8 @@ def amend_distributed_attr_for_program(self):
                 # If the dimension of tensor is less than the sharding dimension of process mesh,
                 # we just amend the dimension mapping to -1. (Is this really OK?)
                 for i in range(len(tensor_shape)):
-                    if dims_mapping[i] != -1 and process_mesh_shape[
-                            dims_mapping[i]] > tensor_shape[i]:
+                    if dims_mapping[i] != -1 and tensor_shape[i] > 0 \
+                        and process_mesh_shape[dims_mapping[i]] > tensor_shape[i]:
                         dims_mapping[i] = -1
 
             for arg_name in attr.get_owner_op().desc.output_arg_names():
@@ -403,8 +418,8 @@ def amend_distributed_attr_for_program(self):
                 # If the dimension of tensor is less than the sharding dimension of process mesh,
                 # we just amend the dimension mapping to -1. (Is this really OK?)
                 for i in range(len(tensor_shape)):
-                    if dims_mapping[i] != -1 and process_mesh_shape[
-                            dims_mapping[i]] > tensor_shape[i]:
+                    if dims_mapping[i] != -1 and tensor_shape[i] > 0 \
+                        and process_mesh_shape[dims_mapping[i]] > tensor_shape[i]:
                         dims_mapping[i] = -1
 
     def _get_data_parallel_info(self):