[pir+auto parallel] add reshard op for input when needed (#63072)

zhiqiu · web-flow · commit 70cc34736b62 · 2024-03-29T10:36:33.000+08:00
* add reshard op for input when needed

* fix unary grad inferspmd
diff --git a/paddle/fluid/pir/dialect/distributed/ir/dist_api.cc b/paddle/fluid/pir/dialect/distributed/ir/dist_api.cc
@@ -59,5 +59,12 @@ pir::Value reshard(const pir::Value& x,
   return reshard_op.result(0);
 }
 
+pir::Value reshard(const pir::Value& x,
+                   const TensorDistAttribute& tensor_dist_attr) {
+  auto reshard_op = ApiBuilder::Instance().GetBuilder()->Build<ReShardOp>(
+      x, tensor_dist_attr);
+  return reshard_op.result(0);
+}
+
 }  // namespace dialect
 }  // namespace paddle
diff --git a/paddle/fluid/pir/dialect/distributed/ir/dist_api.h b/paddle/fluid/pir/dialect/distributed/ir/dist_api.h
@@ -16,6 +16,7 @@
 
 #include <vector>
 
+#include "paddle/fluid/pir/dialect/distributed/ir/dist_attribute.h"
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/common/place.h"
 #include "paddle/phi/core/distributed/auto_parallel/process_mesh.h"
@@ -31,5 +32,9 @@ pir::Value shard_tensor(const pir::Value& x,
 pir::Value reshard(const pir::Value& x,
                    const phi::distributed::ProcessMesh& process_mesh,
                    const std::vector<int64_t>& dims_mapping);
+
+pir::Value reshard(const pir::Value& x,
+                   const TensorDistAttribute& tensor_dist_attr);
+
 }  // namespace dialect
 }  // namespace paddle
diff --git a/paddle/fluid/pir/dialect/distributed/ir/dist_type.cc b/paddle/fluid/pir/dialect/distributed/ir/dist_type.cc
@@ -43,11 +43,13 @@ common::DDim InferLocalDDim(const common::DDim& global_ddim,
                             TensorDistAttribute dist_attr) {
   auto& mesh_dim = dist_attr.process_mesh_attr().shape();
   auto& dim_mapping = dist_attr.dims_mapping();
-  PADDLE_ENFORCE_EQ(
-      global_ddim.size(),
-      dim_mapping.size(),
-      ::common::errors::PreconditionNotMet(
-          "The global ddim size must equal to dim_mapping's size!"));
+  PADDLE_ENFORCE_EQ(global_ddim.size(),
+                    dim_mapping.size(),
+                    ::common::errors::PreconditionNotMet(
+                        "The global ddim size must equal to dim_mapping's "
+                        "size, but bot %d vs %d",
+                        global_ddim.size(),
+                        dim_mapping.size()));
   common::DDim local_ddim(global_ddim);
   for (size_t i = 0; i < dim_mapping.size(); ++i) {
     if (dim_mapping[i] != -1) {
diff --git a/paddle/fluid/pybind/dist_api.cc b/paddle/fluid/pybind/dist_api.cc
@@ -15,6 +15,7 @@
 #include <Python.h>
 #include "pybind11/stl.h"
 
+#include "paddle/fluid/pir/dialect/distributed/ir/dist_api.h"
 #include "paddle/fluid/pir/dialect/distributed/ir/dist_attribute.h"
 #include "paddle/fluid/pybind/dist_api.h"
 #include "paddle/fluid/pybind/dist_static_op_function.h"
@@ -60,6 +61,10 @@ void BindTensorDistAttribute(py::module *m) {
              print_stream << self;
              return print_stream.str();
            })
+      .def("__eq__",
+           [](TensorDistAttribute &self, const TensorDistAttribute &other) {
+             return self == other;
+           })
       .def_property_readonly("process_mesh",
                              [](TensorDistAttribute &self) {
                                return self.process_mesh_attr().process_mesh();
@@ -86,12 +91,20 @@ void BindDistOpsAPI(pybind11::module *module) {
   }
 }
 
+void BindOpsFunction(py::module *m) {
+  m->def("reshard_v2",
+         [](const pir::Value &x, const TensorDistAttribute &dist_attr) {
+           return reshard(x, dist_attr);
+         });
+}
+
 void BindDistApi(pybind11::module *module) {
   auto ir_module = module->def_submodule("pir");
   BindOperationDistAttribute(&ir_module);
   BindTensorDistAttribute(&ir_module);
   auto ops_modules = ir_module.def_submodule("ops");
   BindDistOpsAPI(&ops_modules);
+  BindOpsFunction(&ops_modules);
 }
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/dist_static_op_function.h b/paddle/fluid/pybind/dist_static_op_function.h
@@ -89,7 +89,6 @@ static PyMethodDef DistOpsAPI[] = {
      (PyCFunction)(void (*)(void))static_api_reshard,
      METH_VARARGS | METH_KEYWORDS,
      "C++ interface function for reshard."},
-
     {nullptr, nullptr, 0, nullptr}};
 
 }  // namespace pybind
diff --git a/paddle/phi/infermeta/spmd_rules/elementwise.cc b/paddle/phi/infermeta/spmd_rules/elementwise.cc
@@ -365,14 +365,17 @@ SpmdInfo ElementwiseBinaryInferSpmdReverse(const DistMetaTensor& x,
 
 SpmdInfo ElementwiseUnaryGradInferSpmd(const DistMetaTensor& x,
                                        const DistMetaTensor& out_grad) {
-  return {{out_grad.dist_attr(), out_grad.dist_attr()}, {out_grad.dist_attr()}};
+  auto dist_attr = CopyTensorDistAttrForOutput(out_grad.dist_attr());
+  dist_attr.set_dims_mapping(out_grad.dist_attr().dims_mapping());
+  return {{dist_attr, dist_attr}, {dist_attr}};
 }
 
 SpmdInfo ElementwiseUnaryGradInferSpmd(const DistMetaTensor& x,
                                        const DistMetaTensor& out,
                                        const DistMetaTensor& out_grad) {
-  return {{out_grad.dist_attr(), out_grad.dist_attr(), out_grad.dist_attr()},
-          {out_grad.dist_attr()}};
+  auto dist_attr = CopyTensorDistAttrForOutput(out_grad.dist_attr());
+  dist_attr.set_dims_mapping(out_grad.dist_attr().dims_mapping());
+  return {{dist_attr, dist_attr, dist_attr}, {dist_attr}};
 }
 
 bool DimsNotEqualOrHasBroadcastDim(const DistMetaTensor& x,
diff --git a/python/paddle/distributed/auto_parallel/static/engine.py b/python/paddle/distributed/auto_parallel/static/engine.py
@@ -54,6 +54,7 @@
 from .dist_saver import DistributedSaver
 from .helper import ProgramHelper
 from .parallelizer_v2 import Parallelizer
+from .pir_pass import apply_partition_pass
 from .planner_v2 import Planner
 from .process_group import get_all_process_groups, new_process_group
 
@@ -675,7 +676,7 @@ def _parallel_pir(self, mode):
         # TODO(JZ-LIANG) Step 3.1: Partition Pass
         #   insert reshard op if operand tensor's placements if different from what the cumsumer op need.
         #   Partition the computation graph into different pipeline stage if need.
-        # dist_program = apply_partition_pass(dist_program)
+        dist_program = apply_partition_pass(dist_program)
 
         # TODO(hitywt) Step 3.2: Reshard Pass
         #   resolute the reshard op into special collective operation.
diff --git a/python/paddle/distributed/auto_parallel/static/pir_pass.py b/python/paddle/distributed/auto_parallel/static/pir_pass.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+
+
+def apply_partition_pass(program):
+    new_program = program.clone()
+    with paddle.static.program_guard(new_program):
+        for op in new_program.global_block().ops:
+            # assert len(op.operands()) == len(op.dist_attr().operand_dist_attrs()), f'The number of operand and operand_dist_attrs are not equal in op: {op}'
+            for var, operand_dist_attr in zip(
+                op.operands(), op.dist_attr().operand_dist_attrs()
+            ):
+                if (
+                    var.source().is_dist_dense_tensor_type()
+                    and var.source().dist_attr() != operand_dist_attr
+                ):
+                    paddle.pir.set_insertion_point(op)
+                    # insert reshard
+                    reshard_var = paddle._pir_ops.reshard_v2(
+                        var.source(), operand_dist_attr
+                    )
+                    var.set_source(reshard_var)
+    return new_program
+
+
+def apply_reshard_pass(program):
+    pass
diff --git a/test/auto_parallel/pir/test_to_static_pir_program.py b/test/auto_parallel/pir/test_to_static_pir_program.py
@@ -66,6 +66,7 @@ def __init__(self, mesh):
         )
 
     def forward(self, x):
+        x.stop_gradient = False
         out = self.relu_0(x)  # triggle backward partial allreduce
         out = self.linear_0(out)
         out = self.relu_1(out)
@@ -138,6 +139,8 @@ def test_to_static_program(self):
         backward_op_list = [
             "pd_op.sgd_",
             "pd_op.sgd_",
+            "pd_op.relu_grad",
+            "dist_op.reshard",
             "pd_op.matmul_grad",
             "pd_op.relu_grad",
             "pd_op.matmul_grad",
@@ -225,10 +228,10 @@ def test_to_static_program(self):
                         tensor._local_shape, [BATCH_SIZE, CLASS_NUM]
                     )
                 elif matmul_grad_idx == 1:
-                    self.assertEqual(tensor.dist_attr().dims_mapping, [-1, 0])
-                    self.assertEqual(tensor.dist_attr().partial_dims, set())
+                    self.assertEqual(tensor.dist_attr().dims_mapping, [-1, -1])
+                    self.assertEqual(tensor.dist_attr().partial_dims, {0})
                     self.assertEqual(
-                        tensor._local_shape, [BATCH_SIZE, IMAGE_SIZE // 2]
+                        tensor._local_shape, [BATCH_SIZE, IMAGE_SIZE]
                     )
                 matmul_grad_idx += 1
             if op.name() == 'pd_op.sgd_':