[NPU] Support npu op expand_v2 and expand_v2_grad

WorgenZhang · WorgenZhang · commit 7aadfc987fdb · 2021-08-10T16:58:06.000+08:00
diff --git a/paddle/fluid/operators/expand_v2_op_npu.cc b/paddle/fluid/operators/expand_v2_op_npu.cc
@@ -18,6 +18,33 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+inline std::vector<int> get_expand_shape_npu(
+    const framework::ExecutionContext& ctx) {
+  std::vector<int> vec_expand_shape;
+  auto list_expand_shapes_tensor =
+      ctx.MultiInput<framework::Tensor>("expand_shapes_tensor");
+  if (ctx.HasInput("Shape")) {
+    auto* shape_tensor = ctx.Input<framework::LoDTensor>("Shape");
+    std::vector<int> out_data;
+    TensorToVector(*shape_tensor, ctx.device_context(), &out_data);
+    for (int i = 0; i < static_cast<int>(out_data.size()); ++i) {
+      vec_expand_shape.push_back(out_data[i]);
+    }
+    return vec_expand_shape;
+  } else if (list_expand_shapes_tensor.size() > 0) {
+    // get tensor from
+    for (size_t i = 0; i < list_expand_shapes_tensor.size(); ++i) {
+      auto tensor = list_expand_shapes_tensor[i];
+      std::vector<int> out_data;
+      TensorToVector(*tensor, ctx.device_context(), &out_data);
+      vec_expand_shape.push_back(out_data[0]);
+    }
+    return vec_expand_shape;
+  } else {
+    return ctx.Attr<std::vector<int>>("shape");
+  }
+}
+
 using Tensor = framework::Tensor;
 template <typename DeviceContext, typename T>
 class ExpandV2NPUKernel : public framework::OpKernel<T> {
@@ -26,27 +53,7 @@ class ExpandV2NPUKernel : public framework::OpKernel<T> {
     auto* X = ctx.Input<framework::Tensor>("X");
     auto* Out = ctx.Output<framework::Tensor>("Out");
 
-    std::vector<int> expand_shape;
-    auto list_expand_shapes_tensor =
-        ctx.MultiInput<framework::Tensor>("expand_shapes_tensor");
-    if (ctx.HasInput("Shape")) {
-      auto* shape_tensor = ctx.Input<framework::LoDTensor>("Shape");
-      std::vector<int> out_data;
-      TensorToVector(*shape_tensor, ctx.device_context(), &out_data);
-      for (int i = 0; i < static_cast<int>(out_data.size()); ++i) {
-        expand_shape.push_back(out_data[i]);
-      }
-    } else if (list_expand_shapes_tensor.size() > 0) {
-      // get tensor from
-      for (size_t i = 0; i < list_expand_shapes_tensor.size(); ++i) {
-        auto tensor = list_expand_shapes_tensor[i];
-        std::vector<int> out_data;
-        TensorToVector(*tensor, ctx.device_context(), &out_data);
-        expand_shape.push_back(out_data[0]);
-      }
-    } else {
-      expand_shape = ctx.Attr<std::vector<int>>("shape");
-    }
+    std::vector<int> expand_shape = get_expand_shape_npu(ctx);
 
     framework::NPUAttributeMap attr_input = {{"shape", expand_shape}};
 
@@ -97,6 +104,62 @@ class ExpandV2NPUKernel : public framework::OpKernel<T> {
     runner.Run(stream);
   }
 };
+
+template <typename DeviceContext, typename T>
+class ExpandV2NPUGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    dx->mutable_data<T>(ctx.GetPlace());
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    // case 1: reduce dout dims to dx dims
+    // For example: [2, 120] --> [120]
+    auto reduce_ndim = dout->dims().size() - dx->dims().size();
+    std::vector<int> axes;
+    for (auto i = 0; i < reduce_ndim; ++i) {
+      axes.push_back(i);
+    }
+    Tensor* tmp_dout = const_cast<Tensor*>(dout);
+    Tensor reduced_dout(dx->type());
+    if (axes.size() != 0) {
+      std::vector<int64_t> reduced_dout_dims;
+      for (auto i = reduce_ndim; i < dout->dims().size(); ++i) {
+        reduced_dout_dims.push_back(dout->dims()[i]);
+      }
+      reduced_dout.Resize(framework::make_ddim(reduced_dout_dims));
+      reduced_dout.mutable_data<T>(ctx.GetPlace());
+      const auto& runner = NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout},
+                                       {{"axes", axes}, {"keep_dims", false}});
+      runner.Run(stream);
+      tmp_dout = &reduced_dout;
+    }
+
+    // case 2: reduce axis of dout in which dim is 1
+    // For example: [12, 140] --> [1, 140]
+
+    // case 3: copy dout to dx when shape is totally same, and dim in dx != 1
+    // For example: [2, 10, 5] --> [2, 10, 5]
+    axes.clear();
+    for (auto i = 0; i < dx->dims().size(); ++i) {
+      if (dx->dims()[i] == 1) {
+        axes.push_back(i);
+      }
+    }
+    if (axes.size() != 0) {
+      const auto& runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx},
+                                       {{"axes", axes}, {"keep_dims", true}});
+      runner.Run(stream);
+    } else {
+      framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dx);
+    }
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -107,3 +170,8 @@ REGISTER_OP_NPU_KERNEL(
     ops::ExpandV2NPUKernel<paddle::platform::NPUDeviceContext,
                            paddle::platform::float16>,
     ops::ExpandV2NPUKernel<paddle::platform::NPUDeviceContext, int>);
+
+REGISTER_OP_NPU_KERNEL(
+    expand_v2_grad,
+    ops::ExpandV2NPUGradKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::ExpandV2NPUGradKernel<paddle::platform::NPUDeviceContext, int>);
diff --git a/python/paddle/fluid/tests/unittests/npu/test_expand_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_expand_v2_op_npu.py
@@ -52,8 +52,8 @@ def init_data(self):
     def test_check_output(self):
         self.check_output_with_place(self.place)
 
-    # def test_check_grad(self):
-    #     self.check_grad(['X'], 'Out')
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out')
 
 
 class TestExpandV2OpRank2_DimExpanding(TestExpandV2NPUOpRank1):
@@ -118,8 +118,8 @@ def init_data(self):
     def test_check_output(self):
         self.check_output_with_place(self.place)
 
-    # def test_check_grad(self):
-    #     self.check_grad(['X'], 'Out')
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out')
 
 
 class TestExpandV2OpRank2_Corner_tensor_attr(
@@ -159,11 +159,12 @@ def init_data(self):
     def test_check_output(self):
         self.check_output_with_place(self.place)
 
-    # def test_check_grad(self):
-    #     self.check_grad(['X'], 'Out')
+    def test_check_grad(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out')
 
 
-    # Situation 4: input x is float16
+# Situation 4: input x is float16
+# don't support grad check for float16
 class TestExpandV2OpInteger(OpTest):
     def setUp(self):
         self.set_npu()
@@ -184,6 +185,7 @@ def test_check_output(self):
 
 
 # Situation 5: input x is int32
+# ReduceSumD CANN Op doesn't support grad check for int32
 class TestExpandV2OpInteger(OpTest):
     def setUp(self):
         self.set_npu()
@@ -240,7 +242,7 @@ def test_static(self):
             out_2 = paddle.expand(x, shape=[positive_2, 14])
             out_3 = paddle.expand(x, shape=expand_shape)
 
-            # g0 = fluid.backward.calc_gradient(out_2, x)
+            g0 = fluid.backward.calc_gradient(out_2, x)
 
             exe = fluid.Executor(place=paddle.NPUPlace(0))
             res_1, res_2, res_3 = exe.run(fluid.default_main_program(),