PaddlePaddle
diff --git a/‎paddle/fluid/operators/stack_op_npu.cc‎
Lines changed: 42 additions & 53 deletions b/‎paddle/fluid/operators/stack_op_npu.cc‎
Lines changed: 42 additions & 53 deletions
diff --git a/‎paddle/fluid/operators/unstack_op_npu.cc‎
Lines changed: 85 additions & 0 deletions b/‎paddle/fluid/operators/unstack_op_npu.cc‎
Lines changed: 85 additions & 0 deletions
@@ -12,15 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifdef PADDLE_WITH_ASCEND_CL
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/operators/npu_op_runner.h"
 #include "paddle/fluid/operators/stack_op.h"
-#include "paddle/fluid/operators/unsqueeze_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
 
 namespace paddle {
 namespace operators {
@@ -32,64 +25,56 @@ class StackNPUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto x = ctx.MultiInput<Tensor>("X");
-    int32_t N = x.size();
+    auto* y = ctx.Output<Tensor>("Y");
+    int axis = ctx.Attr<int>("axis");
+    if (axis < 0) axis += (x[0]->dims().size() + 1);
+    int num = static_cast<int>(x.size());
 
-    PADDLE_ENFORCE_GT(
-        N, 0, platform::errors::InvalidArgument("number of input Tensor <= 0"));
+    PADDLE_ENFORCE_GT(num, 0, platform::errors::InvalidArgument(
+                                  "number of input Tensor <= 0"));
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
 
     std::vector<paddle::framework::Tensor> x_list;
-    for (int i = 0; i < N; i++) {
+    for (int i = 0; i < num; i++) {
       x_list.push_back(*x[i]);
     }
+    y->mutable_data<T>(ctx.GetPlace());
 
-    int axis = ctx.Attr<int>("axis");
+    const auto& runner =
+        NpuOpRunner("Pack", {x_list}, {*y}, {{"axis", axis}, {"N", num}});
+    runner.Run(stream);
+  }
+};
 
-    if (axis < 0) {
-      axis = axis + x_list[0].dims().size() + 1;
-    }
-    auto* out = ctx.Output<Tensor>("Y");
+template <typename DeviceContext, typename T>
+class StackGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
+    auto dx = ctx.MultiOutput<Tensor>(framework::GradVarName("X"));
+    int axis = ctx.Attr<int>("axis");
+    if (axis < 0) axis += dy->dims().size();
+    int num = dy->dims()[axis];
 
-    auto place = ctx.GetPlace();
+    PADDLE_ENFORCE_GT(num, 0, platform::errors::InvalidArgument(
+                                  "number of input Tensor <= 0"));
 
     auto stream =
         ctx.template device_context<paddle::platform::NPUDeviceContext>()
             .stream();
 
-    out->mutable_data<T>(place);
-
-    if (axis != 0) {
-      auto x_dim = x_list[0].dims();
-      std::vector<int> vec_dim_tmp;
-      vec_dim_tmp.push_back(N);
-      for (auto i = 0; i < x_dim.size(); ++i) {
-        vec_dim_tmp.push_back(x_dim[i]);
-      }
-
-      Tensor tmp_stack(out->type());
-      tmp_stack.Resize(framework::make_ddim(vec_dim_tmp));
-      tmp_stack.mutable_data<T>(ctx.GetPlace());
-
-      const auto& runner =
-          NpuOpRunner("Pack", {x_list}, {tmp_stack}, {{"axis", 0}, {"N", N}});
-      runner.Run(stream);
-
-      std::vector<int64_t> vec_trans;
-      for (auto i = 1; i <= x_dim.size(); ++i) {
-        vec_trans.push_back(i);
-        if (i == axis) {
-          vec_trans.push_back(0);
-        }
-      }
-
-      const auto& runner_trans_final =
-          NpuOpRunner("TransposeD", {tmp_stack}, {*out}, {{"perm", vec_trans}});
-      runner_trans_final.Run(stream);
-
-    } else {
-      const auto& runner =
-          NpuOpRunner("Pack", {x_list}, {*out}, {{"axis", axis}, {"N", N}});
-      runner.Run(stream);
+    std::vector<paddle::framework::Tensor> dx_list;
+    for (int i = 0; i < num; i++) {
+      dx[i]->mutable_data<T>(ctx.GetPlace());
+      dx_list.push_back(*dx[i]);
     }
+
+    const auto& runner =
+        NpuOpRunner("Unpack", {*dy}, {dx_list}, {{"axis", axis}, {"num", num}});
+    runner.Run(stream);
   }
 };
 
@@ -103,4 +88,8 @@ REGISTER_OP_NPU_KERNEL(
     ops::StackNPUKernel<paddle::platform::NPUDeviceContext,
                         paddle::platform::float16>);
 
-#endif
+REGISTER_OP_NPU_KERNEL(
+    stack_grad,
+    ops::StackGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::StackGradNPUKernel<paddle::platform::NPUDeviceContext,
+                            paddle::platform::float16>);
@@ -0,0 +1,85 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/unstack_op.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class UnStackNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *dy = ctx.Input<Tensor>("X");
+    auto dx = ctx.MultiOutput<Tensor>("Y");
+    int axis = ctx.Attr<int>("axis");
+    if (axis < 0) axis += dy->dims().size();
+    int num = dy->dims()[axis];
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    std::vector<paddle::framework::Tensor> dx_list;
+    for (int i = 0; i < num; i++) {
+      dx[i]->mutable_data<T>(ctx.GetPlace());
+      dx_list.push_back(*dx[i]);
+    }
+
+    const auto &runner =
+        NpuOpRunner("Unpack", {*dy}, {dx_list}, {{"axis", axis}, {"num", num}});
+    runner.Run(stream);
+  }
+};
+
+template <typename DeviceContext, typename T>
+class UnStackGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto x = ctx.MultiInput<Tensor>(framework::GradVarName("Y"));
+    auto *y = ctx.Output<Tensor>(framework::GradVarName("X"));
+    int axis = ctx.Attr<int>("axis");
+    if (axis < 0) axis += (x[0]->dims().size() + 1);
+    int num = static_cast<int>(x.size());
+
+    auto stream =
+        ctx.template device_context<paddle::platform::NPUDeviceContext>()
+            .stream();
+
+    std::vector<paddle::framework::Tensor> x_list;
+    for (int i = 0; i < num; i++) {
+      x_list.push_back(*x[i]);
+    }
+    y->mutable_data<T>(ctx.GetPlace());
+
+    const auto &runner =
+        NpuOpRunner("Pack", {x_list}, {*y}, {{"axis", axis}, {"N", num}});
+    runner.Run(stream);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace plat = paddle::platform;
+namespace ops = paddle::operators;
+
+REGISTER_OP_NPU_KERNEL(
+    unstack, ops::UnStackNPUKernel<plat::NPUDeviceContext, float>,
+    ops::UnStackNPUKernel<plat::NPUDeviceContext, plat::float16>);
+
+REGISTER_OP_NPU_KERNEL(
+    unstack_grad, ops::UnStackGradNPUKernel<plat::NPUDeviceContext, float>,
+    ops::UnStackGradNPUKernel<plat::NPUDeviceContext, plat::float16>);