[NPU] add npu kernel for mean Op (#31562)

OleNet · oyjxer · web-flow · commit 468ac6993b32 · 2021-03-16T18:57:17.000+08:00
* update mean op

* update mean op

* give a better test activation

Co-authored-by: oyjxer &lt;1728722986@qq.com&gt;
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
@@ -184,4 +184,6 @@ endif()
 
 if(WITH_ASCEND_CL)
 cc_test(gelu_op_npu_test SRCS gelu_op_npu_test.cc DEPS op_registry gelu_op scope device_context enforce executor)
+cc_test(mean_op_npu_test SRCS mean_op_npu_test.cc DEPS op_registry mean_op scope device_context enforce executor)
 endif()
+
diff --git a/paddle/fluid/operators/mean_op_npu.cc b/paddle/fluid/operators/mean_op_npu.cc
@@ -0,0 +1,135 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/mean_op.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/operators/npu_op_runner.h"
+
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class MeanNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<framework::LoDTensor>("X");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+
+    auto reduce_ndim = x->dims().size();
+    std::vector<int> axes;
+    for (auto i = 0; i < reduce_ndim; ++i) {
+      axes.push_back(i);
+    }
+
+    framework::NPUAttributeMap attr_input = {
+                  {"keep_dims", false},
+                  {"axes", axes}};
+
+    std::vector<int64_t> out_dims;
+    out_dims.push_back(1);
+    out->Resize(framework::make_ddim(out_dims));
+    out->mutable_data<T>(ctx.GetPlace());
+
+    Tensor reduced_out(x->type());
+    std::vector<int64_t> reduced_dout_dims;
+    reduced_dout_dims.push_back(1);
+    reduced_out.Resize(framework::make_ddim(reduced_dout_dims));
+    reduced_out.mutable_data<T>(ctx.GetPlace());
+
+    auto runner = NpuOpRunner("ReduceMeanD",
+                              {*x},
+                              {*out},
+                              attr_input);
+
+    auto stream =
+      ctx.template device_context<
+                     paddle::platform::NPUDeviceContext>()
+                .stream();
+    runner.Run(stream);
+  }
+};
+
+
+template <typename DeviceContext, typename T>
+class MeanGradNPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto stream =
+      context.template device_context<
+                          paddle::platform::NPUDeviceContext>()
+                          .stream();
+
+    auto grad = context.Input<Tensor>(framework::GradVarName("Out"));
+
+    PADDLE_ENFORCE_EQ(grad->numel(), 1,
+                      platform::errors::InvalidArgument(
+                          "Mean Gradient Input Tensor len should be 1. But "
+                          "received Out@Grad's elements num is %d.",
+                          grad->numel()));
+
+    auto IG = context.Output<Tensor>(framework::GradVarName("X"));
+    IG->mutable_data<T>(context.GetPlace());
+
+    // ones
+    Tensor ones(grad->type());
+    std::vector<int64_t> dout_dims;
+    for (auto i = 0; i < IG->dims().size(); ++i) {
+      dout_dims.push_back(IG->dims()[i]);
+    }
+    ones.Resize(framework::make_ddim(dout_dims));
+    ones.mutable_data<T>(context.GetPlace());
+    auto runner_ones = NpuOpRunner("OnesLike", {*IG}, {ones}, {});
+    runner_ones.Run(stream);
+
+    // means
+    Tensor mean_tensor(grad->type());
+    mean_tensor.Resize({1});
+    mean_tensor.mutable_data<T>(context.GetPlace());
+    std::vector<float> mean_vec;
+    mean_vec.push_back(1.0/static_cast<float>(IG->numel()));
+    framework::TensorFromVector(mean_vec,
+                                context.device_context(),
+                                &mean_tensor);
+
+    // means mul ones
+    Tensor mean_ma(grad->type());
+    mean_ma.Resize(framework::make_ddim(dout_dims));
+    mean_ma.mutable_data<T>(context.GetPlace());
+    auto runner_mul_1 = NpuOpRunner("Mul", {mean_tensor, ones}, {mean_ma}, {});
+    runner_mul_1.Run(stream);
+
+    // and mul grad
+    auto runner_mul_2 = NpuOpRunner("Mul", {mean_ma, *grad}, {*IG}, {});
+    runner_mul_2.Run(stream);
+  }
+};
+
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_NPU_KERNEL(
+    mean,
+    ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, plat::float16>)
+
+
+REGISTER_OP_NPU_KERNEL(
+    mean_grad,
+    ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, int>,
+    ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
+    ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, double>,
+    ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, plat::float16>)
diff --git a/paddle/fluid/operators/mean_op_npu_test.cc b/paddle/fluid/operators/mean_op_npu_test.cc
@@ -0,0 +1,133 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifndef _WIN32
+#include <unistd.h>
+#endif
+
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/operators/dropout_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/string/printf.h"
+
+namespace f = paddle::framework;
+namespace p = paddle::platform;
+namespace m = paddle::operators::math;
+
+USE_OP(mean);
+USE_OP_DEVICE_KERNEL(mean, NPU);
+USE_OP(mean_grad);
+USE_OP_DEVICE_KERNEL(mean_grad, NPU);
+
+template <typename T>
+void Compare(f::Scope* scope, const p::DeviceContext& ctx,
+             std::string op_type) {
+  // init
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+
+  std::vector<T> init;
+  init.push_back(static_cast<T>(1.0));
+  init.push_back(static_cast<T>(2.0));
+  init.push_back(static_cast<T>(3.0));
+  init.push_back(static_cast<T>(4.0));
+
+  TensorFromVector(init, ctx, tensor_x);
+  tensor_x->Resize({4});
+
+  ctx.Wait();
+
+  auto place = ctx.GetPlace();
+  auto out = scope->Var("Out");
+  auto tensor_out = out->GetMutable<f::LoDTensor>();
+
+  auto op = f::OpRegistry::CreateOp(op_type,
+                           {{"X", {"X"}}},
+                           {{"Out", {"Out"}}},
+                           {});
+
+  op->Run(*scope, place);
+
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_out, ctx, &out_vec);
+
+  ctx.Wait();
+
+  EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)1);
+  EXPECT_EQ((float)out_vec[0], (float)2.5);
+}
+
+template <typename T>
+void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
+                 std::string op_type) {
+  // init
+  auto dout = scope->Var("DOut");
+  auto tensor_dout = dout->GetMutable<f::LoDTensor>();
+  float dvalue = 2.0;
+  tensor_dout->Resize({1});
+  std::vector<T> init_dout;
+  init_dout.push_back(static_cast<T>(dvalue));
+  TensorFromVector(init_dout, ctx, tensor_dout);
+  ctx.Wait();
+
+  auto x = scope->Var("X");
+  auto tensor_x = x->GetMutable<f::LoDTensor>();
+  tensor_x->Resize({4});
+
+  auto dx = scope->Var("DX");
+  auto tensor_dx = dx->GetMutable<f::LoDTensor>();
+  tensor_dx->Resize({4});
+
+  ctx.Wait();
+
+  auto op = f::OpRegistry::CreateOp(op_type,
+                                    {{"Out@GRAD", {"DOut"}},
+                                     {"X", {"X"}}},
+                                    {{"X@GRAD", {"DX"}}},
+                                    {});
+
+  auto place = ctx.GetPlace();
+  op->Run(*scope, place);
+
+  std::vector<float> out_vec;
+  TensorToVector(*tensor_dx, ctx, &out_vec);
+
+  ctx.Wait();
+
+  EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)4);
+  EXPECT_EQ((float)out_vec[0], (float)1.0/dvalue);
+  EXPECT_EQ((float)out_vec[1], (float)1.0/dvalue);
+  EXPECT_EQ((float)out_vec[2], (float)1.0/dvalue);
+  EXPECT_EQ((float)out_vec[3], (float)1.0/dvalue);
+}
+
+TEST(mean, NPU_fp32) {
+    f::Scope scope;
+    p::NPUDeviceContext ctx(p::NPUPlace(0));
+    Compare<float>(&scope, ctx, "mean");
+}
+
+
+TEST(mean_grad, NPU_fp32) {
+    f::Scope scope;
+    p::NPUDeviceContext ctx(p::NPUPlace(0));
+    CompareGrad<float>(&scope, ctx, "mean_grad");
+}
diff --git a/python/paddle/fluid/tests/unittests/npu/test_mean_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_mean_op_npu.py