Skip to content

Commit 468ac69

Browse files
OleNetoyjxer
andauthored
[NPU] add npu kernel for mean Op (#31562)
* update mean op * update mean op * give a better test activation Co-authored-by: oyjxer <[email protected]>
1 parent 5118968 commit 468ac69

4 files changed

Lines changed: 419 additions & 0 deletions

File tree

paddle/fluid/operators/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,4 +184,6 @@ endif()
184184

185185
if(WITH_ASCEND_CL)
186186
cc_test(gelu_op_npu_test SRCS gelu_op_npu_test.cc DEPS op_registry gelu_op scope device_context enforce executor)
187+
cc_test(mean_op_npu_test SRCS mean_op_npu_test.cc DEPS op_registry mean_op scope device_context enforce executor)
187188
endif()
189+
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2+
Licensed under the Apache License, Version 2.0 (the "License");
3+
you may not use this file except in compliance with the License.
4+
You may obtain a copy of the License at
5+
http://www.apache.org/licenses/LICENSE-2.0
6+
Unless required by applicable law or agreed to in writing, software
7+
distributed under the License is distributed on an "AS IS" BASIS,
8+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9+
See the License for the specific language governing permissions and
10+
limitations under the License. */
11+
12+
#include "paddle/fluid/operators/mean_op.h"
13+
#include "paddle/fluid/platform/float16.h"
14+
#include "paddle/fluid/operators/npu_op_runner.h"
15+
16+
17+
namespace paddle {
18+
namespace operators {
19+
20+
template <typename DeviceContext, typename T>
21+
class MeanNPUKernel : public framework::OpKernel<T> {
22+
public:
23+
void Compute(const framework::ExecutionContext& ctx) const override {
24+
auto* x = ctx.Input<framework::LoDTensor>("X");
25+
auto* out = ctx.Output<framework::LoDTensor>("Out");
26+
27+
auto reduce_ndim = x->dims().size();
28+
std::vector<int> axes;
29+
for (auto i = 0; i < reduce_ndim; ++i) {
30+
axes.push_back(i);
31+
}
32+
33+
framework::NPUAttributeMap attr_input = {
34+
{"keep_dims", false},
35+
{"axes", axes}};
36+
37+
std::vector<int64_t> out_dims;
38+
out_dims.push_back(1);
39+
out->Resize(framework::make_ddim(out_dims));
40+
out->mutable_data<T>(ctx.GetPlace());
41+
42+
Tensor reduced_out(x->type());
43+
std::vector<int64_t> reduced_dout_dims;
44+
reduced_dout_dims.push_back(1);
45+
reduced_out.Resize(framework::make_ddim(reduced_dout_dims));
46+
reduced_out.mutable_data<T>(ctx.GetPlace());
47+
48+
auto runner = NpuOpRunner("ReduceMeanD",
49+
{*x},
50+
{*out},
51+
attr_input);
52+
53+
auto stream =
54+
ctx.template device_context<
55+
paddle::platform::NPUDeviceContext>()
56+
.stream();
57+
runner.Run(stream);
58+
}
59+
};
60+
61+
62+
template <typename DeviceContext, typename T>
63+
class MeanGradNPUKernel : public framework::OpKernel<T> {
64+
public:
65+
void Compute(const framework::ExecutionContext& context) const override {
66+
auto stream =
67+
context.template device_context<
68+
paddle::platform::NPUDeviceContext>()
69+
.stream();
70+
71+
auto grad = context.Input<Tensor>(framework::GradVarName("Out"));
72+
73+
PADDLE_ENFORCE_EQ(grad->numel(), 1,
74+
platform::errors::InvalidArgument(
75+
"Mean Gradient Input Tensor len should be 1. But "
76+
"received Out@Grad's elements num is %d.",
77+
grad->numel()));
78+
79+
auto IG = context.Output<Tensor>(framework::GradVarName("X"));
80+
IG->mutable_data<T>(context.GetPlace());
81+
82+
// ones
83+
Tensor ones(grad->type());
84+
std::vector<int64_t> dout_dims;
85+
for (auto i = 0; i < IG->dims().size(); ++i) {
86+
dout_dims.push_back(IG->dims()[i]);
87+
}
88+
ones.Resize(framework::make_ddim(dout_dims));
89+
ones.mutable_data<T>(context.GetPlace());
90+
auto runner_ones = NpuOpRunner("OnesLike", {*IG}, {ones}, {});
91+
runner_ones.Run(stream);
92+
93+
// means
94+
Tensor mean_tensor(grad->type());
95+
mean_tensor.Resize({1});
96+
mean_tensor.mutable_data<T>(context.GetPlace());
97+
std::vector<float> mean_vec;
98+
mean_vec.push_back(1.0/static_cast<float>(IG->numel()));
99+
framework::TensorFromVector(mean_vec,
100+
context.device_context(),
101+
&mean_tensor);
102+
103+
// means mul ones
104+
Tensor mean_ma(grad->type());
105+
mean_ma.Resize(framework::make_ddim(dout_dims));
106+
mean_ma.mutable_data<T>(context.GetPlace());
107+
auto runner_mul_1 = NpuOpRunner("Mul", {mean_tensor, ones}, {mean_ma}, {});
108+
runner_mul_1.Run(stream);
109+
110+
// and mul grad
111+
auto runner_mul_2 = NpuOpRunner("Mul", {mean_ma, *grad}, {*IG}, {});
112+
runner_mul_2.Run(stream);
113+
}
114+
};
115+
116+
117+
} // namespace operators
118+
} // namespace paddle
119+
120+
namespace ops = paddle::operators;
121+
namespace plat = paddle::platform;
122+
REGISTER_OP_NPU_KERNEL(
123+
mean,
124+
ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, int>,
125+
ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, float>,
126+
ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, double>,
127+
ops::MeanNPUKernel<paddle::platform::NPUDeviceContext, plat::float16>)
128+
129+
130+
REGISTER_OP_NPU_KERNEL(
131+
mean_grad,
132+
ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, int>,
133+
ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
134+
ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, double>,
135+
ops::MeanGradNPUKernel<paddle::platform::NPUDeviceContext, plat::float16>)
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License. */
14+
15+
#ifndef _WIN32
16+
#include <unistd.h>
17+
#endif
18+
19+
#include <string>
20+
#include <thread> // NOLINT
21+
#include <vector>
22+
23+
#include "gtest/gtest.h"
24+
#include "paddle/fluid/framework/op_registry.h"
25+
#include "paddle/fluid/framework/operator.h"
26+
#include "paddle/fluid/framework/program_desc.h"
27+
#include "paddle/fluid/operators/dropout_op.h"
28+
#include "paddle/fluid/operators/math/math_function.h"
29+
#include "paddle/fluid/string/printf.h"
30+
31+
namespace f = paddle::framework;
32+
namespace p = paddle::platform;
33+
namespace m = paddle::operators::math;
34+
35+
USE_OP(mean);
36+
USE_OP_DEVICE_KERNEL(mean, NPU);
37+
USE_OP(mean_grad);
38+
USE_OP_DEVICE_KERNEL(mean_grad, NPU);
39+
40+
template <typename T>
41+
void Compare(f::Scope* scope, const p::DeviceContext& ctx,
42+
std::string op_type) {
43+
// init
44+
auto x = scope->Var("X");
45+
auto tensor_x = x->GetMutable<f::LoDTensor>();
46+
47+
std::vector<T> init;
48+
init.push_back(static_cast<T>(1.0));
49+
init.push_back(static_cast<T>(2.0));
50+
init.push_back(static_cast<T>(3.0));
51+
init.push_back(static_cast<T>(4.0));
52+
53+
TensorFromVector(init, ctx, tensor_x);
54+
tensor_x->Resize({4});
55+
56+
ctx.Wait();
57+
58+
auto place = ctx.GetPlace();
59+
auto out = scope->Var("Out");
60+
auto tensor_out = out->GetMutable<f::LoDTensor>();
61+
62+
auto op = f::OpRegistry::CreateOp(op_type,
63+
{{"X", {"X"}}},
64+
{{"Out", {"Out"}}},
65+
{});
66+
67+
op->Run(*scope, place);
68+
69+
std::vector<float> out_vec;
70+
TensorToVector(*tensor_out, ctx, &out_vec);
71+
72+
ctx.Wait();
73+
74+
EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)1);
75+
EXPECT_EQ((float)out_vec[0], (float)2.5);
76+
}
77+
78+
template <typename T>
79+
void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx,
80+
std::string op_type) {
81+
// init
82+
auto dout = scope->Var("DOut");
83+
auto tensor_dout = dout->GetMutable<f::LoDTensor>();
84+
float dvalue = 2.0;
85+
tensor_dout->Resize({1});
86+
std::vector<T> init_dout;
87+
init_dout.push_back(static_cast<T>(dvalue));
88+
TensorFromVector(init_dout, ctx, tensor_dout);
89+
ctx.Wait();
90+
91+
auto x = scope->Var("X");
92+
auto tensor_x = x->GetMutable<f::LoDTensor>();
93+
tensor_x->Resize({4});
94+
95+
auto dx = scope->Var("DX");
96+
auto tensor_dx = dx->GetMutable<f::LoDTensor>();
97+
tensor_dx->Resize({4});
98+
99+
ctx.Wait();
100+
101+
auto op = f::OpRegistry::CreateOp(op_type,
102+
{{"Out@GRAD", {"DOut"}},
103+
{"X", {"X"}}},
104+
{{"X@GRAD", {"DX"}}},
105+
{});
106+
107+
auto place = ctx.GetPlace();
108+
op->Run(*scope, place);
109+
110+
std::vector<float> out_vec;
111+
TensorToVector(*tensor_dx, ctx, &out_vec);
112+
113+
ctx.Wait();
114+
115+
EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)4);
116+
EXPECT_EQ((float)out_vec[0], (float)1.0/dvalue);
117+
EXPECT_EQ((float)out_vec[1], (float)1.0/dvalue);
118+
EXPECT_EQ((float)out_vec[2], (float)1.0/dvalue);
119+
EXPECT_EQ((float)out_vec[3], (float)1.0/dvalue);
120+
}
121+
122+
TEST(mean, NPU_fp32) {
123+
f::Scope scope;
124+
p::NPUDeviceContext ctx(p::NPUPlace(0));
125+
Compare<float>(&scope, ctx, "mean");
126+
}
127+
128+
129+
TEST(mean_grad, NPU_fp32) {
130+
f::Scope scope;
131+
p::NPUDeviceContext ctx(p::NPUPlace(0));
132+
CompareGrad<float>(&scope, ctx, "mean_grad");
133+
}

0 commit comments

Comments
 (0)