Skip to content

Commit 45af4f2

Browse files
[NPU] add elementwise_min_grad_op_npu,test=develop (#34731)
1 parent addd5fc commit 45af4f2

File tree

2 files changed

+265
-43
lines changed

2 files changed

+265
-43
lines changed

paddle/fluid/operators/elementwise/elementwise_min_op_npu.cc

Lines changed: 173 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,9 @@ limitations under the License. */
1515
#include <memory>
1616
#include <string>
1717

18+
#include "paddle/fluid/framework/tensor_util.h"
1819
#include "paddle/fluid/operators/elementwise/elementwise_min_op.h"
20+
#include "paddle/fluid/operators/elementwise/elementwise_npu.h"
1921
#include "paddle/fluid/operators/npu_op_runner.h"
2022

2123
namespace paddle {
@@ -27,31 +29,199 @@ template <typename DeviceContext, typename T>
2729
class ElementwiseMinNPUKernel : public framework::OpKernel<T> {
2830
public:
2931
void Compute(const framework::ExecutionContext& ctx) const override {
32+
auto& dev_ctx =
33+
ctx.template device_context<paddle::platform::NPUDeviceContext>();
3034
auto* x = ctx.Input<Tensor>("X");
3135
auto* y = ctx.Input<Tensor>("Y");
3236

3337
auto* out = ctx.Output<Tensor>("Out");
34-
3538
auto place = ctx.GetPlace();
3639

3740
out->mutable_data<T>(place);
3841

42+
int axis = ctx.Attr<int>("axis");
43+
bool direct_compute = false;
44+
auto x_dims = x->dims();
45+
auto y_dims = y->dims();
46+
axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
47+
if (x_dims.size() >= y_dims.size()) {
48+
direct_compute =
49+
y_dims == framework::slice_ddim(x_dims, axis, x_dims.size());
50+
} else {
51+
direct_compute =
52+
x_dims == framework::slice_ddim(y_dims, axis, y_dims.size());
53+
}
54+
Tensor transformed_x, transformed_y;
55+
if (direct_compute) {
56+
transformed_x.ShareDataWith(*x);
57+
transformed_y.ShareDataWith(*y);
58+
} else {
59+
NpuElementWiseOpBroadcast<T>(dev_ctx, x, y, axis, &transformed_x,
60+
&transformed_y);
61+
}
62+
const auto& runner =
63+
NpuOpRunner("Minimum", {transformed_x, transformed_y}, {*out}, {});
3964
auto stream =
4065
ctx.template device_context<paddle::platform::NPUDeviceContext>()
4166
.stream();
42-
43-
const auto& runner = NpuOpRunner("Minimum", {*x, *y}, {*out}, {});
4467
runner.Run(stream);
4568
}
4669
};
4770

71+
template <typename DeviceContext, typename T>
72+
class ElementwiseMinGradNPUKernel : public framework::OpKernel<T> {
73+
public:
74+
void Compute(const framework::ExecutionContext& ctx) const override {
75+
auto& dev_ctx =
76+
ctx.template device_context<paddle::platform::NPUDeviceContext>();
77+
auto* x = ctx.Input<Tensor>("X");
78+
auto* y = ctx.Input<Tensor>("Y");
79+
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
80+
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
81+
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
82+
int axis = ctx.Attr<int>("axis");
83+
axis = (axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis);
84+
auto stream = dev_ctx.stream();
85+
if (dx && dy) {
86+
// dx
87+
dx->mutable_data<T>(ctx.GetPlace());
88+
Tensor tmp_x;
89+
tmp_x.ShareDataWith(*dx);
90+
if (dx->dims() != dout->dims()) {
91+
std::vector<int> dst_dims_vec_x;
92+
std::vector<int> reduce_axes_x;
93+
auto src_dims_x = dx->dims();
94+
auto dout_dims = dout->dims();
95+
96+
int src_axis_x = (src_dims_x.size() < dout_dims.size() ? axis : 0);
97+
for (int ax = 0; ax < dout_dims.size(); ++ax) {
98+
if ((ax < src_axis_x || ax >= src_axis_x + src_dims_x.size()) ||
99+
(dout_dims[ax] > 1 && src_dims_x[ax - src_axis_x] == 1)) {
100+
reduce_axes_x.push_back(ax);
101+
} else {
102+
dst_dims_vec_x.push_back(dout_dims[ax]);
103+
}
104+
}
105+
if (!reduce_axes_x.empty()) {
106+
tmp_x.Resize(framework::make_ddim(dst_dims_vec_x));
107+
}
108+
}
109+
// dy
110+
dy->mutable_data<T>(ctx.GetPlace());
111+
Tensor tmp_y;
112+
tmp_y.ShareDataWith(*dy);
113+
if (dy->dims() != dout->dims()) {
114+
std::vector<int> dst_dims_vec_y;
115+
std::vector<int> reduce_axes_y;
116+
auto src_dims_y = dy->dims();
117+
auto dout_dims = dout->dims();
118+
119+
int src_axis_y = (src_dims_y.size() < dout_dims.size() ? axis : 0);
120+
for (int ax = 0; ax < dout_dims.size(); ++ax) {
121+
if ((ax < src_axis_y || ax >= src_axis_y + src_dims_y.size()) ||
122+
(dout_dims[ax] > 1 && src_dims_y[ax - src_axis_y] == 1)) {
123+
reduce_axes_y.push_back(ax);
124+
} else {
125+
dst_dims_vec_y.push_back(dout_dims[ax]);
126+
}
127+
}
128+
if (!reduce_axes_y.empty()) {
129+
tmp_y.Resize(framework::make_ddim(dst_dims_vec_y));
130+
}
131+
}
132+
133+
const auto& runner =
134+
NpuOpRunner("MinimumGrad", {*dout, *x, *y}, {tmp_x, tmp_y},
135+
{{"grad_x", true}, {"grad_y", true}});
136+
runner.Run(stream);
137+
138+
} else if (dx) {
139+
Tensor zero_tensor(dout->type());
140+
zero_tensor.mutable_data<T>(y->dims(), ctx.GetPlace());
141+
FillNpuTensorWithConstant<T>(&zero_tensor, static_cast<T>(0));
142+
// dx
143+
dx->mutable_data<T>(ctx.GetPlace());
144+
Tensor tmp_x;
145+
tmp_x.ShareDataWith(*dx);
146+
if (dx->dims() != dout->dims()) {
147+
std::vector<int> dst_dims_vec_x;
148+
std::vector<int> reduce_axes_x;
149+
auto src_dims_x = dx->dims();
150+
auto dout_dims = dout->dims();
151+
152+
int src_axis_x = (src_dims_x.size() < dout_dims.size() ? axis : 0);
153+
for (int ax = 0; ax < dout_dims.size(); ++ax) {
154+
if ((ax < src_axis_x || ax >= src_axis_x + src_dims_x.size()) ||
155+
(dout_dims[ax] > 1 && src_dims_x[ax - src_axis_x] == 1)) {
156+
reduce_axes_x.push_back(ax);
157+
} else {
158+
dst_dims_vec_x.push_back(dout_dims[ax]);
159+
}
160+
}
161+
if (!reduce_axes_x.empty()) {
162+
tmp_x.Resize(framework::make_ddim(dst_dims_vec_x));
163+
}
164+
}
165+
166+
const auto& runner =
167+
NpuOpRunner("MinimumGrad", {*dout, *x, *y}, {tmp_x, zero_tensor},
168+
{{"grad_x", true}, {"grad_y", true}});
169+
runner.Run(stream);
170+
171+
} else if (dy) {
172+
Tensor zero_tensor(dout->type());
173+
zero_tensor.mutable_data<T>(x->dims(), ctx.GetPlace());
174+
FillNpuTensorWithConstant<T>(&zero_tensor, static_cast<T>(0));
175+
176+
// dy
177+
dy->mutable_data<T>(ctx.GetPlace());
178+
Tensor tmp_y;
179+
tmp_y.ShareDataWith(*dy);
180+
if (dy->dims() != dout->dims()) {
181+
std::vector<int> dst_dims_vec_y;
182+
std::vector<int> reduce_axes_y;
183+
auto src_dims_y = dy->dims();
184+
auto dout_dims = dout->dims();
185+
186+
int src_axis_y = (src_dims_y.size() < dout_dims.size() ? axis : 0);
187+
for (int ax = 0; ax < dout_dims.size(); ++ax) {
188+
if ((ax < src_axis_y || ax >= src_axis_y + src_dims_y.size()) ||
189+
(dout_dims[ax] > 1 && src_dims_y[ax - src_axis_y] == 1)) {
190+
reduce_axes_y.push_back(ax);
191+
} else {
192+
dst_dims_vec_y.push_back(dout_dims[ax]);
193+
}
194+
}
195+
if (!reduce_axes_y.empty()) {
196+
tmp_y.Resize(framework::make_ddim(dst_dims_vec_y));
197+
}
198+
}
199+
200+
const auto& runner =
201+
NpuOpRunner("MinimumGrad", {*dout, *x, *y}, {zero_tensor, tmp_y},
202+
{{"grad_x", true}, {"grad_y", true}});
203+
runner.Run(stream);
204+
205+
} else {
206+
std::cout << "error" << std::endl;
207+
}
208+
}
209+
};
210+
48211
} // namespace operators
49212
} // namespace paddle
50213

51214
namespace ops = paddle::operators;
215+
namespace plat = paddle::platform;
52216

53217
REGISTER_OP_NPU_KERNEL(
54218
elementwise_min,
55219
ops::ElementwiseMinNPUKernel<paddle::platform::NPUDeviceContext, float>,
56220
ops::ElementwiseMinNPUKernel<paddle::platform::NPUDeviceContext,
57221
paddle::platform::float16>);
222+
223+
REGISTER_OP_NPU_KERNEL(
224+
elementwise_min_grad,
225+
ops::ElementwiseMinGradNPUKernel<paddle::platform::NPUDeviceContext, float>,
226+
ops::ElementwiseMinGradNPUKernel<paddle::platform::NPUDeviceContext,
227+
paddle::platform::float16>);

python/paddle/fluid/tests/unittests/npu/test_elementwise_min_op_npu.py

Lines changed: 92 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -18,81 +18,133 @@
1818
import unittest
1919
import sys
2020
sys.path.append("..")
21-
from op_test import OpTest
21+
from op_test import OpTest, skip_check_grad_ci
2222
import paddle
2323
import paddle.fluid as fluid
24+
from paddle.fluid import Program, program_guard
25+
import paddle.fluid.core as core
2426

2527
paddle.enable_static()
2628
SEED = 2021
2729

2830

29-
class TestElementwiseMin(OpTest):
31+
class TestElementwiseMinOp(OpTest):
3032
def setUp(self):
3133
self.set_npu()
3234
self.op_type = "elementwise_min"
3335
self.place = paddle.NPUPlace(0)
34-
3536
self.init_dtype()
36-
np.random.seed(SEED)
37-
x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
38-
y = np.random.uniform(1, 2, [11, 17]).astype(self.dtype)
39-
out = np.minimum(x, y)
40-
37+
self.init_input_output()
4138
self.inputs = {
42-
'X': OpTest.np_dtype_to_fluid_dtype(x),
43-
'Y': OpTest.np_dtype_to_fluid_dtype(y)
39+
'X': OpTest.np_dtype_to_fluid_dtype(self.x),
40+
'Y': OpTest.np_dtype_to_fluid_dtype(self.y)
4441
}
45-
self.attrs = {}
46-
self.outputs = {'Out': out}
42+
self.outputs = {'Out': self.out}
43+
self.attrs = {'axis': self.axis}
4744

4845
def set_npu(self):
4946
self.__class__.use_npu = True
5047

48+
def init_input_output(self):
49+
# If x and y have the same value, the min() is not differentiable.
50+
# So we generate test data by the following method
51+
# to avoid them being too close to each other.
52+
self.x = np.random.uniform(0.1, 1, [13, 17]).astype(self.dtype)
53+
self.sgn = np.random.choice([-1, 1], [13, 17]).astype(self.dtype)
54+
self.y = self.x + self.sgn * np.random.uniform(
55+
0.1, 1, [13, 17]).astype(self.dtype)
56+
self.out = np.minimum(self.x, self.y)
57+
self.axis = -1
58+
5159
def init_dtype(self):
5260
self.dtype = np.float32
5361

5462
def test_check_output(self):
5563
self.check_output_with_place(self.place)
5664

57-
# TODO(ascendrc): Min grad test
58-
# def test_check_grad(self):
59-
# if self.dtype == np.float16:
60-
# return
61-
# self.check_grad(['X'], 'Out')
62-
#
65+
def test_check_grad_normal(self):
66+
if self.dtype == np.float16:
67+
return
6368

69+
self.check_grad_with_place(
70+
self.place,
71+
['X', 'Y'],
72+
'Out', )
6473

65-
class TestElementwiseMinFp16(OpTest):
66-
def setUp(self):
67-
self.set_npu()
68-
self.op_type = "elementwise_min"
69-
self.place = paddle.NPUPlace(0)
74+
def test_check_grad_ingore_x(self):
75+
if self.dtype == np.float16:
76+
return
7077

71-
self.init_dtype()
72-
np.random.seed(SEED)
73-
x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
74-
y = np.random.uniform(1, 2, [3, 4]).astype(self.dtype)
75-
out = np.minimum(x, y)
78+
self.check_grad_with_place(
79+
self.place,
80+
['Y'],
81+
'Out',
82+
no_grad_set=set("X"), )
7683

77-
self.inputs = {
78-
'X': OpTest.np_dtype_to_fluid_dtype(x),
79-
'Y': OpTest.np_dtype_to_fluid_dtype(y)
80-
}
81-
self.attrs = {}
82-
self.outputs = {'Out': out}
84+
def test_check_grad_ingore_y(self):
85+
if self.dtype == np.float16:
86+
return
87+
88+
self.check_grad_with_place(
89+
self.place,
90+
['X'],
91+
'Out',
92+
no_grad_set=set("Y"), )
8393

84-
def set_npu(self):
85-
self.__class__.use_npu = True
86-
self.__class__.no_need_check_grad = True
8794

95+
class TestElementwiseMinOpFp16(TestElementwiseMinOp):
8896
def init_dtype(self):
8997
self.dtype = np.float16
9098

91-
def test_check_output(self):
92-
self.check_output_with_place(self.place, atol=1e-5)
99+
100+
class TestElementwiseMinOp_Vector(TestElementwiseMinOp):
101+
def init_input_output(self):
102+
self.x = np.random.uniform(1, 2, (100, )).astype(self.dtype)
103+
self.sgn = np.random.choice([-1, 1], (100, )).astype(self.dtype)
104+
self.y = self.x + self.sgn * np.random.uniform(0.1, 1, (
105+
100, )).astype(self.dtype)
106+
self.out = np.minimum(self.x, self.y)
107+
self.axis = -1
108+
109+
110+
class TestElementwiseMinOpFp16_Vector(TestElementwiseMinOp_Vector):
111+
def init_dtype(self):
112+
self.dtype = np.float16
113+
114+
115+
@skip_check_grad_ci(
116+
reason="[skip shape check] Use y_shape(1) to test broadcast.")
117+
class TestElementwiseMinOp_scalar(TestElementwiseMinOp):
118+
def init_input_output(self):
119+
self.x = np.random.random_integers(-5, 5, [10, 3, 4]).astype(self.dtype)
120+
self.y = np.array([0.5]).astype(self.dtype)
121+
self.out = np.minimum(self.x, self.y)
122+
self.axis = -1
123+
124+
125+
@skip_check_grad_ci(
126+
reason="[skip shape check] Use y_shape(1) to test broadcast.")
127+
class TestElementwiseMinOpFp16_scalar(TestElementwiseMinOp_scalar):
128+
def init_dtype(self):
129+
self.dtype = np.float16
130+
131+
132+
class TestElementwiseMinOp_broadcast(TestElementwiseMinOp):
133+
def init_input_output(self):
134+
self.x = np.random.uniform(0.5, 1, (2, 3, 100)).astype(self.dtype)
135+
self.sgn = np.random.choice([-1, 1], (100, )).astype(self.dtype)
136+
self.y = self.x[0, 0, :] + self.sgn * \
137+
np.random.uniform(1, 2, (100, )).astype(self.dtype)
138+
self.out = np.minimum(self.x, self.y.reshape(1, 1, 100))
139+
self.axis = -1
140+
141+
142+
class TestElementwiseMinOpFp16_broadcast(TestElementwiseMinOp_broadcast):
143+
def init_dtype(self):
144+
self.dtype = np.float16
93145

94146

95-
class TestElementwiseMinNet(unittest.TestCase):
147+
class TestElementwiseMinOpNet(unittest.TestCase):
96148
def _test(self, run_npu=True):
97149
main_prog = paddle.static.Program()
98150
startup_prog = paddle.static.Program()

0 commit comments

Comments
 (0)