Skip to content

Commit e536ad9

Browse files
author
shenyijun01
committed
[XPU] Support several ops on precision of fp16.
1 parent 40e0dc3 commit e536ad9

14 files changed

Lines changed: 460 additions & 26 deletions

lite/kernels/x86/calib_compute.cc

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,26 @@ void CalibComputeFp32ToInt8<Ptype, DLType>::Run() {
3434
din, dout, scale.data(), 1, 1, param.input->numel());
3535
}
3636

37+
template <PrecisionType Ptype, DataLayoutType DLType>
38+
void CalibComputeFp32ToFp16<Ptype, DLType>::Run() {
39+
auto& param = this->template Param<operators::CalibParam>();
40+
const auto* din = param.input->template data<float>();
41+
auto* dout = param.output->template mutable_data<float16>();
42+
for (auto i = 0; i < param.input->numel(); ++i) {
43+
dout[i] = static_cast<float16>(din[i]);
44+
}
45+
}
46+
47+
template <PrecisionType Ptype, DataLayoutType DLType>
48+
void CalibComputeFp16ToFp32<Ptype, DLType>::Run() {
49+
auto& param = this->template Param<operators::CalibParam>();
50+
const auto* din = param.input->template data<float16>();
51+
auto* dout = param.output->template mutable_data<float>();
52+
for (auto i = 0; i < param.input->numel(); ++i) {
53+
dout[i] = static_cast<float>(din[i]);
54+
}
55+
}
56+
3757
template <PrecisionType Ptype, DataLayoutType DLType>
3858
void CalibComputeInt64ToInt32<Ptype, DLType>::Run() {
3959
auto& param = this->template Param<operators::CalibParam>();
@@ -84,6 +104,26 @@ void CalibComputeFp32ToInt32<Ptype, DLType>::Run() {
84104
}
85105
}
86106

107+
template <PrecisionType Ptype, DataLayoutType DLType>
108+
void CalibComputeInt32ToFp16<Ptype, DLType>::Run() {
109+
auto& param = this->template Param<operators::CalibParam>();
110+
const auto* din = param.input->template data<int32_t>();
111+
auto* dout = param.output->template mutable_data<float16>();
112+
for (auto i = 0; i < param.input->numel(); ++i) {
113+
dout[i] = static_cast<float16>(din[i]);
114+
}
115+
}
116+
117+
template <PrecisionType Ptype, DataLayoutType DLType>
118+
void CalibComputeFp16ToInt32<Ptype, DLType>::Run() {
119+
auto& param = this->template Param<operators::CalibParam>();
120+
const auto* din = param.input->template data<float16>();
121+
auto* dout = param.output->template mutable_data<int32_t>();
122+
for (auto i = 0; i < param.input->numel(); ++i) {
123+
dout[i] = static_cast<int32_t>(din[i]);
124+
}
125+
}
126+
87127
template <PrecisionType Ptype, DataLayoutType DLType>
88128
void CalibComputeInt64ToFp32<Ptype, DLType>::Run() {
89129
auto& param = this->template Param<operators::CalibParam>();
@@ -171,6 +211,23 @@ REGISTER_LITE_KERNEL(
171211
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFloat))})
172212
.Finalize();
173213

214+
typedef paddle::lite::kernels::x86::CalibComputeFp32ToFp16<PRECISION(kFloat),
215+
DATALAYOUT(kNCHW)>
216+
fp_fp32_to_fp16;
217+
REGISTER_LITE_KERNEL(calib, kX86, kFloat, kNCHW, fp_fp32_to_fp16, fp32_to_fp16)
218+
.BindInput("Input",
219+
{LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFloat))})
220+
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFP16))})
221+
.Finalize();
222+
223+
typedef paddle::lite::kernels::x86::CalibComputeFp16ToFp32<PRECISION(kFP16),
224+
DATALAYOUT(kNCHW)>
225+
fp16_fp16_to_fp32;
226+
REGISTER_LITE_KERNEL(calib, kX86, kFP16, kNCHW, fp16_fp16_to_fp32, fp16_to_fp32)
227+
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFP16))})
228+
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFloat))})
229+
.Finalize();
230+
174231
REGISTER_LITE_KERNEL(
175232
calib_once, kX86, kInt8, kNCHW, i8_fp32_to_int8, fp32_to_int8)
176233
.BindInput("Input",
@@ -223,3 +280,21 @@ REGISTER_LITE_KERNEL(
223280
{LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt64))})
224281
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFloat))})
225282
.Finalize();
283+
284+
typedef paddle::lite::kernels::x86::CalibComputeInt32ToFp16<PRECISION(kFloat),
285+
DATALAYOUT(kNCHW)>
286+
fp_int32_to_fp16;
287+
REGISTER_LITE_KERNEL(
288+
calib, kX86, kFloat, kNCHW, fp_int32_to_fp16, int32_to_fp16)
289+
.BindInput("Input",
290+
{LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt32))})
291+
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFP16))})
292+
.Finalize();
293+
typedef paddle::lite::kernels::x86::CalibComputeFp16ToInt32<PRECISION(kFloat),
294+
DATALAYOUT(kNCHW)>
295+
fp_fp16_to_int32;
296+
REGISTER_LITE_KERNEL(
297+
calib, kX86, kFloat, kNCHW, fp_fp16_to_int32, fp16_to_int32)
298+
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kFP16))})
299+
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kX86), PRECISION(kInt32))})
300+
.Finalize();

lite/kernels/x86/calib_compute.h

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ namespace lite {
2121
namespace kernels {
2222
namespace x86 {
2323

24+
typedef uint16_t float16;
2425
template <PrecisionType Ptype, DataLayoutType DLType>
2526
class CalibComputeFp32ToInt8 : public KernelLite<TARGET(kX86), Ptype, DLType> {
2627
public:
@@ -33,6 +34,30 @@ class CalibComputeFp32ToInt8 : public KernelLite<TARGET(kX86), Ptype, DLType> {
3334
private:
3435
};
3536

37+
template <PrecisionType Ptype, DataLayoutType DLType>
38+
class CalibComputeFp32ToFp16 : public KernelLite<TARGET(kX86), Ptype, DLType> {
39+
public:
40+
using param_t = operators::CalibParam;
41+
42+
void Run() override;
43+
44+
~CalibComputeFp32ToFp16() override{};
45+
46+
private:
47+
};
48+
49+
template <PrecisionType Ptype, DataLayoutType DLType>
50+
class CalibComputeFp16ToFp32 : public KernelLite<TARGET(kX86), Ptype, DLType> {
51+
public:
52+
using param_t = operators::CalibParam;
53+
54+
void Run() override;
55+
56+
~CalibComputeFp16ToFp32() override{};
57+
58+
private:
59+
};
60+
3661
template <PrecisionType Ptype, DataLayoutType DLType>
3762
class CalibComputeInt64ToInt32
3863
: public KernelLite<TARGET(kX86), Ptype, DLType> {
@@ -107,6 +132,29 @@ class CalibComputeInt64ToFp32 : public KernelLite<TARGET(kX86), Ptype, DLType> {
107132
private:
108133
};
109134

135+
template <PrecisionType Ptype, DataLayoutType DLType>
136+
class CalibComputeInt32ToFp16 : public KernelLite<TARGET(kX86), Ptype, DLType> {
137+
public:
138+
using param_t = operators::CalibParam;
139+
140+
void Run() override;
141+
142+
~CalibComputeInt32ToFp16() override{};
143+
144+
private:
145+
};
146+
template <PrecisionType Ptype, DataLayoutType DLType>
147+
class CalibComputeFp16ToInt32 : public KernelLite<TARGET(kX86), Ptype, DLType> {
148+
public:
149+
using param_t = operators::CalibParam;
150+
151+
void Run() override;
152+
153+
~CalibComputeFp16ToInt32() override{};
154+
155+
private:
156+
};
157+
110158
} // namespace x86
111159
} // namespace kernels
112160
} // namespace lite

lite/kernels/xpu/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,8 @@ add_kernel(lod_reset_compute_xpu XPU extra SRCS lod_reset_compute.cc)
110110
add_kernel(select_input_compute_xpu XPU extra SRCS select_input_compute.cc)
111111
add_kernel(group_norm_compute_xpu XPU extra SRCS group_norm_compute.cc)
112112
add_kernel(deformable_conv_compute_xpu XPU extra SRCS deformable_conv_compute.cc)
113+
add_kernel(sin_compute_xpu XPU extra SRCS sin_compute.cc)
114+
add_kernel(cos_compute_xpu XPU extra SRCS cos_compute.cc)
113115

114116
# extra(fused kernel)
115117
add_kernel(__xpu__resnet50_compute_xpu XPU extra SRCS __xpu__resnet50_compute.cc)

lite/kernels/xpu/activation_compute.cc

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -389,15 +389,6 @@ using siluFP32 =
389389
paddle::lite::kernels::xpu::SiluCompute<float, PRECISION(kFloat)>;
390390
using siluFP16 =
391391
paddle::lite::kernels::xpu::SiluCompute<float16, PRECISION(kFP16)>;
392-
REGISTER_LITE_KERNEL(silu, kXPU, kFloat, kNCHW, siluFP32, def)
393-
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
394-
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
395-
.Finalize();
396-
REGISTER_LITE_KERNEL(silu, kXPU, kFP16, kNCHW, siluFP16, def)
397-
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
398-
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
399-
.Finalize();
400-
401392
using eluFP32 =
402393
paddle::lite::kernels::xpu::EluCompute<float, PRECISION(kFloat)>;
403394
using eluFP16 =
@@ -410,6 +401,14 @@ REGISTER_LITE_KERNEL(elu, kXPU, kFP16, kNCHW, eluFP16, DISABLE_XPU1_eluFP16)
410401
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
411402
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
412403
.Finalize();
404+
REGISTER_LITE_KERNEL(silu, kXPU, kFloat, kNCHW, siluFP32, silu_fp32)
405+
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
406+
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
407+
.Finalize();
408+
REGISTER_LITE_KERNEL(silu, kXPU, kFP16, kNCHW, siluFP16, silu_fp16)
409+
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
410+
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
411+
.Finalize();
413412

414413
REGISTER_LITE_KERNEL(
415414
abs, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::AbsCompute, def)

lite/kernels/xpu/calib_compute.cc

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,29 @@ using xpu_calib_fp32_to_fp16 =
113113
using xpu_calib_fp16_to_fp32 =
114114
paddle::lite::kernels::xpu::CalibCompute<float16, float, PRECISION(kFloat)>;
115115

116+
using xpu_calib_fp32_to_fp16_kfp16 =
117+
paddle::lite::kernels::xpu::CalibCompute<float, float16, PRECISION(kFP16)>;
118+
using xpu_calib_fp16_to_fp32_kfp16 =
119+
paddle::lite::kernels::xpu::CalibCompute<float16, float, PRECISION(kFP16)>;
120+
121+
using xpu_calib_int64_to_fp16 =
122+
paddle::lite::kernels::xpu::CalibCompute<int64_t,
123+
float16,
124+
PRECISION(kFP16)>;
125+
using xpu_calib_fp16_to_int64 =
126+
paddle::lite::kernels::xpu::CalibCompute<float16,
127+
int64_t,
128+
PRECISION(kFP16)>;
129+
130+
using xpu_calib_int32_to_fp16 =
131+
paddle::lite::kernels::xpu::CalibCompute<int32_t,
132+
float16,
133+
PRECISION(kFP16)>;
134+
using xpu_calib_fp16_to_int32 =
135+
paddle::lite::kernels::xpu::CalibCompute<float16,
136+
int32_t,
137+
PRECISION(kFP16)>;
138+
116139
REGISTER_LITE_KERNEL(
117140
calib, kXPU, kFloat, kNCHW, xpu_calib_int64_to_int32, calib_int64_to_int32)
118141
.BindInput("Input",
@@ -140,6 +163,45 @@ REGISTER_LITE_KERNEL(
140163
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
141164
.Finalize();
142165

166+
REGISTER_LITE_KERNEL(
167+
calib, kXPU, kFP16, kNCHW, xpu_calib_fp16_to_fp32_kfp16, calib_fp16_to_fp32)
168+
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
169+
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
170+
.Finalize();
171+
172+
REGISTER_LITE_KERNEL(
173+
calib, kXPU, kFP16, kNCHW, xpu_calib_fp32_to_fp16_kfp16, calib_fp32_to_fp16)
174+
.BindInput("Input",
175+
{LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
176+
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
177+
.Finalize();
178+
179+
REGISTER_LITE_KERNEL(
180+
calib, kXPU, kFP16, kNCHW, xpu_calib_int64_to_fp16, calib_int64_to_fp16)
181+
.BindInput("Input",
182+
{LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
183+
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
184+
.Finalize();
185+
186+
REGISTER_LITE_KERNEL(
187+
calib, kXPU, kFP16, kNCHW, xpu_calib_fp16_to_int64, calib_fp16_to_int64)
188+
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
189+
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
190+
.Finalize();
191+
192+
REGISTER_LITE_KERNEL(
193+
calib, kXPU, kFP16, kNCHW, xpu_calib_int32_to_fp16, calib_int32_to_fp16)
194+
.BindInput("Input",
195+
{LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
196+
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
197+
.Finalize();
198+
199+
REGISTER_LITE_KERNEL(
200+
calib, kXPU, kFP16, kNCHW, xpu_calib_fp16_to_int32, calib_fp16_to_int32)
201+
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
202+
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
203+
.Finalize();
204+
143205
REGISTER_LITE_KERNEL(calib_once,
144206
kXPU,
145207
kFloat,
@@ -175,6 +237,27 @@ REGISTER_LITE_KERNEL(
175237
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFloat))})
176238
.Finalize();
177239

240+
REGISTER_LITE_KERNEL(calib_once,
241+
kXPU,
242+
kFP16,
243+
kNCHW,
244+
xpu_calib_int64_to_fp16,
245+
calib_int64_to_fp16)
246+
.BindInput("Input",
247+
{LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
248+
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
249+
.Finalize();
250+
251+
REGISTER_LITE_KERNEL(calib_once,
252+
kXPU,
253+
kFP16,
254+
kNCHW,
255+
xpu_calib_fp16_to_int64,
256+
calib_fp16_to_int64)
257+
.BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
258+
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
259+
.Finalize();
260+
178261
using xpu_calib_fp32_to_int8 =
179262
paddle::lite::kernels::xpu::CalibCompute<float, int8_t, PRECISION(kInt8)>;
180263

lite/kernels/xpu/cos_compute.cc

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#include "lite/kernels/xpu/cos_compute.h"
16+
#include "lite/backends/xpu/xpu_header_sitter.h"
17+
#include "lite/core/op_registry.h"
18+
19+
namespace paddle {
20+
namespace lite {
21+
namespace kernels {
22+
namespace xpu {
23+
24+
template <typename T, PrecisionType PType>
25+
void CosCompute<T, PType>::Run() {
26+
auto& param = this->template Param<param_t>();
27+
auto& ctx = this->ctx_->template As<XPUContext>();
28+
29+
int r = xdnn::cos(ctx.GetRawContext(),
30+
param.X->template data<T>(),
31+
param.Out->template mutable_data<T>(TARGET(kXPU)),
32+
param.X->numel());
33+
CHECK_EQ(r, 0);
34+
}
35+
36+
} // namespace xpu
37+
} // namespace kernels
38+
} // namespace lite
39+
} // namespace paddle
40+
41+
namespace xpu = paddle::lite::kernels::xpu;
42+
43+
using cosFP32 =
44+
paddle::lite::kernels::xpu::CosCompute<float, PRECISION(kFloat)>;
45+
using cosFP16 =
46+
paddle::lite::kernels::xpu::CosCompute<float16, PRECISION(kFP16)>;
47+
REGISTER_LITE_KERNEL(cos, kXPU, kFloat, kNCHW, cosFP32, def)
48+
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
49+
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
50+
.Finalize();
51+
REGISTER_LITE_KERNEL(cos, kXPU, kFP16, kNCHW, cosFP16, cosFP16)
52+
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
53+
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
54+
.Finalize();

0 commit comments

Comments
 (0)