Skip to content

Commit db368d5

Browse files
Add INT8 support for fused_multi_transformer_op (#45284) (#46169)
Co-authored-by: RichardWooSJTU <37864677+RichardWooSJTU@users.noreply.github.com>
1 parent e5dc9d6 commit db368d5

22 files changed

Lines changed: 4168 additions & 1428 deletions

paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,8 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
165165
auto var_data_type = var_node->Var()->GetDataType();
166166
VLOG(5) << "var_name is " << var_name << ", data type is "
167167
<< var_data_type;
168-
if (var_data_type == paddle::framework::proto::VarType::FP16) {
168+
if (var_data_type == paddle::framework::proto::VarType::FP16 &&
169+
t->dtype() != paddle::experimental::DataType::FLOAT16) {
169170
framework::Tensor half_tensor;
170171
half_tensor.set_type(paddle::experimental::DataType::FLOAT16);
171172
half_tensor.Resize(t->dims());

paddle/fluid/operators/fused/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ register_operators(
2323
fused_transformer_op
2424
fused_feedforward_op
2525
fused_multi_transformer_op
26+
fused_multi_transformer_int8_op
2627
fused_bias_dropout_residual_layer_norm_op
2728
resnet_unit_op
2829
fused_gemm_epilogue_op
@@ -119,6 +120,7 @@ if(WITH_GPU OR WITH_ROCM)
119120
# fused_attention_op
120121
op_library(fused_attention_op)
121122
op_library(fused_multi_transformer_op)
123+
op_library(fused_multi_transformer_int8_op)
122124
op_library(fused_bias_dropout_residual_layer_norm_op)
123125
endif()
124126
# resnet_unit needs cudnn 8.0 above

paddle/fluid/operators/fused/attention_layer_norm.h

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,8 @@ limitations under the License. */
1919
namespace paddle {
2020
namespace operators {
2121

22-
template <typename T>
22+
// NOTE: T must be the same as OutType in ComputeBackward
23+
template <typename T, typename InType = T, typename OutType = T>
2324
class AttnLayerNorm {
2425
public:
2526
AttnLayerNorm(const phi::GPUContext& dev_ctx,
@@ -33,25 +34,42 @@ class AttnLayerNorm {
3334

3435
~AttnLayerNorm() {}
3536

36-
void ComputeForward(const T* x_data,
37+
void ComputeForward(const InType* x_data,
3738
const LayerNormParamType<T>* scale_data,
3839
const LayerNormParamType<T>* bias_data,
39-
T* y_data,
40+
OutType* y_data,
4041
LayerNormParamType<T>* mean_data,
41-
LayerNormParamType<T>* var_data) {
42+
LayerNormParamType<T>* var_data,
43+
const float* dequant_out_scale_data = nullptr,
44+
const int quant_out_scale_offset = 0,
45+
const float quant_in_scale = 1.0,
46+
const int quant_round_type = 1,
47+
const float quant_max_bound = 127.0,
48+
const float quant_min_bound = -127.0) {
4249
auto stream = dev_ctx_.stream();
4350

4451
switch (GetDesiredBlockDim(feature_size_)) {
4552
FIXED_BLOCK_DIM_CASE(
46-
LayerNormForward<T, LayerNormParamType<T>, kBlockDim>
53+
LayerNormForward<T,
54+
LayerNormParamType<T>,
55+
kBlockDim,
56+
false,
57+
InType,
58+
OutType>
4759
<<<batch_size_, kBlockDim, 0, stream>>>(x_data,
4860
scale_data,
4961
bias_data,
5062
y_data,
5163
mean_data,
5264
var_data,
5365
epsilon_,
54-
feature_size_));
66+
feature_size_,
67+
dequant_out_scale_data,
68+
quant_out_scale_offset,
69+
quant_in_scale,
70+
quant_round_type,
71+
quant_max_bound,
72+
quant_min_bound));
5573
default:
5674
PADDLE_THROW(platform::errors::InvalidArgument(
5775
"Feature_size must be larger than 1"));
Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License");
4+
you may not use this file except in compliance with the License.
5+
You may obtain a copy of the License at
6+
7+
http://www.apache.org/licenses/LICENSE-2.0
8+
9+
Unless required by applicable law or agreed to in writing, software
10+
distributed under the License is distributed on an "AS IS" BASIS,
11+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
See the License for the specific language governing permissions and
13+
limitations under the License. */
14+
15+
#pragma once
16+
17+
#include <iostream>
18+
#include <vector>
19+
#include "paddle/fluid/operators/fused/cublaslt.h"
20+
#include "paddle/fluid/operators/fused/quant_dequant_kernel.h"
21+
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
22+
#include "paddle/fluid/platform/float16.h"
23+
#include "paddle/phi/kernels/funcs/broadcast_function.h"
24+
#include "paddle/phi/kernels/funcs/elementwise_functor.h"
25+
26+
namespace paddle {
27+
namespace operators {
28+
29+
using Tensor = framework::Tensor;
30+
31+
template <typename T>
32+
class AttnMatmulINT8 {
33+
public:
34+
AttnMatmulINT8(
35+
const phi::GPUContext& dev_ctx, int m, int n, int k, bool compute_bias)
36+
: dev_ctx_(dev_ctx), m_(m), n_(n), k_(k), compute_bias_(compute_bias) {
37+
auto helper = std::make_shared<CublasLtHelper>(m, k, n);
38+
helpers_.emplace_back(helper);
39+
}
40+
~AttnMatmulINT8() {}
41+
42+
// This function is used to execute GEMM, with input and output's types are
43+
// both T.
44+
void ComputeForward(const framework::Tensor* weight,
45+
const framework::Tensor* input,
46+
framework::Tensor* input_tmp,
47+
const framework::Tensor* bias,
48+
framework::Tensor* output,
49+
framework::Tensor* output_tmp,
50+
framework::Tensor* bias_out,
51+
const float quant_in_scale,
52+
const framework::Tensor* dequant_out_scale,
53+
const int quant_out_scale_offset,
54+
const int quant_round_type = 1,
55+
const float quant_max_bound = 127.0,
56+
const float quant_min_bound = -127.0) {
57+
quantize_kernel_launcher<T>(input->data<T>(),
58+
input_tmp->data<int8_t>(),
59+
quant_in_scale,
60+
m_,
61+
k_,
62+
quant_round_type,
63+
quant_max_bound,
64+
quant_min_bound,
65+
dev_ctx_.stream());
66+
67+
helpers_[0]->GEMM(input_tmp->data<int8_t>(),
68+
weight->data<int8_t>(),
69+
output_tmp->data<int32_t>(),
70+
dev_ctx_.stream());
71+
72+
dequantize_kernel_launcher<T>(output_tmp->data<int32_t>(),
73+
output->data<T>(),
74+
m_,
75+
n_,
76+
dev_ctx_.stream(),
77+
quant_in_scale,
78+
dequant_out_scale->data<float>(),
79+
quant_out_scale_offset);
80+
81+
if (compute_bias_) {
82+
// bias_out = output + bias
83+
std::vector<const framework::Tensor*> ins = {output, bias};
84+
std::vector<framework::Tensor*> outs = {bias_out};
85+
phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
86+
dev_ctx_, ins, &outs, -1, phi::funcs::AddFunctor<T>());
87+
PADDLE_ENFORCE_EQ(cudaGetLastError(),
88+
cudaSuccess,
89+
platform::errors::Fatal(
90+
"cuda error occured after computing bias. "
91+
"But it does not mean this error is caused by "
92+
"bias computing"));
93+
}
94+
}
95+
96+
// This function is used to execute GEMM, with input and output's types are
97+
// both INT8.
98+
void ComputeForwardINT8ToINT8(const framework::Tensor* weight,
99+
framework::Tensor* input,
100+
const framework::Tensor* bias,
101+
framework::Tensor* output,
102+
framework::Tensor* bias_out) {
103+
helpers_[0]->GEMM(input->data<int8_t>(),
104+
weight->data<int8_t>(),
105+
output->data<int32_t>(),
106+
dev_ctx_.stream());
107+
}
108+
109+
// This function is used to execute GEMM, with input and output's types are
110+
// INT8 and T.
111+
void ComputeForwardINT8ToT(const framework::Tensor* weight,
112+
const float quant_in_scale,
113+
framework::Tensor* input,
114+
const framework::Tensor* bias,
115+
framework::Tensor* output,
116+
framework::Tensor* output_tmp,
117+
framework::Tensor* bias_out,
118+
const framework::Tensor* dequant_out_scale,
119+
const int quant_out_scale_offset) {
120+
helpers_[0]->GEMM(input->data<int8_t>(),
121+
weight->data<int8_t>(),
122+
output_tmp->data<int32_t>(),
123+
dev_ctx_.stream());
124+
125+
dequantize_kernel_launcher<T>(output_tmp->data<int32_t>(),
126+
output->data<T>(),
127+
m_,
128+
n_,
129+
dev_ctx_.stream(),
130+
quant_in_scale,
131+
dequant_out_scale->data<float>(),
132+
quant_out_scale_offset);
133+
134+
if (compute_bias_) {
135+
// bias_out = output + bias
136+
std::vector<const framework::Tensor*> ins = {output, bias};
137+
std::vector<framework::Tensor*> outs = {bias_out};
138+
phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
139+
dev_ctx_, ins, &outs, -1, phi::funcs::AddFunctor<T>());
140+
PADDLE_ENFORCE_EQ(cudaGetLastError(),
141+
cudaSuccess,
142+
platform::errors::Fatal(
143+
"cuda error occured after computing bias. "
144+
"But it does not mean this error is caused by "
145+
"bias computing"));
146+
}
147+
}
148+
149+
// This function is used to execute GEMM, with input and output's types are T
150+
// and INT8.
151+
void ComputeForwardTToINT8(const framework::Tensor* weight,
152+
const float quant_in_scale,
153+
const framework::Tensor* input,
154+
framework::Tensor* input_tmp,
155+
const framework::Tensor* bias,
156+
framework::Tensor* output,
157+
framework::Tensor* bias_out,
158+
const int quant_round_type = 1,
159+
const float quant_max_bound = 127.0,
160+
const float quant_min_bound = -127.0) {
161+
quantize_kernel_launcher<T>(input->data<T>(),
162+
input_tmp->data<int8_t>(),
163+
quant_in_scale,
164+
m_,
165+
k_,
166+
quant_round_type,
167+
quant_max_bound,
168+
quant_min_bound,
169+
dev_ctx_.stream());
170+
171+
helpers_[0]->GEMM(input_tmp->data<int8_t>(),
172+
weight->data<int8_t>(),
173+
output->data<int32_t>(),
174+
dev_ctx_.stream());
175+
}
176+
177+
private:
178+
const phi::GPUContext& dev_ctx_;
179+
180+
int m_; // m
181+
int n_; // n
182+
int k_; // k
183+
184+
int compute_bias_;
185+
std::vector<std::shared_ptr<CublasLtHelper>> helpers_;
186+
};
187+
188+
} // namespace operators
189+
} // namespace paddle

0 commit comments

Comments
 (0)