|
| 1 | +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. |
| 2 | +// |
| 3 | +// Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +// you may not use this file except in compliance with the License. |
| 5 | +// You may obtain a copy of the License at |
| 6 | +// |
| 7 | +// http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +// |
| 9 | +// Unless required by applicable law or agreed to in writing, software |
| 10 | +// distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +// See the License for the specific language governing permissions and |
| 13 | +// limitations under the License. |
| 14 | + |
| 15 | +#include "lite/kernels/xpu/__xpu__multihead_cross_attn_compute.h" |
| 16 | +#include <vector> |
| 17 | +#include "lite/backends/xpu/xpu_header_sitter.h" |
| 18 | +#include "lite/core/op_registry.h" |
| 19 | + |
| 20 | +namespace paddle { |
| 21 | +namespace lite { |
| 22 | +namespace kernels { |
| 23 | +namespace xpu { |
| 24 | + |
| 25 | +template <typename T> |
| 26 | +static std::vector<const T*> prepare_weight( |
| 27 | + const std::vector<lite::Tensor*>& fc_weight) { |
| 28 | + std::vector<const T*> result; |
| 29 | + for (auto* weight : fc_weight) { |
| 30 | + result.push_back(reinterpret_cast<const T*>(weight->data<float>())); |
| 31 | + } |
| 32 | + return result; |
| 33 | +} |
| 34 | + |
| 35 | +template <typename InType, PrecisionType PType> |
| 36 | +void XPUMhcaCompute<InType, PType>::PrepareWeightMax( |
| 37 | + const std::vector<lite::Tensor*>& weight_max, |
| 38 | + int max_ptr_len, |
| 39 | + std::vector<const float*>* max_xpu_ptrs) { |
| 40 | + int max_value_num = 0; |
| 41 | + for (auto max_tensor : weight_max) { |
| 42 | + max_value_num += max_tensor->numel(); |
| 43 | + } |
| 44 | + VLOG(3) << "Total weight max value number: " << max_value_num; |
| 45 | + weight_max_guard_ = |
| 46 | + TargetWrapperXPU::MallocScratchPad(max_value_num * sizeof(float)); |
| 47 | + float* weight_max_ptr = reinterpret_cast<float*>(weight_max_guard_->addr_); |
| 48 | + |
| 49 | + int offset = 0; |
| 50 | + for (auto max_tensor : weight_max) { |
| 51 | + float* cur_weight_max_ptr = weight_max_ptr + offset; |
| 52 | + auto len = max_tensor->numel(); |
| 53 | + VLOG(6) << "weight max value: " << max_tensor->data<float>()[0] << " " |
| 54 | + << max_tensor->data<float>()[len - 1]; |
| 55 | + std::vector<float> cpu_max(max_ptr_len, max_tensor->data<float>()[0]); |
| 56 | + lite::TargetWrapperXPU::MemcpySync(cur_weight_max_ptr, |
| 57 | + cpu_max.data(), |
| 58 | + sizeof(float) * max_ptr_len, |
| 59 | + IoDirection::HtoD); |
| 60 | + max_xpu_ptrs->push_back(cur_weight_max_ptr); |
| 61 | + offset += max_ptr_len; |
| 62 | + } |
| 63 | +} |
| 64 | + |
| 65 | +template <typename InType, PrecisionType PType> |
| 66 | +void XPUMhcaCompute<InType, PType>::PrepareForRun() { |
| 67 | + auto& ctx = this->ctx_->template As<XPUContext>(); |
| 68 | + auto& param = this->template Param<param_t>(); |
| 69 | + // prepare bias |
| 70 | + for (auto* fc_bias : param.fc_bias) { |
| 71 | + arg_fc_bias_.push_back(fc_bias->template data<float>()); |
| 72 | + } |
| 73 | + // prepare scale |
| 74 | + for (auto* ln_scale : param.ln_scale) { |
| 75 | + arg_ln_scale_.push_back(ln_scale->template data<float>()); |
| 76 | + } |
| 77 | + // prepare ln_bias |
| 78 | + for (auto* ln_bias : param.ln_bias) { |
| 79 | + arg_ln_bias_.push_back(ln_bias->template data<float>()); |
| 80 | + } |
| 81 | + arg_fc_weight_int16_ = prepare_weight<int16_t>(param.fc_weight); |
| 82 | + const int XPU_QUANT_SCALE_NUM = ctx.GetRawContext()->max_ptr_size(); |
| 83 | + PrepareWeightMax(param.weight_max, XPU_QUANT_SCALE_NUM, &fc_weight_max_); |
| 84 | +} |
| 85 | + |
| 86 | +template <typename InType, PrecisionType PType> |
| 87 | +void XPUMhcaCompute<InType, PType>::Run() { |
| 88 | + // TODO(shenyijun): The compute of this op will be adapted to XFT interface |
| 89 | + // later on. |
| 90 | + // |
| 91 | + // auto& param = this->template Param<param_t>(); |
| 92 | + // auto& ctx = this->ctx_->template As<XPUContext>(); |
| 93 | + // const InType* in = param.input->template data<InType>(); |
| 94 | + // const InType* embedding = param.embedding->template data<InType>(); |
| 95 | + // InType* out = param.output->template mutable_data<InType>(TARGET(kXPU)); |
| 96 | + // int batch = static_cast<int>(param.input->dims()[0]); |
| 97 | + // int seqlen = static_cast<int>(param.input->dims()[1]); |
| 98 | + // int embedding_seq = static_cast<int>(param.embedding->dims()[1]); |
| 99 | + // int r = xdnn::unet_mhca_fusion<InType, int16_t, InType, int16_t>( |
| 100 | + // ctx.GetRawContext(), |
| 101 | + // in, |
| 102 | + // embedding, |
| 103 | + // *(XPUMhcaCompute::GetWeight<int16_t>()), |
| 104 | + // out, |
| 105 | + // arg_fc_bias_, |
| 106 | + // arg_ln_scale_, |
| 107 | + // arg_ln_bias_, |
| 108 | + // fc_weight_max_, |
| 109 | + // batch, |
| 110 | + // param.head_num, |
| 111 | + // param.size_per_head, |
| 112 | + // seqlen, |
| 113 | + // param.hidden_dim, |
| 114 | + // embedding_seq, |
| 115 | + // param.embedding_dim); |
| 116 | + // CHECK_EQ(r, 0); |
| 117 | +} |
| 118 | + |
| 119 | +} // namespace xpu |
| 120 | +} // namespace kernels |
| 121 | +} // namespace lite |
| 122 | +} // namespace paddle |
| 123 | + |
| 124 | +namespace xpu = paddle::lite::kernels::xpu; |
| 125 | + |
| 126 | +using XPUMhca_FP32 = xpu::XPUMhcaCompute<float, PRECISION(kFloat)>; |
| 127 | +using XPUMhca_FP16 = xpu::XPUMhcaCompute<float16, PRECISION(kFP16)>; |
| 128 | + |
| 129 | +REGISTER_LITE_KERNEL( |
| 130 | + __xpu__multihead_cross_attn, kXPU, kFloat, kNCHW, XPUMhca_FP32, def) |
| 131 | + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU))}) |
| 132 | + .BindInput("Embedding", {LiteType::GetTensorTy(TARGET(kXPU))}) |
| 133 | + .BindInput("FCWeight", {LiteType::GetTensorTy(TARGET(kXPU))}) |
| 134 | + .BindInput("FCBias", {LiteType::GetTensorTy(TARGET(kXPU))}) |
| 135 | + .BindInput("LNScale", {LiteType::GetTensorTy(TARGET(kXPU))}) |
| 136 | + .BindInput("LNBias", {LiteType::GetTensorTy(TARGET(kXPU))}) |
| 137 | + .BindOutput("Output", {LiteType::GetTensorTy(TARGET(kXPU))}) |
| 138 | + .Finalize(); |
| 139 | +REGISTER_LITE_KERNEL( |
| 140 | + __xpu__multihead_cross_attn, kXPU, kFP16, kNCHW, XPUMhca_FP16, def) |
| 141 | + .BindInput("Input", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) |
| 142 | + .BindInput("Embedding", |
| 143 | + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) |
| 144 | + .BindInput("FCWeight", {LiteType::GetTensorTy(TARGET(kXPU))}) |
| 145 | + .BindInput("FCBias", {LiteType::GetTensorTy(TARGET(kXPU))}) |
| 146 | + .BindInput("LNScale", {LiteType::GetTensorTy(TARGET(kXPU))}) |
| 147 | + .BindInput("LNBias", {LiteType::GetTensorTy(TARGET(kXPU))}) |
| 148 | + .BindOutput("Output", |
| 149 | + {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))}) |
| 150 | + .Finalize(); |
0 commit comments