[XPU] Add select_input_compute and fix bug in box_coder. (#9711)

AlbertVan · web-flow · commit 91b0fd32af4e · 2022-11-25T19:11:36.000+08:00
diff --git a/lite/kernels/xpu/CMakeLists.txt b/lite/kernels/xpu/CMakeLists.txt
@@ -104,6 +104,7 @@ add_kernel(shape_compute_xpu XPU extra SRCS shape_compute.cc)
 add_kernel(lod_array_length_compute_xpu XPU extra SRCS lod_array_length_compute.cc)
 add_kernel(multiclass_nms_compute_xpu XPU extra SRCS multiclass_nms_compute.cc)
 add_kernel(lod_reset_compute_xpu XPU extra SRCS lod_reset_compute.cc)
+add_kernel(select_input_compute_xpu XPU extra SRCS select_input_compute.cc)
 
 # extra(fused kernel)
 add_kernel(__xpu__resnet50_compute_xpu XPU extra SRCS __xpu__resnet50_compute.cc)
diff --git a/lite/kernels/xpu/__xpu__multi_encoder_compute.cc b/lite/kernels/xpu/__xpu__multi_encoder_compute.cc
@@ -306,8 +306,11 @@ void XPUMultiEncoderCompute::run_encoder(const T* in, T* out) {
     std::vector<int64_t> mask_shape = param.mask->dims().Vectorize();
     std::vector<int> encoder_mask_shape =
         std::vector<int>(mask_shape.begin(), mask_shape.end());
-    CHECK_EQ(param.ffn_hidden_dim_scale, 4)
-        << "xpu don't support ffn_hidden_dim_scale!=4 when no vsl";
+    // xpu1 don't support ffn_hidden_dim_scale!=4 when no vsl
+    if (ctx.GetRawContext()->dev().type() == xdnn::kXPU1) {
+      CHECK_EQ(param.ffn_hidden_dim_scale, 4)
+          << "xpu don't support ffn_hidden_dim_scale!=4 when no vsl";
+    }
     xdnn::QKVAttnParam qkv_attn_param(batch,
                                       max_seqlen,
                                       param.head_num,
@@ -326,6 +329,7 @@ void XPUMultiEncoderCompute::run_encoder(const T* in, T* out) {
       qkv_attn_param.relative_pos.assign(roformer_embedding_.begin(),
                                          roformer_embedding_.end());
     }
+    qkv_attn_param.scale_of_hidden_units = param.ffn_hidden_dim_scale;
     int r = xdnn::transformer_encoder<T, TW, TGEMM>(
         ctx.GetRawContext(),
         in,
diff --git a/lite/kernels/xpu/box_coder_compute.cc b/lite/kernels/xpu/box_coder_compute.cc
@@ -66,7 +66,8 @@ void BoxCoderCompute::Run() {
   output_box->Resize({row, col, len});
   auto* output = output_box->mutable_data<float>(TARGET(kXPU));
   float* variance_xpu_ptr =
-      reinterpret_cast<float*>(variance_xpu_guard_->addr_);
+      variance_xpu_guard_ ? reinterpret_cast<float*>(variance_xpu_guard_->addr_)
+                          : nullptr;
 
   if (code_type == "encode_center_size") {
     int r = xdnn::box_coder_encoder<float>(ctx.GetRawContext(),
diff --git a/lite/kernels/xpu/elementwise_compute.cc b/lite/kernels/xpu/elementwise_compute.cc
@@ -183,7 +183,11 @@ using SubFloat32 =
 using SubFloat16 = xpu::ElementwiseCompute<float16,
                                            xpu::SubFunctor<float16>,
                                            PRECISION(kFP16)>;
-
+using SubInt32 =
+    xpu::ElementwiseCompute<int, xpu::SubFunctor<int>, PRECISION(kFloat)>;
+using SubInt64 = xpu::ElementwiseCompute<int64_t,
+                                         xpu::SubFunctor<int64_t>,
+                                         PRECISION(kFloat)>;
 using MulFloat32 =
     xpu::ElementwiseCompute<float, xpu::MulFunctor<float>, PRECISION(kFloat)>;
 using MulFloat16 = xpu::ElementwiseCompute<float16,
@@ -273,6 +277,18 @@ REGISTER_LITE_KERNEL(
     .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
     .Finalize();
 
+REGISTER_LITE_KERNEL(elementwise_sub, kXPU, kFloat, kNCHW, SubInt32, int32)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt32))})
+    .Finalize();
+
+REGISTER_LITE_KERNEL(elementwise_sub, kXPU, kFloat, kNCHW, SubInt64, int64)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
+    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kInt64))})
+    .Finalize();
+
 REGISTER_LITE_KERNEL(elementwise_mul, kXPU, kFloat, kNCHW, MulFloat32, def)
     .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
     .BindInput("Y", {LiteType::GetTensorTy(TARGET(kXPU))})
diff --git a/lite/kernels/xpu/select_input_compute.cc b/lite/kernels/xpu/select_input_compute.cc
@@ -0,0 +1,54 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/kernels/xpu/select_input_compute.h"
+#include "lite/backends/xpu/xpu_header_sitter.h"
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+void SelectInputCompute::Run() {
+  auto& param = this->Param<param_t>();
+  auto& ctx = this->ctx_->template As<XPUContext>();
+  auto x = param.X;
+
+  auto output = param.Out;
+  auto x_i = x[*param.Mask->data<int>()];
+  output->mutable_data(TARGET(kXPU), x_i->memory_size());
+  int r = xdnn::copy<int8_t>(ctx.GetRawContext(),
+                             x_i->data<int8_t>(),
+                             reinterpret_cast<int8_t*>(output->raw_data()),
+                             x_i->memory_size());
+  CHECK_EQ(r, 0);
+}
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_KERNEL(select_input,
+                     kXPU,
+                     kAny,
+                     kNCHW,
+                     paddle::lite::kernels::xpu::SelectInputCompute,
+                     def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kAny))})
+    .BindInput("Mask",
+               {LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kAny))})
+    .Finalize();
diff --git a/lite/kernels/xpu/select_input_compute.h b/lite/kernels/xpu/select_input_compute.h
@@ -0,0 +1,38 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <algorithm>
+#include <string>
+#include "lite/backends/xpu/target_wrapper.h"
+#include "lite/core/kernel.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace xpu {
+
+class SelectInputCompute : public KernelLite<TARGET(kXPU), PRECISION(kAny)> {
+ public:
+  using param_t = operators::SelectInputParam;
+
+  void Run() override;
+
+  virtual ~SelectInputCompute() = default;
+};
+
+}  // namespace xpu
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
diff --git a/lite/operators/select_input_op.cc b/lite/operators/select_input_op.cc
@@ -37,6 +37,7 @@ bool SelectInputOpLite::InferShapeImpl() const {
   const auto &output_dims = inputs[Mask]->dims();
   // Set output dims
   param_.Out->Resize(output_dims);
+  param_.Out->set_lod(inputs[Mask]->lod());
   return true;
 }
 

Original file line number	Diff line number	Diff line change
`@@ -37,6 +37,7 @@ bool SelectInputOpLite::InferShapeImpl() const {`
`37`	`37`	`const auto &output_dims = inputs[Mask]->dims();`
`38`	`38`	`// Set output dims`
`39`	`39`	`param_.Out->Resize(output_dims);`
	`40`	`+ param_.Out->set_lod(inputs[Mask]->lod());`
`40`	`41`	`return true;`
`41`	`42`	`}`
`42`	`43`