Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 10 additions & 5 deletions lite/core/optimizer/mir/__xpu__static_kernel_pick_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -517,7 +517,12 @@ void XPUStaticKernelPickPass::InplaceOpScore(lite::mir::Node* node,
CHECK(instruct.op_info()->GetInputArgname(var_name, &tmp));
VLOG(6) << "current kernel input data variable name:" << var_name
<< "Parameter name:" << tmp;
if (in_node->inlinks.empty()) {
if (in_node->inlinks.empty() && xpu_output_type_.count(var_name) == 0) {
continue;
}

// only to match input X
if (tmp != "X") {
continue;
}

Expand Down Expand Up @@ -549,7 +554,7 @@ void XPUStaticKernelPickPass::InplaceOpScore(lite::mir::Node* node,
const auto& var_name = var.name;
std::string tmp;
CHECK(instruct.op_info()->GetOutputArgname(var_name, &tmp));
if (out_node->outlinks.empty()) {
if (out_node->outlinks.empty() && xpu_input_type_.count(var_name) == 0) {
continue;
}

Expand Down Expand Up @@ -584,7 +589,7 @@ void XPUStaticKernelPickPass::SpecialOpScore(lite::mir::Node* node,
const auto& var_name = var.name;
std::string tmp;
CHECK(instruct.op_info()->GetInputArgname(var_name, &tmp));
if (in_node->inlinks.empty()) {
if (in_node->inlinks.empty() && xpu_output_type_.count(var_name) == 0) {
if (kernel.GetInputDeclType(tmp)->precision() == PrecisionType::kFP16) {
*score = 0;
VLOG(6) << "not pick fp16 kernel ,because input weight "
Expand All @@ -601,7 +606,7 @@ void XPUStaticKernelPickPass::SpecialOpScore(lite::mir::Node* node,
const auto& var_name = var.name;
std::string tmp;
CHECK(instruct.op_info()->GetInputArgname(var_name, &tmp));
if (in_node->inlinks.empty()) {
if (in_node->inlinks.empty() && xpu_output_type_.count(var_name) == 0) {
continue;
}

Expand Down Expand Up @@ -644,7 +649,7 @@ void XPUStaticKernelPickPass::SpecialOpScore(lite::mir::Node* node,
std::string tmp;
CHECK(instruct.op_info()->GetOutputArgname(var_name, &tmp));
int output_match_num = xpu_input_type_.count(var_name);
if (out_node->outlinks.empty()) {
if (out_node->outlinks.empty() && output_match_num == 0) {
continue;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -958,9 +958,6 @@ class XPUMultiEncoderFuser {

// q/k/v fusion
bool enable_qkv_fusion = true;
if (norm_before_0 && !adaptive_seqlen_) {
enable_qkv_fusion = false;
}
op_desc.SetAttr<bool>("enable_qkv_fusion", enable_qkv_fusion);

auto* scope = multi_encoder_stmt->op()->scope();
Expand Down
5 changes: 5 additions & 0 deletions lite/core/optimizer/mir/variable_place_inference_pass.h
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ class VariablePlaceInferencePass : public DebugPass {
type.precision() == PRECISION(kUnk)) {
weight_node->AsArg().type = LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
} else if (with_targets.at("kXPU")) {
weight_node->AsArg().type = LiteType::GetTensorTy(
TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW));
} else {
weight_node->AsArg().type = LiteType::GetTensorTy(
TARGET(kHost), type.precision(), DATALAYOUT(kNCHW));
Expand Down Expand Up @@ -129,9 +132,11 @@ class VariablePlaceInferencePass : public DebugPass {
{"kCUDA", valid_places_has_target(TARGET(kCUDA))},
{"kFPGA", valid_places_has_target(TARGET(kFPGA))},
{"kMetal", valid_places_has_target(TARGET(kMetal))},
{"kXPU", valid_places_has_target(TARGET(kXPU))},
};
VLOG(4) << "with_targets['kOpenCL']:" << with_targets["kOpenCL"];
VLOG(4) << "with_targets['kFPGA']:" << with_targets["kFPGA"];
VLOG(4) << "with_targets['kXPU']:" << with_targets["kXPU"];

VLOG(3) << "param-type-registry:\n" << ParamTypeRegistry::Global();
for (auto& node : graph->StmtTopologicalOrder()) {
Expand Down
1 change: 1 addition & 0 deletions lite/kernels/xpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ add_kernel(grid_sampler_compute_xpu XPU basic SRCS grid_sampler_compute.cc)
add_kernel(fill_zeros_like_compute_xpu XPU basic SRCS fill_zeros_like_compute.cc)
add_kernel(reduce_compute_xpu XPU basic SRCS reduce_compute.cc)
add_kernel(expand_v2_compute_xpu XPU basic SRCS expand_v2_compute.cc)
add_kernel(expand_compute_xpu XPU basic SRCS expand_compute.cc)
add_kernel(range_compute_xpu XPU extra SRCS range_compute.cc)
add_kernel(where_compute_xpu XPU extra SRCS where_compute.cc)
add_kernel(gather_nd_compute_xpu XPU extra SRCS gather_nd_compute.cc)
Expand Down
55 changes: 28 additions & 27 deletions lite/kernels/xpu/assign_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@ namespace lite {
namespace kernels {
namespace xpu {

template <class T>
void AssignCompute<T>::Run() {
auto& param = Param<param_t>();
template <class T, PrecisionType PType>
void AssignCompute<T, PType>::Run() {
auto& param = this->template Param<param_t>();
CHECK(param.X) << "only support input is tensor";
if (param.X == param.Out || param.X->numel() == 0) {
param.Out->set_target(TARGET(kXPU));
Expand All @@ -42,12 +42,9 @@ void AssignCompute<T>::Run() {
} // namespace lite
} // namespace paddle

REGISTER_LITE_KERNEL(assign,
kXPU,
kFloat,
kNCHW,
paddle::lite::kernels::xpu::AssignCompute<float>,
def)
using assign_float =
paddle::lite::kernels::xpu::AssignCompute<float, PRECISION(kFloat)>;
REGISTER_LITE_KERNEL(assign, kXPU, kFloat, kNCHW, assign_float, def)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kXPU),
PRECISION(kFloat),
Expand All @@ -58,12 +55,22 @@ REGISTER_LITE_KERNEL(assign,
DATALAYOUT(kAny))})
.Finalize();

REGISTER_LITE_KERNEL(assign,
kXPU,
kFloat,
kNCHW,
paddle::lite::kernels::xpu::AssignCompute<int>,
int32)
using assign_fp16 =
paddle::lite::kernels::xpu::AssignCompute<float16, PRECISION(kFP16)>;
REGISTER_LITE_KERNEL(assign, kXPU, kFP16, kNCHW, assign_fp16, fp16)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kXPU),
PRECISION(kFP16),
DATALAYOUT(kAny))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kXPU),
PRECISION(kFP16),
DATALAYOUT(kAny))})
.Finalize();

using assign_int =
paddle::lite::kernels::xpu::AssignCompute<int, PRECISION(kFloat)>;
REGISTER_LITE_KERNEL(assign, kXPU, kFloat, kNCHW, assign_int, int32)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kXPU),
PRECISION(kInt32),
Expand All @@ -74,12 +81,9 @@ REGISTER_LITE_KERNEL(assign,
DATALAYOUT(kAny))})
.Finalize();

REGISTER_LITE_KERNEL(assign,
kXPU,
kFloat,
kNCHW,
paddle::lite::kernels::xpu::AssignCompute<int64_t>,
int64)
using assign_int64 =
paddle::lite::kernels::xpu::AssignCompute<int64_t, PRECISION(kFloat)>;
REGISTER_LITE_KERNEL(assign, kXPU, kFloat, kNCHW, assign_int64, int64)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kXPU),
PRECISION(kInt64),
Expand All @@ -90,12 +94,9 @@ REGISTER_LITE_KERNEL(assign,
DATALAYOUT(kAny))})
.Finalize();

REGISTER_LITE_KERNEL(assign,
kXPU,
kFloat,
kNCHW,
paddle::lite::kernels::xpu::AssignCompute<int8_t>,
bool)
using assign_int8 =
paddle::lite::kernels::xpu::AssignCompute<int8_t, PRECISION(kFloat)>;
REGISTER_LITE_KERNEL(assign, kXPU, kFloat, kNCHW, assign_int8, bool)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kXPU),
PRECISION(kBool),
Expand Down
4 changes: 2 additions & 2 deletions lite/kernels/xpu/assign_compute.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ namespace lite {
namespace kernels {
namespace xpu {

template <class T>
class AssignCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
template <class T, PrecisionType PType>
class AssignCompute : public KernelLite<TARGET(kXPU), PType> {
public:
using param_t = operators::AssignParam;

Expand Down
90 changes: 90 additions & 0 deletions lite/kernels/xpu/expand_compute.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "lite/kernels/xpu/expand_compute.h"
#include <vector>
#include "lite/backends/xpu/xpu_header_sitter.h"
#include "lite/core/op_registry.h"

namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {

template <typename T, PrecisionType PType>
void ExpandCompute<T, PType>::Run() {
auto& param = this->template Param<operators::ExpandParam>();
auto& ctx = this->ctx_->template As<XPUContext>();
const auto* x = param.X;
auto* out = param.Out;
std::vector<int64_t> x_shape = x->dims().Vectorize();
std::vector<int64_t> out_shape = out->dims().Vectorize();
std::vector<int> x_dims(x_shape.begin(), x_shape.end());
std::vector<int> out_dims(out_shape.begin(), out_shape.end());
x_dims.insert(x_dims.begin(), out_dims.size() - x_dims.size(), 1);

int r = xdnn::broadcast<T>(ctx.GetRawContext(),
x->template data<T>(),
out->template mutable_data<T>(TARGET(kXPU)),
x_dims,
out_dims);
CHECK_EQ(r, 0);
}

} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle

using expand_xpu_float =
paddle::lite::kernels::xpu::ExpandCompute<float, PRECISION(kFloat)>;
REGISTER_LITE_KERNEL(expand, kXPU, kFloat, kAny, expand_xpu_float, def)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kXPU),
PRECISION(kFloat),
DATALAYOUT(kAny))})
.BindInput("ExpandTimes",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kInt32),
DATALAYOUT(kAny))})
.BindInput("expand_times_tensor",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kInt32),
DATALAYOUT(kAny))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kXPU),
PRECISION(kFloat),
DATALAYOUT(kAny))})
.Finalize();

using expand_xpu_fp16 =
paddle::lite::kernels::xpu::ExpandCompute<float16, PRECISION(kFP16)>;
REGISTER_LITE_KERNEL(expand, kXPU, kFP16, kAny, expand_xpu_fp16, fp16)
.BindInput("X",
{LiteType::GetTensorTy(TARGET(kXPU),
PRECISION(kFP16),
DATALAYOUT(kAny))})
.BindInput("ExpandTimes",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kInt32),
DATALAYOUT(kAny))})
.BindInput("expand_times_tensor",
{LiteType::GetTensorTy(TARGET(kHost),
PRECISION(kInt32),
DATALAYOUT(kAny))})
.BindOutput("Out",
{LiteType::GetTensorTy(TARGET(kXPU),
PRECISION(kFP16),
DATALAYOUT(kAny))})
.Finalize();
35 changes: 35 additions & 0 deletions lite/kernels/xpu/expand_compute.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#pragma once

#include "lite/core/kernel.h"

namespace paddle {
namespace lite {
namespace kernels {
namespace xpu {

template <typename T, PrecisionType PType>
class ExpandCompute : public KernelLite<TARGET(kXPU), PType, DATALAYOUT(kAny)> {
public:
virtual void Run();

virtual ~ExpandCompute() = default;
};

} // namespace xpu
} // namespace kernels
} // namespace lite
} // namespace paddle
35 changes: 24 additions & 11 deletions lite/kernels/xpu/split_compute.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ namespace lite {
namespace kernels {
namespace xpu {

void SplitCompute::Run() {
template <typename T, PrecisionType PType>
void SplitCompute<T, PType>::Run() {
auto& param = this->template Param<param_t>();
auto& ctx = this->ctx_->template As<XPUContext>();
auto& dout = param.output;
Expand All @@ -33,19 +34,19 @@ void SplitCompute::Run() {
height = height * in_dim[i];
}
int width = param.x->numel() / height;
std::vector<float*> out_ptrs;
std::vector<T*> out_ptrs;
std::vector<int> width_out;
for (auto out : dout) {
out->set_lod(param.x->lod());
out_ptrs.push_back(out->mutable_data<float>(TARGET(kXPU)));
out_ptrs.push_back(out->template mutable_data<T>(TARGET(kXPU)));
width_out.push_back(out->numel() / height);
}
int r = xdnn::split<float>(ctx.GetRawContext(),
param.x->data<float>(),
out_ptrs,
{height, width},
width_out,
1);
int r = xdnn::split<T>(ctx.GetRawContext(),
param.x->template data<T>(),
out_ptrs,
{height, width},
width_out,
1);

CHECK_EQ(r, 0);
}
Expand All @@ -55,12 +56,24 @@ void SplitCompute::Run() {
} // namespace lite
} // namespace paddle

REGISTER_LITE_KERNEL(
split, kXPU, kFloat, kNCHW, paddle::lite::kernels::xpu::SplitCompute, def)
using split_float =
paddle::lite::kernels::xpu::SplitCompute<float, PRECISION(kFloat)>;
REGISTER_LITE_KERNEL(split, kXPU, kFloat, kNCHW, split_float, def)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU))})
.BindInput("AxisTensor",
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
.BindOutput("SectionsTensorList",
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU))})
.Finalize();

using split_fp16 =
paddle::lite::kernels::xpu::SplitCompute<float16, PRECISION(kFP16)>;
REGISTER_LITE_KERNEL(split, kXPU, kFP16, kNCHW, split_fp16, fp16)
.BindInput("X", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.BindInput("AxisTensor",
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
.BindOutput("SectionsTensorList",
{LiteType::GetTensorTy(TARGET(kHost), PRECISION(kInt32))})
.BindOutput("Out", {LiteType::GetTensorTy(TARGET(kXPU), PRECISION(kFP16))})
.Finalize();
3 changes: 2 additions & 1 deletion lite/kernels/xpu/split_compute.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ namespace lite {
namespace kernels {
namespace xpu {

class SplitCompute : public KernelLite<TARGET(kXPU), PRECISION(kFloat)> {
template <typename T, PrecisionType PType>
class SplitCompute : public KernelLite<TARGET(kXPU), PType> {
public:
using param_t = operators::SplitParam;

Expand Down