Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
dc5642c
Merge fused_mt branch
penPenf28 May 7, 2024
10e2022
Adjusted fuse_mt_int8
penPenf28 May 8, 2024
6bbda60
Revert attention_layer_norm.h
penPenf28 May 8, 2024
f4b8f6d
Revert paddle/phi/kernels/fusion/gpu/fmha_ref.h
penPenf28 May 8, 2024
1d5eed5
Add win support and refine format.
penPenf28 May 9, 2024
7548bfc
Reformat for win.
penPenf28 May 9, 2024
f280bc3
Removed redundant files, now only supports flash_attn_v2 and variable…
penPenf28 May 11, 2024
0174777
Refine static_fused_ft test
penPenf28 May 13, 2024
c610938
Refine fused_mt related testcase
penPenf28 May 14, 2024
c49c8eb
Remove custom_adll_reduce
penPenf28 May 14, 2024
c01e19d
Remove operator cublaslt and revert parallel test
penPenf28 May 14, 2024
b325bb6
Refine empty seq_len
penPenf28 May 14, 2024
fb533c1
Refine ft
penPenf28 May 14, 2024
2733371
Refine ft_static test
penPenf28 May 15, 2024
25057b9
Remove float32 support and static parallel ft test
penPenf28 May 15, 2024
6f0130f
Refine type static error.
penPenf28 May 16, 2024
5f07582
Fix doc type error
penPenf28 May 16, 2024
bc36efe
Fuse_mt code format
penPenf28 May 16, 2024
235c856
Remove some redundant code
penPenf28 May 17, 2024
e4cb5c3
Remove redundant attention_layer_norm.h
penPenf28 May 17, 2024
b4cb3aa
Remove redundant code in ft_op
penPenf28 May 17, 2024
0480c13
Remove Redundant code and skip fuse_mt doctest
penPenf28 May 18, 2024
9c64e8e
Remove redundant fmha_ref mmha_util and other code
penPenf28 May 19, 2024
0445842
Remove redundant kernel
penPenf28 May 20, 2024
bedda56
Remove redundant file
penPenf28 May 20, 2024
137ffbc
Refine fuse_mt code
penPenf28 May 20, 2024
6e41213
Refine cublaslt comment
penPenf28 May 21, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions paddle/fluid/inference/api/analysis_predictor.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/resource_manager.h"
#include "paddle/fluid/platform/device/gpu/gpu_types.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/phi/common/bfloat16.h"
#include "paddle/utils/string/printf.h"

#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
Expand All @@ -45,6 +47,8 @@
#include "paddle/pir/include/core/program.h"

namespace paddle_infer {
using float16 = paddle::platform::float16;
using bfloat16 = phi::dtype::bfloat16;
namespace experimental {
class InternalUtils;
};
Expand Down
2 changes: 2 additions & 0 deletions paddle/fluid/inference/api/api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ int PaddleDtypeSize(PaddleDType dtype) {
switch (dtype) {
case PaddleDType::FLOAT32:
return sizeof(float);
case PaddleDType::BFLOAT16:
return sizeof(uint16_t);
case PaddleDType::INT64:
return sizeof(int64_t);
case PaddleDType::INT32:
Expand Down
2 changes: 2 additions & 0 deletions paddle/fluid/inference/api/api_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
input_ptr = input.mutable_data<float>(ddim, place_);
} else if (inputs[i].dtype == PaddleDType::INT32) {
input_ptr = input.mutable_data<int32_t>(ddim, place_);
} else if (inputs[i].dtype == PaddleDType::BFLOAT16) {
input_ptr = input.mutable_data<bfloat16>(ddim, place_);
} else {
LOG(ERROR) << "unsupported feed type " << inputs[i].dtype;
return false;
Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/inference/api/details/zero_copy_tensor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -821,6 +821,7 @@ template void Tensor::ORTCopyToCpu<int32_t>(int32_t *data) const;
template void Tensor::ORTCopyToCpu<uint8_t>(uint8_t *data) const;
template void Tensor::ORTCopyToCpu<int8_t>(int8_t *data) const;
template void Tensor::ORTCopyToCpu<float16>(float16 *data) const;
template void Tensor::ORTCopyToCpu<bfloat16>(bfloat16 *data) const;
#endif

namespace experimental {
Expand Down
7 changes: 4 additions & 3 deletions paddle/fluid/operators/fused/attn_gemm_int8.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,12 @@ limitations under the License. */

#include <iostream>
#include <vector>
#include "paddle/fluid/operators/fused/cublaslt.h"
#include "paddle/fluid/operators/fused/quant_dequant_kernel.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/kernels/funcs/broadcast_function.h"
#include "paddle/phi/kernels/funcs/cublaslt.h"
#include "paddle/phi/kernels/funcs/elementwise_functor.h"

namespace paddle {
Expand All @@ -35,7 +35,8 @@ class AttnMatmulINT8 {
AttnMatmulINT8(
const phi::GPUContext& dev_ctx, int m, int n, int k, bool compute_bias)
: dev_ctx_(dev_ctx), m_(m), n_(n), k_(k), compute_bias_(compute_bias) {
auto helper = std::make_shared<CublasLtHelper>(m, k, n);
auto helper = std::make_shared<phi::CublasLtHelper>(
m, k, n, dev_ctx.cublaslt_handle());
helpers_.emplace_back(helper);
gpu_config_ = std::make_unique<GpuLaunchConfig>(
phi::backends::gpu::GetGpuLaunchConfig1D(
Expand Down Expand Up @@ -186,7 +187,7 @@ class AttnMatmulINT8 {
int k_; // k

int compute_bias_;
std::vector<std::shared_ptr<CublasLtHelper>> helpers_;
std::vector<std::shared_ptr<phi::CublasLtHelper>> helpers_;
std::unique_ptr<GpuLaunchConfig> gpu_config_;
};

Expand Down
Loading