Mangodadada
diff --git a/‎csrc/cpu/0001-fp16_bf16.patch‎
Lines changed: 619 additions & 0 deletions b/‎csrc/cpu/0001-fp16_bf16.patch‎
Lines changed: 619 additions & 0 deletions
diff --git a/‎csrc/cpu/0001-fp32.patch‎
Lines changed: 647 additions & 0 deletions b/‎csrc/cpu/0001-fp32.patch‎
Lines changed: 647 additions & 0 deletions
diff --git a/‎csrc/cpu/0001-patch-fp16-and-bf16.patch‎
Lines changed: 0 additions & 280 deletions b/‎csrc/cpu/0001-patch-fp16-and-bf16.patch‎
Lines changed: 0 additions & 280 deletions
diff --git a/‎csrc/cpu/0001-patch-fp32.patch‎
Lines changed: 0 additions & 302 deletions b/‎csrc/cpu/0001-patch-fp32.patch‎
Lines changed: 0 additions & 302 deletions
diff --git a/‎csrc/cpu/README.md‎
Lines changed: 10 additions & 4 deletions b/‎csrc/cpu/README.md‎
Lines changed: 10 additions & 4 deletions
diff --git a/‎csrc/cpu/setup.sh‎
Lines changed: 13 additions & 9 deletions b/‎csrc/cpu/setup.sh‎
Lines changed: 13 additions & 9 deletions
diff --git a/‎csrc/cpu/src/avx_weight_only.cc‎
Lines changed: 160 additions & 0 deletions b/‎csrc/cpu/src/avx_weight_only.cc‎
Lines changed: 160 additions & 0 deletions
diff --git a/‎csrc/cpu/src/setup_cpu.py‎
Lines changed: 25 additions & 12 deletions b/‎csrc/cpu/src/setup_cpu.py‎
Lines changed: 25 additions & 12 deletions
diff --git a/‎csrc/cpu/src/token_penalty_multi_scores.cc‎
Lines changed: 3 additions & 6 deletions b/‎csrc/cpu/src/token_penalty_multi_scores.cc‎
Lines changed: 3 additions & 6 deletions
@@ -1,8 +1,14 @@
 # cpu-custom-ops
 
 ## 快速开始
-# 构建 cpu 自定义算子库
-```
-$ 前提条件:机器支持avx指令
-$ bash setup.sh
+
+### 1.环境准备
+```shell
+# 查询机器是否支持 avx512指令
+lscpu | grep avx512*
 ```
+
+### 2.安装 cpu 自定义算子和第三方库
+```shell
+#建议在 gcc 9.4.0 下安装第三方库
+bash setup.sh
@@ -12,31 +12,35 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-#1. download XFT
+#0.环境准备:安装numactl
+# apt-get update
+# apt-get install numactl
+
+# 1. download XFT
 if [ ! -d xFasterTransformer]; then
-    git clone --branch v1.7.2 https://github.com/intel/xFasterTransformer.git
+    git clone https://github.com/intel/xFasterTransformer.git
 fi
 
 #2.cp patch
 cd xFasterTransformer
-git checkout .
+git reset --hard 420a493f5c3c74f5fdd786f5399aacd04e021df7
 cd ..
 
 if lscpu | grep -q "avx512_bf16"; then
     echo "apply bf16 and fp16."
-    if [ ! -f 0001-patch-fp16-and-bf16.patch ]; then
-        echo "Error:  0001-patch-fp16-and-bf16.patch not exist."
+    if [ ! -f 0001-fp16_bf16.patch ]; then
+        echo "Error:  0001-fp16_bf16.patch not exist."
         exit 1
     fi
     # apply patch
-    cp ./0001-patch-fp16-and-bf16.patch  ./xFasterTransformer/paddle.patch
+    cp ./0001-fp16_bf16.patch  ./xFasterTransformer/paddle.patch
 else
     echo "apply fp32 "
-    if [ ! -f 0001-patch-fp32.patch ]; then
-        echo "Error:  does 0001-patch-fp32.patch not exist."
+    if [ ! -f 0001-fp32.patch ]; then
+        echo "Error:  does 0001-fp32.patch not exist."
         exit 1
     fi
-    cp ./0001-patch-fp32.patch  ./xFasterTransformer/paddle.patch
+    cp ./0001-fp32.patch  ./xFasterTransformer/paddle.patch
 fi
 
 #3. apply patch
 
@@ -0,0 +1,160 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "dtype.h"
+#include "matmul_helper.h"
+#include "my_types.h"
+#include "paddle/extension.h"
+git adtemplate <typename T>
+void AvxCompute(const paddle::Tensor &x,
+                const paddle::Tensor &weight,
+                bool trans,
+                const std::string alog,
+                paddle::Tensor &out,
+                xft::Matrix<T> &quantizedWeight,
+                xft::Vector<float> &WeightScale,
+                xft::Vector<float> &WeightZero,
+                xft::Vector<float> &WeightSum,
+                MMHelper *mmHelper) {
+  auto out_data = out.data<float>();
+  const float *x_data = reinterpret_cast<const float *>(x.data<float>());
+  const float *bias_data = nullptr;
+  int m = 1;
+  for (int i = 0; i < x.shape().size() - 1; i++) {
+    m = m * x.shape()[i];
+  }
+  int k = x.shape()[x.shape().size() - 1];
+  int l = weight.shape()[1];
+  int n = weight.shape()[1];
+  
+  mmHelper->compute(false,
+                    m,
+                    n,
+                    k,
+                    1.0f,
+                    x_data,
+                    k,
+                    quantizedWeight.Data(),
+                    WeightScale.Data(),
+                    WeightZero.Data(),
+                    WeightSum.Data(),
+                    0.0,
+                    out_data,
+                    l);
+};
+template <typename T>
+void AvxWeightOnly(const paddle::Tensor &x,
+                   const paddle::Tensor &weight,
+                   bool trans,
+                   const std::string alog,
+                   paddle::Tensor &out) {
+  static std::unordered_map<std::string,
+                            std::tuple<xft::Matrix<T> *,
+                                       xft::Vector<float> *,
+                                       xft::Vector<float> *,
+                                       xft::Vector<float> *>>
+      weight_only_hub;
+  std::stringstream weights_addr;
+  weights_addr << weight.data<float>() << alog;
+  std::string weight_only_key = weights_addr.str();
+  auto it_created = weight_only_hub.find(weight_only_key);
+  static MMHelper *mmHelper;
+  int rows = weight.shape()[0], cols = weight.shape()[1];
+  xft::Vector<float> *WeightScale =
+      new xft::Vector<float>();  // if weight is int8
+  xft::Vector<float> *WeightZero =
+      new xft::Vector<float>();  // if weight is int8
+  xft::Vector<float> *WeightSum =
+      new xft::Vector<float>();  // if weight is int8
+  xft::Matrix<T> *quantizedWeight = new xft::Matrix<T>();
+  if (it_created == weight_only_hub.end()) {
+    auto weight_ptr = reinterpret_cast<const float *>(weight.data<float>());
+    xft::Matrix<T> convertedWeight;
+    mmHelper = new MMHelper(xft::DeviceKind::iCPU, 0);
+    mmHelper->convertWeight(trans,
+                            rows,
+                            cols,
+                            weight_ptr,
+                            nullptr,
+                            nullptr,
+                            convertedWeight,
+                            *WeightScale,
+                            *WeightZero,
+                            *WeightSum);
+    quantizedWeight->Resize(rows, cols);
+    mmHelper->packWeight(trans, convertedWeight, *quantizedWeight);
+    weight_only_hub[weight_only_key] =
+        std::make_tuple(quantizedWeight, WeightScale, WeightZero, WeightSum);
+    AvxCompute<T>(x,
+                  weight,
+                  trans,
+                  alog,
+                  out,
+                  *quantizedWeight,
+                  *WeightScale,
+                  *WeightZero,
+                  *WeightSum,
+                  mmHelper);
+  } else {
+    AvxCompute<T>(x,
+                  weight,
+                  trans,
+                  alog,
+                  out,
+                  *(std::get<0>(it_created->second)),
+                  *(std::get<1>(it_created->second)),
+                  *(std::get<2>(it_created->second)),
+                  *(std::get<3>(it_created->second)),
+                  mmHelper);
+  }
+}
+std::vector<paddle::Tensor> InvokeAvxWeightOnly(const paddle::Tensor &x,
+                                                const paddle::Tensor &weight,
+                                                const std::string &alog,
+                                                bool trans) {
+  auto out_shape = x.shape();
+  out_shape[out_shape.size() - 1] = weight.shape()[1];
+  auto out = paddle::empty(out_shape, x.dtype(), paddle::CPUPlace());
+  if (alog == "int8") {
+    AvxWeightOnly<int8_t>(x, weight, trans, alog, out);
+  } else if (alog == "fp16") {
+    AvxWeightOnly<float16_t>(x, weight, trans, alog, out);
+  } else {
+    AvxWeightOnly<float16_t>(x, weight, trans, alog, out);
+  }
+  return {out};
+}
+
+std::vector<std::vector<int64_t>> AvxWeightOnlyInferShape(
+    std::vector<int64_t> x_shape,
+    std::vector<int64_t> weigh_shape) {
+  int m = 1;
+  for (int i = 0; i < x_shape.size() - 1; i++) {
+    m = m * x_shape[i];
+  }
+  return {std::vector<int64_t>{m, weigh_shape[1]}};
+}
+
+std::vector<paddle::DataType> AvxWeightOnlyInferDtype(
+    paddle::DataType x_dtype,
+    paddle::DataType weight_dtype) {
+  return {x_dtype};
+}
+
+PD_BUILD_OP(avx_weight_only)
+    .Inputs({"x", "weight"})
+    .Outputs({"out"})
+    .Attrs({"alog: std::string", "trans:bool"})
+    .SetKernelFn(PD_KERNEL(InvokeAvxWeightOnly))
+    .SetInferShapeFn(PD_INFER_SHAPE(AvxWeightOnlyInferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(AvxWeightOnlyInferDtype));
@@ -53,19 +53,28 @@ def check_avx512_bf16__support():
         return False
 
 
-# cc flags
 paddle_extra_compile_args = [
     "-std=c++17",
     "-shared",
     "-fPIC",
     "-Wno-parentheses",
     "-DPADDLE_WITH_CUSTOM_KERNEL",
+    "-mavx512f",
+    "-mavx512vl",
+    "-fopenmp",
+    "-mavx512bw",
+    "-mno-mmx",
+    "-Wall",
+    "-march=skylake-avx512",
+    "-O3",
+    "-g",
 ]
 
 if check_avx512_bf16__support():
     paddle_extra_compile_args += [
         "-DAVX512_BF16_WEIGHT_ONLY_BF16=true",
-        "-DAVX512_BF16_WEIGHT_ONLY_BF16=true",
+        "-DAVX512_FP16_WEIGHT_ONLY_INT8=true",
+        "-DAVX512_FP16_WEIGHT_ONLY_FP16=true",
     ]
 else:
     paddle_extra_compile_args += [
@@ -81,15 +90,17 @@ def check_avx512_bf16__support():
 
 # include path third_party
 paddle_custom_kernel_include += [
-    os.path.join(XFT_INCLUDE_DIR, "include"),  # glog
-    os.path.join(XFT_INCLUDE_DIR, "src/common"),  # src
-    os.path.join(XFT_INCLUDE_DIR, "src/kernel"),  # src
-    os.path.join(XFT_INCLUDE_DIR, "src/layers"),  # src
-    os.path.join(XFT_INCLUDE_DIR, "src/models"),  # src
-    os.path.join(XFT_INCLUDE_DIR, "src/utils"),  # src
-    os.path.join(XFT_INCLUDE_DIR, "3rdparty/onednn/include"),  # src
-    os.path.join(XFT_INCLUDE_DIR, "3rdparty/onednn/build/include"),  # src
-    os.path.join(XFT_INCLUDE_DIR, "3rdparty/xdnn"),  # src
+    os.path.join(XFT_INCLUDE_DIR, "include"),
+    os.path.join(XFT_INCLUDE_DIR, "src/common"),
+    os.path.join(XFT_INCLUDE_DIR, "src/kernel"),
+    os.path.join(XFT_INCLUDE_DIR, "src/layers"),
+    os.path.join(XFT_INCLUDE_DIR, "src/models"),
+    os.path.join(XFT_INCLUDE_DIR, "src/utils"),
+    os.path.join(XFT_INCLUDE_DIR, "3rdparty/onednn/include"),
+    os.path.join(XFT_INCLUDE_DIR, "3rdparty/onednn/build/include"),
+    os.path.join(XFT_INCLUDE_DIR, "3rdparty/xdnn"),
+    os.path.join(XFT_INCLUDE_DIR, "3rdparty"),
+    os.path.join(XFT_INCLUDE_DIR, "3rdparty/mkl/include"),
 ]
 
 # libs path
@@ -101,11 +112,13 @@ def check_avx512_bf16__support():
 
 custom_kernel_dot_module = CppExtension(
     sources=[
-        "./src/xft_llama_layer.cc",
         "../generation/save_with_output.cc",
         "./src/token_penalty_multi_scores.cc",
         "./src/stop_generation_multi_ends.cc",
         "./src/set_value_by_flags.cc",
+        "./src/xft_transformer.cc",
+        "./src/avx_weight_only.cc",
+        "./src/xft_greedy_search.cc",
     ],
     include_dirs=paddle_custom_kernel_include,
     library_dirs=paddle_custom_kernel_library_dir,
 
@@ -1,23 +1,20 @@
 // Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
-// 
+//
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
-// 
+//
 //     http://www.apache.org/licenses/LICENSE-2.0
-// 
+//
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
 #include <paddle/extension.h>
-
 #include <vector>
 
-#include "paddle/phi/core/kernel_registry.h"
-
 template <typename T>
 void min_length_logits_process(T* logits,
                                const int64_t* cur_len,