fused_weight_only_linear_pass supoport weight_only_int4 (#63212)

yuanlehome · web-flow · commit 5199527b9c9a · 2024-04-09T15:02:26.000+08:00
* supoport weight_only_int4

* update

* fix

* fix
diff --git a/paddle/fluid/pir/drr/src/rewrite_pattern.cc b/paddle/fluid/pir/drr/src/rewrite_pattern.cc
@@ -474,16 +474,16 @@ MatchContextImpl DrrRewritePattern::CreateOperations(
     }
   }
 
-  std::vector<std::vector<pir::Operation*>> temp_program;
-  std::unordered_map<pir::Operation*, size_t> op_2_temp_program_index;
-  for (auto& op : *rewriter.block()) {
-    op_2_temp_program_index[&op] = temp_program.size();
-    temp_program.push_back({&op});
-  }
-
   // topo order visit result_pattern_graph
   GraphTopo graph_topo_visit(&result_pattern_graph);
   graph_topo_visit.WalkGraphNodesTopoOrder([&](const OpCall& op_call) {
+    std::vector<std::vector<pir::Operation*>> temp_program;
+    std::unordered_map<pir::Operation*, size_t> op_2_temp_program_index;
+    for (auto& op : *rewriter.block()) {
+      op_2_temp_program_index[&op] = temp_program.size();
+      temp_program.push_back({&op});
+    }
+
     // set insert point
     size_t max_input_op_index = 0UL;
     pir::Operation* max_index_op = nullptr;
@@ -530,11 +530,13 @@ MatchContextImpl DrrRewritePattern::CreateOperations(
 
     pir::Operation* new_op =
         CreateOperation(op_call, src_match_ctx, rewriter, &res_match_ctx);
-    op_2_temp_program_index[new_op] = max_input_op_index + 1;
-    if (max_input_op_index + 1 >= temp_program.size()) {
+
+    size_t new_max_input_op_index = max_input_op_index + 1;
+    op_2_temp_program_index[new_op] = new_max_input_op_index;
+    if (new_max_input_op_index >= temp_program.size()) {
       temp_program.push_back({});
     }
-    temp_program[max_input_op_index + 1].push_back(new_op);
+    temp_program[new_max_input_op_index].push_back(new_op);
   });
 
   return res_match_ctx;
diff --git a/paddle/fluid/pir/transforms/general/constant_folding_pass.cc b/paddle/fluid/pir/transforms/general/constant_folding_pass.cc
@@ -238,7 +238,11 @@ class ConstantFoldingPattern : public pir::RewritePattern {
       const std::vector<std::pair<pir::Operation*, int32_t>>& use_ops) const {
     for (auto [use_op, idx] : use_ops) {
       if (use_op->isa<pir::CombineOp>()) {
-        if (!ReplaceResultByParameterOp(use_op)) return false;
+        if (!ReplaceResultByParameterOp(use_op)) {
+          return false;
+        }
+      } else if (use_op->isa<paddle::dialect::MemcpyH2dOp>()) {
+        return false;
       } else if (use_op->HasInterface<paddle::dialect::OpYamlInfoInterface>()) {
         auto [input_infos, _1, _2, _3, _4] =
             use_op->dyn_cast<paddle::dialect::OpYamlInfoInterface>()
@@ -255,6 +259,9 @@ class ConstantFoldingPattern : public pir::RewritePattern {
   }
 
   bool ReplaceResultByParameterOp(pir::Operation* op) const {
+    if (op->isa<paddle::dialect::MemcpyD2hOp>()) {
+      return false;
+    }
     for (uint32_t i = 0; i < op->num_results(); i++) {
       auto use_ops = pir::GetUseOpsForOutput(op, i);
       if (!CheckUseOps(use_ops)) return false;
diff --git a/paddle/fluid/pir/transforms/gpu/fused_weight_only_linear_pass.cc b/paddle/fluid/pir/transforms/gpu/fused_weight_only_linear_pass.cc
@@ -31,7 +31,7 @@ int getSMVersion() {
   sm_version = paddle::platform::GetGPUComputeCapability(
       paddle::platform::GetCurrentDeviceId());
 #else
-  PADDLE_THROW(paddle::platform::errors::Unavailable(
+  PADDLE_THROW(common::errors::Unavailable(
       "fused_weight_only_linear_pass needs paddle compiled with CUDA."));
 #endif
   return sm_version;
@@ -41,10 +41,14 @@ class FusedWeightOnlyLinearWithBiasPattern
     : public paddle::drr::DrrPatternBase {
  private:
   bool reverse_add_;
+  std::string algo_;
+  int sm_version_;
 
  public:
-  explicit FusedWeightOnlyLinearWithBiasPattern(bool reverse_add)
-      : reverse_add_(reverse_add) {}
+  FusedWeightOnlyLinearWithBiasPattern(bool reverse_add,
+                                       const std::string &algo,
+                                       int sm_version)
+      : reverse_add_(reverse_add), algo_(algo), sm_version_(sm_version) {}
 
   std::string name() const override {
     return "FusedWeightOnlyLinearWithBiasPattern";
@@ -104,19 +108,49 @@ class FusedWeightOnlyLinearWithBiasPattern
     //
     paddle::drr::ResultPattern res = src.ResultPattern();
 
-    const auto &weight_quantize =
-        res.Op(paddle::dialect::WeightQuantizeOp::name(),
-               {{"algo", res.StrAttr("weight_only_int8")},
-                {"arch", res.Int32Attr(getSMVersion())},
-                {"group_size", res.Int32Attr(-1)}});
-    weight_quantize({&res.Tensor("w")},
-                    {&res.Tensor("quanted_weight_tensor"),
-                     &res.Tensor("weight_scale_tensor")});
+    if (algo_ == "weight_only_int4") {
+      // TODO(liuyuanle): When the operator weight_quantize supports
+      // weight_only_int4 on gpu version, delete the memory copy.
+      const auto &memcpy_d2h =
+          res.Op(paddle::dialect::MemcpyD2hOp::name(),
+                 {{"dst_place_type", res.Int32Attr(0 /*cpu*/)}});
+      res.Tensor("w_cpu") = memcpy_d2h(res.Tensor("w"));
+      const auto &weight_quantize =
+          res.Op(paddle::dialect::WeightQuantizeOp::name(),
+                 {{"algo", res.StrAttr(algo_)},
+                  {"arch", res.Int32Attr(sm_version_)},
+                  {"group_size", res.Int32Attr(-1)}});
+      weight_quantize({&res.Tensor("w_cpu")},
+                      {&res.Tensor("quanted_weight_tensor_cpu"),
+                       &res.Tensor("weight_scale_tensor_cpu")});
+
+      const auto &memcpy_h2d_1 =
+          res.Op(paddle::dialect::MemcpyH2dOp::name(),
+                 {{"dst_place_type", res.Int32Attr(1 /*gpu*/)}});
+      res.Tensor("quanted_weight_tensor") =
+          memcpy_h2d_1(res.Tensor("quanted_weight_tensor_cpu"));
+      const auto &memcpy_h2d_2 =
+          res.Op(paddle::dialect::MemcpyH2dOp::name(),
+                 {{"dst_place_type", res.Int32Attr(1 /*gpu*/)}});
+      res.Tensor("weight_scale_tensor") =
+          memcpy_h2d_2(res.Tensor("weight_scale_tensor_cpu"));
+    } else {
+      const auto &weight_quantize =
+          res.Op(paddle::dialect::WeightQuantizeOp::name(),
+                 {{"algo", res.StrAttr(algo_)},
+                  {"arch", res.Int32Attr(sm_version_)},
+                  {"group_size", res.Int32Attr(-1)}});
+
+      weight_quantize({&res.Tensor("w")},
+                      {&res.Tensor("quanted_weight_tensor"),
+                       &res.Tensor("weight_scale_tensor")});
+    }
 
     const auto &weight_only_linear =
         res.Op(paddle::dialect::WeightOnlyLinearOp::name(),
-               {{"weight_dtype", res.StrAttr("int8")},
-                {"arch", res.Int32Attr(getSMVersion())},
+               {{"weight_dtype",
+                 res.StrAttr(algo_ == "weight_only_int8" ? "int8" : "int4")},
+                {"arch", res.Int32Attr(sm_version_)},
                 {"group_size", res.Int32Attr(-1)}});
     weight_only_linear({&res.Tensor("x"),
                         &res.Tensor("quanted_weight_tensor"),
@@ -127,6 +161,14 @@ class FusedWeightOnlyLinearWithBiasPattern
 };
 
 class FusedWeightOnlyLinearNoBiasPattern : public paddle::drr::DrrPatternBase {
+ private:
+  std::string algo_;
+  int sm_version_;
+
+ public:
+  FusedWeightOnlyLinearNoBiasPattern(const std::string &algo, int sm_version)
+      : algo_(algo), sm_version_(sm_version) {}
+
  public:
   std::string name() const override {
     return "FusedWeightOnlyLinearNoBiasPattern";
@@ -179,19 +221,48 @@ class FusedWeightOnlyLinearNoBiasPattern : public paddle::drr::DrrPatternBase {
     //
     paddle::drr::ResultPattern res = src.ResultPattern();
 
-    const auto &weight_quantize =
-        res.Op(paddle::dialect::WeightQuantizeOp::name(),
-               {{"algo", res.StrAttr("weight_only_int8")},
-                {"arch", res.Int32Attr(getSMVersion())},
-                {"group_size", res.Int32Attr(-1)}});
-    weight_quantize({&res.Tensor("w")},
-                    {&res.Tensor("quanted_weight_tensor"),
-                     &res.Tensor("weight_scale_tensor")});
-
+    if (algo_ == "weight_only_int4") {
+      // TODO(liuyuanle): When the operator weight_quantize supports
+      // weight_only_int4 on gpu version, delete the memory copy.
+      const auto &memcpy_d2h =
+          res.Op(paddle::dialect::MemcpyD2hOp::name(),
+                 {{"dst_place_type", res.Int32Attr(0 /*cpu*/)}});
+      res.Tensor("w_cpu") = memcpy_d2h(res.Tensor("w"));
+      const auto &weight_quantize =
+          res.Op(paddle::dialect::WeightQuantizeOp::name(),
+                 {{"algo", res.StrAttr(algo_)},
+                  {"arch", res.Int32Attr(sm_version_)},
+                  {"group_size", res.Int32Attr(-1)}});
+      weight_quantize({&res.Tensor("w_cpu")},
+                      {&res.Tensor("quanted_weight_tensor_cpu"),
+                       &res.Tensor("weight_scale_tensor_cpu")});
+
+      const auto &memcpy_h2d_1 =
+          res.Op(paddle::dialect::MemcpyH2dOp::name(),
+                 {{"dst_place_type", res.Int32Attr(1 /*gpu*/)}});
+      res.Tensor("quanted_weight_tensor") =
+          memcpy_h2d_1(res.Tensor("quanted_weight_tensor_cpu"));
+      const auto &memcpy_h2d_2 =
+          res.Op(paddle::dialect::MemcpyH2dOp::name(),
+                 {{"dst_place_type", res.Int32Attr(1 /*gpu*/)}});
+      res.Tensor("weight_scale_tensor") =
+          memcpy_h2d_2(res.Tensor("weight_scale_tensor_cpu"));
+    } else {
+      const auto &weight_quantize =
+          res.Op(paddle::dialect::WeightQuantizeOp::name(),
+                 {{"algo", res.StrAttr(algo_)},
+                  {"arch", res.Int32Attr(sm_version_)},
+                  {"group_size", res.Int32Attr(-1)}});
+
+      weight_quantize({&res.Tensor("w")},
+                      {&res.Tensor("quanted_weight_tensor"),
+                       &res.Tensor("weight_scale_tensor")});
+    }
     const auto &weight_only_linear =
         res.Op(paddle::dialect::WeightOnlyLinearOp::name(),
-               {{"weight_dtype", res.StrAttr("int8")},
-                {"arch", res.Int32Attr(getSMVersion())},
+               {{"weight_dtype",
+                 res.StrAttr(algo_ == "weight_only_int8" ? "int8" : "int4")},
+                {"arch", res.Int32Attr(sm_version_)},
                 {"group_size", res.Int32Attr(-1)}});
     weight_only_linear({&res.Tensor("x"),
                         &res.Tensor("quanted_weight_tensor"),
@@ -204,15 +275,28 @@ class FusedWeightOnlyLinearNoBiasPattern : public paddle::drr::DrrPatternBase {
 class FusedWeightOnlyLinearPass : public pir::PatternRewritePass {
  public:
   FusedWeightOnlyLinearPass()
-      : pir::PatternRewritePass("fused_weight_only_linear_pass", 4) {}
+      : pir::PatternRewritePass("fused_weight_only_linear_pass", 4),
+        sm_version_(getSMVersion()) {}
 
   pir::RewritePatternSet InitializePatterns(pir::IrContext *context) override {
+    std::string algo = "weight_only_int4";
+    if (Has("weight_only_algo")) {
+      algo = Get<std::string>("weight_only_algo");
+    }
+    PADDLE_ENFORCE_EQ(algo == "weight_only_int8" || algo == "weight_only_int4",
+                      true,
+                      common::errors::InvalidArgument(
+                          "fused_weight_only_linear_pass only support "
+                          "weight_only_int8 or weight_only_int4, but get %s.",
+                          algo));
+
     pir::RewritePatternSet ps(context);
-    ps.Add(paddle::drr::Create<FusedWeightOnlyLinearWithBiasPattern>(context,
-                                                                     true));
-    ps.Add(paddle::drr::Create<FusedWeightOnlyLinearWithBiasPattern>(context,
-                                                                     false));
-    ps.Add(paddle::drr::Create<FusedWeightOnlyLinearNoBiasPattern>(context));
+    ps.Add(paddle::drr::Create<FusedWeightOnlyLinearWithBiasPattern>(
+        context, true, algo, sm_version_));
+    ps.Add(paddle::drr::Create<FusedWeightOnlyLinearWithBiasPattern>(
+        context, false, algo, sm_version_));
+    ps.Add(paddle::drr::Create<FusedWeightOnlyLinearNoBiasPattern>(
+        context, algo, sm_version_));
     return ps;
   }
 
@@ -228,15 +312,15 @@ class FusedWeightOnlyLinearPass : public pir::PatternRewritePass {
   }
 
   bool CanApplyOn(pir::Operation *op) const override {
-    int sm_version = getSMVersion();
-    if (sm_version != 70 && sm_version != 75 && sm_version != 80 &&
-        sm_version != 86) {
+    if (sm_version_ != 70 && sm_version_ != 75 && sm_version_ != 80 &&
+        sm_version_ != 86) {
       return false;
     }
     return op->num_regions() > 0;
   }
 
  private:
+  int sm_version_;
   pir::FrozenRewritePatternSet patterns_;
 };
 
diff --git a/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc b/paddle/fluid/pir/transforms/pd_op_to_kernel_pass.cc
@@ -754,7 +754,7 @@ static phi::Backend GetKernelBackendByYaml(
   auto& backend_info = op_info_parser->OpRuntimeInfo().kernel_key_backend;
   phi::Backend kernel_backend = phi::Backend::UNDEFINED;
 
-  for (auto slot_name : backend_info) {
+  for (const auto& slot_name : backend_info) {
     auto& input_map = op_info_parser->InputName2Id();
 
     if (input_map.count(slot_name)) {
diff --git a/paddle/phi/api/yaml/ops.yaml b/paddle/phi/api/yaml/ops.yaml
@@ -2559,7 +2559,7 @@
   kernel :
     func : shape {dense -> dense},
            shape_sr {selected_rows -> dense}
-  data_transform:
+  data_transform :
     skip_transform : input
   interfaces : paddle::dialect::InferSymbolicShapeInterface
 
@@ -2620,7 +2620,7 @@
     spmd_rule : ElementwiseUnaryInferSpmd
   kernel :
     func : sin
-  inplace: (x -> out)
+  inplace : (x -> out)
   backward : sin_grad
   interfaces : paddle::dialect::InferSymbolicShapeInterface
 
@@ -2781,10 +2781,10 @@
 - op : swiglu
   args : (Tensor x, Tensor y)
   output : Tensor(out)
-  infer_meta:
-     func: SwiGLUInferMeta
-     spmd_rule: SwiGLUInferSpmd
-  kernel:
+  infer_meta :
+     func : SwiGLUInferMeta
+     spmd_rule : SwiGLUInferSpmd
+  kernel :
      func : swiglu
   optional : y
   backward: swiglu_grad
@@ -2808,7 +2808,7 @@
     func : UnchangedInferMeta
   kernel :
     func : tan
-  inplace: (x -> out)
+  inplace : (x -> out)
   backward : tan_grad
   interfaces : paddle::dialect::InferSymbolicShapeInterface
 
@@ -3057,9 +3057,9 @@
     func : WarpctcInferMeta
   kernel :
     func : warpctc
-    data_type: logits
-  optional: logits_length, labels_length
-  intermediate: warpctcgrad
+    data_type : logits
+  optional : logits_length, labels_length
+  intermediate : warpctcgrad
   backward : warpctc_grad
 
 - op : warprnnt
@@ -3069,8 +3069,8 @@
     func : WarprnntInferMeta
   kernel :
     func : warprnnt
-    data_type: input
-  intermediate: warprnntgrad
+    data_type : input
+  intermediate : warprnntgrad
   backward : warprnnt_grad
 
 - op : weight_dequantize
@@ -3090,8 +3090,8 @@
   kernel :
     func : weight_only_linear
     data_type : x
-  optional: bias
-  backward: weight_only_linear_grad
+  optional : bias
+  backward : weight_only_linear_grad
 
 - op : weight_quantize
   args : (Tensor x, str algo = "weight_only_int8", int arch = 80, int group_size = -1)
@@ -3100,7 +3100,8 @@
     func : WeightQuantizeInferMeta
   kernel :
     func : weight_quantize
-    data_type: x
+    data_type : x
+    backend : x
 
 - op : weighted_sample_neighbors
   args : (Tensor row, Tensor colptr, Tensor edge_weight, Tensor input_nodes, Tensor eids, int sample_size, bool return_eids)
@@ -3119,7 +3120,7 @@
     spmd_rule: WhereInferSpmd
   kernel :
     func : where
-  inplace: (x -> out)
+  inplace : (x -> out)
   backward : where_grad
   interfaces : paddle::dialect::InferSymbolicShapeInterface
 
diff --git a/paddle/phi/kernels/gpu/weight_quantize_kernel.cu b/paddle/phi/kernels/gpu/weight_quantize_kernel.cu