[xpu] Fc int31 (#7514)

newway · web-flow · commit 2098ee0c9f2e · 2021-11-03T18:49:39.000+08:00
* [xpu] fix continuous encoder fuse and fc max size

* [xpu] refactor fc int31 for KL2
diff --git a/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_fuse_pass.cc b/lite/core/optimizer/mir/fusion/__xpu__multi_encoder_fuse_pass.cc
@@ -665,21 +665,24 @@ class XPUMultiEncoderFuser {
 
   void operator()(SSAGraph* graph) {
     std::vector<Node*> all_encoders;
-    for (auto* node : graph->StmtTopologicalOrder()) {
-      CHECK(node->IsStmt());
-      if (node->stmt()->op_info()->Type() == "single_encoder") {
-        if (all_encoders.empty() ||
-            IsDirectPredecessorOf(all_encoders.back(), node)) {
-          all_encoders.push_back(node);
-        } else {
-          break;
+    // if no node linked from all_encoders.back(), search is over
+    int encoder_num = 0;
+    do {
+      encoder_num = all_encoders.size();
+      for (auto* node : graph->StmtTopologicalOrder()) {
+        CHECK(node->IsStmt());
+        if (node->stmt()->op_info()->Type() == "single_encoder") {
+          if (all_encoders.empty() ||
+              IsDirectPredecessorOf(all_encoders.back(), node)) {
+            all_encoders.push_back(node);
+          }
         }
       }
-    }
-    VLOG(3) << "Found continuous " << all_encoders.size() << " single_encoder";
+    } while (encoder_num != all_encoders.size());
     if (all_encoders.size() == 0) {
       return;
     }
+    VLOG(3) << "Found continuous " << all_encoders.size() << " single_encoder";
 
     const bool enable_int8 =
         all_encoders[0]->stmt()->op_info()->HasAttr("enable_int8") &&
diff --git a/lite/kernels/xpu/__xpu__fc_compute.cc b/lite/kernels/xpu/__xpu__fc_compute.cc
@@ -109,6 +109,7 @@ void XPUFcCompute::Run() {
   int n = param.w->dims()[1];
   bool quant_int8 = param.quant_w_max > 0.f;
 
+  param.output_max->Resize({lite::XPU_QUANT_SCALE_NUM});
   float* output_max = quant_int8
                           ? nullptr
                           : param.output_max->mutable_data<float>(TARGET(kXPU));
@@ -125,26 +126,26 @@ void XPUFcCompute::Run() {
   }
   // TODO(weihaoji): remove fc_int31 and fc_int16 after xpu fc wrapper refactor
   if (param.precision == "int31") {
-    int r = xdnn::fc_int31(
-        ctx.GetRawContext(),        /* context */
-        false,                      /* TransA */
-        true,                       /* TransB */
-        m,                          /* m */
-        n,                          /* n */
-        k,                          /* k */
-        1.0f,                       /* alpha */
-        param.input->data<float>(), /* A */
-        nullptr,                    /* max_a ptr */
-        reinterpret_cast<const float*>(quant_weight_guard_->addr_), /* B */
-        w_max,                                                      /* max_b */
-        0.0f,                                                       /* beta */
-        param.output->mutable_data<float>(TARGET(kXPU)),            /* C */
-        nullptr, /* max_c ptr */
-        bias,    /* bias */
-        act /* act_type */);
-    CHECK_EQ(r, 0);
-    r = xdnn::findmax<float>(
-        ctx.GetRawContext(), param.output->data<float>(), m * n, output_max);
+    int r = xdnn::fc_fusion<float, float, float, int>(
+        ctx.GetRawContext(),                                         // ctx
+        param.input->data<float>(),                                  // x
+        reinterpret_cast<const float*>(quant_weight_guard_->addr_),  // w
+        param.output->mutable_data<float>(TARGET(kXPU)),             // y
+        m,                                                           // m
+        n,                                                           // n
+        k,                                                           // k
+        false,                                                       // x_trans
+        true,                                                        // w_trans
+        input_max,                                                   // x_maxptr
+        reinterpret_cast<const float*>(weight_max_guard_->addr_),    // w_maxptr
+        output_max,                                                  // y_maxptr
+        k,                                                           // ldx
+        k,                                                           // ldw
+        n,                                                           // ldy
+        1.0f,                                                        // alpha
+        0.0f,                                                        // beta
+        bias,                                                        // bias
+        act);
     CHECK_EQ(r, 0);
   } else if (param.precision == "int16") {
     int r = 0;
diff --git a/lite/operators/__xpu__fc_op.cc b/lite/operators/__xpu__fc_op.cc
@@ -62,7 +62,6 @@ bool XPUFcOp::InferShapeImpl() const {
   }
   output_dims[in_num_col_dims] = w_dims_1;
   param_.output->Resize(output_dims);
-  param_.output_max->Resize({4});
 
   // share LoD
   param_.output->set_lod(param_.input->lod());

Original file line number	Diff line number	Diff line change
`@@ -62,7 +62,6 @@ bool XPUFcOp::InferShapeImpl() const {`
`62`	`62`	`}`
`63`	`63`	`output_dims[in_num_col_dims] = w_dims_1;`
`64`	`64`	`param_.output->Resize(output_dims);`
`65`		`- param_.output_max->Resize({4});`
`66`	`65`
`67`	`66`	`// share LoD`
`68`	`67`	`param_.output->set_lod(param_.input->lod());`