From 84eb23f3cbca3e88fb1bfdc5ca6e020acfcb8298 Mon Sep 17 00:00:00 2001
From: nhzlx <nhzlx.dragon@gmail.com>
Date: Thu, 7 Mar 2019 05:43:58 +0000
Subject: [PATCH 1/5] add simplify detection patten pass for anakin add density
 prior box op converter add detection out op converter

---
 paddle/fluid/framework/ir/CMakeLists.txt      |   5 +
 .../framework/ir/graph_pattern_detector.cc    | 130 ++++++++++
 .../framework/ir/graph_pattern_detector.h     |  18 ++
 .../simplify_anakin_detection_pattern_pass.cc | 233 ++++++++++++++++++
 .../simplify_anakin_detection_pattern_pass.h  |  41 +++
 .../inference/anakin/convert/CMakeLists.txt   |   2 +-
 .../anakin/convert/density_prior_box.cc       |  75 ++++++
 .../anakin/convert/density_prior_box.h        |  37 +++
 .../inference/anakin/convert/detection_out.cc |  65 +++++
 .../inference/anakin/convert/detection_out.h  |  37 +++
 paddle/fluid/inference/anakin/op_teller.cc    |   6 +-
 .../fluid/inference/api/analysis_predictor.cc |   5 +-
 .../inference/api/paddle_pass_builder.cc      |   3 +-
 13 files changed, 652 insertions(+), 5 deletions(-)
 create mode 100644 paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc
 create mode 100644 paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h
 create mode 100644 paddle/fluid/inference/anakin/convert/density_prior_box.cc
 create mode 100644 paddle/fluid/inference/anakin/convert/density_prior_box.h
 create mode 100644 paddle/fluid/inference/anakin/convert/detection_out.cc
 create mode 100644 paddle/fluid/inference/anakin/convert/detection_out.h
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index ca6b0229e906c0..9085deebca54d8 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -66,6 +66,7 @@ pass_library(conv_elementwise_add_fuse_pass inference)
 pass_library(conv_affine_channel_fuse_pass inference)
 pass_library(transpose_flatten_concat_fuse_pass inference)
 pass_library(identity_scale_op_clean_pass base)
+pass_library(simplify_anakin_detection_pattern_pass inference)
 
 # There may be many transpose-flatten structures in a model, and the output of
 # these structures will be used as inputs to the concat Op. This pattern will
@@ -76,6 +77,10 @@ foreach (index RANGE 3 6)
    file(APPEND ${pass_file} "USE_PASS(transpose_flatten${index}_concat_fuse_pass);\n")
 endforeach()
 
+foreach (index RANGE 3 6)
+   file(APPEND ${pass_file} "USE_PASS(simplify_anakin_detection_pattern_pass${index});\n")
+endforeach()
+
 if(WITH_MKLDNN)
     pass_library(mkldnn_placement_pass base mkldnn)
     pass_library(depthwise_conv_mkldnn_pass base mkldnn)
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index c0c34d186b0081..9baf777ff59aea 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1364,6 +1364,136 @@ PDNode *patterns::TransposeFlattenConcat::operator()(
   return concat_out;
 }
 
+PDNode *patterns::AnakinDetectionPattern::operator()(
+    std::vector<PDNode *> conv_in, int times) {
+  // The times represents the repeat times of the
+  // {prior_box, prior_box_loc_out, flatten, prior_box_var_out, reshape}
+  const int kNumFields = 7;
+  const int kPriorBoxLocOffset = 1;
+  const int kReshape1Offset = 2;
+  const int kReshape1OutOffset = 3;
+  const int kPriorBoxVarOffset = 4;
+  const int kReshape2Offset = 5;
+  const int kReshape2OutOffset = 6;
+
+  const int kBoxCoderThirdInputOffset = times;
+  const int kMultiClassSecondInputNmsOffset = times + 1;
+
+  std::vector<PDNode *> nodes;
+
+  for (int i = 0; i < times; i++) {
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("prior_box" + std::to_string(i)))
+            ->assert_is_op("density_prior_box"));
+    nodes.push_back(pattern->NewNode(GetNodeName("box_out" + std::to_string(i)))
+                        ->assert_is_op_output("density_prior_box", "Boxes")
+                        ->assert_is_op_input("reshape2", "X")
+                        ->AsIntermediate());
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("reshape1" + std::to_string(i)))
+            ->assert_is_op("reshape2"));
+
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("reshape1_out" + std::to_string(i)))
+            ->assert_is_op_output("reshape2")
+            ->assert_is_op_nth_input("concat", "X", i)
+            ->AsIntermediate());
+
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("box_var_out" + std::to_string(i)))
+            ->assert_is_op_output("density_prior_box", "Variances")
+            ->assert_is_op_input("reshape2", "X")
+            ->AsIntermediate());
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("reshape2" + std::to_string(i)))
+            ->assert_is_op("reshape2"));
+
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("reshape2_out" + std::to_string(i)))
+            ->assert_is_op_output("reshape2")
+            ->assert_is_op_nth_input("concat", "X", i)
+            ->AsIntermediate());
+  }
+
+  auto concat_op1 = pattern->NewNode(GetNodeName("concat1"))
+                        ->assert_is_op("concat")
+                        ->assert_op_has_n_inputs("concat", times);
+  auto concat_out1 = pattern->NewNode(GetNodeName("concat1_out"))
+                         ->assert_is_op_output("concat")
+                         ->AsIntermediate();
+
+  auto concat_op2 = pattern->NewNode(GetNodeName("concat2"))
+                        ->assert_is_op("concat")
+                        ->assert_op_has_n_inputs("concat", times);
+  auto concat_out2 = pattern->NewNode(GetNodeName("concat2_out"))
+                         ->assert_is_op_output("concat")
+                         ->AsIntermediate();
+
+  auto box_coder_op = pattern->NewNode(GetNodeName("box_coder"))
+                          ->assert_is_op("box_coder")
+                          ->assert_op_has_n_inputs("box_coder", 3);
+
+  auto box_coder_out = pattern->NewNode(GetNodeName("box_coder_out"))
+                           ->assert_is_op_output("box_coder")
+                           ->AsIntermediate();
+
+  auto multiclass_nms_op = pattern->NewNode(GetNodeName("multiclass_nms"))
+                               ->assert_is_op("multiclass_nms")
+                               ->assert_op_has_n_inputs("multiclass_nms", 2);
+
+  auto multiclass_nms_out = pattern->NewNode(GetNodeName("multiclass_nms_out"))
+                                ->assert_is_op_output("multiclass_nms")
+                                ->AsOutput();
+
+  std::vector<PDNode *> reshape1_outs;
+  std::vector<PDNode *> reshape2_outs;
+
+  for (int i = 0; i < times; i++) {
+    conv_in[i]->AsInput();
+    // prior_box
+    nodes[i * kNumFields]->LinksFrom({conv_in[i]});
+    // prior_box box out
+    nodes[i * kNumFields + kPriorBoxLocOffset]->LinksFrom(
+        {nodes[i * kNumFields]});
+    // reshape
+    nodes[i * kNumFields + kReshape1Offset]->LinksFrom(
+        {nodes[i * kNumFields + kPriorBoxLocOffset]});
+    // reshape_out
+    nodes[i * kNumFields + kReshape1OutOffset]->LinksFrom(
+        {nodes[i * kNumFields + kReshape1Offset]});
+
+    nodes[i * kNumFields + kPriorBoxVarOffset]->LinksFrom(
+        {nodes[i * kNumFields]});
+    // reshape
+    nodes[i * kNumFields + kReshape2Offset]->LinksFrom(
+        {nodes[i * kNumFields + kPriorBoxVarOffset]});
+    // reshape_out
+    nodes[i * kNumFields + kReshape2OutOffset]->LinksFrom(
+        {nodes[i * kNumFields + kReshape2Offset]});
+
+    reshape1_outs.push_back(nodes[i * kNumFields + kReshape1OutOffset]);
+    reshape2_outs.push_back(nodes[i * kNumFields + kReshape2OutOffset]);
+  }
+
+  concat_op1->LinksFrom(reshape1_outs);
+  concat_op2->LinksFrom(reshape2_outs);
+  concat_out1->LinksFrom({concat_op1});
+  concat_out2->LinksFrom({concat_op2});
+
+  conv_in[kBoxCoderThirdInputOffset]->AsInput();
+  conv_in[kMultiClassSecondInputNmsOffset]->AsInput();
+
+  box_coder_op->LinksFrom(
+      {concat_out1, concat_out2, conv_in[kBoxCoderThirdInputOffset]});
+  box_coder_out->LinksFrom({box_coder_op});
+
+  multiclass_nms_op
+      ->LinksFrom({box_coder_out, conv_in[kMultiClassSecondInputNmsOffset]})
+      .LinksTo({multiclass_nms_out});
+
+  return multiclass_nms_out;
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index c8be586f546dc6..06df547d4e6ac7 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -18,8 +18,11 @@
 #include <gtest/gtest_prod.h>
 #endif
 
+#include <memory>
 #include <numeric>
 #include <string>
+#include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/ir/graph.h"
@@ -781,6 +784,21 @@ struct TransposeFlattenConcat : public PatternBase {
   }
 };
 
+struct AnakinDetectionPattern : public PatternBase {
+  AnakinDetectionPattern(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "anakin_detect_pattern") {}
+
+  PDNode* operator()(std::vector<PDNode*> conv_inputs, int times);
+
+  std::string GetNodeName(const std::string& op_type) {
+    return PDNodeName(name_scope_, repr_, id_, op_type);
+  }
+
+  PDNode* GetPDNode(const std::string& op_type) {
+    return pattern->RetrieveNode(GetNodeName(op_type));
+  }
+};
+
 }  // namespace patterns
 
 // Link two ir::Nodes from each other.
diff --git a/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc
new file mode 100644
index 00000000000000..462f077eb7a829
--- /dev/null
+++ b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc
@@ -0,0 +1,233 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+#include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+template <int times>
+std::unique_ptr<ir::Graph> SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  const std::string pattern_name =
+      "simplify_anakin_detection_pattern_pass" + std::to_string(times);
+  FusePassBase::Init(pattern_name, graph.get());
+
+  GraphPatternDetector gpd;
+  std::vector<PDNode *> input_nodes;
+  for (int i = 0; i < times; i++) {
+    input_nodes.push_back(gpd.mutable_pattern()
+                              ->NewNode("x" + std::to_string(i))
+                              ->assert_is_op_input("density_prior_box", "Input")
+                              ->AsInput());
+  }
+  input_nodes.push_back(gpd.mutable_pattern()
+                            ->NewNode("x" + std::to_string(times))
+                            ->assert_is_op_input("box_coder", "TargetBox")
+                            ->AsInput());
+
+  input_nodes.push_back(gpd.mutable_pattern()
+                            ->NewNode("x" + std::to_string(times + 1))
+                            ->assert_is_op_input("multiclass_nms", "Scores")
+                            ->AsInput());
+
+  patterns::AnakinDetectionPattern pattern(gpd.mutable_pattern(), pattern_name);
+  pattern(input_nodes, times);
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    const int kNumFields = 7;
+    const int kPriorBoxLocOffset = 1;
+    const int kReshape1Offset = 2;
+    const int kReshape1OutOffset = 3;
+    const int kPriorBoxVarOffset = 4;
+    const int kReshape2Offset = 5;
+    const int kReshape2OutOffset = 6;
+    std::vector<Node *> nodes;
+
+    for (int i = 0; i < times; i++) {
+      PADDLE_ENFORCE(
+          subgraph.at(pattern.GetPDNode("prior_box" + std::to_string(i))));
+      PADDLE_ENFORCE(
+          subgraph.at(pattern.GetPDNode("box_out" + std::to_string(i))));
+      PADDLE_ENFORCE(
+          subgraph.at(pattern.GetPDNode("reshape1" + std::to_string(i))));
+      PADDLE_ENFORCE(
+          subgraph.at(pattern.GetPDNode("reshape1_out" + std::to_string(i))));
+      PADDLE_ENFORCE(
+          subgraph.at(pattern.GetPDNode("reshape2" + std::to_string(i))));
+      PADDLE_ENFORCE(
+          subgraph.at(pattern.GetPDNode("reshape2_out" + std::to_string(i))));
+
+      PADDLE_ENFORCE(
+          subgraph.at(pattern.GetPDNode("box_var_out" + std::to_string(i))));
+
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("prior_box" + std::to_string(i))));
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("box_out" + std::to_string(i))));
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("reshape1" + std::to_string(i))));
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("reshape1_out" + std::to_string(i))));
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("box_var_out" + std::to_string(i))));
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("reshape2" + std::to_string(i))));
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("reshape2_out" + std::to_string(i))));
+    }
+
+    Node *concat_op1 = subgraph.at(pattern.GetPDNode("concat1"));
+    Node *concat_out1 = subgraph.at(pattern.GetPDNode("concat1_out"));
+
+    Node *concat_op2 = subgraph.at(pattern.GetPDNode("concat2"));
+    Node *concat_out2 = subgraph.at(pattern.GetPDNode("concat2_out"));
+
+    Node *box_coder_third_input = subgraph.at(input_nodes[times]);
+    Node *box_coder_op = subgraph.at(pattern.GetPDNode("box_coder"));
+    Node *box_coder_out = subgraph.at(pattern.GetPDNode("box_coder_out"));
+
+    Node *multiclass_nms_second_input = subgraph.at(input_nodes[times + 1]);
+    Node *multiclass_nms = subgraph.at(pattern.GetPDNode("multiclass_nms"));
+    Node *multiclass_nms_out =
+        subgraph.at(pattern.GetPDNode("multiclass_nms_out"));
+
+    std::string code_type =
+        boost::get<std::string>(box_coder_op->Op()->GetAttr("code_type"));
+    bool box_normalized =
+        boost::get<bool>(box_coder_op->Op()->GetAttr("box_normalized"));
+    // auto variance =
+    // boost::get<std::vector<float>>(box_coder_op->Op()->GetAttr("variance"));
+    int background_label =
+        boost::get<int>(multiclass_nms->Op()->GetAttr("background_label"));
+    float score_threshold =
+        boost::get<float>(multiclass_nms->Op()->GetAttr("score_threshold"));
+    int nms_top_k = boost::get<int>(multiclass_nms->Op()->GetAttr("nms_top_k"));
+    float nms_threshold =
+        boost::get<float>(multiclass_nms->Op()->GetAttr("nms_threshold"));
+    float nms_eta = boost::get<float>(multiclass_nms->Op()->GetAttr("nms_eta"));
+    int keep_top_k =
+        boost::get<int>(multiclass_nms->Op()->GetAttr("keep_top_k"));
+
+    std::vector<std::string> concat1_input_names;
+    for (int i = 0; i < times; i++) {
+      concat1_input_names.push_back(
+          nodes[i * kNumFields + kPriorBoxLocOffset]->Name());
+    }
+
+    int axis = boost::get<int>(concat_op1->Op()->GetAttr("axis"));
+    framework::OpDesc concat1_desc;
+    concat1_desc.SetType("concat");
+    concat1_desc.SetInput("X", concat1_input_names);
+    concat1_desc.SetAttr("axis", axis);
+    concat1_desc.SetOutput("Out", {concat_out1->Name()});
+
+    auto *new_add_concat_op = graph->CreateOpNode(&concat1_desc);
+
+    for (int i = 0; i < times; i++) {
+      nodes[i * kNumFields + kPriorBoxLocOffset]->outputs.push_back(
+          new_add_concat_op);
+      new_add_concat_op->inputs.push_back(
+          nodes[i * kNumFields + kPriorBoxLocOffset]);
+    }
+
+    framework::OpDesc new_op_desc;
+    new_op_desc.SetType("detection_out");
+    new_op_desc.SetInput("PriorBox", {concat_out1->Name()});
+    new_op_desc.SetInput("TargetBox", {box_coder_third_input->Name()});
+    new_op_desc.SetInput("Scores", {multiclass_nms_second_input->Name()});
+    new_op_desc.SetAttr("code_type", code_type);
+    new_op_desc.SetAttr("box_normalized", box_normalized);
+    new_op_desc.SetAttr("background_label", background_label);
+    new_op_desc.SetAttr("score_threshold", score_threshold);
+    new_op_desc.SetAttr("nms_top_k", nms_top_k);
+    new_op_desc.SetAttr("nms_threshold", nms_threshold);
+    new_op_desc.SetAttr("nms_eta", nms_eta);
+    new_op_desc.SetAttr("keep_top_k", keep_top_k);
+    new_op_desc.SetOutput("Out", {multiclass_nms_out->Name()});
+    new_op_desc.Flush();
+
+    // Create a new node for the fused op.
+    auto *detection_out_op = graph->CreateOpNode(&new_op_desc);
+
+    std::unordered_set<const Node *> delete_nodes;
+
+    for (int i = 0; i < times; i++) {
+      nodes[i * kNumFields + kPriorBoxLocOffset]->outputs.push_back(concat_op1);
+      delete_nodes.insert(nodes[i * kNumFields + kReshape1Offset]);
+      delete_nodes.insert(nodes[i * kNumFields + kReshape1OutOffset]);
+      delete_nodes.insert(nodes[i * kNumFields + kPriorBoxVarOffset]);
+      delete_nodes.insert(nodes[i * kNumFields + kReshape2Offset]);
+      delete_nodes.insert(nodes[i * kNumFields + kReshape2OutOffset]);
+    }
+
+    delete_nodes.insert(concat_op1);
+    delete_nodes.insert(concat_op2);
+    delete_nodes.insert(concat_out2);
+    delete_nodes.insert(box_coder_op);
+    delete_nodes.insert(box_coder_out);
+    delete_nodes.insert(multiclass_nms);
+
+    new_add_concat_op->outputs.push_back(concat_out1);
+    concat_out1->inputs.push_back(new_add_concat_op);
+
+    detection_out_op->inputs.push_back(concat_out1);
+    detection_out_op->inputs.push_back(box_coder_third_input);
+    detection_out_op->inputs.push_back(multiclass_nms_second_input);
+    detection_out_op->outputs.push_back(multiclass_nms_out);
+
+    concat_out1->outputs.push_back(detection_out_op);
+    box_coder_third_input->outputs.push_back(detection_out_op);
+    multiclass_nms_second_input->outputs.push_back(detection_out_op);
+    multiclass_nms_out->inputs.push_back(detection_out_op);
+
+    // Delete the unneeded nodes.
+    GraphSafeRemoveNodes(graph.get(), delete_nodes);
+  };
+
+  gpd(graph.get(), handler);
+  return graph;
+}
+
+template class SimplifyAnakinDetectionPatternPass<1>;
+template class SimplifyAnakinDetectionPatternPass<3>;
+template class SimplifyAnakinDetectionPatternPass<4>;
+template class SimplifyAnakinDetectionPatternPass<5>;
+template class SimplifyAnakinDetectionPatternPass<6>;
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(simplify_anakin_detection_pattern_pass,
+              paddle::framework::ir::SimplifyAnakinDetectionPatternPass<1>);
+
+REGISTER_PASS(simplify_anakin_detection_pattern_pass3,
+              paddle::framework::ir::SimplifyAnakinDetectionPatternPass<3>);
+
+REGISTER_PASS(simplify_anakin_detection_pattern_pass4,
+              paddle::framework::ir::SimplifyAnakinDetectionPatternPass<4>);
+
+REGISTER_PASS(simplify_anakin_detection_pattern_pass5,
+              paddle::framework::ir::SimplifyAnakinDetectionPatternPass<5>);
+
+REGISTER_PASS(simplify_anakin_detection_pattern_pass6,
+              paddle::framework::ir::SimplifyAnakinDetectionPatternPass<6>);
diff --git a/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h
new file mode 100644
index 00000000000000..2338e4c38b253e
--- /dev/null
+++ b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include <unordered_set>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+// There may be many transpose-flatten structures in a model, and the output of
+// these structures will be used as inputs to the concat Op. This pattern will
+// be detected by our pass. The times here represents the repeat times of this
+// structure.
+template <int times>
+class SimplifyAnakinDetectionPatternPass : public FusePassBase {
+ public:
+  virtual ~SimplifyAnakinDetectionPatternPass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/convert/CMakeLists.txt b/paddle/fluid/inference/anakin/convert/CMakeLists.txt
index dc7fb7f79c0437..4f32d22cfa7996 100644
--- a/paddle/fluid/inference/anakin/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/anakin/convert/CMakeLists.txt
@@ -1,5 +1,5 @@
 cc_library(anakin_op_converter SRCS fc.cc conv2d.cc conv2d_fusion.cc
-elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc DEPS anakin_engine framework_proto scope op_registry)
+ elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc density_prior_box.cc detection_out.cc DEPS anakin_engine framework_proto scope op_registry)
 cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op)
 cc_test(test_anakin_conv2d SRCS test_conv2d_op.cc DEPS anakin_op_converter conv_op im2col vol2col depthwise_conv SERIAL)
 cc_test(test_anakin_activation SRCS test_activation_op.cc DEPS activation_op anakin_op_converter SERIAL)
diff --git a/paddle/fluid/inference/anakin/convert/density_prior_box.cc b/paddle/fluid/inference/anakin/convert/density_prior_box.cc
new file mode 100644
index 00000000000000..9fdea12dcf8edc
--- /dev/null
+++ b/paddle/fluid/inference/anakin/convert/density_prior_box.cc
@@ -0,0 +1,75 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/anakin/convert/density_prior_box.h"
+#include <algorithm>
+#include <map>
+#include <vector>
+
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::saber::NV;
+using anakin::saber::Shape;
+using anakin::PTuple;
+
+namespace paddle {
+namespace inference {
+namespace anakin {
+
+void DensityPriorBoxOpConverter::operator()(const framework::proto::OpDesc &op,
+                                            const framework::Scope &scope,
+                                            bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  auto input_name = op_desc.Input("Input").front();
+  auto image_name = op_desc.Input("Image").front();
+  auto output_name = op_desc.Output("Boxes").front();
+
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+
+  auto fixed_sizes =
+      boost::get<std::vector<float>>(op_desc.GetAttr("fixed_sizes"));
+  auto fixed_ratios =
+      boost::get<std::vector<float>>(op_desc.GetAttr("fixed_ratios"));
+  auto densities = boost::get<std::vector<int>>(op_desc.GetAttr("densities"));
+
+  // lack flip
+  auto clip = boost::get<bool>(op_desc.GetAttr("clip"));
+  auto variances = boost::get<std::vector<float>>(op_desc.GetAttr("variances"));
+
+  // lack img_h, img_w
+  auto step_h = boost::get<float>(op_desc.GetAttr("step_h"));
+  auto step_w = boost::get<float>(op_desc.GetAttr("step_w"));
+  auto offset = boost::get<float>(op_desc.GetAttr("offset"));
+  std::vector<std::string> order = {"MIN", "COM", "MAX"};
+
+  engine_->AddOp(op_name, "PriorBox", {input_name, image_name}, {output_name});
+  engine_->AddOpAttr<PTuple<float>>(op_name, "fixed_sizes", fixed_sizes);
+  engine_->AddOpAttr<PTuple<float>>(op_name, "fixed_ratios", fixed_ratios);
+  engine_->AddOpAttr<PTuple<int>>(op_name, "density", densities);
+  engine_->AddOpAttr(op_name, "flip", false);
+  engine_->AddOpAttr(op_name, "clip", clip);
+  engine_->AddOpAttr<PTuple<float>>(op_name, "variances", variances);
+  engine_->AddOpAttr(op_name, "img_h", static_cast<int>(0));
+  engine_->AddOpAttr(op_name, "img_w", static_cast<int>(0));
+  engine_->AddOpAttr(op_name, "step_h", step_h);
+  engine_->AddOpAttr(op_name, "step_w", step_w);
+  engine_->AddOpAttr(op_name, "offset", offset);
+  engine_->AddOpAttr<PTuple<std::string>>(op_name, "order", order);
+}
+
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_ANAKIN_OP_CONVERTER(density_prior_box, DensityPriorBoxOpConverter);
diff --git a/paddle/fluid/inference/anakin/convert/density_prior_box.h b/paddle/fluid/inference/anakin/convert/density_prior_box.h
new file mode 100644
index 00000000000000..44265cbf2e968e
--- /dev/null
+++ b/paddle/fluid/inference/anakin/convert/density_prior_box.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <string>
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace anakin {
+
+class DensityPriorBoxOpConverter : public AnakinOpConverter {
+ public:
+  DensityPriorBoxOpConverter() = default;
+
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~DensityPriorBoxOpConverter() {}
+};
+
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/convert/detection_out.cc b/paddle/fluid/inference/anakin/convert/detection_out.cc
new file mode 100644
index 00000000000000..1dae346a02443b
--- /dev/null
+++ b/paddle/fluid/inference/anakin/convert/detection_out.cc
@@ -0,0 +1,65 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/anakin/convert/detection_out.h"
+#include <algorithm>
+#include <map>
+
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::saber::NV;
+using anakin::saber::Shape;
+
+namespace paddle {
+namespace inference {
+namespace anakin {
+
+void DetectionOutOpConverter::operator()(const framework::proto::OpDesc &op,
+                                         const framework::Scope &scope,
+                                         bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  auto target_name = op_desc.Input("TargetBox").front();
+  auto prior_box_name = op_desc.Input("PriorBox").front();
+  auto scores_name = op_desc.Input("Scores").front();
+  auto output_name = op_desc.Output("Out").front();
+
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+
+  auto code_type = boost::get<std::string>(op_desc.GetAttr("code_type"));
+  auto background_label = boost::get<int>(op_desc.GetAttr("background_label"));
+  auto score_threshold = boost::get<float>(op_desc.GetAttr("score_threshold"));
+  auto nms_top_k = boost::get<float>(op_desc.GetAttr("nms_top_k"));
+  auto nms_threshold = boost::get<float>(op_desc.GetAttr("nms_threshold"));
+  auto nms_eta = boost::get<float>(op_desc.GetAttr("nms_eta"));
+  auto keep_top_k = boost::get<int>(op_desc.GetAttr("keep_top_k"));
+
+  engine_->AddOp(op_name, "DetectionOutput",
+                 {target_name, scores_name, prior_box_name}, {output_name});
+  engine_->AddOpAttr(op_name, "share_location", true);
+  engine_->AddOpAttr(op_name, "variance_encode_in_target", false);
+  engine_->AddOpAttr(op_name, "class_num", static_cast<int>(0));
+  engine_->AddOpAttr(op_name, "background_id", background_label);
+  engine_->AddOpAttr(op_name, "keep_top_k", keep_top_k);
+  engine_->AddOpAttr(op_name, "code_type", code_type);
+  engine_->AddOpAttr(op_name, "conf_thresh", score_threshold);
+  engine_->AddOpAttr(op_name, "nms_top_k", nms_top_k);
+  engine_->AddOpAttr(op_name, "nms_thresh", nms_threshold);
+  engine_->AddOpAttr(op_name, "nms_eta", nms_eta);
+}
+
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_ANAKIN_OP_CONVERTER(detection_out, DetectionOutOpConverter);
diff --git a/paddle/fluid/inference/anakin/convert/detection_out.h b/paddle/fluid/inference/anakin/convert/detection_out.h
new file mode 100644
index 00000000000000..5bf1c3ecbc8979
--- /dev/null
+++ b/paddle/fluid/inference/anakin/convert/detection_out.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <string>
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace anakin {
+
+class DetectionOutOpConverter : public AnakinOpConverter {
+ public:
+  DetectionOutOpConverter() = default;
+
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~DetectionOutOpConverter() {}
+};
+
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/op_teller.cc b/paddle/fluid/inference/anakin/op_teller.cc
index 015bcc066e1621..01ceef696ddc9b 100644
--- a/paddle/fluid/inference/anakin/op_teller.cc
+++ b/paddle/fluid/inference/anakin/op_teller.cc
@@ -21,7 +21,7 @@ namespace anakin {
 // Just tell by the op_types.
 struct SimpleOpTypeSetTeller : public Teller {
   SimpleOpTypeSetTeller() {
-    // teller_set.insert("mul");
+    teller_set.insert("mul");
     teller_set.insert("fc");
     teller_set.insert("conv2d_fusion");
     teller_set.insert("split");
@@ -30,7 +30,9 @@ struct SimpleOpTypeSetTeller : public Teller {
     teller_set.insert("elementwise_add");
     teller_set.insert("concat");
     teller_set.insert("tanh");
-    // teller_set.insert("conv2d");
+    teller_set.insert("conv2d");
+    teller_set.insert("detection_out");
+    teller_set.insert("density_prior_box");
   }
 
   bool operator()(const std::string& op_type,
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 87964d6b52728b..fa64786d706075 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -768,13 +768,16 @@ USE_TRT_CONVERTER(conv2d_transpose);
 USE_TRT_CONVERTER(leaky_relu);
 #endif
 
+USE_ANAKIN_CONVERTER(mul);
 USE_ANAKIN_CONVERTER(fc);
 USE_ANAKIN_CONVERTER(conv2d);
+USE_ANAKIN_CONVERTER(conv2d_fusion);
 USE_ANAKIN_CONVERTER(concat);
 USE_ANAKIN_CONVERTER(split);
 USE_ANAKIN_CONVERTER(relu);
 USE_ANAKIN_CONVERTER(sigmoid);
 USE_ANAKIN_CONVERTER(tanh);
 USE_ANAKIN_CONVERTER(pool2d);
-USE_ANAKIN_CONVERTER(conv2d_fusion);
 USE_ANAKIN_CONVERTER(elementwise_add);
+USE_ANAKIN_CONVERTER(detection_out);
+USE_ANAKIN_CONVERTER(density_prior_box);
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index b9615392c5b819..f723f0ef0605b0 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -71,7 +71,8 @@ void GpuPassStrategy::EnableMKLDNN() {
 GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
   passes_.assign({
       "infer_clean_graph_pass",  //
-      "fc_fuse_pass", "conv_elementwise_add_fuse_pass", "anakin_subgraph_pass",
+      "simplify_anakin_detection_pattern_pass3", "fc_fuse_pass",
+      "conv_elementwise_add_fuse_pass", "anakin_subgraph_pass",
       //     "identity_scale_op_clean_pass",              //
       /*
              "conv_affine_channel_fuse_pass",             //

From f8dcbd568e473646cc98a2f9aa8571f69017141d Mon Sep 17 00:00:00 2001
From: flame <fuchang1991@gmail.com>
Date: Thu, 7 Mar 2019 20:39:06 +0800
Subject: [PATCH 2/5] batch norm (#16110)

* use anakin batch norm and scale implement fluid batch norm
---
 .../inference/anakin/convert/batch_norm.cc    | 97 ++++++++++---------
 .../anakin/convert/test_batch_norm_op.cc      |  1 -
 2 files changed, 50 insertions(+), 48 deletions(-)

diff --git a/paddle/fluid/inference/anakin/convert/batch_norm.cc b/paddle/fluid/inference/anakin/convert/batch_norm.cc
index 5ebcefb8a2dc5b..bb5d406f981d40 100644
--- a/paddle/fluid/inference/anakin/convert/batch_norm.cc
+++ b/paddle/fluid/inference/anakin/convert/batch_norm.cc
@@ -41,16 +41,15 @@ void BatchNormOpConverter::operator()(const framework::proto::OpDesc &op,
 
   auto output = op_desc.Output("Y").front();
   auto op_name = op_desc.Type() + ":" + op_desc.Output("Y").front();
-  engine_->AddOp(op_name, "Scale", {inputs["X"]}, {output});
-  engine_->AddOpAttr(op_name, "bias_term", true);
-  engine_->AddOpAttr(op_name, "axis", 1);
-  engine_->AddOpAttr(op_name, "num_axes", 1);
-
   bool is_test = boost::get<bool>(op_desc.GetAttr("is_test"));
-  PADDLE_ENFORCE(is_test);
-  float epsilon = boost::get<float>(op_desc.GetAttr("epsilon"));
-  engine_->AddOpAttr(op_name, "epsilon", epsilon);
+  auto epsilon = boost::get<float>(op_desc.GetAttr("epsilon"));
+
+  auto bn_op_name = op_name + ":bn";
+  auto bn_output = bn_op_name + "_output";
+  engine_->AddOp(bn_op_name, "BatchNorm", {inputs["X"]}, {bn_output});
+  engine_->AddOpAttr(bn_op_name, "epsilon", epsilon);
 
+  auto scale_op_name = op_name + ":scale";
   auto get_lod_tensor = [this, &scope, &op_name](const std::string &var_name,
                                                  framework::LoDTensor *tensor) {
     auto *v = scope.FindVar(var_name);
@@ -69,50 +68,54 @@ void BatchNormOpConverter::operator()(const framework::proto::OpDesc &op,
   get_lod_tensor(inputs["Scale"], &scale_t);
   get_lod_tensor(inputs["Variance"], &variance_t);
 
-  auto *bias = bias_t.mutable_data<float>(platform::CPUPlace());
-  auto *mean = mean_t.mutable_data<float>(platform::CPUPlace());
-  auto *scale = scale_t.mutable_data<float>(platform::CPUPlace());
-  auto *variance = variance_t.mutable_data<float>(platform::CPUPlace());
-
-  framework::LoDTensor combile_scale_t;
-  framework::LoDTensor combile_bias_t;
-  combile_scale_t.Resize(scale_t.dims());
-  combile_bias_t.Resize(bias_t.dims());
-
-  auto *combile_scale =
-      combile_scale_t.mutable_data<float>(platform::CPUPlace());
-  auto *combile_bias = combile_bias_t.mutable_data<float>(platform::CPUPlace());
-
-  size_t elem_num = combile_scale_t.memory_size() / sizeof(float);
-  for (size_t i = 0; i < elem_num; i++) {
-    combile_scale[i] = scale[i] / sqrtf(variance[i] + epsilon);
-    combile_bias[i] = bias[i] - mean[i] * combile_scale[i];
-  }
-
-  auto fill_shape = [](size_t n, std::vector<int> *shape) {
-    shape->insert(shape->begin(), 1);
-    if (shape->size() < n) {
-      shape->insert(shape->end(), n - shape->size(), 1);
+  auto fill_shape = [](size_t n, std::vector<int> shape) {
+    shape.insert(shape.begin(), 1);
+    if (shape.size() < n) {
+      shape.insert(shape.end(), n - shape.size(), 1);
     }
+    return shape;
   };
-  auto scale_shape = framework::vectorize2int(combile_scale_t.dims());
-  auto bias_shape = framework::vectorize2int(combile_bias_t.dims());
-  fill_shape(4, &scale_shape);
-  fill_shape(4, &bias_shape);
-  Shape weight1_shape(scale_shape);
-  Shape weight2_shape(bias_shape);
+  Shape shape1(fill_shape(4, framework::vectorize2int(mean_t.dims())));
+  Shape shape2(fill_shape(4, framework::vectorize2int(variance_t.dims())));
   auto *weight1 =
-      GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(weight1_shape);
-  auto *scale_data = static_cast<float *>(weight1->h_tensor().mutable_data());
-  std::copy_n(combile_scale_t.data<float>(), combile_scale_t.numel(),
-              scale_data);
-  engine_->AddOpAttr(op_name, "weight_1", *weight1);
+      GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(shape1);
+  auto *mean_data = static_cast<float *>(weight1->h_tensor().mutable_data());
+  std::copy_n(mean_t.data<float>(), mean_t.numel(), mean_data);
+  engine_->AddOpAttr(bn_op_name, "weight_1", *weight1);
 
   auto *weight2 =
-      GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(weight2_shape);
-  auto *bias_data = static_cast<float *>(weight2->h_tensor().mutable_data());
-  std::copy_n(combile_bias_t.data<float>(), combile_bias_t.numel(), bias_data);
-  engine_->AddOpAttr(op_name, "weight_2", *weight2);
+      GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(shape2);
+  auto *variance_data =
+      static_cast<float *>(weight2->h_tensor().mutable_data());
+  std::copy_n(variance_t.data<float>(), variance_t.numel(), variance_data);
+  engine_->AddOpAttr(bn_op_name, "weight_2", *weight2);
+
+  Shape shape3(std::vector<int>({1, 1, 1, 1}));
+  auto *weight3 =
+      GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(shape3);
+  auto *alpha_data = static_cast<float *>(weight3->h_tensor().mutable_data());
+  float weight3_data[] = {1};
+  std::copy(std::begin(weight3_data), std::end(weight3_data), alpha_data);
+  engine_->AddOpAttr(bn_op_name, "weight_3", *weight3);
+
+  Shape scale_shape(fill_shape(4, framework::vectorize2int(scale_t.dims())));
+  auto *scale =
+      GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(scale_shape);
+  auto *scale_data = static_cast<float *>(scale->h_tensor().mutable_data());
+  std::copy_n(scale_t.data<float>(), scale_t.numel(), scale_data);
+
+  Shape bias_shape(fill_shape(4, framework::vectorize2int(bias_t.dims())));
+  auto *bias =
+      GraphGlobalMem<NV>::Global().template new_block<AK_FLOAT>(bias_shape);
+  auto *bias_data = static_cast<float *>(bias->h_tensor().mutable_data());
+  std::copy_n(bias_t.data<float>(), bias_t.numel(), bias_data);
+
+  engine_->AddOp(scale_op_name, "Scale", {bn_output}, {output});
+  engine_->AddOpAttr(scale_op_name, "axis", 1);
+  engine_->AddOpAttr(scale_op_name, "num_axes", 1);
+  engine_->AddOpAttr(scale_op_name, "bias_term", true);
+  engine_->AddOpAttr(scale_op_name, "weight_1", *scale);
+  engine_->AddOpAttr(scale_op_name, "weight_2", *bias);
 }
 
 }  // namespace anakin
diff --git a/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc b/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc
index 9504b2bf894c19..c6eebf5d0c425a 100644
--- a/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc
@@ -54,7 +54,6 @@ TEST(batch_norm_op, test) {
   float eps = 1e-5f;
   desc.SetAttr("epsilon", eps);
   desc.SetAttr("is_test", true);
-  // desc.SetAttr("momentum", 0.8f);
 
   validator.SetOp(*desc.Proto());
 

From 09b71f9ddd5e36a35cef92f59425429bff7c44d8 Mon Sep 17 00:00:00 2001
From: nhzlx <nhzlx.dragon@gmail.com>
Date: Thu, 7 Mar 2019 12:48:22 +0000
Subject: [PATCH 3/5] refine anakin op converter add density_prior_box op
 converter add detection output op converter

---
 .../fluid/inference/anakin/convert/concat.cc  |  4 ++--
 .../anakin/convert/density_prior_box.cc       | 12 ++++++----
 .../inference/anakin/convert/detection_out.cc | 11 +++++++--
 .../fluid/inference/anakin/convert/flatten.cc | 17 ++++----------
 .../inference/anakin/convert/op_converter.h   |  1 +
 .../fluid/inference/anakin/convert/reshape.cc |  2 +-
 .../anakin/convert/test_concat_op.cc          | 23 +++++++++++++++++++
 .../anakin/convert/test_flatten_op.cc         |  6 ++---
 .../anakin/convert/test_reshape_op.cc         | 21 +++++++++++++++++
 .../anakin/convert/test_softmax_op.cc         |  5 ++--
 .../anakin/convert/test_transpose_op.cc       | 22 ++++++++++++++++++
 .../inference/anakin/convert/transpose.cc     |  7 +++++-
 .../inference/anakin/convert/ut_helper.h      |  3 +++
 paddle/fluid/inference/anakin/op_teller.cc    |  7 +++++-
 .../ir_passes/anakin_subgraph_pass.cc         |  2 +-
 .../fluid/inference/api/analysis_predictor.cc |  6 +++++
 .../inference/api/paddle_pass_builder.cc      |  1 +
 17 files changed, 119 insertions(+), 31 deletions(-)

diff --git a/paddle/fluid/inference/anakin/convert/concat.cc b/paddle/fluid/inference/anakin/convert/concat.cc
index 4225307137302d..e2d1111acbb606 100644
--- a/paddle/fluid/inference/anakin/convert/concat.cc
+++ b/paddle/fluid/inference/anakin/convert/concat.cc
@@ -34,8 +34,8 @@ void ConcatOpConverter::operator()(const framework::proto::OpDesc &op,
   framework::OpDesc op_desc(op, nullptr);
   int axis = boost::get<int>(op_desc.GetAttr("axis"));
   auto input_names = op_desc.Input("X");
-  PADDLE_ENFORCE(axis > 0,
-                 "The axis attr of Concat op should be large than 0 for trt");
+  // PADDLE_ENFORCE(axis > 0,
+  //               "The axis attr of Concat op should be large than 0 for trt");
 
   auto y_name = op_desc.Output("Out").front();
   auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
diff --git a/paddle/fluid/inference/anakin/convert/density_prior_box.cc b/paddle/fluid/inference/anakin/convert/density_prior_box.cc
index 9fdea12dcf8edc..565a95f17b12fc 100644
--- a/paddle/fluid/inference/anakin/convert/density_prior_box.cc
+++ b/paddle/fluid/inference/anakin/convert/density_prior_box.cc
@@ -35,7 +35,7 @@ void DensityPriorBoxOpConverter::operator()(const framework::proto::OpDesc &op,
   auto image_name = op_desc.Input("Image").front();
   auto output_name = op_desc.Output("Boxes").front();
 
-  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Boxes").front();
 
   auto fixed_sizes =
       boost::get<std::vector<float>>(op_desc.GetAttr("fixed_sizes"));
@@ -52,14 +52,18 @@ void DensityPriorBoxOpConverter::operator()(const framework::proto::OpDesc &op,
   auto step_w = boost::get<float>(op_desc.GetAttr("step_w"));
   auto offset = boost::get<float>(op_desc.GetAttr("offset"));
   std::vector<std::string> order = {"MIN", "COM", "MAX"};
+  std::vector<float> temp_v = {};
 
   engine_->AddOp(op_name, "PriorBox", {input_name, image_name}, {output_name});
+  engine_->AddOpAttr<PTuple<float>>(op_name, "min_size", temp_v);
+  engine_->AddOpAttr<PTuple<float>>(op_name, "max_size", temp_v);
+  engine_->AddOpAttr<PTuple<float>>(op_name, "aspect_ratio", temp_v);
   engine_->AddOpAttr<PTuple<float>>(op_name, "fixed_sizes", fixed_sizes);
   engine_->AddOpAttr<PTuple<float>>(op_name, "fixed_ratios", fixed_ratios);
   engine_->AddOpAttr<PTuple<int>>(op_name, "density", densities);
-  engine_->AddOpAttr(op_name, "flip", false);
-  engine_->AddOpAttr(op_name, "clip", clip);
-  engine_->AddOpAttr<PTuple<float>>(op_name, "variances", variances);
+  engine_->AddOpAttr(op_name, "is_flip", false);
+  engine_->AddOpAttr(op_name, "is_clip", clip);
+  engine_->AddOpAttr<PTuple<float>>(op_name, "variance", variances);
   engine_->AddOpAttr(op_name, "img_h", static_cast<int>(0));
   engine_->AddOpAttr(op_name, "img_w", static_cast<int>(0));
   engine_->AddOpAttr(op_name, "step_h", step_h);
diff --git a/paddle/fluid/inference/anakin/convert/detection_out.cc b/paddle/fluid/inference/anakin/convert/detection_out.cc
index 1dae346a02443b..67636651017cfb 100644
--- a/paddle/fluid/inference/anakin/convert/detection_out.cc
+++ b/paddle/fluid/inference/anakin/convert/detection_out.cc
@@ -39,10 +39,17 @@ void DetectionOutOpConverter::operator()(const framework::proto::OpDesc &op,
   auto code_type = boost::get<std::string>(op_desc.GetAttr("code_type"));
   auto background_label = boost::get<int>(op_desc.GetAttr("background_label"));
   auto score_threshold = boost::get<float>(op_desc.GetAttr("score_threshold"));
-  auto nms_top_k = boost::get<float>(op_desc.GetAttr("nms_top_k"));
+  auto nms_top_k = boost::get<int>(op_desc.GetAttr("nms_top_k"));
   auto nms_threshold = boost::get<float>(op_desc.GetAttr("nms_threshold"));
   auto nms_eta = boost::get<float>(op_desc.GetAttr("nms_eta"));
   auto keep_top_k = boost::get<int>(op_desc.GetAttr("keep_top_k"));
+  std::string anakin_code_type;
+  if (code_type == "decode_center_size") {
+    anakin_code_type = "CENTER_SIZE";
+  } else if (code_type == "encode_center_size") {
+    PADDLE_THROW(
+        "Not support encode_center_size code_type in DetectionOut of anakin");
+  }
 
   engine_->AddOp(op_name, "DetectionOutput",
                  {target_name, scores_name, prior_box_name}, {output_name});
@@ -51,7 +58,7 @@ void DetectionOutOpConverter::operator()(const framework::proto::OpDesc &op,
   engine_->AddOpAttr(op_name, "class_num", static_cast<int>(0));
   engine_->AddOpAttr(op_name, "background_id", background_label);
   engine_->AddOpAttr(op_name, "keep_top_k", keep_top_k);
-  engine_->AddOpAttr(op_name, "code_type", code_type);
+  engine_->AddOpAttr(op_name, "code_type", anakin_code_type);
   engine_->AddOpAttr(op_name, "conf_thresh", score_threshold);
   engine_->AddOpAttr(op_name, "nms_top_k", nms_top_k);
   engine_->AddOpAttr(op_name, "nms_thresh", nms_threshold);
diff --git a/paddle/fluid/inference/anakin/convert/flatten.cc b/paddle/fluid/inference/anakin/convert/flatten.cc
index 8314a72129cd48..5a1b81bf5841cd 100644
--- a/paddle/fluid/inference/anakin/convert/flatten.cc
+++ b/paddle/fluid/inference/anakin/convert/flatten.cc
@@ -34,20 +34,11 @@ void FlattenOpConverter::operator()(const framework::proto::OpDesc &op,
 
   auto input = op_desc.Input("X").front();
   auto output = op_desc.Output("Out").front();
-  auto in_dims = scope.FindVar(input)->Get<framework::LoDTensor>().dims();
   int axis = boost::get<int>(op_desc.GetAttr("axis"));
+  PADDLE_ENFORCE(axis == 1,
+                 "the anakin flatten op converter now only support aixs == 1.");
 
-  int inner = 1;
-  int outer = 1;
-  for (int i = 0; i < in_dims.size(); i++) {
-    if (i < axis) {
-      outer *= in_dims[i];
-    } else {
-      inner *= in_dims[i];
-    }
-  }
-
-  std::vector<int> out_dims = {1, outer, inner, 1};
+  std::vector<int> out_dims = {0, -1, 1, 1};
   auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
   engine_->AddOp(op_name, "Reshape", {input}, {output});
   engine_->AddOpAttr<PTuple<int>>(op_name, "dims", out_dims);
@@ -57,4 +48,4 @@ void FlattenOpConverter::operator()(const framework::proto::OpDesc &op,
 }  // namespace inference
 }  // namespace paddle
 
-REGISTER_ANAKIN_OP_CONVERTER(flatten, FlattenOpConverter);
+REGISTER_ANAKIN_OP_CONVERTER(flatten2, FlattenOpConverter);
diff --git a/paddle/fluid/inference/anakin/convert/op_converter.h b/paddle/fluid/inference/anakin/convert/op_converter.h
index 0d214d82eb88b0..c29be735223a4a 100644
--- a/paddle/fluid/inference/anakin/convert/op_converter.h
+++ b/paddle/fluid/inference/anakin/convert/op_converter.h
@@ -47,6 +47,7 @@ class AnakinOpConverter {
     std::string op_type = op_desc.Type();
     AnakinOpConverter *it = nullptr;
 
+    std::cout << op_type << std::endl;
     if (!it) {
       it = Registry<AnakinOpConverter>::Global().Lookup(op_type);
     }
diff --git a/paddle/fluid/inference/anakin/convert/reshape.cc b/paddle/fluid/inference/anakin/convert/reshape.cc
index eee36d2f37ea79..faea5261f3ffa9 100644
--- a/paddle/fluid/inference/anakin/convert/reshape.cc
+++ b/paddle/fluid/inference/anakin/convert/reshape.cc
@@ -49,4 +49,4 @@ void ReshapeOpConverter::operator()(const framework::proto::OpDesc &op,
 }  // namespace inference
 }  // namespace paddle
 
-REGISTER_ANAKIN_OP_CONVERTER(reshape, ReshapeOpConverter);
+REGISTER_ANAKIN_OP_CONVERTER(reshape2, ReshapeOpConverter);
diff --git a/paddle/fluid/inference/anakin/convert/test_concat_op.cc b/paddle/fluid/inference/anakin/convert/test_concat_op.cc
index 0944e8dbdc90c3..8c6949cb2914bd 100644
--- a/paddle/fluid/inference/anakin/convert/test_concat_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_concat_op.cc
@@ -44,6 +44,29 @@ TEST(concat_op, test) {
   validator.Execute(1);
 }
 
+TEST(concat_op, test2) {
+  std::unordered_set<std::string> parameters({""});
+  framework::Scope scope;
+  AnakinConvertValidation validator(parameters, scope);
+  validator.DeclInputVar("concat_x1", {1, 4});
+  validator.DeclInputVar("concat_x2", {3, 4});
+  validator.DeclInputVar("concat_x3", {2, 4});
+  validator.DeclOutputVar("concat_out", {6, 4});
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("concat");
+  desc.SetInput("X", {"concat_x1", "concat_x2", "concat_x3"});
+  desc.SetOutput("Out", {"concat_out"});
+
+  int axis = 0;
+  desc.SetAttr("axis", axis);
+
+  validator.SetOp(*desc.Proto());
+
+  validator.Execute(1);
+}
+
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/convert/test_flatten_op.cc b/paddle/fluid/inference/anakin/convert/test_flatten_op.cc
index 61f03819407fb1..381fab786509a2 100644
--- a/paddle/fluid/inference/anakin/convert/test_flatten_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_flatten_op.cc
@@ -27,13 +27,13 @@ TEST(flatten_op, test) {
   std::unordered_set<std::string> parameters;
   framework::Scope scope;
   AnakinConvertValidation validator(parameters, scope);
-  validator.DeclInputVar("flatten-X", {3, 100, 100, 4});
-  validator.DeclOutputVar("flatten-Out", {1, 300, 400, 1});
+  validator.DeclInputVar("flatten-X", {3, 10, 10, 4});
+  validator.DeclOutputVar("flatten-Out", {3, 400, 1, 1});
   framework::OpDesc desc;
   desc.SetType("flatten");
   desc.SetInput("X", {"flatten-X"});
   desc.SetOutput("Out", {"flatten-Out"});
-  desc.SetAttr("axis", 2);
+  desc.SetAttr("axis", 1);
 
   LOG(INFO) << "set OP";
   validator.SetOp(*desc.Proto());
diff --git a/paddle/fluid/inference/anakin/convert/test_reshape_op.cc b/paddle/fluid/inference/anakin/convert/test_reshape_op.cc
index 2c26cdc525f2fe..a60544e6507e67 100644
--- a/paddle/fluid/inference/anakin/convert/test_reshape_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_reshape_op.cc
@@ -45,6 +45,27 @@ TEST(reshape, test) {
   validator.Execute(1);
 }
 
+TEST(reshape, test2) {
+  framework::Scope scope;
+  std::unordered_set<std::string> parameters;
+  AnakinConvertValidation validator(parameters, scope);
+
+  validator.DeclInputVar("reshape-X", {1, 2, 4});
+  validator.DeclOutputVar("reshape-Out", {1, 4, 2});
+
+  framework::OpDesc desc;
+  desc.SetType("reshape");
+  desc.SetInput("X", {"reshape-X"});
+  desc.SetOutput("Out", {"reshape-Out"});
+  // desc.SetAttr("shape", std::vector<int>({3, 2, 1, 3}));
+  desc.SetAttr("shape", std::vector<int>({0, -1, 2}));
+
+  LOG(INFO) << "set OP";
+  validator.SetOp(*desc.Proto());
+  LOG(INFO) << "execute";
+  validator.Execute(1);
+}
+
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/convert/test_softmax_op.cc b/paddle/fluid/inference/anakin/convert/test_softmax_op.cc
index 40186915886bec..933e3eb2b24682 100644
--- a/paddle/fluid/inference/anakin/convert/test_softmax_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_softmax_op.cc
@@ -27,9 +27,8 @@ TEST(softmax, test) {
   std::unordered_set<std::string> parameters;
   AnakinConvertValidation validator(parameters, scope);
 
-  std::vector<int> tensor_shape{8, 10};
-  validator.DeclInputVar("softmax-X", {1, 10, 1, 1});
-  validator.DeclOutputVar("softmax-Out", {1, 10, 1, 1});
+  validator.DeclInputVar("softmax-X", {1, 10});
+  validator.DeclOutputVar("softmax-Out", {1, 10});
 
   framework::OpDesc desc;
   desc.SetType("softmax");
diff --git a/paddle/fluid/inference/anakin/convert/test_transpose_op.cc b/paddle/fluid/inference/anakin/convert/test_transpose_op.cc
index 20ef00b5aa2eca..67c9c23f1fdc94 100644
--- a/paddle/fluid/inference/anakin/convert/test_transpose_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_transpose_op.cc
@@ -43,6 +43,28 @@ TEST(transpose_op, test) {
   validator.Execute(3);
 }
 
+// test input shape's dims < 4
+TEST(transpose_op, test2) {
+  std::unordered_set<std::string> parameters;
+  framework::Scope scope;
+  AnakinConvertValidation validator(parameters, scope);
+  validator.DeclInputVar("transpose-X", {3, 4, 5});
+  validator.DeclOutputVar("transpose-Out", {3, 5, 4});
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("transpose");
+  desc.SetInput("X", {"transpose-X"});
+  desc.SetOutput("Out", {"transpose-Out"});
+  desc.SetAttr("axis", std::vector<int>({0, 2, 1}));
+
+  LOG(INFO) << "set OP";
+  validator.SetOp(*desc.Proto());
+  LOG(INFO) << "execute";
+
+  validator.Execute(1);
+}
+
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/convert/transpose.cc b/paddle/fluid/inference/anakin/convert/transpose.cc
index 6333f5ef06a26b..74016cba7430e0 100644
--- a/paddle/fluid/inference/anakin/convert/transpose.cc
+++ b/paddle/fluid/inference/anakin/convert/transpose.cc
@@ -40,6 +40,11 @@ void TransposeOpConverter::operator()(const framework::proto::OpDesc &op,
   engine_->AddOp(op_name, "Permute", {input}, {output});
 
   auto axis = boost::get<std::vector<int>>(op_desc.GetAttr("axis"));
+  size_t axis_size = axis.size();
+  while (axis.size() < 4) {
+    axis.push_back(axis_size);
+    axis_size += 1;
+  }
   engine_->AddOpAttr<PTuple<int>>(op_name, "dims", axis);
 }
 
@@ -47,4 +52,4 @@ void TransposeOpConverter::operator()(const framework::proto::OpDesc &op,
 }  // namespace inference
 }  // namespace paddle
 
-REGISTER_ANAKIN_OP_CONVERTER(transpose, TransposeOpConverter);
+REGISTER_ANAKIN_OP_CONVERTER(transpose2, TransposeOpConverter);
diff --git a/paddle/fluid/inference/anakin/convert/ut_helper.h b/paddle/fluid/inference/anakin/convert/ut_helper.h
index 6b52b7607ac578..815a88fb4898c6 100644
--- a/paddle/fluid/inference/anakin/convert/ut_helper.h
+++ b/paddle/fluid/inference/anakin/convert/ut_helper.h
@@ -127,6 +127,9 @@ class AnakinConvertValidation {
       auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(scope_,
                                                                         input);
       auto t_shape = framework::vectorize2int(t.dims());
+      while (t_shape.size() < 4) {
+        t_shape.push_back(1);
+      }
       engine_->SetInputShape(input, t_shape);
     }
     engine_->Optimize();
diff --git a/paddle/fluid/inference/anakin/op_teller.cc b/paddle/fluid/inference/anakin/op_teller.cc
index 01ceef696ddc9b..3166f68b67af22 100644
--- a/paddle/fluid/inference/anakin/op_teller.cc
+++ b/paddle/fluid/inference/anakin/op_teller.cc
@@ -31,8 +31,13 @@ struct SimpleOpTypeSetTeller : public Teller {
     teller_set.insert("concat");
     teller_set.insert("tanh");
     teller_set.insert("conv2d");
-    teller_set.insert("detection_out");
+    teller_set.insert("batch_norm");
+    teller_set.insert("softmax");
+    teller_set.insert("flatten2");
+    teller_set.insert("reshape2");
+    teller_set.insert("transpose2");
     teller_set.insert("density_prior_box");
+    teller_set.insert("detection_out");
   }
 
   bool operator()(const std::string& op_type,
diff --git a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
index f57b89baab7582..1a4c24e800bc83 100644
--- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
@@ -45,7 +45,7 @@ std::unique_ptr<framework::ir::Graph> analysis::AnakinSubgraphPass::ApplyImpl(
     return anakin::OpTeller::Global().Tell(node->Op()->Type(), *node->Op());
   };
 
-  SubGraphFuser fuser(graph.get(), teller, 0);
+  SubGraphFuser fuser(graph.get(), teller, 3 /* min_subgraph_size */);
   fuser();
 
   for (auto *node : graph->Nodes()) {
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index fa64786d706075..ac17c3f6beb63e 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -779,5 +779,11 @@ USE_ANAKIN_CONVERTER(sigmoid);
 USE_ANAKIN_CONVERTER(tanh);
 USE_ANAKIN_CONVERTER(pool2d);
 USE_ANAKIN_CONVERTER(elementwise_add);
+USE_ANAKIN_CONVERTER(batch_norm);
+USE_ANAKIN_CONVERTER(flatten2);
+USE_ANAKIN_CONVERTER(reshape2);
+USE_ANAKIN_CONVERTER(transpose2);
+USE_ANAKIN_CONVERTER(softmax);
+
 USE_ANAKIN_CONVERTER(detection_out);
 USE_ANAKIN_CONVERTER(density_prior_box);
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index f723f0ef0605b0..57c0b8546b4931 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -72,6 +72,7 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
   passes_.assign({
       "infer_clean_graph_pass",  //
       "simplify_anakin_detection_pattern_pass3", "fc_fuse_pass",
+      "conv_elementwise_add_fuse_pass", "conv_bn_fuse_pass",
       "conv_elementwise_add_fuse_pass", "anakin_subgraph_pass",
       //     "identity_scale_op_clean_pass",              //
       /*

From 5dc3f81a035b5dec01b3267974b78b148741e685 Mon Sep 17 00:00:00 2001
From: nhzlx <nhzlx.dragon@gmail.com>
Date: Thu, 7 Mar 2019 17:13:30 +0000
Subject: [PATCH 4/5] refine anakin op converter

---
 paddle/fluid/inference/anakin/convert/batch_norm.cc  | 2 +-
 paddle/fluid/inference/anakin/convert/flatten.cc     | 2 +-
 paddle/fluid/inference/anakin/convert/op_converter.h | 5 ++++-
 paddle/fluid/inference/anakin/convert/reshape.cc     | 2 +-
 paddle/fluid/inference/anakin/convert/transpose.cc   | 2 +-
 paddle/fluid/inference/api/CMakeLists.txt            | 5 +++++
 paddle/fluid/inference/api/analysis_predictor.cc     | 6 +++---
 7 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/inference/anakin/convert/batch_norm.cc b/paddle/fluid/inference/anakin/convert/batch_norm.cc
index bb5d406f981d40..ebe81dabcbc71f 100644
--- a/paddle/fluid/inference/anakin/convert/batch_norm.cc
+++ b/paddle/fluid/inference/anakin/convert/batch_norm.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/inference/anakin/convert/batch_norm.h"
 #include <math.h>
+#include <algorithm>
 #include <map>
 #include <string>
 #include <vector>
@@ -41,7 +42,6 @@ void BatchNormOpConverter::operator()(const framework::proto::OpDesc &op,
 
   auto output = op_desc.Output("Y").front();
   auto op_name = op_desc.Type() + ":" + op_desc.Output("Y").front();
-  bool is_test = boost::get<bool>(op_desc.GetAttr("is_test"));
   auto epsilon = boost::get<float>(op_desc.GetAttr("epsilon"));
 
   auto bn_op_name = op_name + ":bn";
diff --git a/paddle/fluid/inference/anakin/convert/flatten.cc b/paddle/fluid/inference/anakin/convert/flatten.cc
index 5a1b81bf5841cd..c6c372bbef87de 100644
--- a/paddle/fluid/inference/anakin/convert/flatten.cc
+++ b/paddle/fluid/inference/anakin/convert/flatten.cc
@@ -48,4 +48,4 @@ void FlattenOpConverter::operator()(const framework::proto::OpDesc &op,
 }  // namespace inference
 }  // namespace paddle
 
-REGISTER_ANAKIN_OP_CONVERTER(flatten2, FlattenOpConverter);
+REGISTER_ANAKIN_OP_CONVERTER(flatten, FlattenOpConverter);
diff --git a/paddle/fluid/inference/anakin/convert/op_converter.h b/paddle/fluid/inference/anakin/convert/op_converter.h
index c29be735223a4a..eb75f1f62ab92f 100644
--- a/paddle/fluid/inference/anakin/convert/op_converter.h
+++ b/paddle/fluid/inference/anakin/convert/op_converter.h
@@ -47,7 +47,10 @@ class AnakinOpConverter {
     std::string op_type = op_desc.Type();
     AnakinOpConverter *it = nullptr;
 
-    std::cout << op_type << std::endl;
+    if (op_type == "reshape2") op_type = "reshape";
+    if (op_type == "transpose2") op_type = "transpose";
+    if (op_type == "flatten2") op_type = "flatten";
+
     if (!it) {
       it = Registry<AnakinOpConverter>::Global().Lookup(op_type);
     }
diff --git a/paddle/fluid/inference/anakin/convert/reshape.cc b/paddle/fluid/inference/anakin/convert/reshape.cc
index faea5261f3ffa9..eee36d2f37ea79 100644
--- a/paddle/fluid/inference/anakin/convert/reshape.cc
+++ b/paddle/fluid/inference/anakin/convert/reshape.cc
@@ -49,4 +49,4 @@ void ReshapeOpConverter::operator()(const framework::proto::OpDesc &op,
 }  // namespace inference
 }  // namespace paddle
 
-REGISTER_ANAKIN_OP_CONVERTER(reshape2, ReshapeOpConverter);
+REGISTER_ANAKIN_OP_CONVERTER(reshape, ReshapeOpConverter);
diff --git a/paddle/fluid/inference/anakin/convert/transpose.cc b/paddle/fluid/inference/anakin/convert/transpose.cc
index 74016cba7430e0..6a887401034f9d 100644
--- a/paddle/fluid/inference/anakin/convert/transpose.cc
+++ b/paddle/fluid/inference/anakin/convert/transpose.cc
@@ -52,4 +52,4 @@ void TransposeOpConverter::operator()(const framework::proto::OpDesc &op,
 }  // namespace inference
 }  // namespace paddle
 
-REGISTER_ANAKIN_OP_CONVERTER(transpose2, TransposeOpConverter);
+REGISTER_ANAKIN_OP_CONVERTER(transpose, TransposeOpConverter);
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 38313754ea98dc..29d21dc171c6d7 100644
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -64,3 +64,8 @@ if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
     anakin_target(inference_anakin_api)
     anakin_target(inference_anakin_api_shared)
 endif()
+if (WITH_ANAKIN_SUBGRAPH)
+   inference_analysis_test(test_anakin_model SRCS mobilenet_test.cc EXTRA_DEPS paddle_fluid) 
+   inference_analysis_test(anakin_conv_model SRCS conv_anakin_test.cc EXTRA_DEPS paddle_fluid) 
+   inference_analysis_test(life_feature_test SRCS life_feature_test.cc EXTRA_DEPS paddle_fluid) 
+endif()
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index ac17c3f6beb63e..49025e9269c30c 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -780,9 +780,9 @@ USE_ANAKIN_CONVERTER(tanh);
 USE_ANAKIN_CONVERTER(pool2d);
 USE_ANAKIN_CONVERTER(elementwise_add);
 USE_ANAKIN_CONVERTER(batch_norm);
-USE_ANAKIN_CONVERTER(flatten2);
-USE_ANAKIN_CONVERTER(reshape2);
-USE_ANAKIN_CONVERTER(transpose2);
+USE_ANAKIN_CONVERTER(flatten);
+USE_ANAKIN_CONVERTER(reshape);
+USE_ANAKIN_CONVERTER(transpose);
 USE_ANAKIN_CONVERTER(softmax);
 
 USE_ANAKIN_CONVERTER(detection_out);

From 59ac472e1dea5b622b607c2fc251e072d5d42f66 Mon Sep 17 00:00:00 2001
From: nhzlx <nhzlx.dragon@gmail.com>
Date: Fri, 8 Mar 2019 03:06:23 +0000
Subject: [PATCH 5/5] add trt update for this branch

---
 Dockerfile                                    |   5 +-
 paddle/fluid/framework/ir/fuse_pass_base.h    |   5 +
 paddle/fluid/inference/analysis/argument.h    |   6 +
 paddle/fluid/inference/analysis/helper.h      |  31 ++
 .../inference/analysis/ir_pass_manager.cc     |   3 +
 .../inference/analysis/ir_pass_manager.h      |   3 +
 .../ir_passes/tensorrt_subgraph_pass.cc       | 267 +++++++++++++-----
 .../ir_passes/tensorrt_subgraph_pass.h        |  12 +-
 .../ir_params_sync_among_devices_pass.cc      |  11 +
 .../ir_params_sync_among_devices_pass.h       |   1 +
 paddle/fluid/inference/api/analysis_config.cc |   4 +-
 .../fluid/inference/api/analysis_predictor.cc |  32 +++
 .../fluid/inference/api/analysis_predictor.h  |   9 +
 .../inference/api/details/zero_copy_tensor.cc |  60 +++-
 .../api/details/zero_copy_tensor_dummy.cc     |   2 +-
 paddle/fluid/inference/api/helper.h           |   5 +
 .../inference/api/paddle_analysis_config.h    |   4 +-
 paddle/fluid/inference/api/paddle_api.h       |  22 +-
 paddle/fluid/inference/engine.h               |   5 -
 .../inference/tensorrt/convert/conv2d_op.cc   |  21 +-
 .../tensorrt/convert/elementwise_op.cc        |   3 +-
 .../fluid/inference/tensorrt/convert/fc_op.cc |   4 +-
 .../inference/tensorrt/convert/op_converter.h |  61 ++++
 .../inference/tensorrt/convert/prelu_op.cc    |  19 +-
 .../inference/tensorrt/convert/ut_helper.h    |  85 +++---
 paddle/fluid/inference/tensorrt/engine.cc     | 143 +---------
 paddle/fluid/inference/tensorrt/engine.h      |  91 +++---
 paddle/fluid/inference/tensorrt/helper.h      |  29 ++
 .../inference/tensorrt/plugin/CMakeLists.txt  |   3 +-
 .../tensorrt/plugin/avg_pool_op_plugin.cu     |   7 +
 .../tensorrt/plugin/avg_pool_op_plugin.h      |  14 +-
 .../tensorrt/plugin/elementwise_op_plugin.cu  |  11 +-
 .../tensorrt/plugin/elementwise_op_plugin.h   |  20 +-
 .../tensorrt/plugin/prelu_op_plugin.cu        |  15 +-
 .../tensorrt/plugin/prelu_op_plugin.h         |  43 ++-
 .../tensorrt/plugin/split_op_plugin.cu        |   6 +
 .../tensorrt/plugin/split_op_plugin.h         |   9 +-
 .../inference/tensorrt/plugin/trt_plugin.h    |  10 +-
 .../tensorrt/plugin/trt_plugin_factory.cc     |  48 ++++
 .../tensorrt/plugin/trt_plugin_factory.h      |  78 +++++
 .../{serialize.h => trt_plugin_utils.h}       |   9 +-
 .../fluid/inference/tensorrt/test_engine.cc   | 134 ++++++---
 .../inference/tests/api/trt_models_tester.cc  |   3 +-
 .../operators/tensorrt/tensorrt_engine_op.cc  |   3 +
 .../operators/tensorrt/tensorrt_engine_op.h   | 179 +++++-------
 .../tensorrt/tensorrt_engine_op_test.cc       |   2 +
 paddle/fluid/pybind/inference_api.cc          |   3 +-
 47 files changed, 1005 insertions(+), 535 deletions(-)
 create mode 100644 paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc
 create mode 100644 paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h
 rename paddle/fluid/inference/tensorrt/plugin/{serialize.h => trt_plugin_utils.h} (96%)

diff --git a/Dockerfile b/Dockerfile
index fe0721e9b99b5e..c248ac119caa1f 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -75,8 +75,9 @@ RUN curl -s -q https://glide.sh/get | sh
 #    and its size is only one-third of the official one.
 # 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
 #    See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
-RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \
-    tar -xz -C /usr/local && \
+
+RUN wget -q https://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.1.6-ubuntu14.04.x86_64-gnu.cuda.8.0.cudnn7.0.tar.gz --no-check-certificate && \
+    tar -zxf TensorRT-4.0.1.6-ubuntu14.04.x86_64-gnu.cuda.8.0.cudnn7.0.tar.gz -C /usr/local && \
     cp -rf /usr/local/TensorRT/include /usr && \
     cp -rf /usr/local/TensorRT/lib /usr
 
diff --git a/paddle/fluid/framework/ir/fuse_pass_base.h b/paddle/fluid/framework/ir/fuse_pass_base.h
index c53b2a6186741d..3a1022bbcbd671 100644
--- a/paddle/fluid/framework/ir/fuse_pass_base.h
+++ b/paddle/fluid/framework/ir/fuse_pass_base.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <string>
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/pass.h"
 #include "paddle/fluid/framework/scope.h"
@@ -24,6 +25,10 @@ namespace ir {
 
 static const char kParamScopeAttr[] = "__param_scope__";
 static const char kFuseStatisAttr[] = "__fuse_statis__";
+// When we use trt or other third_party lib, the parameters are managed by
+// the lib, but not the fluid. So we need to record them to avoid duplicate
+// allocation.
+static const char kRepetitiveParamAttr[] = "__repetitive_param__";
 
 enum FuseOptions {
   DO_NOT_FUSE,  // fusing will not be done
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 2f31b182af7293..89e934ae27b931 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -23,8 +23,12 @@
 
 #pragma once
 
+#include <memory>
 #include <string>
+#include <unordered_map>
+#include <unordered_set>
 #include <vector>
+
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
@@ -133,6 +137,8 @@ struct Argument {
   DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int);
   DECL_ARGUMENT_FIELD(tensorrt_precision_mode, TensorRtPrecisionMode,
                       AnalysisConfig::Precision);
+  DECL_ARGUMENT_FIELD(tensorrt_use_static_engine, TensorRtUseStaticEngine,
+                      bool);
 
   // Memory optimized related.
   DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool);
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index 59107f28080dce..a48058400241b0 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -17,10 +17,12 @@ limitations under the License. */
 #include <sys/stat.h>
 #include <cstdio>
 #include <fstream>
+#include <memory>
 #include <set>
 #include <string>
 #include <typeindex>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "paddle/fluid/framework/framework.pb.h"
@@ -217,6 +219,35 @@ static std::string GetTrtCalibTableData(const std::string &model_opt_cache_dir,
   return "";
 }
 
+static std::string GetTrtEngineSerializedPath(const std::string &model_root,
+                                              const std::string &engine_key) {
+  return model_root + "/trt_serialized_" + engine_key;
+}
+
+static std::string GetTrtEngineSerializedData(
+    const std::string &model_opt_cache_dir, const std::string &engine_key) {
+  std::string trt_serialized_path =
+      GetTrtEngineSerializedPath(model_opt_cache_dir, engine_key);
+  if (FileExists(trt_serialized_path)) {
+    VLOG(3) << "Trt serialized file: " << trt_serialized_path
+            << "is found here";
+    std::ifstream infile(trt_serialized_path, std::ios::in);
+    std::stringstream buffer;
+    buffer << infile.rdbuf();
+    std::string trt_engine_serialized_data(buffer.str());
+    return trt_engine_serialized_data;
+  }
+  return "";
+}
+
+static void SaveTrtEngineSerializedDataToFile(
+    const std::string &trt_serialized_path,
+    const std::string &engine_serialized_data) {
+  std::ofstream outfile(trt_serialized_path);
+  outfile << engine_serialized_data;
+  outfile.close();
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index 7eff099a8a26e0..638f3bc90875da 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -89,6 +89,9 @@ void IRPassManager::CreatePasses(Argument *argument,
       pass->Set(
           "model_opt_cache_dir",
           new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir)));
+      pass->Set("gpu_device_id", new int(argument->gpu_device_id()));
+      pass->Set("use_static_engine",
+                new bool(argument->tensorrt_use_static_engine()));
     }
 
     pre_pass = pass_name;
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.h b/paddle/fluid/inference/analysis/ir_pass_manager.h
index 2a595cb36b8345..2d120679eedd39 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.h
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.h
@@ -22,7 +22,10 @@
 
 #pragma once
 
+#include <memory>
 #include <string>
+#include <unordered_set>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/pass.h"
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 69a9caec030600..d4e2da8957f205 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -14,13 +14,13 @@
 
 #include <algorithm>
 #include <set>
-#include <string>
-#include <vector>
 
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h"
 #include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/op_teller.h"
 #include "paddle/fluid/string/pretty_log.h"
 
@@ -33,8 +33,15 @@ using framework::ir::Node;
 std::vector<std::string> ExtractParameters(
     const std::unordered_set<Node *> &nodes);
 
-std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
+void RenameAndGetOutputs(
+    const std::vector<framework::ir::Node *> &subgraph_nodes,
+    framework::BlockDesc *block_desc,
+    const std::set<std::string> &input_names_with_id,
+    std::set<std::string> *output_names_with_id,
+    std::set<std::string> *output_names,
+    std::unordered_map<std::string, std::string> *output_name_map);
 
+std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
     std::unique_ptr<framework::ir::Graph> graph) const {
   framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get());
 
@@ -47,9 +54,16 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
                       Get<int>("min_subgraph_size") /*min subgraph size*/);
   fuser();
 
+  std::vector<std::string> graph_param_names =
+      ExtractParameters(graph->Nodes());
+  // those parameter already exist in trt, and should not have another copy in
+  // fluid.
+  std::vector<std::string> repetitive_params;
+
   for (auto *node : graph->Nodes()) {
     if (node->IsOp() && !Agent(node).subgraph()->empty()) {
-      CreateTensorRTOp(node, graph.get());
+      CreateTensorRTOp(node, graph.get(), graph_param_names,
+                       &repetitive_params);
 
       std::unordered_set<const Node *> nodes2remove(
           Agent(node).subgraph()->begin(), Agent(node).subgraph()->end());
@@ -64,12 +78,15 @@ std::unique_ptr<framework::ir::Graph> analysis::TensorRtSubgraphPass::ApplyImpl(
     }
   }
   framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove);
+  graph->Set(framework::ir::kRepetitiveParamAttr,
+             new std::vector<std::string>(repetitive_params));
 
   return graph;
 }
 
 std::string GenerateEngineKey(const std::set<std::string> &engine_inputs,
-                              const std::set<std::string> &engine_outputs) {
+                              const std::set<std::string> &engine_outputs,
+                              const std::string &predictor_id) {
   std::string engine_hash_key = "";
   for (auto name : engine_inputs) {
     engine_hash_key += name;
@@ -77,12 +94,15 @@ std::string GenerateEngineKey(const std::set<std::string> &engine_inputs,
   for (auto name : engine_outputs) {
     engine_hash_key += name;
   }
+  engine_hash_key += predictor_id;
   auto engine_key = std::to_string(std::hash<std::string>()(engine_hash_key));
   return engine_key;
 }
 
-void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
-                                            Graph *graph) const {
+void TensorRtSubgraphPass::CreateTensorRTOp(
+    framework::ir::Node *node, Graph *graph,
+    const std::vector<std::string> &graph_params,
+    std::vector<std::string> *repetitive_params) const {
   auto *op_desc = node->Op();
   auto &subgraph = *Agent(node).subgraph();
   PADDLE_ENFORCE(!subgraph.empty());
@@ -116,12 +136,16 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
   // is unique.
   std::set<std::string> input_names;
   std::set<std::string> input_names_with_id;
+  std::vector<std::string> params;
+
+  // The node->inputs containes input tensors and parameters.
   for (auto *x : node->inputs) {
     input_names.insert(x->Name());
     input_names_with_id.insert(x->Name() + std::to_string(x->id()));
+    if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) {
+      params.push_back(x->Name());
+    }
   }
-  op_desc->SetInput(
-      "Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
 
   std::set<std::string> output_names;
   std::set<std::string> output_names_with_id;
@@ -130,11 +154,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
     output_names_with_id.insert(x->Name() + std::to_string(x->id()));
   }
 
-  op_desc->SetOutput(
-      "Ys", std::vector<std::string>(output_names.begin(), output_names.end()));
-  op_desc->SetType("tensorrt_engine");
-
   std::unordered_map<std::string, std::string> output_name_map;
+  auto &subgraph_nodes = *Agent(node).subgraph();
 
   // The following procedure is used to rename all the intermediate
   // variables and the output variables of the subgraph.
@@ -148,61 +169,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
   // input of a OP, but also the output of a Op, there will be problems.
   // So we have to rename the variable in the subgraph to make sure
   // it is either an OP's input or an OP's output.
-
-  auto &subgraph_nodes = *Agent(node).subgraph();
-  for (size_t index = 0; index < block_desc.OpSize(); ++index) {
-    framework::proto::OpDesc *op = block_desc.Op(index)->Proto();
-    auto correspond_node = subgraph_nodes[index];
-    PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());
-
-    std::unordered_map<std::string, size_t> var2id;
-    for (auto *in_var : correspond_node->inputs) {
-      var2id[in_var->Name()] = in_var->id();
-    }
-    // rename for the input variables of op inside subgraph
-    for (int i = 0; i < op->inputs_size(); i++) {
-      // one input
-      auto *in_var = op->mutable_inputs(i);
-      std::vector<std::string> replaced_names;
-      for (int k = 0; k < in_var->arguments_size(); k++) {  // all the arguments
-        std::string arg_value = in_var->arguments(k);
-        std::string arg_value_with_id =
-            arg_value + std::to_string(var2id[arg_value]);
-        if (input_names_with_id.count(arg_value_with_id)) {
-          replaced_names.push_back(arg_value);
-        } else {
-          replaced_names.push_back(arg_value_with_id);
-        }
-      }
-      in_var->clear_arguments();
-      for (size_t k = 0; k < replaced_names.size(); k++) {
-        in_var->add_arguments(replaced_names[k]);
-      }
-    }
-    var2id.clear();
-    for (auto out_var : correspond_node->outputs) {
-      var2id[out_var->Name()] = out_var->id();
-    }
-
-    // rename for the output variables of op inside subgraph
-    for (int i = 0; i < op->outputs_size(); i++) {
-      framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i);
-      std::vector<std::string> replaced_names;
-      for (int k = 0; k < out_var->arguments_size(); k++) {
-        std::string arg_value = out_var->arguments(k);
-        std::string arg_value_with_id =
-            arg_value + std::to_string(var2id[arg_value]);
-        if (output_names_with_id.count(arg_value_with_id)) {
-          output_name_map[arg_value] = arg_value_with_id;
-        }
-        replaced_names.push_back(arg_value_with_id);
-      }
-      out_var->clear_arguments();
-      for (size_t k = 0; k < replaced_names.size(); k++) {
-        out_var->add_arguments(replaced_names[k]);
-      }
-    }
-  }
+  RenameAndGetOutputs(subgraph_nodes, &block_desc, input_names_with_id,
+                      &output_names_with_id, &output_names, &output_name_map);
 
   // When tensorrt engine runs at the end of the operation,
   // output_mapping help us copy the data from the renamed ITensor
@@ -212,6 +180,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
     PADDLE_ENFORCE(output_name_map.count(name) != 0);
     output_mapping.push_back(output_name_map[name]);
   }
+  PADDLE_ENFORCE(!output_mapping.empty());
 
   auto *vars = block_desc.Proto()->mutable_vars();
   for (framework::ir::Node *node : graph->Nodes()) {
@@ -222,26 +191,83 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node,
 
   PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(),
                  "the block has no var-desc");
-  PADDLE_ENFORCE(!output_mapping.empty());
+
+  // Set attrs
+  op_desc->SetType("tensorrt_engine");
+  op_desc->SetInput(
+      "Xs", std::vector<std::string>(input_names.begin(), input_names.end()));
+
+  op_desc->SetOutput(
+      "Ys", std::vector<std::string>(output_names.begin(), output_names.end()));
+
   op_desc->SetBlockAttr("sub_block", new_block);
   SetAttr(op_desc->Proto(), "subgraph",
           block_desc.Proto()->SerializeAsString());
-  // Set attrs
   SetAttr(op_desc->Proto(), "max_batch_size", Get<int>("max_batch_size"));
   SetAttr(op_desc->Proto(), "workspace_size", Get<int>("workspace_size"));
-  SetAttr(op_desc->Proto(), "parameters", ExtractParameters(graph->Nodes()));
   SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping);
+  SetAttr(op_desc->Proto(), "parameters", params);
 
   auto enable_int8 = Get<bool>("enable_int8");
-  auto engine_key =
-      GenerateEngineKey(input_names_with_id, output_names_with_id);
+  auto engine_key = GenerateEngineKey(input_names_with_id, output_names_with_id,
+                                      std::to_string(0));
 
+  // Get "" when there is no cached calibration table data.
   std::string calibration_data = GetTrtCalibTableData(
       Get<std::string>("model_opt_cache_dir"), engine_key, enable_int8);
   SetAttr(op_desc->Proto(), "calibration_data", calibration_data);
 
   SetAttr(op_desc->Proto(), "enable_int8", enable_int8);
   SetAttr(op_desc->Proto(), "engine_key", engine_key);
+  SetAttr(op_desc->Proto(), "engine_serialized_data", std::string(""));
+
+  std::unique_ptr<tensorrt::TRTInt8Calibrator> calibrator;
+  if (enable_int8 && calibration_data.size() != 0) {
+    calibrator.reset(new tensorrt::TRTInt8Calibrator(calibration_data));
+  }
+
+  bool use_static_engine = Get<bool>("use_static_engine");
+  // When in int8 mode and calibration_mode, the program just produce the
+  // calibration table data.
+  bool calibration_mode = (enable_int8 && calibration_data.size() == 0);
+  if (!calibration_mode && use_static_engine) {
+    std::copy(params.begin(), params.end(),
+              std::back_inserter(*repetitive_params));
+    std::string trt_engine_serialized_data = GetTrtEngineSerializedData(
+        Get<std::string>("model_opt_cache_dir"), engine_key);
+
+    if (trt_engine_serialized_data.empty()) {
+      LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP "
+                   "kernel etc). This process may cost a lot of time.";
+      std::unique_ptr<tensorrt::TensorRTEngine> trt_engine(
+          new tensorrt::TensorRTEngine(
+              Get<int>("max_batch_size"), Get<int>("workspace_size"),
+              enable_int8, calibrator.get(), Get<int>("gpu_device_id")));
+      auto *scope = param_scope();
+      framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto());
+      std::unordered_set<std::string> param_set(params.begin(), params.end());
+      inference::Singleton<inference::tensorrt::OpConverter>::Global()
+          .ConvertBlockToTRTEngine(
+              &block_desc_temp, *scope,
+              std::vector<std::string>(input_names.begin(), input_names.end()),
+              param_set, output_mapping, trt_engine.get());
+      nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize();
+      trt_engine_serialized_data =
+          std::string((const char *)serialized_engine_data->data(),
+                      serialized_engine_data->size());
+      SaveTrtEngineSerializedDataToFile(
+          GetTrtEngineSerializedPath(Get<std::string>("model_opt_cache_dir"),
+                                     engine_key),
+          trt_engine_serialized_data);
+    } else {
+      LOG(INFO) << "Load TRT Optimized Info from "
+                << GetTrtEngineSerializedPath(
+                       Get<std::string>("model_opt_cache_dir"), engine_key);
+    }
+
+    SetAttr(op_desc->Proto(), "engine_serialized_data",
+            trt_engine_serialized_data);
+  }
 }
 
 std::vector<std::string> ExtractParameters(
@@ -253,7 +279,7 @@ std::vector<std::string> ExtractParameters(
   for (const auto &node : nodes) {
     if (!node->IsOp()) continue;
     std::string op_type = node->Op()->Type();
-    if (op_type == "feed") {
+    if (op_type == "feed" || op_type == "fetch") {
       std::vector<std::string> output_names = node->Op()->OutputArgumentNames();
       std::copy(output_names.begin(), output_names.end(),
                 std::back_inserter(feed_outputs));
@@ -272,6 +298,99 @@ std::vector<std::string> ExtractParameters(
   return parameters;
 }
 
+void RenameAndGetOutputs(
+    const std::vector<framework::ir::Node *> &subgraph_nodes,
+    framework::BlockDesc *block_desc,
+    const std::set<std::string> &input_names_with_id,
+    std::set<std::string> *output_names_with_id,
+    std::set<std::string> *output_names,
+    std::unordered_map<std::string, std::string> *output_name_map) {
+  //// In the normal case, the paddle-trt exists bug when runing the googlenet.
+  // When there are more than two convolutions of 1 * 1 with the same input, the
+  // paddle-tensorrt will do the merging optimization, which fuse those conv
+  // into one conv, and then trigger bug. So,  We should use strategy to avoid
+  // this optimization for the time being. This bug will be fixed in the future.
+  std::unordered_map<std::string /*name*/, int /*ITensor_quote_num*/>
+      same_hierarchy_conv2d_num_map;
+
+  for (size_t index = 0; index < block_desc->OpSize(); ++index) {
+    framework::proto::OpDesc *op = block_desc->Op(index)->Proto();
+    framework::OpDesc op_desc(*op, nullptr);
+    auto correspond_node = subgraph_nodes[index];
+    PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type());
+
+    std::unordered_map<std::string, size_t> var2id;
+    std::unordered_map<std::string, framework::ir::Node *> in_vars;
+    for (auto *in_var : correspond_node->inputs) {
+      var2id[in_var->Name()] = in_var->id();
+      in_vars[in_var->Name()] = in_var;
+    }
+    // rename for the input variables of op inside subgraph
+    for (int i = 0; i < op->inputs_size(); i++) {
+      // one input
+      auto *in_var = op->mutable_inputs(i);
+      std::vector<std::string> replaced_names;
+      for (int k = 0; k < in_var->arguments_size(); k++) {  // all the arguments
+        std::string arg_value = in_var->arguments(k);
+        std::string arg_value_with_id =
+            arg_value + std::to_string(var2id[arg_value]);
+        if (input_names_with_id.count(arg_value_with_id)) {
+          replaced_names.push_back(arg_value);
+        } else {
+          replaced_names.push_back(arg_value_with_id);
+        }
+      }
+      in_var->clear_arguments();
+      for (size_t k = 0; k < replaced_names.size(); k++) {
+        in_var->add_arguments(replaced_names[k]);
+      }
+    }
+    var2id.clear();
+    for (auto out_var : correspond_node->outputs) {
+      var2id[out_var->Name()] = out_var->id();
+    }
+
+    if (op_desc.Type() == "conv2d") {
+      auto input_var_name = op_desc.Input("Input").front();
+      auto filter_var_name = op_desc.Input("Filter").front();
+      auto out_var_name = op_desc.Output("Output").front();
+      auto filter_shape = in_vars[filter_var_name]->Var()->GetShape();
+      const std::vector<int> strides =
+          boost::get<std::vector<int>>(op_desc.GetAttr("strides"));
+      const std::vector<int> paddings =
+          boost::get<std::vector<int>>(op_desc.GetAttr("paddings"));
+      if (same_hierarchy_conv2d_num_map[input_var_name] > 0) {
+        (*output_names_with_id)
+            .insert(out_var_name + std::to_string(var2id[out_var_name]));
+        (*output_names).insert(out_var_name);
+      } else if (filter_shape[2] == 1 && filter_shape[3] == 1 &&
+                 strides[0] == 1 && strides[1] == 1 && paddings[0] == 0 &&
+                 paddings[1] == 0) {
+        same_hierarchy_conv2d_num_map[input_var_name] += 1;
+      }
+    }
+
+    // rename for the output variables of op inside subgraph
+    for (int i = 0; i < op->outputs_size(); i++) {
+      framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i);
+      std::vector<std::string> replaced_names;
+      for (int k = 0; k < out_var->arguments_size(); k++) {
+        std::string arg_value = out_var->arguments(k);
+        std::string arg_value_with_id =
+            arg_value + std::to_string(var2id[arg_value]);
+        if (output_names_with_id->count(arg_value_with_id)) {
+          (*output_name_map)[arg_value] = arg_value_with_id;
+        }
+        replaced_names.push_back(arg_value_with_id);
+      }
+      out_var->clear_arguments();
+      for (size_t k = 0; k < replaced_names.size(); k++) {
+        out_var->add_arguments(replaced_names[k]);
+      }
+    }
+  }
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
index 502353b95fc15e..6689a668fc9313 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h
@@ -13,7 +13,12 @@
 // limitations under the License.
 
 #pragma once
-#include <paddle/fluid/framework/ir/fuse_pass_base.h>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/ir/pass.h"
 
 namespace paddle {
@@ -26,8 +31,9 @@ class TensorRtSubgraphPass : public framework::ir::FusePassBase {
       std::unique_ptr<framework::ir::Graph> graph) const override;
 
  private:
-  void CreateTensorRTOp(framework::ir::Node *x,
-                        framework::ir::Graph *graph) const;
+  void CreateTensorRTOp(framework::ir::Node *x, framework::ir::Graph *graph,
+                        const std::vector<std::string> &graph_params,
+                        std::vector<std::string> *repetitive_params) const;
   void CleanIntermediateOutputs(framework::ir::Node *node);
 };
 
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
index 50ffb6e9410933..8360963f7366e7 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -32,6 +32,13 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
   if (!argument->use_gpu()) return;
   return;
 
+  auto &graph = argument->main_graph();
+  std::vector<std::string> repetitive_params;
+
+  if (graph.Has(framework::ir::kRepetitiveParamAttr))
+    repetitive_params = graph.Get<std::vector<std::string>>(
+        framework::ir::kRepetitiveParamAttr);
+
   LOG(INFO) << "Sync params from CPU to GPU";
 
   PADDLE_ENFORCE(argument->gpu_device_id_valid());
@@ -44,6 +51,10 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
   // Because there exists the case that new parameter variables are not added to
   // the program in the analysis pass.
   for (auto &var_name : all_vars) {
+    if (std::count(repetitive_params.begin(), repetitive_params.end(),
+                   var_name)) {
+      continue;
+    }
     auto *var = scope->FindLocalVar(var_name);
     PADDLE_ENFORCE(var != nullptr);
     if (var->IsType<framework::LoDTensor>() ||
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
index a95f460df6f963..61990150a30db1 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h
@@ -17,6 +17,7 @@
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/analysis/analysis_pass.h"
 #include "paddle/fluid/platform/place.h"
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 522ab495227e9b..77411112220dcb 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -103,6 +103,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(tensorrt_max_batchsize_);
   CP_MEMBER(tensorrt_min_subgraph_size_);
   CP_MEMBER(tensorrt_precision_mode_);
+  CP_MEMBER(trt_use_static_engine_);
   // MKLDNN related.
   CP_MEMBER(use_mkldnn_);
   CP_MEMBER(mkldnn_enabled_op_types_);
@@ -144,7 +145,7 @@ void AnalysisConfig::EnableMKLDNN() {
 
 void AnalysisConfig::EnableTensorRtEngine(
     int workspace_size, int max_batch_size, int min_subgraph_size,
-    AnalysisConfig::Precision precision_mode) {
+    AnalysisConfig::Precision precision_mode, bool use_static) {
 #ifdef PADDLE_WITH_CUDA
   if (!use_gpu()) {
     LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first";
@@ -156,6 +157,7 @@ void AnalysisConfig::EnableTensorRtEngine(
   tensorrt_max_batchsize_ = max_batch_size;
   tensorrt_min_subgraph_size_ = min_subgraph_size;
   tensorrt_precision_mode_ = precision_mode;
+  trt_use_static_engine_ = use_static;
 
   Update();
 #else
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 49025e9269c30c..801363a532bff4 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -364,6 +364,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
     argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_);
     argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_);
     argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_);
+    argument_.SetTensorRtUseStaticEngine(config_.trt_use_static_engine_);
   }
 
   if (config_.use_mkldnn_) {
@@ -437,12 +438,14 @@ void AnalysisPredictor::PrepareFeedFetch() {
       }
       feeds_[idx] = op;
       feed_names_[op->Output("Out")[0]] = idx;
+      idx2feeds_[idx] = op->Output("Out")[0];
     } else if (op->Type() == "fetch") {
       int idx = boost::get<int>(op->GetAttr("col"));
       if (fetches_.size() <= static_cast<size_t>(idx)) {
         fetches_.resize(idx + 1);
       }
       fetches_[idx] = op;
+      idx2fetches_[idx] = op->Input("X")[0];
     }
   }
 }
@@ -455,6 +458,22 @@ void AnalysisPredictor::CreateFeedFetchVar(framework::Scope *scope) {
   var->GetMutable<framework::FeedFetchList>();
 }
 
+std::vector<std::string> AnalysisPredictor::GetInputNames() {
+  std::vector<std::string> input_names;
+  for (auto &item : idx2feeds_) {
+    input_names.push_back(item.second);
+  }
+  return input_names;
+}
+
+std::vector<std::string> AnalysisPredictor::GetOutputNames() {
+  std::vector<std::string> output_names;
+  for (auto &item : idx2fetches_) {
+    output_names.push_back(item.second);
+  }
+  return output_names;
+}
+
 std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
     const std::string &name) {
   PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name);
@@ -462,6 +481,13 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetInputTensor(
       new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
   res->input_or_output_ = true;
   res->SetName(name);
+  if (platform::is_cpu_place(place_)) {
+    res->SetPlace(PaddlePlace::kCPU);
+  } else {
+    auto gpu_place = boost::get<platform::CUDAPlace>(place_);
+    res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
+  }
+
   return res;
 }
 
@@ -472,6 +498,12 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
       new ZeroCopyTensor(static_cast<void *>(executor_->scope())));
   res->input_or_output_ = false;
   res->SetName(name);
+  if (platform::is_cpu_place(place_)) {
+    res->SetPlace(PaddlePlace::kCPU);
+  } else {
+    auto gpu_place = boost::get<platform::CUDAPlace>(place_);
+    res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId());
+  }
   return res;
 }
 
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index d5445c58e45ae6..5c0535d63e00c3 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -15,12 +15,14 @@
 #pragma once
 #include <algorithm>
 #include <map>
+#include <memory>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/inference/analysis/analyzer.h"
 #include "paddle/fluid/inference/api/api_impl.h"
 #include "paddle/fluid/inference/api/details/reset_tensor_array.h"
+#include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/string/printf.h"
 #ifdef PADDLE_WITH_TESTING
@@ -53,6 +55,9 @@ class AnalysisPredictor : public PaddlePredictor {
            std::vector<PaddleTensor> *output_data,
            int batch_size = -1) override;
 
+  std::vector<std::string> GetInputNames();
+  std::vector<std::string> GetOutputNames();
+
   std::unique_ptr<ZeroCopyTensor> GetInputTensor(
       const std::string &name) override;
   std::unique_ptr<ZeroCopyTensor> GetOutputTensor(
@@ -131,7 +136,11 @@ class AnalysisPredictor : public PaddlePredictor {
   std::shared_ptr<framework::ProgramDesc> inference_program_;
   std::vector<framework::OpDesc *> feeds_;
   std::map<std::string, size_t> feed_names_;
+  // Sorted according to the idx.
+  std::map<size_t, std::string> idx2feeds_;
   std::vector<framework::OpDesc *> fetches_;
+  std::map<size_t, std::string> idx2fetches_;
+
   // Memory buffer for feed inputs. The temporary LoDTensor will cause serious
   // concurrency problems, wrong results and memory leak, so cache them.
   std::vector<framework::LoDTensor> feed_tensors_;
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
index f60ff40c5da3e9..cf02901d963858 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/enforce.h"
 
 namespace paddle {
@@ -73,6 +74,61 @@ T *ZeroCopyTensor::data(PaddlePlace *place, int *size) const {
   return res;
 }
 
+template <typename T>
+void ZeroCopyTensor::copy_from_cpu(const T *data) {
+  EAGER_GET_TENSOR;
+  PADDLE_ENFORCE_GE(
+      tensor->numel(), 0,
+      "You should call ZeroCopyTensor::Reshape(const std::vector<int> &shape)"
+      "function before copy data from cpu.");
+  size_t ele_size = tensor->numel() * sizeof(T);
+
+  if (place_ == PaddlePlace::kCPU) {
+    auto *t_data = tensor->mutable_data<T>(platform::CPUPlace());
+    std::memcpy(static_cast<void *>(t_data), data, ele_size);
+  } else {
+#ifdef PADDLE_WITH_CUDA
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    platform::CUDAPlace gpu_place(device_);
+    auto *t_data = tensor->mutable_data<T>(gpu_place);
+    auto *dev_ctx =
+        static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
+
+    memory::Copy(gpu_place, static_cast<void *>(t_data), platform::CPUPlace(),
+                 data, ele_size, dev_ctx->stream());
+#else
+    PADDLE_THROW("Not compile with CUDA, should not reach here.");
+#endif
+  }
+}
+
+template <typename T>
+void ZeroCopyTensor::copy_to_cpu(T *data) {
+  EAGER_GET_TENSOR;
+  auto ele_num = tensor->numel();
+  auto *t_data = tensor->data<T>();
+  auto t_place = tensor->place();
+
+  if (platform::is_cpu_place(t_place)) {
+    std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
+  } else {
+#ifdef PADDLE_WITH_CUDA
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto gpu_place = boost::get<platform::CUDAPlace>(t_place);
+    auto *dev_ctx =
+        static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
+    memory::Copy(platform::CPUPlace(), static_cast<void *>(data), gpu_place,
+                 t_data, ele_num * sizeof(T), dev_ctx->stream());
+#else
+    PADDLE_THROW("Not compile with CUDA, should not reach here.");
+#endif
+  }
+}
+template void ZeroCopyTensor::copy_from_cpu<float>(const float *data);
+template void ZeroCopyTensor::copy_from_cpu<int64_t>(const int64_t *data);
+template void ZeroCopyTensor::copy_to_cpu<float>(float *data);
+template void ZeroCopyTensor::copy_to_cpu<int64_t>(int64_t *data);
+
 template float *ZeroCopyTensor::data<float>(PaddlePlace *place,
                                             int *size) const;
 template int64_t *ZeroCopyTensor::data<int64_t>(PaddlePlace *place,
@@ -92,10 +148,10 @@ void *ZeroCopyTensor::FindTensor() const {
   return tensor;
 }
 
-std::vector<int64_t> ZeroCopyTensor::shape() const {
+std::vector<int> ZeroCopyTensor::shape() const {
   EAGER_GET_TENSOR;
   PADDLE_ENFORCE(tensor_, "not found tensor called %s in the scope", name_);
-  return framework::vectorize(tensor->dims());
+  return framework::vectorize2int(tensor->dims());
 }
 
 void ZeroCopyTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {
diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
index 12071e09f8442f..cbbb3ea2d1395a 100644
--- a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
+++ b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc
@@ -37,7 +37,7 @@ template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place);
 
 void *ZeroCopyTensor::FindTensor() const { return nullptr; }
 
-std::vector<int64_t> ZeroCopyTensor::shape() const { return {}; }
+std::vector<int> ZeroCopyTensor::shape() const { return {}; }
 
 void ZeroCopyTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {}
 
diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h
index b92781e4f2c612..ec3bef42fd91ce 100644
--- a/paddle/fluid/inference/api/helper.h
+++ b/paddle/fluid/inference/api/helper.h
@@ -50,6 +50,11 @@ class Timer {
   }
 };
 
+static int GetUniqueId() {
+  static int id = 0;
+  return id++;
+}
+
 static void split(const std::string &str, char sep,
                   std::vector<std::string> *pieces) {
   pieces->clear();
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index c1c6227cdd8b20..9b05c335047d7f 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -135,7 +135,8 @@ struct AnalysisConfig {
    */
   void EnableTensorRtEngine(int workspace_size = 1 << 20,
                             int max_batch_size = 1, int min_subgraph_size = 3,
-                            Precision precision = Precision::kFloat32);
+                            Precision precision = Precision::kFloat32,
+                            bool use_static = true);
   /** A boolean state telling whether the TensorRT engine is used.
    */
   bool tensorrt_engine_enabled() const { return use_tensorrt_; }
@@ -233,6 +234,7 @@ struct AnalysisConfig {
   //  subgraph, 3 as default value.
   int tensorrt_min_subgraph_size_{3};
   Precision tensorrt_precision_mode_;
+  bool trt_use_static_engine_;
 
   // memory reuse related.
   bool enable_memory_optim_{false};
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index c9a45b4aa3b403..f807289f6aee06 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -160,11 +160,21 @@ class ZeroCopyTensor {
   template <typename T>
   T* data(PaddlePlace* place, int* size) const;
 
-  std::vector<int64_t> shape() const;
+  template <typename T>
+  void copy_from_cpu(const T* data);
+
+  template <typename T>
+  void copy_to_cpu(T* data);
+
+  std::vector<int> shape() const;
 
   void SetLoD(const std::vector<std::vector<size_t>>& x);
   std::vector<std::vector<size_t>> lod() const;
   const std::string& name() const { return name_; }
+  void SetPlace(PaddlePlace place, int device = -1) {
+    place_ = place;
+    device_ = device;
+  }
 
  protected:
   explicit ZeroCopyTensor(void* scope) : scope_{scope} {}
@@ -179,6 +189,8 @@ class ZeroCopyTensor {
   // The corresponding tensor pointer inside Paddle workspace is cached for
   // performance.
   mutable void* tensor_{nullptr};
+  PaddlePlace place_;
+  int device_;
 };
 
 /** A simple Inference API for Paddle.
@@ -200,6 +212,14 @@ class PaddlePredictor {
                    std::vector<PaddleTensor>* output_data,
                    int batch_size = -1) = 0;
 
+  /** \brief Get input names of the model
+   */
+  virtual std::vector<std::string> GetInputNames() { return {}; }
+
+  /** \brief Get output names of the model
+   */
+  virtual std::vector<std::string> GetOutputNames() { return {}; }
+
   /** \brief Get a mutable tensor directly.
    *
    * NOTE Only works in AnalysisPredictor.
diff --git a/paddle/fluid/inference/engine.h b/paddle/fluid/inference/engine.h
index ce2b8161715a3f..1a13ba510384c0 100644
--- a/paddle/fluid/inference/engine.h
+++ b/paddle/fluid/inference/engine.h
@@ -49,11 +49,6 @@ class EngineBase {
   // Execute the engine, that will run the inference network.
   virtual void Execute(int batch_size) = 0;
 
-  // Return the IO buffer that allocated in engine. One can read/write directly
-  // on the buffer. If the buffer's buffer is nullptr, one can also allocate
-  // memory and maintain it outside the engine.
-  virtual Buffer& buffer(const std::string& name) = 0;
-
   virtual ~EngineBase() {}
 };  // class EngineBase
 
diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
index 7900f56c9ce17f..39a99a21ea7020 100644
--- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc
@@ -18,21 +18,6 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-bool to_skip_merging_optimize(TensorRTEngine* engine,
-                              const std::vector<int>& filters,
-                              const std::vector<int>& strides,
-                              const std::vector<int>& paddings,
-                              std::string input_name) {
-  if (engine->itensor_quote_num[input_name] > 0) {
-    return true;
-  }
-  if (filters[0] == 1 && filters[1] == 1 && strides[0] == 1 &&
-      strides[1] == 1 && paddings[0] == 0 && paddings[1] == 0)
-    engine->itensor_quote_num[input_name] += 1;
-
-  return false;
-}
-
 template <typename RegistFunc, typename SetDilationFunc>
 void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
                    const framework::Scope& scope, bool test_mode,
@@ -59,7 +44,7 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
   weight_tensor->Resize(Y_t->dims());
   TensorCopySync((*Y_t), cpu_place, weight_tensor.get());
 
-  auto* weight_data = weight_tensor->mutable_data<float>(platform::CPUPlace());
+  auto* weight_data = weight_tensor->mutable_data<float>(cpu_place);
 
   PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL);
   const int n_output = weight_tensor->dims()[0];
@@ -100,9 +85,7 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op,
   layer->getOutput(0)->setName(output_name.c_str());
   engine->SetITensor(output_name, layer->getOutput(0));
 
-  if (test_mode ||
-      to_skip_merging_optimize(engine, {filter_h, filter_w}, strides, paddings,
-                               op_desc.Input("Input").front())) {
+  if (test_mode) {
     engine->DeclareOutput(output_name);
   }
 }
diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
index 79362f96770102..0c5a1a6ef16f05 100644
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -153,7 +153,6 @@ class ElementwiseTensorOpConverter : public OpConverter {
     if (CheckDims(dims_x, dims_y)) {
       // The two input tensor should have the same dims
       VLOG(3) << "Convert a fluid elementwise op to TensorRT IElementWiseLayer";
-
       nvinfer1::IElementWiseLayer* layer = TRT_ENGINE_ADD_LAYER(
           engine_, ElementWise, *const_cast<nvinfer1::ITensor*>(X),
           *const_cast<nvinfer1::ITensor*>(Y), op_pair->second);
@@ -166,7 +165,7 @@ class ElementwiseTensorOpConverter : public OpConverter {
                  "ElementWisePluginLayer";
 
       plugin::ElementWisePlugin* plugin =
-          new plugin::ElementWisePlugin(op_pair->second, dims_x, dims_y, axis);
+          new plugin::ElementWisePlugin(op_type_, dims_x, dims_y, axis);
       plugin->AddInput(X);
       plugin->AddInput(Y);
       nvinfer1::IPluginLayer* layer = engine_->AddPlugin(
diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
index eef4fab4e86f05..42dcd68e40e04e 100644
--- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc
@@ -85,10 +85,10 @@ class FcOpConverter : public OpConverter {
            Y_t->dims()[0] * Y_t->dims()[1] * sizeof(float));
     TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
                                   static_cast<void*>(weight_data),
-                                  Y_t->memory_size() / sizeof(float)};
+                                  static_cast<size_t>(Y_t->numel())};
     TensorRTEngine::Weight tmp_weight(nvinfer1::DataType::kFLOAT,
                                       static_cast<void*>(tmp->data<float>()),
-                                      Y_t->memory_size() / sizeof(float));
+                                      static_cast<size_t>(Y_t->numel()));
     weight.dims.assign({Y_t->dims()[0], Y_t->dims()[1]});
     tmp_weight.dims = weight.dims;
 
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index cc9a370e7db570..55515569ead6e4 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -17,9 +17,11 @@ limitations under the License. */
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
+#include <vector>
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/utils/singleton.h"
 
@@ -27,6 +29,37 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
+using FluidDT = framework::proto::VarType_Type;
+using TRT_DT = nvinfer1::DataType;
+
+namespace {  // NOLINT
+
+TRT_DT FluidDataType2TRT(FluidDT type) {
+  switch (type) {
+    case FluidDT::VarType_Type_FP32:
+      return TRT_DT::kFLOAT;
+    case FluidDT::VarType_Type_INT32:
+      return TRT_DT::kINT32;
+    default:
+      return TRT_DT::kINT32;
+  }
+  PADDLE_THROW("unkown type");
+  return TRT_DT::kINT32;
+}
+
+nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t>& shape) {
+  PADDLE_ENFORCE_GT(shape.size(), 1UL,
+                    "TensorRT' tensor input requires at least 2 dimensions");
+  PADDLE_ENFORCE_LE(shape.size(), 4UL,
+                    "TensorRT' tensor input requires at most 4 dimensions");
+  PADDLE_ENFORCE(shape.size() == 4UL || shape.size() == 2UL);
+  if (shape.size() == 4UL)
+    return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]);
+  return nvinfer1::DimsCHW(shape[1], 1, 1);
+}
+
+}  // namespace // NOLINT
+
 /*
  * Convert Op from Fluid to TensorRT Engine.
  */
@@ -111,6 +144,34 @@ class OpConverter {
     }
   }
 
+  // The scope  here should be inited with the parameter vars.
+  void ConvertBlockToTRTEngine(
+      framework::BlockDesc* block_desc, const framework::Scope& scope,
+      const std::vector<std::string>& inputs,
+      const std::unordered_set<std::string>& parameters,
+      const std::vector<std::string>& outputs, TensorRTEngine* engine) {
+    engine->InitNetwork();
+    for (auto& input : inputs) {
+      if (parameters.count(input)) continue;
+      auto* var = block_desc->FindVar(input);
+      PADDLE_ENFORCE(var, "no variable called %s", input);
+      PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
+                        "TensorRT engine only takes LoDTensor as input");
+      auto var_shape = var->GetShape();
+
+      engine->DeclareInput(
+          input, FluidDataType2TRT(
+                     var->Proto()->type().lod_tensor().tensor().data_type()),
+          Vec2TRT_Dims(var_shape));
+    }
+    framework::proto::BlockDesc* block_proto = block_desc->Proto();
+    ConvertBlock(*block_proto, parameters, scope, engine);
+    for (auto& output : outputs) {
+      engine->DeclareOutput(output);
+    }
+    engine->FreezeNetwork();
+  }
+
   void SetEngine(TensorRTEngine* engine) { engine_ = engine; }
 
   virtual ~OpConverter() {}
diff --git a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
index dbdff85ddebc85..2ae804106e5f7b 100644
--- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc
@@ -43,23 +43,20 @@ class PReluOpConverter : public OpConverter {
     PADDLE_ENFORCE_NOT_NULL(alpha_var);
     auto* alpha_tensor = alpha_var->GetMutable<framework::LoDTensor>();
 
-    platform::CUDAPlace place;
-    std::unique_ptr<framework::LoDTensor> alpha_tensor_device(
+    platform::CPUPlace cpu_place;
+    std::unique_ptr<framework::LoDTensor> alpha_tensor_temp(
         new framework::LoDTensor());
-    alpha_tensor_device->Resize(alpha_tensor->dims());
-    TensorCopySync(*alpha_tensor, place, alpha_tensor_device.get());
-    float* alpha_data = alpha_tensor_device->mutable_data<float>(place);
+    alpha_tensor_temp->Resize(alpha_tensor->dims());
+    TensorCopySync(*alpha_tensor, cpu_place, alpha_tensor_temp.get());
+    float* alpha_data = alpha_tensor_temp->mutable_data<float>(cpu_place);
 
-    // Transform alpha to TensorRTEngine::Weight
-    TensorRTEngine::Weight alpha_rt(nvinfer1::DataType::kFLOAT,
-                                    static_cast<void*>(alpha_data),
-                                    alpha_tensor_device->numel());
-    plugin::PReluPlugin* plugin = new plugin::PReluPlugin(alpha_rt, mode);
+    plugin::PReluPlugin* plugin =
+        new plugin::PReluPlugin(alpha_data, alpha_tensor_temp->numel(), mode);
     nvinfer1::IPluginLayer* layer =
         engine_->AddPlugin(&input, input_num, plugin);
     // keep alpha tensor to avoid release it's memory
     engine_->weight_map[op_desc.Input("Alpha")[0]] =
-        std::move(alpha_tensor_device);
+        std::move(alpha_tensor_temp);
 
     std::string layer_name = "prelu (Output: ";
     auto output_name = op_desc.Output("Out")[0];
diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
index e83961f3d7bda0..2571abbf69892d 100644
--- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h
+++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h
@@ -19,7 +19,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <memory>
 #include <string>
+#include <unordered_set>
 #include <vector>
 
 #include "paddle/fluid/framework/lod_tensor.h"
@@ -79,7 +81,8 @@ class TRTConvertValidation {
         if_add_batch_(if_add_batch),
         max_batch_size_(max_batch_size) {
     PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0);
-    engine_.reset(new TensorRTEngine(max_batch_size, workspace_size, stream_));
+    engine_.reset(
+        new TensorRTEngine(max_batch_size, workspace_size, false, nullptr, 0));
     engine_->InitNetwork();
   }
 
@@ -114,13 +117,12 @@ class TRTConvertValidation {
   }
 
   void DeclVar(const std::string& name, const std::vector<int> dim_vec) {
-    platform::CUDAPlace place;
-    platform::CUDADeviceContext ctx(place);
+    platform::CUDADeviceContext ctx(place_);
 
     auto* x = scope_.Var(name);
     auto* x_tensor = x->GetMutable<framework::LoDTensor>();
     x_tensor->Resize(framework::make_ddim(dim_vec));
-    RandomizeTensor(x_tensor, place, ctx);
+    RandomizeTensor(x_tensor, place_, ctx);
   }
   // Declare a variable in a fluid Scope.
   void DeclVar(const std::string& name, const nvinfer1::Dims& dims,
@@ -146,19 +148,6 @@ class TRTConvertValidation {
 
     // Declare outputs.
     op_desc_.reset(new framework::OpDesc(desc, nullptr));
-
-    // Set Inputs.
-    for (const auto& input : op_desc_->InputArgumentNames()) {
-      if (parameters_.count(input)) continue;
-      auto* var = scope_.FindVar(input);
-      PADDLE_ENFORCE(var);
-      auto tensor = var->GetMutable<framework::LoDTensor>();
-
-      engine_->SetInputFromGPU(
-          input, static_cast<void*>(tensor->data<void>()),
-          sizeof(float) *
-              analysis::AccuDims(tensor->dims(), tensor->dims().size()));
-    }
   }
 
   // We use the set 'neglected_output' here, because some Ops like batch norm,
@@ -168,43 +157,71 @@ class TRTConvertValidation {
                std::unordered_set<std::string> neglected_output = {}) {
     // Execute Fluid Op
     PADDLE_ENFORCE_LE(batch_size, max_batch_size_);
-    platform::CUDAPlace place;
-    platform::CUDADeviceContext ctx(place);
-    op_->Run(scope_, place);
-    // Execute TRT.
-    engine_->Execute(batch_size);
-    cudaStreamSynchronize(engine_->stream());
+    platform::CUDADeviceContext ctx(place_);
+    op_->Run(scope_, place_);
 
-    ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
-    const size_t output_space_size = 3000;
+    std::vector<std::string> input_output_names;
+
+    // Note: we need filter the parameter
+    for (const auto& input : op_desc_->InputArgumentNames()) {
+      if (parameters_.count(input)) continue;
+      input_output_names.push_back(input);
+    }
+
+    // Collect the fluid outputs.
+    std::vector<std::vector<float>> fluid_outs;
     for (const auto& output : op_desc_->OutputArgumentNames()) {
       if (neglected_output.count(output)) continue;
+      input_output_names.push_back(output);
       std::vector<float> fluid_out;
-      std::vector<float> trt_out(output_space_size);
-      engine_->GetOutputInCPU(output, &trt_out[0], output_space_size);
-      cudaStreamSynchronize(engine_->stream());
-
       auto* var = scope_.FindVar(output);
-      auto tensor = var->GetMutable<framework::LoDTensor>();
+      auto* tensor = var->GetMutable<framework::LoDTensor>();
       framework::TensorToVector(*tensor, ctx, &fluid_out);
+      fluid_outs.push_back(fluid_out);
+    }
+
+    // Bind input and output for TRT.
+    const int num_bindings = input_output_names.size();
+    std::vector<void*> buffers(num_bindings);
+
+    for (const std::string& name : input_output_names) {
+      auto* var = scope_.FindVar(name);
+      auto* tensor = var->GetMutable<framework::LoDTensor>();
+      const int bind_index = engine_->engine()->getBindingIndex(name.c_str());
+      buffers[bind_index] =
+          static_cast<void*>(tensor->mutable_data<float>(place_));
+    }
+
+    // Execute TRT.
+    engine_->Execute(batch_size, &buffers, stream_);
 
-      size_t fluid_out_size = fluid_out.size();
+    ASSERT_FALSE(op_desc_->OutputArgumentNames().empty());
+    int index = 0;
+    for (const auto& output : op_desc_->OutputArgumentNames()) {
+      if (neglected_output.count(output)) continue;
+      std::vector<float> trt_out;
+      auto* var = scope_.FindVar(output);
+      auto* tensor = var->GetMutable<framework::LoDTensor>();
+      framework::TensorToVector(*tensor, ctx, &trt_out);
+
+      size_t fluid_out_size = fluid_outs[index].size();
       if (if_add_batch_ == true) {
         fluid_out_size =
             batch_size * (framework::product(tensor->dims()) / max_batch_size_);
       }
-      // Compare two output
-      ASSERT_FALSE(fluid_out.empty());
+
       for (size_t i = 0; i < fluid_out_size; i++) {
         // Loose the threshold for CI in different machine model.
-        EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 2e-5);
+        EXPECT_LT(std::abs(fluid_outs[index][i] - trt_out[i]), 2e-5);
       }
+      index += 1;
     }
   }
 
   framework::Scope& scope() { return scope_; }
 
  private:
+  platform::CUDAPlace place_;
   std::unique_ptr<TensorRTEngine> engine_;
   cudaStream_t stream_;
   std::unique_ptr<framework::OperatorBase> op_;
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index 10f48462cfaf80..fddf5f11c285da 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -32,36 +32,18 @@ void TensorRTEngine::Build(const DescType &paddle_model) {
   PADDLE_ENFORCE(false, "not implemented");
 }
 
-void TensorRTEngine::Execute(int batch_size) {
+void TensorRTEngine::Execute(int batch_size, std::vector<void *> *buffers,
+                             cudaStream_t stream) {
   freshDeviceId();
   batch_size_ = batch_size;
-  std::vector<void *> buffers;
-  for (auto &buf : buffers_) {
-    PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated");
-    PADDLE_ENFORCE_GT(buf.max_size, 0);
-    PADDLE_ENFORCE(buf.device == DeviceType::GPU);
-    buffers.push_back(buf.buffer);
-  }
-  infer_context_->enqueue(batch_size, buffers.data(), stream_, nullptr);
-  cudaStreamSynchronize(stream_);
+  infer_context_->enqueue(batch_size, buffers->data(), stream, nullptr);
+  cudaStreamSynchronize(stream);
   SetRuntimeBatch(batch_size);
 }
 
-TensorRTEngine::~TensorRTEngine() {
-  cudaStreamSynchronize(stream_);
-  // clean buffer
-  for (auto &buf : buffers_) {
-    if (buf.device == DeviceType::GPU && buf.buffer != nullptr) {
-      PADDLE_ENFORCE_EQ(0, cudaFree(buf.buffer));
-      buf.buffer = nullptr;
-      buf.max_size = 0;
-    }
-  }
-}
-
 void TensorRTEngine::FreezeNetwork() {
-  VLOG(3) << "TRT to freeze network";
   freshDeviceId();
+  VLOG(3) << "TRT to freeze network";
   PADDLE_ENFORCE(infer_builder_ != nullptr,
                  "Call InitNetwork first to initialize network.");
   PADDLE_ENFORCE(infer_network_ != nullptr,
@@ -81,30 +63,6 @@ void TensorRTEngine::FreezeNetwork() {
   PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed!");
 
   infer_context_.reset(infer_engine_->createExecutionContext());
-
-  // allocate GPU buffers.
-  buffers_.resize(buffer_sizes_.size());
-  for (auto &item : buffer_sizes_) {
-    // The output buffers are not set in the network building phrase, need to
-    // infer from the TesorRT network.
-    if (item.second == 0) {
-      auto slot_offset = infer_engine_->getBindingIndex(item.first.c_str());
-      auto dims = infer_engine_->getBindingDimensions(slot_offset);
-      item.second = kDataTypeSize[static_cast<int>(
-                        infer_engine_->getBindingDataType(slot_offset))] *
-                    analysis::AccuDims(dims.d, dims.nbDims) * max_batch_;
-      PADDLE_ENFORCE_GT(item.second, 0);
-    }
-
-    auto &buf = buffer(item.first);
-    buf.max_size = item.second * max_batch_;
-    CHECK(buf.buffer == nullptr);  // buffer should be allocated only once.
-
-    PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second * max_batch_));
-    buf.size = 0;
-    PADDLE_ENFORCE_LE(buf.max_size, 1 << 30);  // 10G
-    buf.device = DeviceType::GPU;
-  }
 }
 
 nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name,
@@ -158,83 +116,6 @@ void TensorRTEngine::DeclareOutput(const std::string &name) {
   buffer_sizes_[name] = 0;
 }
 
-void *TensorRTEngine::GetOutputInGPU(const std::string &name) {
-  return buffer(name).buffer;
-}
-
-void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst,
-                                    size_t max_size) {
-  // determine data size
-  auto *output = TensorRTEngine::GetITensor(name);
-  nvinfer1::Dims dims = output->getDimensions();
-  auto dim_size = analysis::AccuDims(dims.d, dims.nbDims);
-  size_t dst_size = dim_size * runtime_batch_ *
-                    kDataTypeSize[static_cast<int>(output->getType())];
-
-  auto it = buffer_sizes_.find(name);
-  PADDLE_ENFORCE(it != buffer_sizes_.end());
-  PADDLE_ENFORCE_GT(it->second, 0);
-  PADDLE_ENFORCE_LE(dst_size, it->second);
-  PADDLE_ENFORCE_GE(max_size, dst_size);
-  auto &buf = buffer(name);
-  PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
-  PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, dst_size,
-                                    cudaMemcpyDeviceToDevice, stream_),
-                    0);
-}
-
-void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst,
-                                    size_t max_size) {
-  // determine data size
-
-  auto *output = TensorRTEngine::GetITensor(name);
-  nvinfer1::Dims dims = output->getDimensions();
-  auto dim_size = analysis::AccuDims(dims.d, dims.nbDims);
-  size_t dst_size = dim_size * runtime_batch_ *
-                    kDataTypeSize[static_cast<int>(output->getType())];
-  auto it = buffer_sizes_.find(name);
-  PADDLE_ENFORCE(it != buffer_sizes_.end());
-  PADDLE_ENFORCE_GT(it->second, 0);
-  PADDLE_ENFORCE_LE(dst_size, it->second);
-  PADDLE_ENFORCE_GE(max_size, dst_size);
-  auto &buf = buffer(name);
-  PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before");
-  PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, dst_size,
-                                       cudaMemcpyDeviceToHost, stream_));
-}
-
-Buffer &TensorRTEngine::buffer(const std::string &name) {
-  PADDLE_ENFORCE(infer_engine_ != nullptr, "call FreezeNetwork first.");
-  auto it = buffer_sizes_.find(name);
-  PADDLE_ENFORCE(it != buffer_sizes_.end(), "tried to access buffer named %s",
-                 name);
-  auto slot_offset = infer_engine_->getBindingIndex(name.c_str());
-  return buffers_[slot_offset];
-}
-
-void TensorRTEngine::SetInputFromCPU(const std::string &name, const void *data,
-                                     size_t size) {
-  auto &buf = buffer(name);
-  PADDLE_ENFORCE_NOT_NULL(buf.buffer);
-  PADDLE_ENFORCE_NOT_NULL(data);
-  PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
-  PADDLE_ENFORCE(buf.device == DeviceType::GPU);
-  buf.size = size;
-  PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
-                                       cudaMemcpyHostToDevice, stream_));
-}
-
-void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data,
-                                     size_t size) {
-  auto &buf = buffer(name);
-  buf.size = size;
-  PADDLE_ENFORCE_NOT_NULL(buf.buffer);
-  PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small");
-  PADDLE_ENFORCE(buf.device == DeviceType::GPU);
-  PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size,
-                                       cudaMemcpyDeviceToDevice, stream_));
-}
-
 void TensorRTEngine::SetITensor(const std::string &name,
                                 nvinfer1::ITensor *tensor) {
   PADDLE_ENFORCE(tensor != nullptr);
@@ -254,13 +135,6 @@ void TensorRTEngine::SetRuntimeBatch(size_t batch_size) {
 
 int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; }
 
-void TensorRTEngine::freshDeviceId() {
-  int count;
-  cudaGetDeviceCount(&count);
-  PADDLE_ENFORCE_LT(device_, count);
-  cudaSetDevice(device_);
-}
-
 nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin(
     nvinfer1::ITensor *const *inputs, int num_inputs,
     plugin::PluginTensorRT *plugin) {
@@ -268,6 +142,13 @@ nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin(
   return infer_network_.get()->addPluginExt(inputs, num_inputs, *plugin);
 }
 
+void TensorRTEngine::freshDeviceId() {
+  int count;
+  cudaGetDeviceCount(&count);
+  PADDLE_ENFORCE_LT(device_id_, count);
+  cudaSetDevice(device_id_);
+}
+
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index cdfe09b5a7fd2d..657dfd9355f9e3 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -23,6 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/engine.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
 #include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h"
 #include "paddle/fluid/inference/utils/singleton.h"
 
@@ -37,7 +38,9 @@ class TRTInt8Calibrator;
  * There are two alternative ways to use it, one is  to build from a paddle
  * protobuf model, another way is to manully construct the network.
  */
-class TensorRTEngine : public EngineBase {
+class TensorRTEngine {
+  using DescType = ::paddle::framework::proto::BlockDesc;
+
  public:
   // Weight is model parameter.
   class Weight {
@@ -56,28 +59,28 @@ class TensorRTEngine : public EngineBase {
     nvinfer1::Weights w_;
   };
 
-  TensorRTEngine(int max_batch, int max_workspace, cudaStream_t stream,
-                 int device = 0, bool enable_int8 = false,
-                 TRTInt8Calibrator* calibrator = nullptr,
+  TensorRTEngine(int max_batch, int max_workspace, bool enable_int8 = false,
+                 TRTInt8Calibrator* calibrator = nullptr, int device_id = 0,
                  nvinfer1::ILogger& logger = NaiveLogger::Global())
       : max_batch_(max_batch),
         max_workspace_(max_workspace),
-        stream_(stream),
-        device_(device),
         enable_int8_(enable_int8),
         calibrator_(calibrator),
+        device_id_(device_id),
         logger_(logger) {}
 
-  virtual ~TensorRTEngine();
+  ~TensorRTEngine() {}
 
   // TODO(Superjomn) implement it later when graph segmentation is supported.
-  void Build(const DescType& paddle_model) override;
+  void Build(const DescType& paddle_model);
 
-  void Execute(int batch_size) override;
+  void Execute(int batch_size, std::vector<void*>* buffers,
+               cudaStream_t stream);
 
   // Initialize the inference network, so that TensorRT layers can add to this
   // network.
   void InitNetwork() {
+    freshDeviceId();
     infer_builder_.reset(createInferBuilder(&logger_));
     infer_network_.reset(infer_builder_->createNetwork());
   }
@@ -98,37 +101,34 @@ class TensorRTEngine : public EngineBase {
   // Check if the ITensor has been declared
   bool HasDeclared(const std::string& name);
 
-  // GPU memory address for an ITensor with specific name. One can operate on
-  // these memory directly for acceleration, for example, output the converted
-  // data directly to the buffer to save data copy overhead.
-  // NOTE this should be used after calling `FreezeNetwork`.
-  Buffer& buffer(const std::string& name) override;
-
-  cudaStream_t stream() { return stream_; }
-
-  // Fill an input from CPU memory with name and size.
-  void SetInputFromCPU(const std::string& name, const void* data, size_t size);
-  // TODO(Superjomn) is this method necessary given that buffer(xxx) can be
-  // accessed directly. Fill an input from GPU memory with name and size.
-  void SetInputFromGPU(const std::string& name, const void* data, size_t size);
-  // Get an output called name, the output of tensorrt is in GPU, so this method
-  // Return the output's GPU memory address without copy.
-  void* GetOutputInGPU(const std::string& name);
-  // Copy data into dst inside the GPU device.
-  void GetOutputInGPU(const std::string& name, void* dst, size_t max_size);
-  // LOW EFFICENCY! Get output to CPU, this will trigger a memory copy from GPU
-  // to CPU.
-  void GetOutputInCPU(const std::string& name, void* dst, size_t max_size);
-  // Fill an ITensor into map itensor_map_.
   void SetITensor(const std::string& name, nvinfer1::ITensor* tensor);
   // Get an ITensor called name.
   nvinfer1::ITensor* GetITensor(const std::string& name);
 
   nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); }
   nvinfer1::INetworkDefinition* network() { return infer_network_.get(); }
+
+  nvinfer1::IHostMemory* Serialize() {
+    PADDLE_ENFORCE(infer_engine_ != nullptr,
+                   "You should build engine first and then serialize");
+    ihost_memory_.reset(infer_engine_->serialize());
+    return ihost_memory_.get();
+  }
+
+  void Deserialize(const std::string& engine_serialized_data) {
+    freshDeviceId();
+    infer_ptr<nvinfer1::IRuntime> runtime(createInferRuntime(&logger_));
+    infer_engine_.reset(runtime->deserializeCudaEngine(
+        engine_serialized_data.c_str(), engine_serialized_data.size(),
+        &inference::Singleton<plugin::PluginFactoryTensorRT>::Global()));
+    PADDLE_ENFORCE(infer_engine_ != nullptr,
+                   "build cuda engine failed when deserialize engine info.!");
+    infer_context_.reset(infer_engine_->createExecutionContext());
+  }
+
   void SetRuntimeBatch(size_t batch_size);
   int GetRuntimeBatch();
-  int GetDevice() { return device_; }
+  int GetDeviceId() { return device_id_; }
   nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs,
                                     int num_inputs, plugin::PluginTensorRT*);
 
@@ -140,17 +140,12 @@ class TensorRTEngine : public EngineBase {
   std::unordered_map<std::string /*name*/, std::unique_ptr<framework::Tensor>>
       weight_map;
 
-  // TODO(NHZLX)
-  // In the normal case, the paddle-trt exists bug when runing the googlenet.
-  // When there are more than two convolutions of 1 * 1 with the same input, the
-  // paddle-tensorrt will do the merging optimization, which fuse those conv
-  // into one conv, and then trigger bug. So,  We should use strategy to avoid
-  // this
-  // optimization for the time being. This bug will be fixed in the future.
-  std::unordered_map<std::string /*name*/, int /*ITensor_quote_num*/>
-      itensor_quote_num;
-
  private:
+  // Each ICudaEngine object is bound to a specific GPU when it is instantiated,
+  // ensure that the thread is associated with the correct device by calling
+  // freshDeviceId().
+  void freshDeviceId();
+
   // the max batch size
   int max_batch_;
   // the runtime batch size
@@ -158,18 +153,14 @@ class TensorRTEngine : public EngineBase {
   // the max memory size the engine uses
   int max_workspace_;
 
-  cudaStream_t stream_;
-  // The specific GPU id that the TensorRTEngine bounded to.
-  int device_;
-
   bool enable_int8_;
   TRTInt8Calibrator* calibrator_;
   // batch size of the current data, will be updated each Executation.
   int batch_size_{-1};
 
+  int device_id_;
   nvinfer1::ILogger& logger_;
 
-  std::vector<Buffer> buffers_;
   // max data size for the buffers.
   std::unordered_map<std::string /*name*/, size_t /*max size*/> buffer_sizes_;
   std::unordered_map<std::string /*name*/, nvinfer1::ITensor* /*ITensor*/>
@@ -192,15 +183,11 @@ class TensorRTEngine : public EngineBase {
   infer_ptr<nvinfer1::INetworkDefinition> infer_network_;
   infer_ptr<nvinfer1::ICudaEngine> infer_engine_;
   infer_ptr<nvinfer1::IExecutionContext> infer_context_;
-  // Each ICudaEngine object is bound to a specific GPU when it is instantiated,
-  // ensure that the thread is associated with the correct device by calling
-  // freshDeviceId().
-  void freshDeviceId();
+  infer_ptr<nvinfer1::IHostMemory> ihost_memory_;
 };  // class TensorRTEngine
 
 // Add an layer__ into engine__ with args ARGS.
 // For example:
-//   TRT_ENGINE_ADD_LAYER(xxx, FullyConnected, input, dim, weights, bias)
 //
 // Reference
 // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#charRNN_define_network
diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h
index fc7ca7714e9325..010942a0678fe9 100644
--- a/paddle/fluid/inference/tensorrt/helper.h
+++ b/paddle/fluid/inference/tensorrt/helper.h
@@ -17,6 +17,9 @@
 #include <NvInfer.h>
 #include <cuda.h>
 #include <glog/logging.h>
+#include <string>
+#include <utility>
+#include <vector>
 #include "paddle/fluid/platform/dynload/tensorrt.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -74,6 +77,32 @@ class NaiveLogger : public nvinfer1::ILogger {
   ~NaiveLogger() override {}
 };
 
+class NaiveProfiler : public nvinfer1::IProfiler {
+ public:
+  typedef std::pair<std::string, float> Record;
+  std::vector<Record> mProfile;
+
+  virtual void reportLayerTime(const char* layerName, float ms) {
+    auto record =
+        std::find_if(mProfile.begin(), mProfile.end(),
+                     [&](const Record& r) { return r.first == layerName; });
+    if (record == mProfile.end())
+      mProfile.push_back(std::make_pair(layerName, ms));
+    else
+      record->second += ms;
+  }
+
+  void printLayerTimes() {
+    float totalTime = 0;
+    for (size_t i = 0; i < mProfile.size(); i++) {
+      printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(),
+             mProfile[i].second);
+      totalTime += mProfile[i].second;
+    }
+    printf("Time over all layers: %4.3f\n", totalTime);
+  }
+};
+
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
index 95443e813327c1..709aa103d1b668 100644
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -1,4 +1,5 @@
 nv_library(tensorrt_plugin
-           SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu prelu_op_plugin.cu
+           SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu
+           prelu_op_plugin.cu  trt_plugin_factory.cc
            avg_pool_op_plugin.cu
            DEPS enforce tensorrt_engine prelu)
diff --git a/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu
index 5d747af8c55d71..f27a838162c89b 100644
--- a/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
 #include "paddle/fluid/operators/math/pooling.h"
 
 namespace paddle {
@@ -20,6 +21,12 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {
 
+AvgPoolPlugin* CreateAvgPoolPluginDeserialize(const void* buffer,
+                                              size_t length) {
+  return new AvgPoolPlugin(buffer, length);
+}
+REGISTER_TRT_PLUGIN("avg_pool_plugin", CreateAvgPoolPluginDeserialize);
+
 nvinfer1::Dims AvgPoolPlugin::getOutputDimensions(
     int index, const nvinfer1::Dims* inputDims, int nbInputs) {
   assert(nbInputs == 1);
diff --git a/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h
index b5e4ece0fba446..a7c0aa5794e6bb 100644
--- a/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h
@@ -33,24 +33,27 @@ class AvgPoolPlugin : public PluginTensorRT {
 
  protected:
   size_t getSerializationSize() override {
-    return SerializedSize(ceil_mode_) + SerializedSize(ksize_) +
-           SerializedSize(strides_) + SerializedSize(paddings_) +
-           SerializedSize(input_shape_) + getBaseSerializationSize();
+    return SerializedSize(getPluginType()) + SerializedSize(ceil_mode_) +
+           SerializedSize(ksize_) + SerializedSize(strides_) +
+           SerializedSize(paddings_) + SerializedSize(input_shape_) +
+           SerializedSize(output_shape_) + getBaseSerializationSize();
   }
 
   // TRT will call this func when we need to serialize the configuration of
   // tensorrt.
-  // It should not be called by users.
   void serialize(void *buffer) override {
+    SerializeValue(&buffer, getPluginType());
     serializeBase(buffer);
     SerializeValue(&buffer, ceil_mode_);
     SerializeValue(&buffer, ksize_);
     SerializeValue(&buffer, strides_);
     SerializeValue(&buffer, paddings_);
     SerializeValue(&buffer, input_shape_);
+    SerializeValue(&buffer, output_shape_);
   }
 
  public:
+  AvgPoolPlugin() {}
   AvgPoolPlugin(bool ceil_mode, std::vector<int> ksize,
                 std::vector<int> strides, std::vector<int> paddings,
                 std::vector<int> input_shape)
@@ -89,6 +92,7 @@ class AvgPoolPlugin : public PluginTensorRT {
     DeserializeValue(&serialData, &serialLength, &strides_);
     DeserializeValue(&serialData, &serialLength, &paddings_);
     DeserializeValue(&serialData, &serialLength, &input_shape_);
+    DeserializeValue(&serialData, &serialLength, &output_shape_);
   }
 
   AvgPoolPlugin *clone() const override {
@@ -96,7 +100,7 @@ class AvgPoolPlugin : public PluginTensorRT {
                              input_shape_);
   }
 
-  const char *getPluginType() const override { return "avg_pool"; }
+  const char *getPluginType() const override { return "avg_pool_plugin"; }
   int getNbOutputs() const override { return 1; }
   nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs,
                                      int nbInputDims) override;
diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
index 9cd9026b732808..9aed3ddab1448f 100644
--- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu
@@ -14,12 +14,19 @@ limitations under the License. */
 
 #include <glog/logging.h>
 #include "paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
 
 namespace paddle {
 namespace inference {
 namespace tensorrt {
 namespace plugin {
 
+ElementWisePlugin* CreateElementWisePluginDeserialize(const void* buffer,
+                                                      size_t length) {
+  return new ElementWisePlugin(buffer, length);
+}
+REGISTER_TRT_PLUGIN("elementwise_plugin", CreateElementWisePluginDeserialize);
+
 namespace details {
 
 template <typename T>
@@ -119,10 +126,10 @@ int ElementWisePlugin::enqueue(int batch_size, const void* const* inputs,
   const float* y = reinterpret_cast<const float*>(inputs[1]);
   float* out = reinterpret_cast<float*>(outputs[0]);
 
-  if (type_ == nvinfer1::ElementWiseOperation::kSUM) {
+  if (type_ == "add") {
     details::ElementWise(details::Add<float>(), x, y, out, batch_size,
                          prev_size_, midd_size_, post_size_, stream);
-  } else if (type_ == nvinfer1::ElementWiseOperation::kPROD) {
+  } else if (type_ == "mul") {
     details::ElementWise(details::Mul<float>(), x, y, out, batch_size,
                          prev_size_, midd_size_, post_size_, stream);
   } else {
diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
index 9c461f7a5c44eb..3b040f14c531c5 100644
--- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <string>
 #include <vector>
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 
@@ -24,9 +25,8 @@ namespace plugin {
 
 class ElementWisePlugin : public PluginTensorRT {
  public:
-  ElementWisePlugin(nvinfer1::ElementWiseOperation type,
-                    nvinfer1::Dims const &dims_x, nvinfer1::Dims const &dims_y,
-                    int axis)
+  ElementWisePlugin(std::string type, nvinfer1::Dims const &dims_x,
+                    nvinfer1::Dims const &dims_y, int axis)
       : type_(type),
         dims_x_(dims_x),
         dims_y_(dims_y),
@@ -37,6 +37,9 @@ class ElementWisePlugin : public PluginTensorRT {
 
   ElementWisePlugin(void const *serial_data, size_t serial_length) {
     deserializeBase(serial_data, serial_length);
+    const char *elementwise_type;
+    DeserializeValue(&serial_data, &serial_length, &elementwise_type);
+    type_ = std::string(elementwise_type);
     DeserializeValue(&serial_data, &serial_length, &axis_);
     DeserializeValue(&serial_data, &serial_length, &dims_x_);
     DeserializeValue(&serial_data, &serial_length, &dims_y_);
@@ -47,7 +50,7 @@ class ElementWisePlugin : public PluginTensorRT {
     return nullptr;
   }
 
-  const char *getPluginType() const override { return "elementwise"; }
+  const char *getPluginType() const override { return "elementwise_plugin"; }
 
   nvinfer1::Dims getOutputDimensions(int index,
                                      const nvinfer1::Dims *input_dims,
@@ -61,18 +64,21 @@ class ElementWisePlugin : public PluginTensorRT {
 
  protected:
   size_t getSerializationSize() override {
-    return SerializedSize(axis_) + SerializedSize(dims_x_) +
-           SerializedSize(dims_y_) + getBaseSerializationSize();
+    return SerializedSize(getPluginType()) + SerializedSize(axis_) +
+           SerializedSize(dims_x_) + SerializedSize(dims_y_) +
+           getBaseSerializationSize();
   }
 
   void serialize(void *buffer) override {
+    SerializeValue(&buffer, getPluginType());
     serializeBase(buffer);
+    SerializeValue(&buffer, type_.c_str());
     SerializeValue(&buffer, axis_);
     SerializeValue(&buffer, dims_x_);
     SerializeValue(&buffer, dims_y_);
   }
 
-  nvinfer1::ElementWiseOperation type_;
+  std::string type_;
   nvinfer1::Dims dims_x_;
   nvinfer1::Dims dims_y_;
   int axis_;
diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
index 3075e87ea6d719..b8a044fe99b918 100644
--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu
@@ -17,6 +17,7 @@
 #include <vector>
 #include "glog/logging.h"
 #include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
 #include "paddle/fluid/operators/math/prelu.h"
 
 namespace paddle {
@@ -24,6 +25,17 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {
 
+PReluPlugin *CreatePreluPluginDeserialize(const void *buffer, size_t length) {
+  return new PReluPlugin(buffer, length);
+}
+REGISTER_TRT_PLUGIN("prelu_plugin", CreatePreluPluginDeserialize);
+
+int PReluPlugin::initialize() {
+  cudaMalloc(&p_gpu_weight_, sizeof(float) * weight_.size());
+  cudaMemcpy(p_gpu_weight_, weight_.data(), weight_.size() * sizeof(float),
+             cudaMemcpyHostToDevice);
+}
+
 nvinfer1::Dims PReluPlugin::getOutputDimensions(int index,
                                                 const nvinfer1::Dims *inputDims,
                                                 int nbInputs) {
@@ -39,7 +51,8 @@ int PReluPlugin::enqueue(int batch_size, const void *const *inputs,
   // input dims is CHW.
   const auto &input_dims = this->getInputDims(0);
   const float *input = reinterpret_cast<const float *>(inputs[0]);
-  const float *alpha = reinterpret_cast<const float *>(alpha_.get().values);
+  // const float *alpha = reinterpret_cast<const float *>(alpha_.get().values);
+  const float *alpha = p_gpu_weight_;
   float *output = reinterpret_cast<float **>(outputs)[0];
 
   std::vector<int> input_shape;
diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
index 0db56a310b072e..a96649503f1c76 100644
--- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h
@@ -14,7 +14,12 @@
 
 #pragma once
 
+#include <algorithm>
 #include <string>
+#include <vector>
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 
@@ -24,39 +29,51 @@ namespace tensorrt {
 namespace plugin {
 
 class PReluPlugin : public PluginTensorRT {
-  TensorRTEngine::Weight alpha_;
+  std::vector<float> weight_;
+  float *p_gpu_weight_;
   std::string mode_;
 
  protected:
   size_t getSerializationSize() override {
-    // return getBaseSerializationSize(alpha_) + SerializedSize(mode_);
-    return 0;
+    return getBaseSerializationSize() + SerializedSize(mode_.c_str()) +
+           SerializedSize(weight_) + SerializedSize(getPluginType());
   }
 
   // TRT will call this func when we need to serialize the configuration of
   // tensorrt.
   // It should not be called by users.
   void serialize(void *buffer) override {
-    // serializeBase(buffer);
-    // SerializeValue(&buffer, alpha_);
-    // SerializeValue(&buffer, mode_);
+    SerializeValue(&buffer, getPluginType());
+    serializeBase(buffer);
+    SerializeValue(&buffer, weight_);
+    SerializeValue(&buffer, mode_.c_str());
   }
 
  public:
-  PReluPlugin(TensorRTEngine::Weight const &alpha, std::string const &mode)
-      : alpha_(alpha), mode_(mode) {}
+  PReluPlugin(const float *weight, const int weight_num,
+              std::string const &mode)
+      : mode_(mode) {
+    weight_.resize(weight_num);
+    std::copy(weight, weight + weight_num, weight_.data());
+  }
 
   // It was used for tensorrt deserialization.
   // It should not be called by users.
   PReluPlugin(void const *serialData, size_t serialLength) {
-    // deserializeBase(serialData, serialLength);
-    // DeserializeValue(&serialData, &serialLength, &alpha_);
-    // DeserializeValue(&serialData, &serialLength, &mode_);
+    deserializeBase(serialData, serialLength);
+    DeserializeValue(&serialData, &serialLength, &weight_);
+    const char *prelu_mode;
+    DeserializeValue(&serialData, &serialLength, &prelu_mode);
+    mode_ = std::string(prelu_mode);
   }
+  ~PReluPlugin() { cudaFree(p_gpu_weight_); }
+  int initialize() override;
 
-  PReluPlugin *clone() const override { return new PReluPlugin(alpha_, mode_); }
+  PReluPlugin *clone() const override {
+    return new PReluPlugin(weight_.data(), weight_.size(), mode_);
+  }
 
-  const char *getPluginType() const override { return "prelu"; }
+  const char *getPluginType() const override { return "prelu_plugin"; }
   int getNbOutputs() const override { return 1; }
   nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs,
                                      int nbInputDims) override;
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
index de61ace59e299a..b5503c3b95ee24 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu
@@ -15,12 +15,18 @@
 #include <cuda_fp16.h>
 #include <algorithm>
 #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
 
 namespace paddle {
 namespace inference {
 namespace tensorrt {
 namespace plugin {
 
+SplitPlugin* CreateSplitPluginDeserialize(const void* buffer, size_t length) {
+  return new SplitPlugin(buffer, length);
+}
+REGISTER_TRT_PLUGIN("split_plugin", CreateSplitPluginDeserialize);
+
 // copied from operators::math::SplitFunctor
 template <typename T>
 __global__ void SplitKernel(const T* input_data, const int in_row,
diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
index 6f028d3d72ae3c..cbb72590567a35 100644
--- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <thrust/device_vector.h>
+#include <utility>
 #include <vector>
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 
@@ -25,6 +26,7 @@ namespace plugin {
 
 class SplitPlugin : public PluginTensorRT {
  public:
+  SplitPlugin() {}
   SplitPlugin(int axis, std::vector<int> const &output_lengths)
       : axis_(axis), same_shape_(true), output_length_(output_lengths) {}
 
@@ -38,7 +40,7 @@ class SplitPlugin : public PluginTensorRT {
     return new SplitPlugin(axis_, output_length_);
   }
 
-  const char *getPluginType() const override { return "split"; }
+  const char *getPluginType() const override { return "split_plugin"; }
   int getNbOutputs() const override { return output_length_.size(); }
   nvinfer1::Dims getOutputDimensions(int index,
                                      const nvinfer1::Dims *input_dims,
@@ -50,11 +52,12 @@ class SplitPlugin : public PluginTensorRT {
 
  protected:
   size_t getSerializationSize() override {
-    return SerializedSize(axis_) + SerializedSize(output_length_) +
-           getBaseSerializationSize();
+    return SerializedSize(getPluginType()) + SerializedSize(axis_) +
+           SerializedSize(output_length_) + getBaseSerializationSize();
   }
 
   void serialize(void *buffer) override {
+    SerializeValue(&buffer, getPluginType());
     serializeBase(buffer);
     SerializeValue(&buffer, axis_);
     SerializeValue(&buffer, output_length_);
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
index 86084829e150f8..3b737bd726ad09 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
@@ -17,9 +17,10 @@
 #include <NvInfer.h>
 #include <cstring>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
-#include "paddle/fluid/inference/tensorrt/plugin/serialize.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -30,6 +31,13 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {
 
+class PluginTensorRT;
+
+typedef std::function<PluginTensorRT*(const void*, size_t)>
+    PluginDeserializeFunc;
+
+typedef std::function<PluginTensorRT*(void)> PluginConstructFunc;
+
 class PluginTensorRT : public nvinfer1::IPluginExt {
  public:
   PluginTensorRT() {}
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc
new file mode 100644
index 00000000000000..3c20b6d1e72527
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc
@@ -0,0 +1,48 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+PluginTensorRT* PluginFactoryTensorRT::createPlugin(const char* layer_name,
+                                                    const void* serial_data,
+                                                    size_t serial_length) {
+  const char* plugin_type;
+  DeserializeValue(&serial_data, &serial_length, &plugin_type);
+
+  PADDLE_ENFORCE(Has(plugin_type),
+                 "trt plugin type %s does not exists, check it.", plugin_type);
+  auto plugin = plugin_registry_[plugin_type](serial_data, serial_length);
+  owned_plugins_.emplace_back(plugin);
+
+  return plugin;
+}
+
+bool PluginFactoryTensorRT::RegisterPlugin(
+    const std::string& op_name, PluginDeserializeFunc deserialize_func) {
+  if (Has(op_name)) return false;
+  auto ret = plugin_registry_.emplace(op_name, deserialize_func);
+  return ret.second;
+}
+
+void PluginFactoryTensorRT::DestroyPlugins() { owned_plugins_.clear(); }
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h
new file mode 100644
index 00000000000000..139c75595f9f44
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h
@@ -0,0 +1,78 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <NvInfer.h>
+#include <cstring>
+#include <list>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h"
+#include "paddle/fluid/inference/utils/singleton.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+class PluginFactoryTensorRT : public nvinfer1::IPluginFactory,
+                              public DeleteHelper {
+ public:
+  // Deserialization method
+  PluginTensorRT* createPlugin(const char* layer_name, const void* serial_data,
+                               size_t serial_length) override;
+
+  bool RegisterPlugin(const std::string& op_name,
+                      PluginDeserializeFunc deserialize_func);
+
+  bool Has(const std::string& op_name) {
+    return plugin_registry_.find(op_name) != plugin_registry_.end();
+  }
+
+  void DestroyPlugins();
+
+ protected:
+  std::unordered_map<std::string, PluginDeserializeFunc> plugin_registry_;
+
+  std::list<std::unique_ptr<PluginTensorRT>> owned_plugins_;
+};
+
+class TrtPluginRegistrar {
+ public:
+  TrtPluginRegistrar(const std::string& name,
+                     PluginDeserializeFunc deserialize_func) {
+    inference::Singleton<PluginFactoryTensorRT>::Global().RegisterPlugin(
+        name, deserialize_func);
+  }
+};
+
+#define REGISTER_TRT_PLUGIN(name, deserialize_func) \
+  REGISTER_TRT_PLUGIN_UNIQ(__COUNTER__, name, deserialize_func)
+
+#define REGISTER_TRT_PLUGIN_UNIQ(ctr, name, deserialize_func)      \
+  static paddle::inference::tensorrt::plugin::TrtPluginRegistrar   \
+      trt_plugin_registrar##ctr __attribute__((unused)) =          \
+          paddle::inference::tensorrt::plugin::TrtPluginRegistrar( \
+              name, deserialize_func)
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/serialize.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
similarity index 96%
rename from paddle/fluid/inference/tensorrt/plugin/serialize.h
rename to paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
index ce859f16fc8747..1cae4ccae4cc59 100644
--- a/paddle/fluid/inference/tensorrt/plugin/serialize.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #pragma once
-
 #include <cstring>
+#include <string>
 #include <type_traits>
 #include <vector>
 #include "paddle/fluid/platform/enforce.h"
@@ -24,6 +24,13 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {
 
+// Some trt base classes lack of the destructor.
+// We use a assisted class to fix this.
+struct DeleteHelper {
+ protected:
+  virtual ~DeleteHelper() {}
+};
+
 template <typename T>
 inline void SerializeValue(void** buffer, T const& value);
 
diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc
index 9eed0f6ee9ce4d..a03dd45db0f804 100644
--- a/paddle/fluid/inference/tensorrt/test_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_engine.cc
@@ -17,6 +17,8 @@ limitations under the License. */
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/platform/enforce.h"
 
@@ -27,19 +29,34 @@ namespace tensorrt {
 class TensorRTEngineTest : public ::testing::Test {
  protected:
   void SetUp() override {
-    ASSERT_EQ(0, cudaStreamCreate(&stream_));
-    engine_ = new TensorRTEngine(10, 1 << 10, stream_);
+    ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0));
+
+    engine_ = new TensorRTEngine(10, 1 << 10);
     engine_->InitNetwork();
   }
 
   void TearDown() override {
-    delete engine_;
-    cudaStreamDestroy(stream_);
+    if (engine_) {
+      delete engine_;
+      engine_ = nullptr;
+    }
+  }
+
+  void PrepareInputOutput(const std::vector<float> &input,
+                          std::vector<int> output_shape) {
+    TensorFromVector(input, *ctx_, &input_);
+    output_.Resize(framework::make_ddim(output_shape));
+  }
+
+  void GetOutput(std::vector<float> *output) {
+    TensorToVector(output_, *ctx_, output);
   }
 
  protected:
-  TensorRTEngine* engine_;
-  cudaStream_t stream_;
+  framework::Tensor input_;
+  framework::Tensor output_;
+  TensorRTEngine *engine_;
+  platform::CUDADeviceContext *ctx_;
 };
 
 TEST_F(TensorRTEngineTest, add_layer) {
@@ -48,12 +65,14 @@ TEST_F(TensorRTEngineTest, add_layer) {
   float raw_weight[size] = {2.};  // Weight in CPU memory.
   float raw_bias[size] = {3.};
 
+  std::vector<void *> buffers(2);  // TRT binded inputs
+
   LOG(INFO) << "create weights";
   TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, size);
   TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, size);
-  auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
+  auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
                                   nvinfer1::DimsCHW{1, 1, 1});
-  auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, size,
+  auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, size,
                                         weight.get(), bias.get());
   PADDLE_ENFORCE(fc_layer != nullptr);
 
@@ -63,18 +82,24 @@ TEST_F(TensorRTEngineTest, add_layer) {
   ASSERT_EQ(engine_->engine()->getNbBindings(), 2);
 
   // fill in real data
-  float x_v = 1234;
-  engine_->SetInputFromCPU("x", reinterpret_cast<void*>(&x_v),
-                           1 * sizeof(float));
+  std::vector<float> x_v = {1234};
+  std::vector<float> y_cpu;
+  PrepareInputOutput(x_v, {1});
+
+  auto *x_v_gpu_data = input_.mutable_data<float>(ctx_->GetPlace());
+  auto *y_gpu_data = output_.mutable_data<float>(ctx_->GetPlace());
+
+  buffers[0] = reinterpret_cast<void *>(x_v_gpu_data);
+  buffers[1] = reinterpret_cast<void *>(y_gpu_data);
+
   LOG(INFO) << "to execute";
-  engine_->Execute(1);
+  engine_->Execute(1, &buffers, ctx_->stream());
 
   LOG(INFO) << "to get output";
-  float y_cpu;
-  engine_->GetOutputInCPU("y", &y_cpu, 1 * sizeof(float));
+  GetOutput(&y_cpu);
 
   LOG(INFO) << "to checkout output";
-  ASSERT_EQ(y_cpu, x_v * 2 + 3);
+  ASSERT_EQ(y_cpu[0], x_v[0] * 2 + 3);
 }
 
 TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
@@ -83,12 +108,13 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
   // instead of row-major, which is [[1.0, 1.1], [3.3, 4.4]]
   float raw_weight[4] = {1.0, 1.1, 3.3, 4.4};
   float raw_bias[2] = {1.3, 2.4};
+  std::vector<void *> buffers(2);  // TRT binded inputs
 
   TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, 4);
   TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, 2);
-  auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
+  auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
                                   nvinfer1::DimsCHW{1, 2, 1});
-  auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, 2,
+  auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, 2,
                                         weight.get(), bias.get());
   PADDLE_ENFORCE(fc_layer != nullptr);
 
@@ -96,19 +122,27 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) {
   engine_->FreezeNetwork();
   ASSERT_EQ(engine_->engine()->getNbBindings(), 2);
 
-  float x_v[2] = {1.0, 2.0};
-  engine_->SetInputFromCPU("x", reinterpret_cast<void*>(&x_v),
-                           2 * sizeof(float));
-  engine_->Execute(1);
+  // fill in real data
+  std::vector<float> x_v = {1.0, 2.0};
+  std::vector<float> y_cpu;
+  PrepareInputOutput(x_v, {2});
+
+  auto *x_v_gpu_data = input_.mutable_data<float>(ctx_->GetPlace());
+  auto *y_gpu_data = output_.mutable_data<float>(ctx_->GetPlace());
+
+  buffers[0] = reinterpret_cast<void *>(x_v_gpu_data);
+  buffers[1] = reinterpret_cast<void *>(y_gpu_data);
+
+  engine_->Execute(1, &buffers, ctx_->stream());
 
   LOG(INFO) << "to get output";
-  float y_cpu[2] = {-1., -1.};
+  GetOutput(&y_cpu);
 
   auto dims = engine_->GetITensor("y")->getDimensions();
   ASSERT_EQ(dims.nbDims, 3);
   ASSERT_EQ(dims.d[0], 2);
   ASSERT_EQ(dims.d[1], 1);
-  engine_->GetOutputInCPU("y", &y_cpu[0], 2 * sizeof(float));
+
   ASSERT_EQ(y_cpu[0], 4.5);
   ASSERT_EQ(y_cpu[1], 14.5);
 }
@@ -117,12 +151,13 @@ TEST_F(TensorRTEngineTest, test_conv2d) {
   // Weight in CPU memory.
   float raw_weight[9] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
   float raw_bias[1] = {0};
+  std::vector<void *> buffers(2);  // TRT binded inputs
 
   TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, 9);
   TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, 1);
-  auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
+  auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
                                   nvinfer1::Dims3{1, 3, 3});
-  auto* conv_layer =
+  auto *conv_layer =
       TRT_ENGINE_ADD_LAYER(engine_, Convolution, *x, 1, nvinfer1::DimsHW{3, 3},
                            weight.get(), bias.get());
   PADDLE_ENFORCE(conv_layer != nullptr);
@@ -133,28 +168,36 @@ TEST_F(TensorRTEngineTest, test_conv2d) {
   engine_->FreezeNetwork();
   ASSERT_EQ(engine_->engine()->getNbBindings(), 2);
 
-  float x_v[18] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
-                   1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
-  engine_->SetInputFromCPU("x", reinterpret_cast<void*>(&x_v),
-                           18 * sizeof(float));
-  engine_->Execute(2);
+  // fill in real data
+  std::vector<float> x_v = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                            1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+  std::vector<float> y_cpu;
+  PrepareInputOutput(x_v, {18});
+
+  auto *x_v_gpu_data = input_.mutable_data<float>(ctx_->GetPlace());
+  auto *y_gpu_data = output_.mutable_data<float>(ctx_->GetPlace());
+
+  buffers[0] = reinterpret_cast<void *>(x_v_gpu_data);
+  buffers[1] = reinterpret_cast<void *>(y_gpu_data);
+
+  engine_->Execute(2, &buffers, ctx_->stream());
 
   LOG(INFO) << "to get output";
-  float* y_cpu = new float[18];
-  engine_->GetOutputInCPU("y", &y_cpu[0], 18 * sizeof(float));
+  GetOutput(&y_cpu);
+
   ASSERT_EQ(y_cpu[0], 4.0);
   ASSERT_EQ(y_cpu[1], 6.0);
 }
 
 TEST_F(TensorRTEngineTest, test_pool2d) {
   // Weight in CPU memory.
-  auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
+  auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT,
                                   nvinfer1::Dims3{1, 2, 2});
 
+  std::vector<void *> buffers(2);  // TRT binded inputs
   nvinfer1::PoolingType pool_t = nvinfer1::PoolingType::kAVERAGE;
-  auto* pool_layer =
-      TRT_ENGINE_ADD_LAYER(engine_, Pooling, *const_cast<nvinfer1::ITensor*>(x),
-                           pool_t, nvinfer1::DimsHW{2, 2});
+  auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *x, pool_t,
+                                          nvinfer1::DimsHW{2, 2});
 
   PADDLE_ENFORCE(pool_layer != nullptr);
   pool_layer->setStride(nvinfer1::DimsHW{1, 1});
@@ -164,14 +207,21 @@ TEST_F(TensorRTEngineTest, test_pool2d) {
   engine_->FreezeNetwork();
   ASSERT_EQ(engine_->engine()->getNbBindings(), 2);
 
-  float x_v[8] = {1.0, 2.0, 5.0, 0.0, 2.0, 3.0, 5.0, 10.0};
-  engine_->SetInputFromCPU("x", reinterpret_cast<void*>(&x_v),
-                           8 * sizeof(float));
-  engine_->Execute(2);
+  // fill in real data
+  std::vector<float> x_v = {1.0, 2.0, 5.0, 0.0, 2.0, 3.0, 5.0, 10.0};
+  std::vector<float> y_cpu;
+  PrepareInputOutput(x_v, {2});
+
+  auto *x_v_gpu_data = input_.mutable_data<float>(ctx_->GetPlace());
+  auto *y_gpu_data = output_.mutable_data<float>(ctx_->GetPlace());
+
+  buffers[0] = reinterpret_cast<void *>(x_v_gpu_data);
+  buffers[1] = reinterpret_cast<void *>(y_gpu_data);
+
+  engine_->Execute(2, &buffers, ctx_->stream());
 
   LOG(INFO) << "to get output";
-  float* y_cpu = new float[2];
-  engine_->GetOutputInCPU("y", &y_cpu[0], 2 * sizeof(float));
+  GetOutput(&y_cpu);
 
   ASSERT_EQ(y_cpu[0], 2.0);
   ASSERT_EQ(y_cpu[1], 5.0);
diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc
index 17a433c9d98768..cb668a4174134b 100644
--- a/paddle/fluid/inference/tests/api/trt_models_tester.cc
+++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc
@@ -54,7 +54,8 @@ void SetConfig<AnalysisConfig>(AnalysisConfig* config, std::string model_dir,
   if (use_gpu) {
     config->EnableUseGpu(100, 0);
     if (use_tensorrt) {
-      config->EnableTensorRtEngine(1 << 10, batch_size);
+      config->EnableTensorRtEngine(1 << 10, batch_size, 3,
+                                   AnalysisConfig::Precision::kFloat32, false);
       config->pass_builder()->DeletePass("conv_bn_fuse_pass");
       config->pass_builder()->DeletePass("fc_fuse_pass");
       config->pass_builder()->TurnOnDebug();
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
index 031335009b692f..a8c86de9f9a1ae 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc
@@ -30,6 +30,9 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Ys", "A list of outputs").AsDuplicable();
     AddAttr<std::string>("subgraph", "the subgraph.");
     AddAttr<std::string>("calibration_data", "the calibration data for int8");
+    AddAttr<std::string>(
+        "engine_serialized_data",
+        "the serialized data contains the all info of the ICUDAEngine");
     AddAttr<std::string>(
         "engine_key",
         "The engine_key here is used to distinguish different TRT Engines");
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index 2ff35c7c6ac640..c3667331248973 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -16,8 +16,10 @@
 
 #ifdef PADDLE_WITH_CUDA
 
+#include <memory>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 #include "paddle/fluid/framework/executor.h"
@@ -31,37 +33,6 @@ namespace paddle {
 
 namespace operators {
 
-using FluidDT = framework::proto::VarType_Type;
-using TRT_DT = nvinfer1::DataType;
-
-namespace {  // NOLINT
-
-TRT_DT FluidDataType2TRT(FluidDT type) {
-  switch (type) {
-    case FluidDT::VarType_Type_FP32:
-      return TRT_DT::kFLOAT;
-    case FluidDT::VarType_Type_INT32:
-      return TRT_DT::kINT32;
-    default:
-      return TRT_DT::kINT32;
-  }
-  PADDLE_THROW("unkown type");
-  return TRT_DT::kINT32;
-}
-
-nvinfer1::Dims Vec2TRT_Dims(const std::vector<int64_t> &shape) {
-  PADDLE_ENFORCE_GT(shape.size(), 1UL,
-                    "TensorRT' tensor input requires at least 2 dimensions");
-  PADDLE_ENFORCE_LE(shape.size(), 4UL,
-                    "TensorRT' tensor input requires at most 4 dimensions");
-  PADDLE_ENFORCE(shape.size() == 4UL || shape.size() == 2UL);
-  if (shape.size() == 4UL)
-    return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]);
-  return nvinfer1::DimsCHW(shape[1], 1, 1);
-}
-
-}  // namespace // NOLINT
-
 using inference::Singleton;
 using inference::tensorrt::TensorRTEngine;
 using inference::tensorrt::TRTInt8Calibrator;
@@ -79,6 +50,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
   bool enable_int8_;
   std::string calibration_data_;
   std::string engine_key_;
+  std::string engine_serialized_data_;
   bool calibration_mode_;
 
  public:
@@ -93,6 +65,7 @@ class TensorRTEngineOp : public framework::OperatorBase {
     enable_int8_ = Attr<bool>("enable_int8");
     calibration_data_ = Attr<std::string>("calibration_data");
     engine_key_ = Attr<std::string>("engine_key");
+    engine_serialized_data_ = Attr<std::string>("engine_serialized_data");
 
     auto params = Attr<std::vector<std::string>>("parameters");
     for (const auto &param : params) {
@@ -125,7 +98,8 @@ class TensorRTEngineOp : public framework::OperatorBase {
       RunCalibration(scope, dev_place);
       return;
     }
-    RunTrt(scope, dev_place);
+    auto *trt_engine = GetEngine(scope, dev_place);
+    RunTrt(scope, dev_place, trt_engine);
   }
 
   void RunCalibration(const framework::Scope &scope,
@@ -136,10 +110,6 @@ class TensorRTEngineOp : public framework::OperatorBase {
     LOG_FIRST_N(INFO, 1) << "The TRT engine: " << engine_key_
                          << " is running calibration trt int8... ";
     int runtime_batch = 1;
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    auto &dev_ctx = *pool.Get(dev_place);
-    auto stream =
-        reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx).stream();
     if (!Singleton<TRTCalibratorEngineManager>::Global().Has(engine_key_)) {
       TRTCalibratorEngine *calib_res =
           Singleton<TRTCalibratorEngineManager>::Global().Create(engine_key_);
@@ -156,11 +126,11 @@ class TensorRTEngineOp : public framework::OperatorBase {
           calib_buffers, runtime_batch, engine_key_, dev_place));
       calib_res->thr_.reset(new std::thread([&]() {
         calib_res->engine_.reset(new TensorRTEngine(
-            max_batch_size_, workspace_size_, stream,
-            boost::get<platform::CUDAPlace>(dev_place).device, enable_int8_,
-            calib_res->calib_.get()));
+            max_batch_size_, workspace_size_, enable_int8_,
+            calib_res->calib_.get(),
+            boost::get<platform::CUDAPlace>(dev_place).device));
         VLOG(3) << "start the calib trt engine thread";
-        Prepare(scope, dev_place, calib_res->engine_.get());
+        PrepareTRTEngine(scope, calib_res->engine_.get());
       }));
     }
 
@@ -180,28 +150,29 @@ class TensorRTEngineOp : public framework::OperatorBase {
     RunNativeImpl(scope, dev_place);
   }
 
-  void RunTrt(const framework::Scope &scope,
-              const platform::Place &dev_place) const {
+  void RunTrt(const framework::Scope &scope, const platform::Place &dev_place,
+              TensorRTEngine *engine) const {
     int runtime_batch = 1;
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto &dev_ctx = *pool.Get(dev_place);
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx).stream();
-    if (trt_engine_.get() == nullptr) {
-      trt_engine_.reset(
-          new TensorRTEngine(max_batch_size_, workspace_size_, stream,
-                             boost::get<platform::CUDAPlace>(dev_place).device,
-                             enable_int8_, calibrator_.get()));
-      Prepare(scope, dev_place, trt_engine_.get());
-    }
 
-    auto *engine = trt_engine_.get();
     PADDLE_ENFORCE(!input_names_.empty(), "should pass more than one inputs");
 
     std::vector<std::string> output_maps =
         Attr<std::vector<std::string>>("output_name_mapping");
 
-    // Convert input tensor from fluid to engine.
+    int num_inputs = 0;
+
+    for (const auto &x : Inputs("Xs")) {
+      if (param_names_.count(x)) continue;
+      num_inputs += 1;
+    }
+    const int num_bindings = num_inputs + Outputs("Ys").size();
+    std::vector<void *> buffers(num_bindings);
+
+    // Bind input tensor to TRT.
     for (const auto &x : Inputs("Xs")) {
       if (param_names_.count(x)) continue;
       // convert input and copy to TRT engine's buffer
@@ -209,28 +180,20 @@ class TensorRTEngineOp : public framework::OperatorBase {
           inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
       auto t_shape = framework::vectorize(t.dims());
       runtime_batch = t_shape[0];
-      if (platform::is_cpu_place(t.place())) {
-        engine->SetInputFromCPU(x, static_cast<const void *>(t.data<void>()),
-                                t.memory_size());
-      } else {
-        engine->SetInputFromGPU(x, static_cast<const void *>(t.data<void>()),
-                                t.memory_size());
-      }
-    }
 
-    cudaStreamSynchronize(stream);
-    PADDLE_ENFORCE_LE(runtime_batch, max_batch_size_);
-    // Execute the engine.
-    engine->Execute(runtime_batch);
+      const int bind_index = engine->engine()->getBindingIndex(x.c_str());
+      PADDLE_ENFORCE(bind_index < num_bindings,
+                     "The bind index should be less than num_bindings");
+      buffers[bind_index] = static_cast<void *>(t.data<float>());
+    }
 
-    // Convert output tensor from engine to fluid
+    // Bind output tensor to TRT.
     int output_index = 0;
     VLOG(4) << "TensorRT Engine Op Outputs:";
     for (const auto &y : Outputs("Ys")) {
-      VLOG(4) << y;
-      // convert output and copy to fluid.
-      nvinfer1::ITensor *trt_t = engine->GetITensor(output_maps[output_index]);
-      auto dims = trt_t->getDimensions();
+      const int bind_index =
+          engine->engine()->getBindingIndex(output_maps[output_index].c_str());
+      auto dims = engine->engine()->getBindingDimensions(bind_index);
       // Use the output ITensor's dims to reshape the Fluid Tensor.
       // The ITensor doesn't contain the batch size dim.
       std::vector<int> ddim;
@@ -238,71 +201,55 @@ class TensorRTEngineOp : public framework::OperatorBase {
       for (int i = 0; i < dims.nbDims; i++) {
         ddim.push_back(dims.d[i]);
       }
-
       auto *fluid_v = scope.FindVar(y);
       PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
       auto *fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
-
       fluid_t->Resize(framework::make_ddim(ddim));
 
-      // TODO(Superjomn) change this float to dtype size.
-      auto size =
-          inference::analysis::AccuDims(dims.d, dims.nbDims) * runtime_batch;
-      engine->GetOutputInGPU(
-          output_maps[output_index],
-          fluid_t->mutable_data<float>(platform::CUDAPlace(
-              boost::get<platform::CUDAPlace>(dev_place).device)),
-          size * sizeof(float));
+      PADDLE_ENFORCE(bind_index < num_bindings,
+                     "The bind index should be less than num_bindings");
+      buffers[bind_index] = static_cast<void *>(fluid_t->mutable_data<float>(
+          boost::get<platform::CUDAPlace>(dev_place)));
+
       output_index += 1;
     }
 
+    PADDLE_ENFORCE_LE(runtime_batch, max_batch_size_);
+    // Execute the engine.
+    engine->Execute(runtime_batch, &buffers, stream);
     cudaStreamSynchronize(stream);
   }
 
-  void Prepare(const framework::Scope &scope, const platform::Place &dev_place,
-               TensorRTEngine *engine) const {
+  TensorRTEngine *GetEngine(const framework::Scope &scope,
+                            const platform::Place &dev_place) const {
+    if (!trt_engine_) {
+      trt_engine_.reset(new inference::tensorrt::TensorRTEngine(
+          max_batch_size_, workspace_size_, enable_int8_, calibrator_.get(),
+          boost::get<platform::CUDAPlace>(dev_place).device));
+      if (!engine_serialized_data_.empty()) {
+        trt_engine_->Deserialize(engine_serialized_data_);
+      } else {
+        PrepareTRTEngine(scope, trt_engine_.get());
+      }
+    }
+    return trt_engine_.get();
+  }
+
+  void PrepareTRTEngine(const framework::Scope &scope,
+                        TensorRTEngine *engine) const {
     LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP "
                  "kernel etc). This process may cost a lot of time.";
-    framework::proto::BlockDesc block_desc;
-    block_desc.ParseFromString(Attr<std::string>("subgraph"));
+    framework::proto::BlockDesc block_proto;
+    block_proto.ParseFromString(Attr<std::string>("subgraph"));
+    framework::BlockDesc block_desc(nullptr, &block_proto);
 
-    std::vector<std::string> output_maps =
+    std::vector<std::string> inputs = Inputs("Xs");
+    std::vector<std::string> outputs =
         Attr<std::vector<std::string>>("output_name_mapping");
 
-    engine->InitNetwork();
-
-    framework::BlockDesc block(nullptr /*programdesc*/, &block_desc);
-    VLOG(4) << "parsed var size " << block.AllVars().size();
-    // Add inputs
-    VLOG(4) << "declare inputs";
-    for (auto &input : Inputs("Xs")) {
-      if (param_names_.count(input)) continue;
-      VLOG(4) << "declare input " << input;
-
-      auto &t =
-          inference::analysis::GetFromScope<framework::LoDTensor>(scope, input);
-      auto t_shape = framework::vectorize(t.dims());
-
-      auto *var = block.FindVar(input);
-      // TensorRT engine need to create parameters. The parameter's description
-      // should be set in
-      PADDLE_ENFORCE(var, "no variable called %s", input);
-      PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR,
-                        "TensorRT engine only takes LoDTensor as input");
-
-      engine->DeclareInput(
-          input, FluidDataType2TRT(
-                     var->Proto()->type().lod_tensor().tensor().data_type()),
-          Vec2TRT_Dims(t_shape));
-    }
     inference::Singleton<inference::tensorrt::OpConverter>::Global()
-        .ConvertBlock(block_desc, param_names_, scope, engine);
-
-    // Add outputs
-    for (auto &output : output_maps) {
-      engine->DeclareOutput(output);
-    }
-    engine->FreezeNetwork();
+        .ConvertBlockToTRTEngine(&block_desc, scope, inputs, param_names_,
+                                 outputs, engine);
   }
 };
 
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
index 5a3d9d2c1a3e81..e7ad2f4fe0c654 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
@@ -107,6 +107,7 @@ TEST(TensorRTEngineOp, manual) {
   engine_op_desc.SetAttr("output_name_mapping",
                          std::vector<std::string>({"z0"}));
   engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
+  engine_op_desc.SetAttr("engine_serialized_data", std::string(""));
 
   LOG(INFO) << "create engine op";
   auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
@@ -202,6 +203,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) {
   engine_op_desc.SetAttr("output_name_mapping",
                          std::vector<std::string>({"z3"}));
   engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString()));
+  engine_op_desc.SetAttr("engine_serialized_data", std::string(""));
 
   auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc);
 
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 7db2bb451b4991..03c1b0bd092181 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -221,7 +221,8 @@ void BindAnalysisConfig(py::module *m) {
       .def("enable_tensorrt_engine", &AnalysisConfig::EnableTensorRtEngine,
            py::arg("workspace_size") = 1 << 20, py::arg("max_batch_size") = 1,
            py::arg("min_subgraph_size") = 3,
-           py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32)
+           py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32,
+           py::arg("use_static") = true)
       .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled)
       .def("switch_ir_debug", &AnalysisConfig::SwitchIrDebug,
            py::arg("x") = true)