From 84eb23f3cbca3e88fb1bfdc5ca6e020acfcb8298 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Thu, 7 Mar 2019 05:43:58 +0000 Subject: [PATCH 1/5] add simplify detection patten pass for anakin add density prior box op converter add detection out op converter --- paddle/fluid/framework/ir/CMakeLists.txt | 5 + .../framework/ir/graph_pattern_detector.cc | 130 ++++++++++ .../framework/ir/graph_pattern_detector.h | 18 ++ .../simplify_anakin_detection_pattern_pass.cc | 233 ++++++++++++++++++ .../simplify_anakin_detection_pattern_pass.h | 41 +++ .../inference/anakin/convert/CMakeLists.txt | 2 +- .../anakin/convert/density_prior_box.cc | 75 ++++++ .../anakin/convert/density_prior_box.h | 37 +++ .../inference/anakin/convert/detection_out.cc | 65 +++++ .../inference/anakin/convert/detection_out.h | 37 +++ paddle/fluid/inference/anakin/op_teller.cc | 6 +- .../fluid/inference/api/analysis_predictor.cc | 5 +- .../inference/api/paddle_pass_builder.cc | 3 +- 13 files changed, 652 insertions(+), 5 deletions(-) create mode 100644 paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc create mode 100644 paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h create mode 100644 paddle/fluid/inference/anakin/convert/density_prior_box.cc create mode 100644 paddle/fluid/inference/anakin/convert/density_prior_box.h create mode 100644 paddle/fluid/inference/anakin/convert/detection_out.cc create mode 100644 paddle/fluid/inference/anakin/convert/detection_out.h diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index ca6b0229e906c0..9085deebca54d8 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -66,6 +66,7 @@ pass_library(conv_elementwise_add_fuse_pass inference) pass_library(conv_affine_channel_fuse_pass inference) pass_library(transpose_flatten_concat_fuse_pass inference) pass_library(identity_scale_op_clean_pass base) +pass_library(simplify_anakin_detection_pattern_pass inference) # There may be many transpose-flatten structures in a model, and the output of # these structures will be used as inputs to the concat Op. This pattern will @@ -76,6 +77,10 @@ foreach (index RANGE 3 6) file(APPEND ${pass_file} "USE_PASS(transpose_flatten${index}_concat_fuse_pass);\n") endforeach() +foreach (index RANGE 3 6) + file(APPEND ${pass_file} "USE_PASS(simplify_anakin_detection_pattern_pass${index});\n") +endforeach() + if(WITH_MKLDNN) pass_library(mkldnn_placement_pass base mkldnn) pass_library(depthwise_conv_mkldnn_pass base mkldnn) diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index c0c34d186b0081..9baf777ff59aea 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -1364,6 +1364,136 @@ PDNode *patterns::TransposeFlattenConcat::operator()( return concat_out; } +PDNode *patterns::AnakinDetectionPattern::operator()( + std::vector conv_in, int times) { + // The times represents the repeat times of the + // {prior_box, prior_box_loc_out, flatten, prior_box_var_out, reshape} + const int kNumFields = 7; + const int kPriorBoxLocOffset = 1; + const int kReshape1Offset = 2; + const int kReshape1OutOffset = 3; + const int kPriorBoxVarOffset = 4; + const int kReshape2Offset = 5; + const int kReshape2OutOffset = 6; + + const int kBoxCoderThirdInputOffset = times; + const int kMultiClassSecondInputNmsOffset = times + 1; + + std::vector nodes; + + for (int i = 0; i < times; i++) { + nodes.push_back( + pattern->NewNode(GetNodeName("prior_box" + std::to_string(i))) + ->assert_is_op("density_prior_box")); + nodes.push_back(pattern->NewNode(GetNodeName("box_out" + std::to_string(i))) + ->assert_is_op_output("density_prior_box", "Boxes") + ->assert_is_op_input("reshape2", "X") + ->AsIntermediate()); + nodes.push_back( + pattern->NewNode(GetNodeName("reshape1" + std::to_string(i))) + ->assert_is_op("reshape2")); + + nodes.push_back( + pattern->NewNode(GetNodeName("reshape1_out" + std::to_string(i))) + ->assert_is_op_output("reshape2") + ->assert_is_op_nth_input("concat", "X", i) + ->AsIntermediate()); + + nodes.push_back( + pattern->NewNode(GetNodeName("box_var_out" + std::to_string(i))) + ->assert_is_op_output("density_prior_box", "Variances") + ->assert_is_op_input("reshape2", "X") + ->AsIntermediate()); + nodes.push_back( + pattern->NewNode(GetNodeName("reshape2" + std::to_string(i))) + ->assert_is_op("reshape2")); + + nodes.push_back( + pattern->NewNode(GetNodeName("reshape2_out" + std::to_string(i))) + ->assert_is_op_output("reshape2") + ->assert_is_op_nth_input("concat", "X", i) + ->AsIntermediate()); + } + + auto concat_op1 = pattern->NewNode(GetNodeName("concat1")) + ->assert_is_op("concat") + ->assert_op_has_n_inputs("concat", times); + auto concat_out1 = pattern->NewNode(GetNodeName("concat1_out")) + ->assert_is_op_output("concat") + ->AsIntermediate(); + + auto concat_op2 = pattern->NewNode(GetNodeName("concat2")) + ->assert_is_op("concat") + ->assert_op_has_n_inputs("concat", times); + auto concat_out2 = pattern->NewNode(GetNodeName("concat2_out")) + ->assert_is_op_output("concat") + ->AsIntermediate(); + + auto box_coder_op = pattern->NewNode(GetNodeName("box_coder")) + ->assert_is_op("box_coder") + ->assert_op_has_n_inputs("box_coder", 3); + + auto box_coder_out = pattern->NewNode(GetNodeName("box_coder_out")) + ->assert_is_op_output("box_coder") + ->AsIntermediate(); + + auto multiclass_nms_op = pattern->NewNode(GetNodeName("multiclass_nms")) + ->assert_is_op("multiclass_nms") + ->assert_op_has_n_inputs("multiclass_nms", 2); + + auto multiclass_nms_out = pattern->NewNode(GetNodeName("multiclass_nms_out")) + ->assert_is_op_output("multiclass_nms") + ->AsOutput(); + + std::vector reshape1_outs; + std::vector reshape2_outs; + + for (int i = 0; i < times; i++) { + conv_in[i]->AsInput(); + // prior_box + nodes[i * kNumFields]->LinksFrom({conv_in[i]}); + // prior_box box out + nodes[i * kNumFields + kPriorBoxLocOffset]->LinksFrom( + {nodes[i * kNumFields]}); + // reshape + nodes[i * kNumFields + kReshape1Offset]->LinksFrom( + {nodes[i * kNumFields + kPriorBoxLocOffset]}); + // reshape_out + nodes[i * kNumFields + kReshape1OutOffset]->LinksFrom( + {nodes[i * kNumFields + kReshape1Offset]}); + + nodes[i * kNumFields + kPriorBoxVarOffset]->LinksFrom( + {nodes[i * kNumFields]}); + // reshape + nodes[i * kNumFields + kReshape2Offset]->LinksFrom( + {nodes[i * kNumFields + kPriorBoxVarOffset]}); + // reshape_out + nodes[i * kNumFields + kReshape2OutOffset]->LinksFrom( + {nodes[i * kNumFields + kReshape2Offset]}); + + reshape1_outs.push_back(nodes[i * kNumFields + kReshape1OutOffset]); + reshape2_outs.push_back(nodes[i * kNumFields + kReshape2OutOffset]); + } + + concat_op1->LinksFrom(reshape1_outs); + concat_op2->LinksFrom(reshape2_outs); + concat_out1->LinksFrom({concat_op1}); + concat_out2->LinksFrom({concat_op2}); + + conv_in[kBoxCoderThirdInputOffset]->AsInput(); + conv_in[kMultiClassSecondInputNmsOffset]->AsInput(); + + box_coder_op->LinksFrom( + {concat_out1, concat_out2, conv_in[kBoxCoderThirdInputOffset]}); + box_coder_out->LinksFrom({box_coder_op}); + + multiclass_nms_op + ->LinksFrom({box_coder_out, conv_in[kMultiClassSecondInputNmsOffset]}) + .LinksTo({multiclass_nms_out}); + + return multiclass_nms_out; +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index c8be586f546dc6..06df547d4e6ac7 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -18,8 +18,11 @@ #include #endif +#include #include #include +#include +#include #include #include #include "paddle/fluid/framework/ir/graph.h" @@ -781,6 +784,21 @@ struct TransposeFlattenConcat : public PatternBase { } }; +struct AnakinDetectionPattern : public PatternBase { + AnakinDetectionPattern(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "anakin_detect_pattern") {} + + PDNode* operator()(std::vector conv_inputs, int times); + + std::string GetNodeName(const std::string& op_type) { + return PDNodeName(name_scope_, repr_, id_, op_type); + } + + PDNode* GetPDNode(const std::string& op_type) { + return pattern->RetrieveNode(GetNodeName(op_type)); + } +}; + } // namespace patterns // Link two ir::Nodes from each other. diff --git a/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc new file mode 100644 index 00000000000000..462f077eb7a829 --- /dev/null +++ b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc @@ -0,0 +1,233 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "paddle/fluid/framework/ir/graph_viz_pass.h" +#include "paddle/fluid/framework/ir/node.h" +#include "paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +template +std::unique_ptr SimplifyAnakinDetectionPatternPass::ApplyImpl( + std::unique_ptr graph) const { + const std::string pattern_name = + "simplify_anakin_detection_pattern_pass" + std::to_string(times); + FusePassBase::Init(pattern_name, graph.get()); + + GraphPatternDetector gpd; + std::vector input_nodes; + for (int i = 0; i < times; i++) { + input_nodes.push_back(gpd.mutable_pattern() + ->NewNode("x" + std::to_string(i)) + ->assert_is_op_input("density_prior_box", "Input") + ->AsInput()); + } + input_nodes.push_back(gpd.mutable_pattern() + ->NewNode("x" + std::to_string(times)) + ->assert_is_op_input("box_coder", "TargetBox") + ->AsInput()); + + input_nodes.push_back(gpd.mutable_pattern() + ->NewNode("x" + std::to_string(times + 1)) + ->assert_is_op_input("multiclass_nms", "Scores") + ->AsInput()); + + patterns::AnakinDetectionPattern pattern(gpd.mutable_pattern(), pattern_name); + pattern(input_nodes, times); + + auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, + Graph *g) { + const int kNumFields = 7; + const int kPriorBoxLocOffset = 1; + const int kReshape1Offset = 2; + const int kReshape1OutOffset = 3; + const int kPriorBoxVarOffset = 4; + const int kReshape2Offset = 5; + const int kReshape2OutOffset = 6; + std::vector nodes; + + for (int i = 0; i < times; i++) { + PADDLE_ENFORCE( + subgraph.at(pattern.GetPDNode("prior_box" + std::to_string(i)))); + PADDLE_ENFORCE( + subgraph.at(pattern.GetPDNode("box_out" + std::to_string(i)))); + PADDLE_ENFORCE( + subgraph.at(pattern.GetPDNode("reshape1" + std::to_string(i)))); + PADDLE_ENFORCE( + subgraph.at(pattern.GetPDNode("reshape1_out" + std::to_string(i)))); + PADDLE_ENFORCE( + subgraph.at(pattern.GetPDNode("reshape2" + std::to_string(i)))); + PADDLE_ENFORCE( + subgraph.at(pattern.GetPDNode("reshape2_out" + std::to_string(i)))); + + PADDLE_ENFORCE( + subgraph.at(pattern.GetPDNode("box_var_out" + std::to_string(i)))); + + nodes.push_back( + subgraph.at(pattern.GetPDNode("prior_box" + std::to_string(i)))); + nodes.push_back( + subgraph.at(pattern.GetPDNode("box_out" + std::to_string(i)))); + nodes.push_back( + subgraph.at(pattern.GetPDNode("reshape1" + std::to_string(i)))); + nodes.push_back( + subgraph.at(pattern.GetPDNode("reshape1_out" + std::to_string(i)))); + nodes.push_back( + subgraph.at(pattern.GetPDNode("box_var_out" + std::to_string(i)))); + nodes.push_back( + subgraph.at(pattern.GetPDNode("reshape2" + std::to_string(i)))); + nodes.push_back( + subgraph.at(pattern.GetPDNode("reshape2_out" + std::to_string(i)))); + } + + Node *concat_op1 = subgraph.at(pattern.GetPDNode("concat1")); + Node *concat_out1 = subgraph.at(pattern.GetPDNode("concat1_out")); + + Node *concat_op2 = subgraph.at(pattern.GetPDNode("concat2")); + Node *concat_out2 = subgraph.at(pattern.GetPDNode("concat2_out")); + + Node *box_coder_third_input = subgraph.at(input_nodes[times]); + Node *box_coder_op = subgraph.at(pattern.GetPDNode("box_coder")); + Node *box_coder_out = subgraph.at(pattern.GetPDNode("box_coder_out")); + + Node *multiclass_nms_second_input = subgraph.at(input_nodes[times + 1]); + Node *multiclass_nms = subgraph.at(pattern.GetPDNode("multiclass_nms")); + Node *multiclass_nms_out = + subgraph.at(pattern.GetPDNode("multiclass_nms_out")); + + std::string code_type = + boost::get(box_coder_op->Op()->GetAttr("code_type")); + bool box_normalized = + boost::get(box_coder_op->Op()->GetAttr("box_normalized")); + // auto variance = + // boost::get>(box_coder_op->Op()->GetAttr("variance")); + int background_label = + boost::get(multiclass_nms->Op()->GetAttr("background_label")); + float score_threshold = + boost::get(multiclass_nms->Op()->GetAttr("score_threshold")); + int nms_top_k = boost::get(multiclass_nms->Op()->GetAttr("nms_top_k")); + float nms_threshold = + boost::get(multiclass_nms->Op()->GetAttr("nms_threshold")); + float nms_eta = boost::get(multiclass_nms->Op()->GetAttr("nms_eta")); + int keep_top_k = + boost::get(multiclass_nms->Op()->GetAttr("keep_top_k")); + + std::vector concat1_input_names; + for (int i = 0; i < times; i++) { + concat1_input_names.push_back( + nodes[i * kNumFields + kPriorBoxLocOffset]->Name()); + } + + int axis = boost::get(concat_op1->Op()->GetAttr("axis")); + framework::OpDesc concat1_desc; + concat1_desc.SetType("concat"); + concat1_desc.SetInput("X", concat1_input_names); + concat1_desc.SetAttr("axis", axis); + concat1_desc.SetOutput("Out", {concat_out1->Name()}); + + auto *new_add_concat_op = graph->CreateOpNode(&concat1_desc); + + for (int i = 0; i < times; i++) { + nodes[i * kNumFields + kPriorBoxLocOffset]->outputs.push_back( + new_add_concat_op); + new_add_concat_op->inputs.push_back( + nodes[i * kNumFields + kPriorBoxLocOffset]); + } + + framework::OpDesc new_op_desc; + new_op_desc.SetType("detection_out"); + new_op_desc.SetInput("PriorBox", {concat_out1->Name()}); + new_op_desc.SetInput("TargetBox", {box_coder_third_input->Name()}); + new_op_desc.SetInput("Scores", {multiclass_nms_second_input->Name()}); + new_op_desc.SetAttr("code_type", code_type); + new_op_desc.SetAttr("box_normalized", box_normalized); + new_op_desc.SetAttr("background_label", background_label); + new_op_desc.SetAttr("score_threshold", score_threshold); + new_op_desc.SetAttr("nms_top_k", nms_top_k); + new_op_desc.SetAttr("nms_threshold", nms_threshold); + new_op_desc.SetAttr("nms_eta", nms_eta); + new_op_desc.SetAttr("keep_top_k", keep_top_k); + new_op_desc.SetOutput("Out", {multiclass_nms_out->Name()}); + new_op_desc.Flush(); + + // Create a new node for the fused op. + auto *detection_out_op = graph->CreateOpNode(&new_op_desc); + + std::unordered_set delete_nodes; + + for (int i = 0; i < times; i++) { + nodes[i * kNumFields + kPriorBoxLocOffset]->outputs.push_back(concat_op1); + delete_nodes.insert(nodes[i * kNumFields + kReshape1Offset]); + delete_nodes.insert(nodes[i * kNumFields + kReshape1OutOffset]); + delete_nodes.insert(nodes[i * kNumFields + kPriorBoxVarOffset]); + delete_nodes.insert(nodes[i * kNumFields + kReshape2Offset]); + delete_nodes.insert(nodes[i * kNumFields + kReshape2OutOffset]); + } + + delete_nodes.insert(concat_op1); + delete_nodes.insert(concat_op2); + delete_nodes.insert(concat_out2); + delete_nodes.insert(box_coder_op); + delete_nodes.insert(box_coder_out); + delete_nodes.insert(multiclass_nms); + + new_add_concat_op->outputs.push_back(concat_out1); + concat_out1->inputs.push_back(new_add_concat_op); + + detection_out_op->inputs.push_back(concat_out1); + detection_out_op->inputs.push_back(box_coder_third_input); + detection_out_op->inputs.push_back(multiclass_nms_second_input); + detection_out_op->outputs.push_back(multiclass_nms_out); + + concat_out1->outputs.push_back(detection_out_op); + box_coder_third_input->outputs.push_back(detection_out_op); + multiclass_nms_second_input->outputs.push_back(detection_out_op); + multiclass_nms_out->inputs.push_back(detection_out_op); + + // Delete the unneeded nodes. + GraphSafeRemoveNodes(graph.get(), delete_nodes); + }; + + gpd(graph.get(), handler); + return graph; +} + +template class SimplifyAnakinDetectionPatternPass<1>; +template class SimplifyAnakinDetectionPatternPass<3>; +template class SimplifyAnakinDetectionPatternPass<4>; +template class SimplifyAnakinDetectionPatternPass<5>; +template class SimplifyAnakinDetectionPatternPass<6>; + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(simplify_anakin_detection_pattern_pass, + paddle::framework::ir::SimplifyAnakinDetectionPatternPass<1>); + +REGISTER_PASS(simplify_anakin_detection_pattern_pass3, + paddle::framework::ir::SimplifyAnakinDetectionPatternPass<3>); + +REGISTER_PASS(simplify_anakin_detection_pattern_pass4, + paddle::framework::ir::SimplifyAnakinDetectionPatternPass<4>); + +REGISTER_PASS(simplify_anakin_detection_pattern_pass5, + paddle::framework::ir::SimplifyAnakinDetectionPatternPass<5>); + +REGISTER_PASS(simplify_anakin_detection_pattern_pass6, + paddle::framework::ir::SimplifyAnakinDetectionPatternPass<6>); diff --git a/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h new file mode 100644 index 00000000000000..2338e4c38b253e --- /dev/null +++ b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h @@ -0,0 +1,41 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +// There may be many transpose-flatten structures in a model, and the output of +// these structures will be used as inputs to the concat Op. This pattern will +// be detected by our pass. The times here represents the repeat times of this +// structure. +template +class SimplifyAnakinDetectionPatternPass : public FusePassBase { + public: + virtual ~SimplifyAnakinDetectionPatternPass() {} + + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/CMakeLists.txt b/paddle/fluid/inference/anakin/convert/CMakeLists.txt index dc7fb7f79c0437..4f32d22cfa7996 100644 --- a/paddle/fluid/inference/anakin/convert/CMakeLists.txt +++ b/paddle/fluid/inference/anakin/convert/CMakeLists.txt @@ -1,5 +1,5 @@ cc_library(anakin_op_converter SRCS fc.cc conv2d.cc conv2d_fusion.cc -elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc DEPS anakin_engine framework_proto scope op_registry) + elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc density_prior_box.cc detection_out.cc DEPS anakin_engine framework_proto scope op_registry) cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op) cc_test(test_anakin_conv2d SRCS test_conv2d_op.cc DEPS anakin_op_converter conv_op im2col vol2col depthwise_conv SERIAL) cc_test(test_anakin_activation SRCS test_activation_op.cc DEPS activation_op anakin_op_converter SERIAL) diff --git a/paddle/fluid/inference/anakin/convert/density_prior_box.cc b/paddle/fluid/inference/anakin/convert/density_prior_box.cc new file mode 100644 index 00000000000000..9fdea12dcf8edc --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/density_prior_box.cc @@ -0,0 +1,75 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/density_prior_box.h" +#include +#include +#include + +using anakin::graph::GraphGlobalMem; +using anakin::AK_FLOAT; +using anakin::saber::NV; +using anakin::saber::Shape; +using anakin::PTuple; + +namespace paddle { +namespace inference { +namespace anakin { + +void DensityPriorBoxOpConverter::operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + auto input_name = op_desc.Input("Input").front(); + auto image_name = op_desc.Input("Image").front(); + auto output_name = op_desc.Output("Boxes").front(); + + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + + auto fixed_sizes = + boost::get>(op_desc.GetAttr("fixed_sizes")); + auto fixed_ratios = + boost::get>(op_desc.GetAttr("fixed_ratios")); + auto densities = boost::get>(op_desc.GetAttr("densities")); + + // lack flip + auto clip = boost::get(op_desc.GetAttr("clip")); + auto variances = boost::get>(op_desc.GetAttr("variances")); + + // lack img_h, img_w + auto step_h = boost::get(op_desc.GetAttr("step_h")); + auto step_w = boost::get(op_desc.GetAttr("step_w")); + auto offset = boost::get(op_desc.GetAttr("offset")); + std::vector order = {"MIN", "COM", "MAX"}; + + engine_->AddOp(op_name, "PriorBox", {input_name, image_name}, {output_name}); + engine_->AddOpAttr>(op_name, "fixed_sizes", fixed_sizes); + engine_->AddOpAttr>(op_name, "fixed_ratios", fixed_ratios); + engine_->AddOpAttr>(op_name, "density", densities); + engine_->AddOpAttr(op_name, "flip", false); + engine_->AddOpAttr(op_name, "clip", clip); + engine_->AddOpAttr>(op_name, "variances", variances); + engine_->AddOpAttr(op_name, "img_h", static_cast(0)); + engine_->AddOpAttr(op_name, "img_w", static_cast(0)); + engine_->AddOpAttr(op_name, "step_h", step_h); + engine_->AddOpAttr(op_name, "step_w", step_w); + engine_->AddOpAttr(op_name, "offset", offset); + engine_->AddOpAttr>(op_name, "order", order); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(density_prior_box, DensityPriorBoxOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/density_prior_box.h b/paddle/fluid/inference/anakin/convert/density_prior_box.h new file mode 100644 index 00000000000000..44265cbf2e968e --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/density_prior_box.h @@ -0,0 +1,37 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +class DensityPriorBoxOpConverter : public AnakinOpConverter { + public: + DensityPriorBoxOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) override; + virtual ~DensityPriorBoxOpConverter() {} +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/detection_out.cc b/paddle/fluid/inference/anakin/convert/detection_out.cc new file mode 100644 index 00000000000000..1dae346a02443b --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/detection_out.cc @@ -0,0 +1,65 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/anakin/convert/detection_out.h" +#include +#include + +using anakin::graph::GraphGlobalMem; +using anakin::AK_FLOAT; +using anakin::saber::NV; +using anakin::saber::Shape; + +namespace paddle { +namespace inference { +namespace anakin { + +void DetectionOutOpConverter::operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) { + framework::OpDesc op_desc(op, nullptr); + auto target_name = op_desc.Input("TargetBox").front(); + auto prior_box_name = op_desc.Input("PriorBox").front(); + auto scores_name = op_desc.Input("Scores").front(); + auto output_name = op_desc.Output("Out").front(); + + auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + + auto code_type = boost::get(op_desc.GetAttr("code_type")); + auto background_label = boost::get(op_desc.GetAttr("background_label")); + auto score_threshold = boost::get(op_desc.GetAttr("score_threshold")); + auto nms_top_k = boost::get(op_desc.GetAttr("nms_top_k")); + auto nms_threshold = boost::get(op_desc.GetAttr("nms_threshold")); + auto nms_eta = boost::get(op_desc.GetAttr("nms_eta")); + auto keep_top_k = boost::get(op_desc.GetAttr("keep_top_k")); + + engine_->AddOp(op_name, "DetectionOutput", + {target_name, scores_name, prior_box_name}, {output_name}); + engine_->AddOpAttr(op_name, "share_location", true); + engine_->AddOpAttr(op_name, "variance_encode_in_target", false); + engine_->AddOpAttr(op_name, "class_num", static_cast(0)); + engine_->AddOpAttr(op_name, "background_id", background_label); + engine_->AddOpAttr(op_name, "keep_top_k", keep_top_k); + engine_->AddOpAttr(op_name, "code_type", code_type); + engine_->AddOpAttr(op_name, "conf_thresh", score_threshold); + engine_->AddOpAttr(op_name, "nms_top_k", nms_top_k); + engine_->AddOpAttr(op_name, "nms_thresh", nms_threshold); + engine_->AddOpAttr(op_name, "nms_eta", nms_eta); +} + +} // namespace anakin +} // namespace inference +} // namespace paddle + +REGISTER_ANAKIN_OP_CONVERTER(detection_out, DetectionOutOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/detection_out.h b/paddle/fluid/inference/anakin/convert/detection_out.h new file mode 100644 index 00000000000000..5bf1c3ecbc8979 --- /dev/null +++ b/paddle/fluid/inference/anakin/convert/detection_out.h @@ -0,0 +1,37 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/inference/anakin/convert/op_converter.h" + +namespace paddle { +namespace inference { +namespace anakin { + +class DetectionOutOpConverter : public AnakinOpConverter { + public: + DetectionOutOpConverter() = default; + + virtual void operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, + bool test_mode) override; + virtual ~DetectionOutOpConverter() {} +}; + +} // namespace anakin +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/anakin/op_teller.cc b/paddle/fluid/inference/anakin/op_teller.cc index 015bcc066e1621..01ceef696ddc9b 100644 --- a/paddle/fluid/inference/anakin/op_teller.cc +++ b/paddle/fluid/inference/anakin/op_teller.cc @@ -21,7 +21,7 @@ namespace anakin { // Just tell by the op_types. struct SimpleOpTypeSetTeller : public Teller { SimpleOpTypeSetTeller() { - // teller_set.insert("mul"); + teller_set.insert("mul"); teller_set.insert("fc"); teller_set.insert("conv2d_fusion"); teller_set.insert("split"); @@ -30,7 +30,9 @@ struct SimpleOpTypeSetTeller : public Teller { teller_set.insert("elementwise_add"); teller_set.insert("concat"); teller_set.insert("tanh"); - // teller_set.insert("conv2d"); + teller_set.insert("conv2d"); + teller_set.insert("detection_out"); + teller_set.insert("density_prior_box"); } bool operator()(const std::string& op_type, diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 87964d6b52728b..fa64786d706075 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -768,13 +768,16 @@ USE_TRT_CONVERTER(conv2d_transpose); USE_TRT_CONVERTER(leaky_relu); #endif +USE_ANAKIN_CONVERTER(mul); USE_ANAKIN_CONVERTER(fc); USE_ANAKIN_CONVERTER(conv2d); +USE_ANAKIN_CONVERTER(conv2d_fusion); USE_ANAKIN_CONVERTER(concat); USE_ANAKIN_CONVERTER(split); USE_ANAKIN_CONVERTER(relu); USE_ANAKIN_CONVERTER(sigmoid); USE_ANAKIN_CONVERTER(tanh); USE_ANAKIN_CONVERTER(pool2d); -USE_ANAKIN_CONVERTER(conv2d_fusion); USE_ANAKIN_CONVERTER(elementwise_add); +USE_ANAKIN_CONVERTER(detection_out); +USE_ANAKIN_CONVERTER(density_prior_box); diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index b9615392c5b819..f723f0ef0605b0 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -71,7 +71,8 @@ void GpuPassStrategy::EnableMKLDNN() { GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { passes_.assign({ "infer_clean_graph_pass", // - "fc_fuse_pass", "conv_elementwise_add_fuse_pass", "anakin_subgraph_pass", + "simplify_anakin_detection_pattern_pass3", "fc_fuse_pass", + "conv_elementwise_add_fuse_pass", "anakin_subgraph_pass", // "identity_scale_op_clean_pass", // /* "conv_affine_channel_fuse_pass", // From f8dcbd568e473646cc98a2f9aa8571f69017141d Mon Sep 17 00:00:00 2001 From: flame Date: Thu, 7 Mar 2019 20:39:06 +0800 Subject: [PATCH 2/5] batch norm (#16110) * use anakin batch norm and scale implement fluid batch norm --- .../inference/anakin/convert/batch_norm.cc | 97 ++++++++++--------- .../anakin/convert/test_batch_norm_op.cc | 1 - 2 files changed, 50 insertions(+), 48 deletions(-) diff --git a/paddle/fluid/inference/anakin/convert/batch_norm.cc b/paddle/fluid/inference/anakin/convert/batch_norm.cc index 5ebcefb8a2dc5b..bb5d406f981d40 100644 --- a/paddle/fluid/inference/anakin/convert/batch_norm.cc +++ b/paddle/fluid/inference/anakin/convert/batch_norm.cc @@ -41,16 +41,15 @@ void BatchNormOpConverter::operator()(const framework::proto::OpDesc &op, auto output = op_desc.Output("Y").front(); auto op_name = op_desc.Type() + ":" + op_desc.Output("Y").front(); - engine_->AddOp(op_name, "Scale", {inputs["X"]}, {output}); - engine_->AddOpAttr(op_name, "bias_term", true); - engine_->AddOpAttr(op_name, "axis", 1); - engine_->AddOpAttr(op_name, "num_axes", 1); - bool is_test = boost::get(op_desc.GetAttr("is_test")); - PADDLE_ENFORCE(is_test); - float epsilon = boost::get(op_desc.GetAttr("epsilon")); - engine_->AddOpAttr(op_name, "epsilon", epsilon); + auto epsilon = boost::get(op_desc.GetAttr("epsilon")); + + auto bn_op_name = op_name + ":bn"; + auto bn_output = bn_op_name + "_output"; + engine_->AddOp(bn_op_name, "BatchNorm", {inputs["X"]}, {bn_output}); + engine_->AddOpAttr(bn_op_name, "epsilon", epsilon); + auto scale_op_name = op_name + ":scale"; auto get_lod_tensor = [this, &scope, &op_name](const std::string &var_name, framework::LoDTensor *tensor) { auto *v = scope.FindVar(var_name); @@ -69,50 +68,54 @@ void BatchNormOpConverter::operator()(const framework::proto::OpDesc &op, get_lod_tensor(inputs["Scale"], &scale_t); get_lod_tensor(inputs["Variance"], &variance_t); - auto *bias = bias_t.mutable_data(platform::CPUPlace()); - auto *mean = mean_t.mutable_data(platform::CPUPlace()); - auto *scale = scale_t.mutable_data(platform::CPUPlace()); - auto *variance = variance_t.mutable_data(platform::CPUPlace()); - - framework::LoDTensor combile_scale_t; - framework::LoDTensor combile_bias_t; - combile_scale_t.Resize(scale_t.dims()); - combile_bias_t.Resize(bias_t.dims()); - - auto *combile_scale = - combile_scale_t.mutable_data(platform::CPUPlace()); - auto *combile_bias = combile_bias_t.mutable_data(platform::CPUPlace()); - - size_t elem_num = combile_scale_t.memory_size() / sizeof(float); - for (size_t i = 0; i < elem_num; i++) { - combile_scale[i] = scale[i] / sqrtf(variance[i] + epsilon); - combile_bias[i] = bias[i] - mean[i] * combile_scale[i]; - } - - auto fill_shape = [](size_t n, std::vector *shape) { - shape->insert(shape->begin(), 1); - if (shape->size() < n) { - shape->insert(shape->end(), n - shape->size(), 1); + auto fill_shape = [](size_t n, std::vector shape) { + shape.insert(shape.begin(), 1); + if (shape.size() < n) { + shape.insert(shape.end(), n - shape.size(), 1); } + return shape; }; - auto scale_shape = framework::vectorize2int(combile_scale_t.dims()); - auto bias_shape = framework::vectorize2int(combile_bias_t.dims()); - fill_shape(4, &scale_shape); - fill_shape(4, &bias_shape); - Shape weight1_shape(scale_shape); - Shape weight2_shape(bias_shape); + Shape shape1(fill_shape(4, framework::vectorize2int(mean_t.dims()))); + Shape shape2(fill_shape(4, framework::vectorize2int(variance_t.dims()))); auto *weight1 = - GraphGlobalMem::Global().template new_block(weight1_shape); - auto *scale_data = static_cast(weight1->h_tensor().mutable_data()); - std::copy_n(combile_scale_t.data(), combile_scale_t.numel(), - scale_data); - engine_->AddOpAttr(op_name, "weight_1", *weight1); + GraphGlobalMem::Global().template new_block(shape1); + auto *mean_data = static_cast(weight1->h_tensor().mutable_data()); + std::copy_n(mean_t.data(), mean_t.numel(), mean_data); + engine_->AddOpAttr(bn_op_name, "weight_1", *weight1); auto *weight2 = - GraphGlobalMem::Global().template new_block(weight2_shape); - auto *bias_data = static_cast(weight2->h_tensor().mutable_data()); - std::copy_n(combile_bias_t.data(), combile_bias_t.numel(), bias_data); - engine_->AddOpAttr(op_name, "weight_2", *weight2); + GraphGlobalMem::Global().template new_block(shape2); + auto *variance_data = + static_cast(weight2->h_tensor().mutable_data()); + std::copy_n(variance_t.data(), variance_t.numel(), variance_data); + engine_->AddOpAttr(bn_op_name, "weight_2", *weight2); + + Shape shape3(std::vector({1, 1, 1, 1})); + auto *weight3 = + GraphGlobalMem::Global().template new_block(shape3); + auto *alpha_data = static_cast(weight3->h_tensor().mutable_data()); + float weight3_data[] = {1}; + std::copy(std::begin(weight3_data), std::end(weight3_data), alpha_data); + engine_->AddOpAttr(bn_op_name, "weight_3", *weight3); + + Shape scale_shape(fill_shape(4, framework::vectorize2int(scale_t.dims()))); + auto *scale = + GraphGlobalMem::Global().template new_block(scale_shape); + auto *scale_data = static_cast(scale->h_tensor().mutable_data()); + std::copy_n(scale_t.data(), scale_t.numel(), scale_data); + + Shape bias_shape(fill_shape(4, framework::vectorize2int(bias_t.dims()))); + auto *bias = + GraphGlobalMem::Global().template new_block(bias_shape); + auto *bias_data = static_cast(bias->h_tensor().mutable_data()); + std::copy_n(bias_t.data(), bias_t.numel(), bias_data); + + engine_->AddOp(scale_op_name, "Scale", {bn_output}, {output}); + engine_->AddOpAttr(scale_op_name, "axis", 1); + engine_->AddOpAttr(scale_op_name, "num_axes", 1); + engine_->AddOpAttr(scale_op_name, "bias_term", true); + engine_->AddOpAttr(scale_op_name, "weight_1", *scale); + engine_->AddOpAttr(scale_op_name, "weight_2", *bias); } } // namespace anakin diff --git a/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc b/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc index 9504b2bf894c19..c6eebf5d0c425a 100644 --- a/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_batch_norm_op.cc @@ -54,7 +54,6 @@ TEST(batch_norm_op, test) { float eps = 1e-5f; desc.SetAttr("epsilon", eps); desc.SetAttr("is_test", true); - // desc.SetAttr("momentum", 0.8f); validator.SetOp(*desc.Proto()); From 09b71f9ddd5e36a35cef92f59425429bff7c44d8 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Thu, 7 Mar 2019 12:48:22 +0000 Subject: [PATCH 3/5] refine anakin op converter add density_prior_box op converter add detection output op converter --- .../fluid/inference/anakin/convert/concat.cc | 4 ++-- .../anakin/convert/density_prior_box.cc | 12 ++++++---- .../inference/anakin/convert/detection_out.cc | 11 +++++++-- .../fluid/inference/anakin/convert/flatten.cc | 17 ++++---------- .../inference/anakin/convert/op_converter.h | 1 + .../fluid/inference/anakin/convert/reshape.cc | 2 +- .../anakin/convert/test_concat_op.cc | 23 +++++++++++++++++++ .../anakin/convert/test_flatten_op.cc | 6 ++--- .../anakin/convert/test_reshape_op.cc | 21 +++++++++++++++++ .../anakin/convert/test_softmax_op.cc | 5 ++-- .../anakin/convert/test_transpose_op.cc | 22 ++++++++++++++++++ .../inference/anakin/convert/transpose.cc | 7 +++++- .../inference/anakin/convert/ut_helper.h | 3 +++ paddle/fluid/inference/anakin/op_teller.cc | 7 +++++- .../ir_passes/anakin_subgraph_pass.cc | 2 +- .../fluid/inference/api/analysis_predictor.cc | 6 +++++ .../inference/api/paddle_pass_builder.cc | 1 + 17 files changed, 119 insertions(+), 31 deletions(-) diff --git a/paddle/fluid/inference/anakin/convert/concat.cc b/paddle/fluid/inference/anakin/convert/concat.cc index 4225307137302d..e2d1111acbb606 100644 --- a/paddle/fluid/inference/anakin/convert/concat.cc +++ b/paddle/fluid/inference/anakin/convert/concat.cc @@ -34,8 +34,8 @@ void ConcatOpConverter::operator()(const framework::proto::OpDesc &op, framework::OpDesc op_desc(op, nullptr); int axis = boost::get(op_desc.GetAttr("axis")); auto input_names = op_desc.Input("X"); - PADDLE_ENFORCE(axis > 0, - "The axis attr of Concat op should be large than 0 for trt"); + // PADDLE_ENFORCE(axis > 0, + // "The axis attr of Concat op should be large than 0 for trt"); auto y_name = op_desc.Output("Out").front(); auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); diff --git a/paddle/fluid/inference/anakin/convert/density_prior_box.cc b/paddle/fluid/inference/anakin/convert/density_prior_box.cc index 9fdea12dcf8edc..565a95f17b12fc 100644 --- a/paddle/fluid/inference/anakin/convert/density_prior_box.cc +++ b/paddle/fluid/inference/anakin/convert/density_prior_box.cc @@ -35,7 +35,7 @@ void DensityPriorBoxOpConverter::operator()(const framework::proto::OpDesc &op, auto image_name = op_desc.Input("Image").front(); auto output_name = op_desc.Output("Boxes").front(); - auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); + auto op_name = op_desc.Type() + ":" + op_desc.Output("Boxes").front(); auto fixed_sizes = boost::get>(op_desc.GetAttr("fixed_sizes")); @@ -52,14 +52,18 @@ void DensityPriorBoxOpConverter::operator()(const framework::proto::OpDesc &op, auto step_w = boost::get(op_desc.GetAttr("step_w")); auto offset = boost::get(op_desc.GetAttr("offset")); std::vector order = {"MIN", "COM", "MAX"}; + std::vector temp_v = {}; engine_->AddOp(op_name, "PriorBox", {input_name, image_name}, {output_name}); + engine_->AddOpAttr>(op_name, "min_size", temp_v); + engine_->AddOpAttr>(op_name, "max_size", temp_v); + engine_->AddOpAttr>(op_name, "aspect_ratio", temp_v); engine_->AddOpAttr>(op_name, "fixed_sizes", fixed_sizes); engine_->AddOpAttr>(op_name, "fixed_ratios", fixed_ratios); engine_->AddOpAttr>(op_name, "density", densities); - engine_->AddOpAttr(op_name, "flip", false); - engine_->AddOpAttr(op_name, "clip", clip); - engine_->AddOpAttr>(op_name, "variances", variances); + engine_->AddOpAttr(op_name, "is_flip", false); + engine_->AddOpAttr(op_name, "is_clip", clip); + engine_->AddOpAttr>(op_name, "variance", variances); engine_->AddOpAttr(op_name, "img_h", static_cast(0)); engine_->AddOpAttr(op_name, "img_w", static_cast(0)); engine_->AddOpAttr(op_name, "step_h", step_h); diff --git a/paddle/fluid/inference/anakin/convert/detection_out.cc b/paddle/fluid/inference/anakin/convert/detection_out.cc index 1dae346a02443b..67636651017cfb 100644 --- a/paddle/fluid/inference/anakin/convert/detection_out.cc +++ b/paddle/fluid/inference/anakin/convert/detection_out.cc @@ -39,10 +39,17 @@ void DetectionOutOpConverter::operator()(const framework::proto::OpDesc &op, auto code_type = boost::get(op_desc.GetAttr("code_type")); auto background_label = boost::get(op_desc.GetAttr("background_label")); auto score_threshold = boost::get(op_desc.GetAttr("score_threshold")); - auto nms_top_k = boost::get(op_desc.GetAttr("nms_top_k")); + auto nms_top_k = boost::get(op_desc.GetAttr("nms_top_k")); auto nms_threshold = boost::get(op_desc.GetAttr("nms_threshold")); auto nms_eta = boost::get(op_desc.GetAttr("nms_eta")); auto keep_top_k = boost::get(op_desc.GetAttr("keep_top_k")); + std::string anakin_code_type; + if (code_type == "decode_center_size") { + anakin_code_type = "CENTER_SIZE"; + } else if (code_type == "encode_center_size") { + PADDLE_THROW( + "Not support encode_center_size code_type in DetectionOut of anakin"); + } engine_->AddOp(op_name, "DetectionOutput", {target_name, scores_name, prior_box_name}, {output_name}); @@ -51,7 +58,7 @@ void DetectionOutOpConverter::operator()(const framework::proto::OpDesc &op, engine_->AddOpAttr(op_name, "class_num", static_cast(0)); engine_->AddOpAttr(op_name, "background_id", background_label); engine_->AddOpAttr(op_name, "keep_top_k", keep_top_k); - engine_->AddOpAttr(op_name, "code_type", code_type); + engine_->AddOpAttr(op_name, "code_type", anakin_code_type); engine_->AddOpAttr(op_name, "conf_thresh", score_threshold); engine_->AddOpAttr(op_name, "nms_top_k", nms_top_k); engine_->AddOpAttr(op_name, "nms_thresh", nms_threshold); diff --git a/paddle/fluid/inference/anakin/convert/flatten.cc b/paddle/fluid/inference/anakin/convert/flatten.cc index 8314a72129cd48..5a1b81bf5841cd 100644 --- a/paddle/fluid/inference/anakin/convert/flatten.cc +++ b/paddle/fluid/inference/anakin/convert/flatten.cc @@ -34,20 +34,11 @@ void FlattenOpConverter::operator()(const framework::proto::OpDesc &op, auto input = op_desc.Input("X").front(); auto output = op_desc.Output("Out").front(); - auto in_dims = scope.FindVar(input)->Get().dims(); int axis = boost::get(op_desc.GetAttr("axis")); + PADDLE_ENFORCE(axis == 1, + "the anakin flatten op converter now only support aixs == 1."); - int inner = 1; - int outer = 1; - for (int i = 0; i < in_dims.size(); i++) { - if (i < axis) { - outer *= in_dims[i]; - } else { - inner *= in_dims[i]; - } - } - - std::vector out_dims = {1, outer, inner, 1}; + std::vector out_dims = {0, -1, 1, 1}; auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front(); engine_->AddOp(op_name, "Reshape", {input}, {output}); engine_->AddOpAttr>(op_name, "dims", out_dims); @@ -57,4 +48,4 @@ void FlattenOpConverter::operator()(const framework::proto::OpDesc &op, } // namespace inference } // namespace paddle -REGISTER_ANAKIN_OP_CONVERTER(flatten, FlattenOpConverter); +REGISTER_ANAKIN_OP_CONVERTER(flatten2, FlattenOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/op_converter.h b/paddle/fluid/inference/anakin/convert/op_converter.h index 0d214d82eb88b0..c29be735223a4a 100644 --- a/paddle/fluid/inference/anakin/convert/op_converter.h +++ b/paddle/fluid/inference/anakin/convert/op_converter.h @@ -47,6 +47,7 @@ class AnakinOpConverter { std::string op_type = op_desc.Type(); AnakinOpConverter *it = nullptr; + std::cout << op_type << std::endl; if (!it) { it = Registry::Global().Lookup(op_type); } diff --git a/paddle/fluid/inference/anakin/convert/reshape.cc b/paddle/fluid/inference/anakin/convert/reshape.cc index eee36d2f37ea79..faea5261f3ffa9 100644 --- a/paddle/fluid/inference/anakin/convert/reshape.cc +++ b/paddle/fluid/inference/anakin/convert/reshape.cc @@ -49,4 +49,4 @@ void ReshapeOpConverter::operator()(const framework::proto::OpDesc &op, } // namespace inference } // namespace paddle -REGISTER_ANAKIN_OP_CONVERTER(reshape, ReshapeOpConverter); +REGISTER_ANAKIN_OP_CONVERTER(reshape2, ReshapeOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/test_concat_op.cc b/paddle/fluid/inference/anakin/convert/test_concat_op.cc index 0944e8dbdc90c3..8c6949cb2914bd 100644 --- a/paddle/fluid/inference/anakin/convert/test_concat_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_concat_op.cc @@ -44,6 +44,29 @@ TEST(concat_op, test) { validator.Execute(1); } +TEST(concat_op, test2) { + std::unordered_set parameters({""}); + framework::Scope scope; + AnakinConvertValidation validator(parameters, scope); + validator.DeclInputVar("concat_x1", {1, 4}); + validator.DeclInputVar("concat_x2", {3, 4}); + validator.DeclInputVar("concat_x3", {2, 4}); + validator.DeclOutputVar("concat_out", {6, 4}); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("concat"); + desc.SetInput("X", {"concat_x1", "concat_x2", "concat_x3"}); + desc.SetOutput("Out", {"concat_out"}); + + int axis = 0; + desc.SetAttr("axis", axis); + + validator.SetOp(*desc.Proto()); + + validator.Execute(1); +} + } // namespace anakin } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/test_flatten_op.cc b/paddle/fluid/inference/anakin/convert/test_flatten_op.cc index 61f03819407fb1..381fab786509a2 100644 --- a/paddle/fluid/inference/anakin/convert/test_flatten_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_flatten_op.cc @@ -27,13 +27,13 @@ TEST(flatten_op, test) { std::unordered_set parameters; framework::Scope scope; AnakinConvertValidation validator(parameters, scope); - validator.DeclInputVar("flatten-X", {3, 100, 100, 4}); - validator.DeclOutputVar("flatten-Out", {1, 300, 400, 1}); + validator.DeclInputVar("flatten-X", {3, 10, 10, 4}); + validator.DeclOutputVar("flatten-Out", {3, 400, 1, 1}); framework::OpDesc desc; desc.SetType("flatten"); desc.SetInput("X", {"flatten-X"}); desc.SetOutput("Out", {"flatten-Out"}); - desc.SetAttr("axis", 2); + desc.SetAttr("axis", 1); LOG(INFO) << "set OP"; validator.SetOp(*desc.Proto()); diff --git a/paddle/fluid/inference/anakin/convert/test_reshape_op.cc b/paddle/fluid/inference/anakin/convert/test_reshape_op.cc index 2c26cdc525f2fe..a60544e6507e67 100644 --- a/paddle/fluid/inference/anakin/convert/test_reshape_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_reshape_op.cc @@ -45,6 +45,27 @@ TEST(reshape, test) { validator.Execute(1); } +TEST(reshape, test2) { + framework::Scope scope; + std::unordered_set parameters; + AnakinConvertValidation validator(parameters, scope); + + validator.DeclInputVar("reshape-X", {1, 2, 4}); + validator.DeclOutputVar("reshape-Out", {1, 4, 2}); + + framework::OpDesc desc; + desc.SetType("reshape"); + desc.SetInput("X", {"reshape-X"}); + desc.SetOutput("Out", {"reshape-Out"}); + // desc.SetAttr("shape", std::vector({3, 2, 1, 3})); + desc.SetAttr("shape", std::vector({0, -1, 2})); + + LOG(INFO) << "set OP"; + validator.SetOp(*desc.Proto()); + LOG(INFO) << "execute"; + validator.Execute(1); +} + } // namespace anakin } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/test_softmax_op.cc b/paddle/fluid/inference/anakin/convert/test_softmax_op.cc index 40186915886bec..933e3eb2b24682 100644 --- a/paddle/fluid/inference/anakin/convert/test_softmax_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_softmax_op.cc @@ -27,9 +27,8 @@ TEST(softmax, test) { std::unordered_set parameters; AnakinConvertValidation validator(parameters, scope); - std::vector tensor_shape{8, 10}; - validator.DeclInputVar("softmax-X", {1, 10, 1, 1}); - validator.DeclOutputVar("softmax-Out", {1, 10, 1, 1}); + validator.DeclInputVar("softmax-X", {1, 10}); + validator.DeclOutputVar("softmax-Out", {1, 10}); framework::OpDesc desc; desc.SetType("softmax"); diff --git a/paddle/fluid/inference/anakin/convert/test_transpose_op.cc b/paddle/fluid/inference/anakin/convert/test_transpose_op.cc index 20ef00b5aa2eca..67c9c23f1fdc94 100644 --- a/paddle/fluid/inference/anakin/convert/test_transpose_op.cc +++ b/paddle/fluid/inference/anakin/convert/test_transpose_op.cc @@ -43,6 +43,28 @@ TEST(transpose_op, test) { validator.Execute(3); } +// test input shape's dims < 4 +TEST(transpose_op, test2) { + std::unordered_set parameters; + framework::Scope scope; + AnakinConvertValidation validator(parameters, scope); + validator.DeclInputVar("transpose-X", {3, 4, 5}); + validator.DeclOutputVar("transpose-Out", {3, 5, 4}); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("transpose"); + desc.SetInput("X", {"transpose-X"}); + desc.SetOutput("Out", {"transpose-Out"}); + desc.SetAttr("axis", std::vector({0, 2, 1})); + + LOG(INFO) << "set OP"; + validator.SetOp(*desc.Proto()); + LOG(INFO) << "execute"; + + validator.Execute(1); +} + } // namespace anakin } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/anakin/convert/transpose.cc b/paddle/fluid/inference/anakin/convert/transpose.cc index 6333f5ef06a26b..74016cba7430e0 100644 --- a/paddle/fluid/inference/anakin/convert/transpose.cc +++ b/paddle/fluid/inference/anakin/convert/transpose.cc @@ -40,6 +40,11 @@ void TransposeOpConverter::operator()(const framework::proto::OpDesc &op, engine_->AddOp(op_name, "Permute", {input}, {output}); auto axis = boost::get>(op_desc.GetAttr("axis")); + size_t axis_size = axis.size(); + while (axis.size() < 4) { + axis.push_back(axis_size); + axis_size += 1; + } engine_->AddOpAttr>(op_name, "dims", axis); } @@ -47,4 +52,4 @@ void TransposeOpConverter::operator()(const framework::proto::OpDesc &op, } // namespace inference } // namespace paddle -REGISTER_ANAKIN_OP_CONVERTER(transpose, TransposeOpConverter); +REGISTER_ANAKIN_OP_CONVERTER(transpose2, TransposeOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/ut_helper.h b/paddle/fluid/inference/anakin/convert/ut_helper.h index 6b52b7607ac578..815a88fb4898c6 100644 --- a/paddle/fluid/inference/anakin/convert/ut_helper.h +++ b/paddle/fluid/inference/anakin/convert/ut_helper.h @@ -127,6 +127,9 @@ class AnakinConvertValidation { auto& t = inference::analysis::GetFromScope(scope_, input); auto t_shape = framework::vectorize2int(t.dims()); + while (t_shape.size() < 4) { + t_shape.push_back(1); + } engine_->SetInputShape(input, t_shape); } engine_->Optimize(); diff --git a/paddle/fluid/inference/anakin/op_teller.cc b/paddle/fluid/inference/anakin/op_teller.cc index 01ceef696ddc9b..3166f68b67af22 100644 --- a/paddle/fluid/inference/anakin/op_teller.cc +++ b/paddle/fluid/inference/anakin/op_teller.cc @@ -31,8 +31,13 @@ struct SimpleOpTypeSetTeller : public Teller { teller_set.insert("concat"); teller_set.insert("tanh"); teller_set.insert("conv2d"); - teller_set.insert("detection_out"); + teller_set.insert("batch_norm"); + teller_set.insert("softmax"); + teller_set.insert("flatten2"); + teller_set.insert("reshape2"); + teller_set.insert("transpose2"); teller_set.insert("density_prior_box"); + teller_set.insert("detection_out"); } bool operator()(const std::string& op_type, diff --git a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc index f57b89baab7582..1a4c24e800bc83 100644 --- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc @@ -45,7 +45,7 @@ std::unique_ptr analysis::AnakinSubgraphPass::ApplyImpl( return anakin::OpTeller::Global().Tell(node->Op()->Type(), *node->Op()); }; - SubGraphFuser fuser(graph.get(), teller, 0); + SubGraphFuser fuser(graph.get(), teller, 3 /* min_subgraph_size */); fuser(); for (auto *node : graph->Nodes()) { diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index fa64786d706075..ac17c3f6beb63e 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -779,5 +779,11 @@ USE_ANAKIN_CONVERTER(sigmoid); USE_ANAKIN_CONVERTER(tanh); USE_ANAKIN_CONVERTER(pool2d); USE_ANAKIN_CONVERTER(elementwise_add); +USE_ANAKIN_CONVERTER(batch_norm); +USE_ANAKIN_CONVERTER(flatten2); +USE_ANAKIN_CONVERTER(reshape2); +USE_ANAKIN_CONVERTER(transpose2); +USE_ANAKIN_CONVERTER(softmax); + USE_ANAKIN_CONVERTER(detection_out); USE_ANAKIN_CONVERTER(density_prior_box); diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index f723f0ef0605b0..57c0b8546b4931 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -72,6 +72,7 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { passes_.assign({ "infer_clean_graph_pass", // "simplify_anakin_detection_pattern_pass3", "fc_fuse_pass", + "conv_elementwise_add_fuse_pass", "conv_bn_fuse_pass", "conv_elementwise_add_fuse_pass", "anakin_subgraph_pass", // "identity_scale_op_clean_pass", // /* From 5dc3f81a035b5dec01b3267974b78b148741e685 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Thu, 7 Mar 2019 17:13:30 +0000 Subject: [PATCH 4/5] refine anakin op converter --- paddle/fluid/inference/anakin/convert/batch_norm.cc | 2 +- paddle/fluid/inference/anakin/convert/flatten.cc | 2 +- paddle/fluid/inference/anakin/convert/op_converter.h | 5 ++++- paddle/fluid/inference/anakin/convert/reshape.cc | 2 +- paddle/fluid/inference/anakin/convert/transpose.cc | 2 +- paddle/fluid/inference/api/CMakeLists.txt | 5 +++++ paddle/fluid/inference/api/analysis_predictor.cc | 6 +++--- 7 files changed, 16 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/inference/anakin/convert/batch_norm.cc b/paddle/fluid/inference/anakin/convert/batch_norm.cc index bb5d406f981d40..ebe81dabcbc71f 100644 --- a/paddle/fluid/inference/anakin/convert/batch_norm.cc +++ b/paddle/fluid/inference/anakin/convert/batch_norm.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/inference/anakin/convert/batch_norm.h" #include +#include #include #include #include @@ -41,7 +42,6 @@ void BatchNormOpConverter::operator()(const framework::proto::OpDesc &op, auto output = op_desc.Output("Y").front(); auto op_name = op_desc.Type() + ":" + op_desc.Output("Y").front(); - bool is_test = boost::get(op_desc.GetAttr("is_test")); auto epsilon = boost::get(op_desc.GetAttr("epsilon")); auto bn_op_name = op_name + ":bn"; diff --git a/paddle/fluid/inference/anakin/convert/flatten.cc b/paddle/fluid/inference/anakin/convert/flatten.cc index 5a1b81bf5841cd..c6c372bbef87de 100644 --- a/paddle/fluid/inference/anakin/convert/flatten.cc +++ b/paddle/fluid/inference/anakin/convert/flatten.cc @@ -48,4 +48,4 @@ void FlattenOpConverter::operator()(const framework::proto::OpDesc &op, } // namespace inference } // namespace paddle -REGISTER_ANAKIN_OP_CONVERTER(flatten2, FlattenOpConverter); +REGISTER_ANAKIN_OP_CONVERTER(flatten, FlattenOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/op_converter.h b/paddle/fluid/inference/anakin/convert/op_converter.h index c29be735223a4a..eb75f1f62ab92f 100644 --- a/paddle/fluid/inference/anakin/convert/op_converter.h +++ b/paddle/fluid/inference/anakin/convert/op_converter.h @@ -47,7 +47,10 @@ class AnakinOpConverter { std::string op_type = op_desc.Type(); AnakinOpConverter *it = nullptr; - std::cout << op_type << std::endl; + if (op_type == "reshape2") op_type = "reshape"; + if (op_type == "transpose2") op_type = "transpose"; + if (op_type == "flatten2") op_type = "flatten"; + if (!it) { it = Registry::Global().Lookup(op_type); } diff --git a/paddle/fluid/inference/anakin/convert/reshape.cc b/paddle/fluid/inference/anakin/convert/reshape.cc index faea5261f3ffa9..eee36d2f37ea79 100644 --- a/paddle/fluid/inference/anakin/convert/reshape.cc +++ b/paddle/fluid/inference/anakin/convert/reshape.cc @@ -49,4 +49,4 @@ void ReshapeOpConverter::operator()(const framework::proto::OpDesc &op, } // namespace inference } // namespace paddle -REGISTER_ANAKIN_OP_CONVERTER(reshape2, ReshapeOpConverter); +REGISTER_ANAKIN_OP_CONVERTER(reshape, ReshapeOpConverter); diff --git a/paddle/fluid/inference/anakin/convert/transpose.cc b/paddle/fluid/inference/anakin/convert/transpose.cc index 74016cba7430e0..6a887401034f9d 100644 --- a/paddle/fluid/inference/anakin/convert/transpose.cc +++ b/paddle/fluid/inference/anakin/convert/transpose.cc @@ -52,4 +52,4 @@ void TransposeOpConverter::operator()(const framework::proto::OpDesc &op, } // namespace inference } // namespace paddle -REGISTER_ANAKIN_OP_CONVERTER(transpose2, TransposeOpConverter); +REGISTER_ANAKIN_OP_CONVERTER(transpose, TransposeOpConverter); diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 38313754ea98dc..29d21dc171c6d7 100644 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -64,3 +64,8 @@ if (WITH_ANAKIN AND WITH_MKL) # only needed in CI anakin_target(inference_anakin_api) anakin_target(inference_anakin_api_shared) endif() +if (WITH_ANAKIN_SUBGRAPH) + inference_analysis_test(test_anakin_model SRCS mobilenet_test.cc EXTRA_DEPS paddle_fluid) + inference_analysis_test(anakin_conv_model SRCS conv_anakin_test.cc EXTRA_DEPS paddle_fluid) + inference_analysis_test(life_feature_test SRCS life_feature_test.cc EXTRA_DEPS paddle_fluid) +endif() diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index ac17c3f6beb63e..49025e9269c30c 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -780,9 +780,9 @@ USE_ANAKIN_CONVERTER(tanh); USE_ANAKIN_CONVERTER(pool2d); USE_ANAKIN_CONVERTER(elementwise_add); USE_ANAKIN_CONVERTER(batch_norm); -USE_ANAKIN_CONVERTER(flatten2); -USE_ANAKIN_CONVERTER(reshape2); -USE_ANAKIN_CONVERTER(transpose2); +USE_ANAKIN_CONVERTER(flatten); +USE_ANAKIN_CONVERTER(reshape); +USE_ANAKIN_CONVERTER(transpose); USE_ANAKIN_CONVERTER(softmax); USE_ANAKIN_CONVERTER(detection_out); From 59ac472e1dea5b622b607c2fc251e072d5d42f66 Mon Sep 17 00:00:00 2001 From: nhzlx Date: Fri, 8 Mar 2019 03:06:23 +0000 Subject: [PATCH 5/5] add trt update for this branch --- Dockerfile | 5 +- paddle/fluid/framework/ir/fuse_pass_base.h | 5 + paddle/fluid/inference/analysis/argument.h | 6 + paddle/fluid/inference/analysis/helper.h | 31 ++ .../inference/analysis/ir_pass_manager.cc | 3 + .../inference/analysis/ir_pass_manager.h | 3 + .../ir_passes/tensorrt_subgraph_pass.cc | 267 +++++++++++++----- .../ir_passes/tensorrt_subgraph_pass.h | 12 +- .../ir_params_sync_among_devices_pass.cc | 11 + .../ir_params_sync_among_devices_pass.h | 1 + paddle/fluid/inference/api/analysis_config.cc | 4 +- .../fluid/inference/api/analysis_predictor.cc | 32 +++ .../fluid/inference/api/analysis_predictor.h | 9 + .../inference/api/details/zero_copy_tensor.cc | 60 +++- .../api/details/zero_copy_tensor_dummy.cc | 2 +- paddle/fluid/inference/api/helper.h | 5 + .../inference/api/paddle_analysis_config.h | 4 +- paddle/fluid/inference/api/paddle_api.h | 22 +- paddle/fluid/inference/engine.h | 5 - .../inference/tensorrt/convert/conv2d_op.cc | 21 +- .../tensorrt/convert/elementwise_op.cc | 3 +- .../fluid/inference/tensorrt/convert/fc_op.cc | 4 +- .../inference/tensorrt/convert/op_converter.h | 61 ++++ .../inference/tensorrt/convert/prelu_op.cc | 19 +- .../inference/tensorrt/convert/ut_helper.h | 85 +++--- paddle/fluid/inference/tensorrt/engine.cc | 143 +--------- paddle/fluid/inference/tensorrt/engine.h | 91 +++--- paddle/fluid/inference/tensorrt/helper.h | 29 ++ .../inference/tensorrt/plugin/CMakeLists.txt | 3 +- .../tensorrt/plugin/avg_pool_op_plugin.cu | 7 + .../tensorrt/plugin/avg_pool_op_plugin.h | 14 +- .../tensorrt/plugin/elementwise_op_plugin.cu | 11 +- .../tensorrt/plugin/elementwise_op_plugin.h | 20 +- .../tensorrt/plugin/prelu_op_plugin.cu | 15 +- .../tensorrt/plugin/prelu_op_plugin.h | 43 ++- .../tensorrt/plugin/split_op_plugin.cu | 6 + .../tensorrt/plugin/split_op_plugin.h | 9 +- .../inference/tensorrt/plugin/trt_plugin.h | 10 +- .../tensorrt/plugin/trt_plugin_factory.cc | 48 ++++ .../tensorrt/plugin/trt_plugin_factory.h | 78 +++++ .../{serialize.h => trt_plugin_utils.h} | 9 +- .../fluid/inference/tensorrt/test_engine.cc | 134 ++++++--- .../inference/tests/api/trt_models_tester.cc | 3 +- .../operators/tensorrt/tensorrt_engine_op.cc | 3 + .../operators/tensorrt/tensorrt_engine_op.h | 179 +++++------- .../tensorrt/tensorrt_engine_op_test.cc | 2 + paddle/fluid/pybind/inference_api.cc | 3 +- 47 files changed, 1005 insertions(+), 535 deletions(-) create mode 100644 paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc create mode 100644 paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h rename paddle/fluid/inference/tensorrt/plugin/{serialize.h => trt_plugin_utils.h} (96%) diff --git a/Dockerfile b/Dockerfile index fe0721e9b99b5e..c248ac119caa1f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -75,8 +75,9 @@ RUN curl -s -q https://glide.sh/get | sh # and its size is only one-third of the official one. # 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle. # See https://github.com/PaddlePaddle/Paddle/issues/10129 for details. -RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \ - tar -xz -C /usr/local && \ + +RUN wget -q https://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.1.6-ubuntu14.04.x86_64-gnu.cuda.8.0.cudnn7.0.tar.gz --no-check-certificate && \ + tar -zxf TensorRT-4.0.1.6-ubuntu14.04.x86_64-gnu.cuda.8.0.cudnn7.0.tar.gz -C /usr/local && \ cp -rf /usr/local/TensorRT/include /usr && \ cp -rf /usr/local/TensorRT/lib /usr diff --git a/paddle/fluid/framework/ir/fuse_pass_base.h b/paddle/fluid/framework/ir/fuse_pass_base.h index c53b2a6186741d..3a1022bbcbd671 100644 --- a/paddle/fluid/framework/ir/fuse_pass_base.h +++ b/paddle/fluid/framework/ir/fuse_pass_base.h @@ -14,6 +14,7 @@ #pragma once +#include #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/scope.h" @@ -24,6 +25,10 @@ namespace ir { static const char kParamScopeAttr[] = "__param_scope__"; static const char kFuseStatisAttr[] = "__fuse_statis__"; +// When we use trt or other third_party lib, the parameters are managed by +// the lib, but not the fluid. So we need to record them to avoid duplicate +// allocation. +static const char kRepetitiveParamAttr[] = "__repetitive_param__"; enum FuseOptions { DO_NOT_FUSE, // fusing will not be done diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 2f31b182af7293..89e934ae27b931 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -23,8 +23,12 @@ #pragma once +#include #include +#include +#include #include + #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" @@ -133,6 +137,8 @@ struct Argument { DECL_ARGUMENT_FIELD(tensorrt_min_subgraph_size, TensorRtMinSubgraphSize, int); DECL_ARGUMENT_FIELD(tensorrt_precision_mode, TensorRtPrecisionMode, AnalysisConfig::Precision); + DECL_ARGUMENT_FIELD(tensorrt_use_static_engine, TensorRtUseStaticEngine, + bool); // Memory optimized related. DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool); diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h index 59107f28080dce..a48058400241b0 100644 --- a/paddle/fluid/inference/analysis/helper.h +++ b/paddle/fluid/inference/analysis/helper.h @@ -17,10 +17,12 @@ limitations under the License. */ #include #include #include +#include #include #include #include #include +#include #include #include "paddle/fluid/framework/framework.pb.h" @@ -217,6 +219,35 @@ static std::string GetTrtCalibTableData(const std::string &model_opt_cache_dir, return ""; } +static std::string GetTrtEngineSerializedPath(const std::string &model_root, + const std::string &engine_key) { + return model_root + "/trt_serialized_" + engine_key; +} + +static std::string GetTrtEngineSerializedData( + const std::string &model_opt_cache_dir, const std::string &engine_key) { + std::string trt_serialized_path = + GetTrtEngineSerializedPath(model_opt_cache_dir, engine_key); + if (FileExists(trt_serialized_path)) { + VLOG(3) << "Trt serialized file: " << trt_serialized_path + << "is found here"; + std::ifstream infile(trt_serialized_path, std::ios::in); + std::stringstream buffer; + buffer << infile.rdbuf(); + std::string trt_engine_serialized_data(buffer.str()); + return trt_engine_serialized_data; + } + return ""; +} + +static void SaveTrtEngineSerializedDataToFile( + const std::string &trt_serialized_path, + const std::string &engine_serialized_data) { + std::ofstream outfile(trt_serialized_path); + outfile << engine_serialized_data; + outfile.close(); +} + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 7eff099a8a26e0..638f3bc90875da 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -89,6 +89,9 @@ void IRPassManager::CreatePasses(Argument *argument, pass->Set( "model_opt_cache_dir", new std::string(GetOrCreateModelOptCacheDir(model_opt_cache_dir))); + pass->Set("gpu_device_id", new int(argument->gpu_device_id())); + pass->Set("use_static_engine", + new bool(argument->tensorrt_use_static_engine())); } pre_pass = pass_name; diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.h b/paddle/fluid/inference/analysis/ir_pass_manager.h index 2a595cb36b8345..2d120679eedd39 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.h +++ b/paddle/fluid/inference/analysis/ir_pass_manager.h @@ -22,7 +22,10 @@ #pragma once +#include #include +#include +#include #include #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/pass.h" diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 69a9caec030600..d4e2da8957f205 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -14,13 +14,13 @@ #include #include -#include -#include #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/ir_passes/subgraph_detector.h" #include "paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h" +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/tensorrt/op_teller.h" #include "paddle/fluid/string/pretty_log.h" @@ -33,8 +33,15 @@ using framework::ir::Node; std::vector ExtractParameters( const std::unordered_set &nodes); -std::unique_ptr analysis::TensorRtSubgraphPass::ApplyImpl( +void RenameAndGetOutputs( + const std::vector &subgraph_nodes, + framework::BlockDesc *block_desc, + const std::set &input_names_with_id, + std::set *output_names_with_id, + std::set *output_names, + std::unordered_map *output_name_map); +std::unique_ptr analysis::TensorRtSubgraphPass::ApplyImpl( std::unique_ptr graph) const { framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph.get()); @@ -47,9 +54,16 @@ std::unique_ptr analysis::TensorRtSubgraphPass::ApplyImpl( Get("min_subgraph_size") /*min subgraph size*/); fuser(); + std::vector graph_param_names = + ExtractParameters(graph->Nodes()); + // those parameter already exist in trt, and should not have another copy in + // fluid. + std::vector repetitive_params; + for (auto *node : graph->Nodes()) { if (node->IsOp() && !Agent(node).subgraph()->empty()) { - CreateTensorRTOp(node, graph.get()); + CreateTensorRTOp(node, graph.get(), graph_param_names, + &repetitive_params); std::unordered_set nodes2remove( Agent(node).subgraph()->begin(), Agent(node).subgraph()->end()); @@ -64,12 +78,15 @@ std::unique_ptr analysis::TensorRtSubgraphPass::ApplyImpl( } } framework::ir::GraphSafeRemoveNodes(graph.get(), nodes2remove); + graph->Set(framework::ir::kRepetitiveParamAttr, + new std::vector(repetitive_params)); return graph; } std::string GenerateEngineKey(const std::set &engine_inputs, - const std::set &engine_outputs) { + const std::set &engine_outputs, + const std::string &predictor_id) { std::string engine_hash_key = ""; for (auto name : engine_inputs) { engine_hash_key += name; @@ -77,12 +94,15 @@ std::string GenerateEngineKey(const std::set &engine_inputs, for (auto name : engine_outputs) { engine_hash_key += name; } + engine_hash_key += predictor_id; auto engine_key = std::to_string(std::hash()(engine_hash_key)); return engine_key; } -void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, - Graph *graph) const { +void TensorRtSubgraphPass::CreateTensorRTOp( + framework::ir::Node *node, Graph *graph, + const std::vector &graph_params, + std::vector *repetitive_params) const { auto *op_desc = node->Op(); auto &subgraph = *Agent(node).subgraph(); PADDLE_ENFORCE(!subgraph.empty()); @@ -116,12 +136,16 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, // is unique. std::set input_names; std::set input_names_with_id; + std::vector params; + + // The node->inputs containes input tensors and parameters. for (auto *x : node->inputs) { input_names.insert(x->Name()); input_names_with_id.insert(x->Name() + std::to_string(x->id())); + if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) { + params.push_back(x->Name()); + } } - op_desc->SetInput( - "Xs", std::vector(input_names.begin(), input_names.end())); std::set output_names; std::set output_names_with_id; @@ -130,11 +154,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, output_names_with_id.insert(x->Name() + std::to_string(x->id())); } - op_desc->SetOutput( - "Ys", std::vector(output_names.begin(), output_names.end())); - op_desc->SetType("tensorrt_engine"); - std::unordered_map output_name_map; + auto &subgraph_nodes = *Agent(node).subgraph(); // The following procedure is used to rename all the intermediate // variables and the output variables of the subgraph. @@ -148,61 +169,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, // input of a OP, but also the output of a Op, there will be problems. // So we have to rename the variable in the subgraph to make sure // it is either an OP's input or an OP's output. - - auto &subgraph_nodes = *Agent(node).subgraph(); - for (size_t index = 0; index < block_desc.OpSize(); ++index) { - framework::proto::OpDesc *op = block_desc.Op(index)->Proto(); - auto correspond_node = subgraph_nodes[index]; - PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type()); - - std::unordered_map var2id; - for (auto *in_var : correspond_node->inputs) { - var2id[in_var->Name()] = in_var->id(); - } - // rename for the input variables of op inside subgraph - for (int i = 0; i < op->inputs_size(); i++) { - // one input - auto *in_var = op->mutable_inputs(i); - std::vector replaced_names; - for (int k = 0; k < in_var->arguments_size(); k++) { // all the arguments - std::string arg_value = in_var->arguments(k); - std::string arg_value_with_id = - arg_value + std::to_string(var2id[arg_value]); - if (input_names_with_id.count(arg_value_with_id)) { - replaced_names.push_back(arg_value); - } else { - replaced_names.push_back(arg_value_with_id); - } - } - in_var->clear_arguments(); - for (size_t k = 0; k < replaced_names.size(); k++) { - in_var->add_arguments(replaced_names[k]); - } - } - var2id.clear(); - for (auto out_var : correspond_node->outputs) { - var2id[out_var->Name()] = out_var->id(); - } - - // rename for the output variables of op inside subgraph - for (int i = 0; i < op->outputs_size(); i++) { - framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i); - std::vector replaced_names; - for (int k = 0; k < out_var->arguments_size(); k++) { - std::string arg_value = out_var->arguments(k); - std::string arg_value_with_id = - arg_value + std::to_string(var2id[arg_value]); - if (output_names_with_id.count(arg_value_with_id)) { - output_name_map[arg_value] = arg_value_with_id; - } - replaced_names.push_back(arg_value_with_id); - } - out_var->clear_arguments(); - for (size_t k = 0; k < replaced_names.size(); k++) { - out_var->add_arguments(replaced_names[k]); - } - } - } + RenameAndGetOutputs(subgraph_nodes, &block_desc, input_names_with_id, + &output_names_with_id, &output_names, &output_name_map); // When tensorrt engine runs at the end of the operation, // output_mapping help us copy the data from the renamed ITensor @@ -212,6 +180,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, PADDLE_ENFORCE(output_name_map.count(name) != 0); output_mapping.push_back(output_name_map[name]); } + PADDLE_ENFORCE(!output_mapping.empty()); auto *vars = block_desc.Proto()->mutable_vars(); for (framework::ir::Node *node : graph->Nodes()) { @@ -222,26 +191,83 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(), "the block has no var-desc"); - PADDLE_ENFORCE(!output_mapping.empty()); + + // Set attrs + op_desc->SetType("tensorrt_engine"); + op_desc->SetInput( + "Xs", std::vector(input_names.begin(), input_names.end())); + + op_desc->SetOutput( + "Ys", std::vector(output_names.begin(), output_names.end())); + op_desc->SetBlockAttr("sub_block", new_block); SetAttr(op_desc->Proto(), "subgraph", block_desc.Proto()->SerializeAsString()); - // Set attrs SetAttr(op_desc->Proto(), "max_batch_size", Get("max_batch_size")); SetAttr(op_desc->Proto(), "workspace_size", Get("workspace_size")); - SetAttr(op_desc->Proto(), "parameters", ExtractParameters(graph->Nodes())); SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping); + SetAttr(op_desc->Proto(), "parameters", params); auto enable_int8 = Get("enable_int8"); - auto engine_key = - GenerateEngineKey(input_names_with_id, output_names_with_id); + auto engine_key = GenerateEngineKey(input_names_with_id, output_names_with_id, + std::to_string(0)); + // Get "" when there is no cached calibration table data. std::string calibration_data = GetTrtCalibTableData( Get("model_opt_cache_dir"), engine_key, enable_int8); SetAttr(op_desc->Proto(), "calibration_data", calibration_data); SetAttr(op_desc->Proto(), "enable_int8", enable_int8); SetAttr(op_desc->Proto(), "engine_key", engine_key); + SetAttr(op_desc->Proto(), "engine_serialized_data", std::string("")); + + std::unique_ptr calibrator; + if (enable_int8 && calibration_data.size() != 0) { + calibrator.reset(new tensorrt::TRTInt8Calibrator(calibration_data)); + } + + bool use_static_engine = Get("use_static_engine"); + // When in int8 mode and calibration_mode, the program just produce the + // calibration table data. + bool calibration_mode = (enable_int8 && calibration_data.size() == 0); + if (!calibration_mode && use_static_engine) { + std::copy(params.begin(), params.end(), + std::back_inserter(*repetitive_params)); + std::string trt_engine_serialized_data = GetTrtEngineSerializedData( + Get("model_opt_cache_dir"), engine_key); + + if (trt_engine_serialized_data.empty()) { + LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP " + "kernel etc). This process may cost a lot of time."; + std::unique_ptr trt_engine( + new tensorrt::TensorRTEngine( + Get("max_batch_size"), Get("workspace_size"), + enable_int8, calibrator.get(), Get("gpu_device_id"))); + auto *scope = param_scope(); + framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto()); + std::unordered_set param_set(params.begin(), params.end()); + inference::Singleton::Global() + .ConvertBlockToTRTEngine( + &block_desc_temp, *scope, + std::vector(input_names.begin(), input_names.end()), + param_set, output_mapping, trt_engine.get()); + nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize(); + trt_engine_serialized_data = + std::string((const char *)serialized_engine_data->data(), + serialized_engine_data->size()); + SaveTrtEngineSerializedDataToFile( + GetTrtEngineSerializedPath(Get("model_opt_cache_dir"), + engine_key), + trt_engine_serialized_data); + } else { + LOG(INFO) << "Load TRT Optimized Info from " + << GetTrtEngineSerializedPath( + Get("model_opt_cache_dir"), engine_key); + } + + SetAttr(op_desc->Proto(), "engine_serialized_data", + trt_engine_serialized_data); + } } std::vector ExtractParameters( @@ -253,7 +279,7 @@ std::vector ExtractParameters( for (const auto &node : nodes) { if (!node->IsOp()) continue; std::string op_type = node->Op()->Type(); - if (op_type == "feed") { + if (op_type == "feed" || op_type == "fetch") { std::vector output_names = node->Op()->OutputArgumentNames(); std::copy(output_names.begin(), output_names.end(), std::back_inserter(feed_outputs)); @@ -272,6 +298,99 @@ std::vector ExtractParameters( return parameters; } +void RenameAndGetOutputs( + const std::vector &subgraph_nodes, + framework::BlockDesc *block_desc, + const std::set &input_names_with_id, + std::set *output_names_with_id, + std::set *output_names, + std::unordered_map *output_name_map) { + //// In the normal case, the paddle-trt exists bug when runing the googlenet. + // When there are more than two convolutions of 1 * 1 with the same input, the + // paddle-tensorrt will do the merging optimization, which fuse those conv + // into one conv, and then trigger bug. So, We should use strategy to avoid + // this optimization for the time being. This bug will be fixed in the future. + std::unordered_map + same_hierarchy_conv2d_num_map; + + for (size_t index = 0; index < block_desc->OpSize(); ++index) { + framework::proto::OpDesc *op = block_desc->Op(index)->Proto(); + framework::OpDesc op_desc(*op, nullptr); + auto correspond_node = subgraph_nodes[index]; + PADDLE_ENFORCE_EQ(correspond_node->Name(), op->type()); + + std::unordered_map var2id; + std::unordered_map in_vars; + for (auto *in_var : correspond_node->inputs) { + var2id[in_var->Name()] = in_var->id(); + in_vars[in_var->Name()] = in_var; + } + // rename for the input variables of op inside subgraph + for (int i = 0; i < op->inputs_size(); i++) { + // one input + auto *in_var = op->mutable_inputs(i); + std::vector replaced_names; + for (int k = 0; k < in_var->arguments_size(); k++) { // all the arguments + std::string arg_value = in_var->arguments(k); + std::string arg_value_with_id = + arg_value + std::to_string(var2id[arg_value]); + if (input_names_with_id.count(arg_value_with_id)) { + replaced_names.push_back(arg_value); + } else { + replaced_names.push_back(arg_value_with_id); + } + } + in_var->clear_arguments(); + for (size_t k = 0; k < replaced_names.size(); k++) { + in_var->add_arguments(replaced_names[k]); + } + } + var2id.clear(); + for (auto out_var : correspond_node->outputs) { + var2id[out_var->Name()] = out_var->id(); + } + + if (op_desc.Type() == "conv2d") { + auto input_var_name = op_desc.Input("Input").front(); + auto filter_var_name = op_desc.Input("Filter").front(); + auto out_var_name = op_desc.Output("Output").front(); + auto filter_shape = in_vars[filter_var_name]->Var()->GetShape(); + const std::vector strides = + boost::get>(op_desc.GetAttr("strides")); + const std::vector paddings = + boost::get>(op_desc.GetAttr("paddings")); + if (same_hierarchy_conv2d_num_map[input_var_name] > 0) { + (*output_names_with_id) + .insert(out_var_name + std::to_string(var2id[out_var_name])); + (*output_names).insert(out_var_name); + } else if (filter_shape[2] == 1 && filter_shape[3] == 1 && + strides[0] == 1 && strides[1] == 1 && paddings[0] == 0 && + paddings[1] == 0) { + same_hierarchy_conv2d_num_map[input_var_name] += 1; + } + } + + // rename for the output variables of op inside subgraph + for (int i = 0; i < op->outputs_size(); i++) { + framework::proto::OpDesc_Var *out_var = op->mutable_outputs(i); + std::vector replaced_names; + for (int k = 0; k < out_var->arguments_size(); k++) { + std::string arg_value = out_var->arguments(k); + std::string arg_value_with_id = + arg_value + std::to_string(var2id[arg_value]); + if (output_names_with_id->count(arg_value_with_id)) { + (*output_name_map)[arg_value] = arg_value_with_id; + } + replaced_names.push_back(arg_value_with_id); + } + out_var->clear_arguments(); + for (size_t k = 0; k < replaced_names.size(); k++) { + out_var->add_arguments(replaced_names[k]); + } + } + } +} + } // namespace analysis } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h index 502353b95fc15e..6689a668fc9313 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.h @@ -13,7 +13,12 @@ // limitations under the License. #pragma once -#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/pass.h" namespace paddle { @@ -26,8 +31,9 @@ class TensorRtSubgraphPass : public framework::ir::FusePassBase { std::unique_ptr graph) const override; private: - void CreateTensorRTOp(framework::ir::Node *x, - framework::ir::Graph *graph) const; + void CreateTensorRTOp(framework::ir::Node *x, framework::ir::Graph *graph, + const std::vector &graph_params, + std::vector *repetitive_params) const; void CleanIntermediateOutputs(framework::ir::Node *node); }; diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc index 50ffb6e9410933..8360963f7366e7 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc @@ -32,6 +32,13 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) { if (!argument->use_gpu()) return; return; + auto &graph = argument->main_graph(); + std::vector repetitive_params; + + if (graph.Has(framework::ir::kRepetitiveParamAttr)) + repetitive_params = graph.Get>( + framework::ir::kRepetitiveParamAttr); + LOG(INFO) << "Sync params from CPU to GPU"; PADDLE_ENFORCE(argument->gpu_device_id_valid()); @@ -44,6 +51,10 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) { // Because there exists the case that new parameter variables are not added to // the program in the analysis pass. for (auto &var_name : all_vars) { + if (std::count(repetitive_params.begin(), repetitive_params.end(), + var_name)) { + continue; + } auto *var = scope->FindLocalVar(var_name); PADDLE_ENFORCE(var != nullptr); if (var->IsType() || diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h index a95f460df6f963..61990150a30db1 100644 --- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h +++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h @@ -17,6 +17,7 @@ #include #include +#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/analysis/analysis_pass.h" #include "paddle/fluid/platform/place.h" diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 522ab495227e9b..77411112220dcb 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -103,6 +103,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(tensorrt_max_batchsize_); CP_MEMBER(tensorrt_min_subgraph_size_); CP_MEMBER(tensorrt_precision_mode_); + CP_MEMBER(trt_use_static_engine_); // MKLDNN related. CP_MEMBER(use_mkldnn_); CP_MEMBER(mkldnn_enabled_op_types_); @@ -144,7 +145,7 @@ void AnalysisConfig::EnableMKLDNN() { void AnalysisConfig::EnableTensorRtEngine( int workspace_size, int max_batch_size, int min_subgraph_size, - AnalysisConfig::Precision precision_mode) { + AnalysisConfig::Precision precision_mode, bool use_static) { #ifdef PADDLE_WITH_CUDA if (!use_gpu()) { LOG(ERROR) << "To use TensorRT engine, please call EnableGpu() first"; @@ -156,6 +157,7 @@ void AnalysisConfig::EnableTensorRtEngine( tensorrt_max_batchsize_ = max_batch_size; tensorrt_min_subgraph_size_ = min_subgraph_size; tensorrt_precision_mode_ = precision_mode; + trt_use_static_engine_ = use_static; Update(); #else diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 49025e9269c30c..801363a532bff4 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -364,6 +364,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_); argument_.SetTensorRtMinSubgraphSize(config_.tensorrt_min_subgraph_size_); argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_); + argument_.SetTensorRtUseStaticEngine(config_.trt_use_static_engine_); } if (config_.use_mkldnn_) { @@ -437,12 +438,14 @@ void AnalysisPredictor::PrepareFeedFetch() { } feeds_[idx] = op; feed_names_[op->Output("Out")[0]] = idx; + idx2feeds_[idx] = op->Output("Out")[0]; } else if (op->Type() == "fetch") { int idx = boost::get(op->GetAttr("col")); if (fetches_.size() <= static_cast(idx)) { fetches_.resize(idx + 1); } fetches_[idx] = op; + idx2fetches_[idx] = op->Input("X")[0]; } } } @@ -455,6 +458,22 @@ void AnalysisPredictor::CreateFeedFetchVar(framework::Scope *scope) { var->GetMutable(); } +std::vector AnalysisPredictor::GetInputNames() { + std::vector input_names; + for (auto &item : idx2feeds_) { + input_names.push_back(item.second); + } + return input_names; +} + +std::vector AnalysisPredictor::GetOutputNames() { + std::vector output_names; + for (auto &item : idx2fetches_) { + output_names.push_back(item.second); + } + return output_names; +} + std::unique_ptr AnalysisPredictor::GetInputTensor( const std::string &name) { PADDLE_ENFORCE(executor_->scope()->FindVar(name), "no name called %s", name); @@ -462,6 +481,13 @@ std::unique_ptr AnalysisPredictor::GetInputTensor( new ZeroCopyTensor(static_cast(executor_->scope()))); res->input_or_output_ = true; res->SetName(name); + if (platform::is_cpu_place(place_)) { + res->SetPlace(PaddlePlace::kCPU); + } else { + auto gpu_place = boost::get(place_); + res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); + } + return res; } @@ -472,6 +498,12 @@ std::unique_ptr AnalysisPredictor::GetOutputTensor( new ZeroCopyTensor(static_cast(executor_->scope()))); res->input_or_output_ = false; res->SetName(name); + if (platform::is_cpu_place(place_)) { + res->SetPlace(PaddlePlace::kCPU); + } else { + auto gpu_place = boost::get(place_); + res->SetPlace(PaddlePlace::kGPU, gpu_place.GetDeviceId()); + } return res; } diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index d5445c58e45ae6..5c0535d63e00c3 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -15,12 +15,14 @@ #pragma once #include #include +#include #include #include #include "paddle/fluid/framework/naive_executor.h" #include "paddle/fluid/inference/analysis/analyzer.h" #include "paddle/fluid/inference/api/api_impl.h" #include "paddle/fluid/inference/api/details/reset_tensor_array.h" +#include "paddle/fluid/inference/api/helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/string/printf.h" #ifdef PADDLE_WITH_TESTING @@ -53,6 +55,9 @@ class AnalysisPredictor : public PaddlePredictor { std::vector *output_data, int batch_size = -1) override; + std::vector GetInputNames(); + std::vector GetOutputNames(); + std::unique_ptr GetInputTensor( const std::string &name) override; std::unique_ptr GetOutputTensor( @@ -131,7 +136,11 @@ class AnalysisPredictor : public PaddlePredictor { std::shared_ptr inference_program_; std::vector feeds_; std::map feed_names_; + // Sorted according to the idx. + std::map idx2feeds_; std::vector fetches_; + std::map idx2fetches_; + // Memory buffer for feed inputs. The temporary LoDTensor will cause serious // concurrency problems, wrong results and memory leak, so cache them. std::vector feed_tensors_; diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index f60ff40c5da3e9..cf02901d963858 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -15,6 +15,7 @@ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -73,6 +74,61 @@ T *ZeroCopyTensor::data(PaddlePlace *place, int *size) const { return res; } +template +void ZeroCopyTensor::copy_from_cpu(const T *data) { + EAGER_GET_TENSOR; + PADDLE_ENFORCE_GE( + tensor->numel(), 0, + "You should call ZeroCopyTensor::Reshape(const std::vector &shape)" + "function before copy data from cpu."); + size_t ele_size = tensor->numel() * sizeof(T); + + if (place_ == PaddlePlace::kCPU) { + auto *t_data = tensor->mutable_data(platform::CPUPlace()); + std::memcpy(static_cast(t_data), data, ele_size); + } else { +#ifdef PADDLE_WITH_CUDA + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + platform::CUDAPlace gpu_place(device_); + auto *t_data = tensor->mutable_data(gpu_place); + auto *dev_ctx = + static_cast(pool.Get(gpu_place)); + + memory::Copy(gpu_place, static_cast(t_data), platform::CPUPlace(), + data, ele_size, dev_ctx->stream()); +#else + PADDLE_THROW("Not compile with CUDA, should not reach here."); +#endif + } +} + +template +void ZeroCopyTensor::copy_to_cpu(T *data) { + EAGER_GET_TENSOR; + auto ele_num = tensor->numel(); + auto *t_data = tensor->data(); + auto t_place = tensor->place(); + + if (platform::is_cpu_place(t_place)) { + std::memcpy(static_cast(data), t_data, ele_num * sizeof(T)); + } else { +#ifdef PADDLE_WITH_CUDA + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto gpu_place = boost::get(t_place); + auto *dev_ctx = + static_cast(pool.Get(gpu_place)); + memory::Copy(platform::CPUPlace(), static_cast(data), gpu_place, + t_data, ele_num * sizeof(T), dev_ctx->stream()); +#else + PADDLE_THROW("Not compile with CUDA, should not reach here."); +#endif + } +} +template void ZeroCopyTensor::copy_from_cpu(const float *data); +template void ZeroCopyTensor::copy_from_cpu(const int64_t *data); +template void ZeroCopyTensor::copy_to_cpu(float *data); +template void ZeroCopyTensor::copy_to_cpu(int64_t *data); + template float *ZeroCopyTensor::data(PaddlePlace *place, int *size) const; template int64_t *ZeroCopyTensor::data(PaddlePlace *place, @@ -92,10 +148,10 @@ void *ZeroCopyTensor::FindTensor() const { return tensor; } -std::vector ZeroCopyTensor::shape() const { +std::vector ZeroCopyTensor::shape() const { EAGER_GET_TENSOR; PADDLE_ENFORCE(tensor_, "not found tensor called %s in the scope", name_); - return framework::vectorize(tensor->dims()); + return framework::vectorize2int(tensor->dims()); } void ZeroCopyTensor::SetLoD(const std::vector> &x) { diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc index 12071e09f8442f..cbbb3ea2d1395a 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc @@ -37,7 +37,7 @@ template int64_t *ZeroCopyTensor::mutable_data(PaddlePlace place); void *ZeroCopyTensor::FindTensor() const { return nullptr; } -std::vector ZeroCopyTensor::shape() const { return {}; } +std::vector ZeroCopyTensor::shape() const { return {}; } void ZeroCopyTensor::SetLoD(const std::vector> &x) {} diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index b92781e4f2c612..ec3bef42fd91ce 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -50,6 +50,11 @@ class Timer { } }; +static int GetUniqueId() { + static int id = 0; + return id++; +} + static void split(const std::string &str, char sep, std::vector *pieces) { pieces->clear(); diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index c1c6227cdd8b20..9b05c335047d7f 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -135,7 +135,8 @@ struct AnalysisConfig { */ void EnableTensorRtEngine(int workspace_size = 1 << 20, int max_batch_size = 1, int min_subgraph_size = 3, - Precision precision = Precision::kFloat32); + Precision precision = Precision::kFloat32, + bool use_static = true); /** A boolean state telling whether the TensorRT engine is used. */ bool tensorrt_engine_enabled() const { return use_tensorrt_; } @@ -233,6 +234,7 @@ struct AnalysisConfig { // subgraph, 3 as default value. int tensorrt_min_subgraph_size_{3}; Precision tensorrt_precision_mode_; + bool trt_use_static_engine_; // memory reuse related. bool enable_memory_optim_{false}; diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index c9a45b4aa3b403..f807289f6aee06 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -160,11 +160,21 @@ class ZeroCopyTensor { template T* data(PaddlePlace* place, int* size) const; - std::vector shape() const; + template + void copy_from_cpu(const T* data); + + template + void copy_to_cpu(T* data); + + std::vector shape() const; void SetLoD(const std::vector>& x); std::vector> lod() const; const std::string& name() const { return name_; } + void SetPlace(PaddlePlace place, int device = -1) { + place_ = place; + device_ = device; + } protected: explicit ZeroCopyTensor(void* scope) : scope_{scope} {} @@ -179,6 +189,8 @@ class ZeroCopyTensor { // The corresponding tensor pointer inside Paddle workspace is cached for // performance. mutable void* tensor_{nullptr}; + PaddlePlace place_; + int device_; }; /** A simple Inference API for Paddle. @@ -200,6 +212,14 @@ class PaddlePredictor { std::vector* output_data, int batch_size = -1) = 0; + /** \brief Get input names of the model + */ + virtual std::vector GetInputNames() { return {}; } + + /** \brief Get output names of the model + */ + virtual std::vector GetOutputNames() { return {}; } + /** \brief Get a mutable tensor directly. * * NOTE Only works in AnalysisPredictor. diff --git a/paddle/fluid/inference/engine.h b/paddle/fluid/inference/engine.h index ce2b8161715a3f..1a13ba510384c0 100644 --- a/paddle/fluid/inference/engine.h +++ b/paddle/fluid/inference/engine.h @@ -49,11 +49,6 @@ class EngineBase { // Execute the engine, that will run the inference network. virtual void Execute(int batch_size) = 0; - // Return the IO buffer that allocated in engine. One can read/write directly - // on the buffer. If the buffer's buffer is nullptr, one can also allocate - // memory and maintain it outside the engine. - virtual Buffer& buffer(const std::string& name) = 0; - virtual ~EngineBase() {} }; // class EngineBase diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc index 7900f56c9ce17f..39a99a21ea7020 100644 --- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc @@ -18,21 +18,6 @@ namespace paddle { namespace inference { namespace tensorrt { -bool to_skip_merging_optimize(TensorRTEngine* engine, - const std::vector& filters, - const std::vector& strides, - const std::vector& paddings, - std::string input_name) { - if (engine->itensor_quote_num[input_name] > 0) { - return true; - } - if (filters[0] == 1 && filters[1] == 1 && strides[0] == 1 && - strides[1] == 1 && paddings[0] == 0 && paddings[1] == 0) - engine->itensor_quote_num[input_name] += 1; - - return false; -} - template void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op, const framework::Scope& scope, bool test_mode, @@ -59,7 +44,7 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op, weight_tensor->Resize(Y_t->dims()); TensorCopySync((*Y_t), cpu_place, weight_tensor.get()); - auto* weight_data = weight_tensor->mutable_data(platform::CPUPlace()); + auto* weight_data = weight_tensor->mutable_data(cpu_place); PADDLE_ENFORCE_EQ(weight_tensor->dims().size(), 4UL); const int n_output = weight_tensor->dims()[0]; @@ -100,9 +85,7 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op, layer->getOutput(0)->setName(output_name.c_str()); engine->SetITensor(output_name, layer->getOutput(0)); - if (test_mode || - to_skip_merging_optimize(engine, {filter_h, filter_w}, strides, paddings, - op_desc.Input("Input").front())) { + if (test_mode) { engine->DeclareOutput(output_name); } } diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc index 79362f96770102..0c5a1a6ef16f05 100644 --- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc @@ -153,7 +153,6 @@ class ElementwiseTensorOpConverter : public OpConverter { if (CheckDims(dims_x, dims_y)) { // The two input tensor should have the same dims VLOG(3) << "Convert a fluid elementwise op to TensorRT IElementWiseLayer"; - nvinfer1::IElementWiseLayer* layer = TRT_ENGINE_ADD_LAYER( engine_, ElementWise, *const_cast(X), *const_cast(Y), op_pair->second); @@ -166,7 +165,7 @@ class ElementwiseTensorOpConverter : public OpConverter { "ElementWisePluginLayer"; plugin::ElementWisePlugin* plugin = - new plugin::ElementWisePlugin(op_pair->second, dims_x, dims_y, axis); + new plugin::ElementWisePlugin(op_type_, dims_x, dims_y, axis); plugin->AddInput(X); plugin->AddInput(Y); nvinfer1::IPluginLayer* layer = engine_->AddPlugin( diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc index eef4fab4e86f05..42dcd68e40e04e 100644 --- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc @@ -85,10 +85,10 @@ class FcOpConverter : public OpConverter { Y_t->dims()[0] * Y_t->dims()[1] * sizeof(float)); TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT, static_cast(weight_data), - Y_t->memory_size() / sizeof(float)}; + static_cast(Y_t->numel())}; TensorRTEngine::Weight tmp_weight(nvinfer1::DataType::kFLOAT, static_cast(tmp->data()), - Y_t->memory_size() / sizeof(float)); + static_cast(Y_t->numel())); weight.dims.assign({Y_t->dims()[0], Y_t->dims()[1]}); tmp_weight.dims = weight.dims; diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h index cc9a370e7db570..55515569ead6e4 100644 --- a/paddle/fluid/inference/tensorrt/convert/op_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -17,9 +17,11 @@ limitations under the License. */ #include #include #include +#include #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/utils/singleton.h" @@ -27,6 +29,37 @@ namespace paddle { namespace inference { namespace tensorrt { +using FluidDT = framework::proto::VarType_Type; +using TRT_DT = nvinfer1::DataType; + +namespace { // NOLINT + +TRT_DT FluidDataType2TRT(FluidDT type) { + switch (type) { + case FluidDT::VarType_Type_FP32: + return TRT_DT::kFLOAT; + case FluidDT::VarType_Type_INT32: + return TRT_DT::kINT32; + default: + return TRT_DT::kINT32; + } + PADDLE_THROW("unkown type"); + return TRT_DT::kINT32; +} + +nvinfer1::Dims Vec2TRT_Dims(const std::vector& shape) { + PADDLE_ENFORCE_GT(shape.size(), 1UL, + "TensorRT' tensor input requires at least 2 dimensions"); + PADDLE_ENFORCE_LE(shape.size(), 4UL, + "TensorRT' tensor input requires at most 4 dimensions"); + PADDLE_ENFORCE(shape.size() == 4UL || shape.size() == 2UL); + if (shape.size() == 4UL) + return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]); + return nvinfer1::DimsCHW(shape[1], 1, 1); +} + +} // namespace // NOLINT + /* * Convert Op from Fluid to TensorRT Engine. */ @@ -111,6 +144,34 @@ class OpConverter { } } + // The scope here should be inited with the parameter vars. + void ConvertBlockToTRTEngine( + framework::BlockDesc* block_desc, const framework::Scope& scope, + const std::vector& inputs, + const std::unordered_set& parameters, + const std::vector& outputs, TensorRTEngine* engine) { + engine->InitNetwork(); + for (auto& input : inputs) { + if (parameters.count(input)) continue; + auto* var = block_desc->FindVar(input); + PADDLE_ENFORCE(var, "no variable called %s", input); + PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR, + "TensorRT engine only takes LoDTensor as input"); + auto var_shape = var->GetShape(); + + engine->DeclareInput( + input, FluidDataType2TRT( + var->Proto()->type().lod_tensor().tensor().data_type()), + Vec2TRT_Dims(var_shape)); + } + framework::proto::BlockDesc* block_proto = block_desc->Proto(); + ConvertBlock(*block_proto, parameters, scope, engine); + for (auto& output : outputs) { + engine->DeclareOutput(output); + } + engine->FreezeNetwork(); + } + void SetEngine(TensorRTEngine* engine) { engine_ = engine; } virtual ~OpConverter() {} diff --git a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc index dbdff85ddebc85..2ae804106e5f7b 100644 --- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc @@ -43,23 +43,20 @@ class PReluOpConverter : public OpConverter { PADDLE_ENFORCE_NOT_NULL(alpha_var); auto* alpha_tensor = alpha_var->GetMutable(); - platform::CUDAPlace place; - std::unique_ptr alpha_tensor_device( + platform::CPUPlace cpu_place; + std::unique_ptr alpha_tensor_temp( new framework::LoDTensor()); - alpha_tensor_device->Resize(alpha_tensor->dims()); - TensorCopySync(*alpha_tensor, place, alpha_tensor_device.get()); - float* alpha_data = alpha_tensor_device->mutable_data(place); + alpha_tensor_temp->Resize(alpha_tensor->dims()); + TensorCopySync(*alpha_tensor, cpu_place, alpha_tensor_temp.get()); + float* alpha_data = alpha_tensor_temp->mutable_data(cpu_place); - // Transform alpha to TensorRTEngine::Weight - TensorRTEngine::Weight alpha_rt(nvinfer1::DataType::kFLOAT, - static_cast(alpha_data), - alpha_tensor_device->numel()); - plugin::PReluPlugin* plugin = new plugin::PReluPlugin(alpha_rt, mode); + plugin::PReluPlugin* plugin = + new plugin::PReluPlugin(alpha_data, alpha_tensor_temp->numel(), mode); nvinfer1::IPluginLayer* layer = engine_->AddPlugin(&input, input_num, plugin); // keep alpha tensor to avoid release it's memory engine_->weight_map[op_desc.Input("Alpha")[0]] = - std::move(alpha_tensor_device); + std::move(alpha_tensor_temp); std::string layer_name = "prelu (Output: "; auto output_name = op_desc.Output("Out")[0]; diff --git a/paddle/fluid/inference/tensorrt/convert/ut_helper.h b/paddle/fluid/inference/tensorrt/convert/ut_helper.h index e83961f3d7bda0..2571abbf69892d 100644 --- a/paddle/fluid/inference/tensorrt/convert/ut_helper.h +++ b/paddle/fluid/inference/tensorrt/convert/ut_helper.h @@ -19,7 +19,9 @@ limitations under the License. */ #pragma once +#include #include +#include #include #include "paddle/fluid/framework/lod_tensor.h" @@ -79,7 +81,8 @@ class TRTConvertValidation { if_add_batch_(if_add_batch), max_batch_size_(max_batch_size) { PADDLE_ENFORCE_EQ(cudaStreamCreate(&stream_), 0); - engine_.reset(new TensorRTEngine(max_batch_size, workspace_size, stream_)); + engine_.reset( + new TensorRTEngine(max_batch_size, workspace_size, false, nullptr, 0)); engine_->InitNetwork(); } @@ -114,13 +117,12 @@ class TRTConvertValidation { } void DeclVar(const std::string& name, const std::vector dim_vec) { - platform::CUDAPlace place; - platform::CUDADeviceContext ctx(place); + platform::CUDADeviceContext ctx(place_); auto* x = scope_.Var(name); auto* x_tensor = x->GetMutable(); x_tensor->Resize(framework::make_ddim(dim_vec)); - RandomizeTensor(x_tensor, place, ctx); + RandomizeTensor(x_tensor, place_, ctx); } // Declare a variable in a fluid Scope. void DeclVar(const std::string& name, const nvinfer1::Dims& dims, @@ -146,19 +148,6 @@ class TRTConvertValidation { // Declare outputs. op_desc_.reset(new framework::OpDesc(desc, nullptr)); - - // Set Inputs. - for (const auto& input : op_desc_->InputArgumentNames()) { - if (parameters_.count(input)) continue; - auto* var = scope_.FindVar(input); - PADDLE_ENFORCE(var); - auto tensor = var->GetMutable(); - - engine_->SetInputFromGPU( - input, static_cast(tensor->data()), - sizeof(float) * - analysis::AccuDims(tensor->dims(), tensor->dims().size())); - } } // We use the set 'neglected_output' here, because some Ops like batch norm, @@ -168,43 +157,71 @@ class TRTConvertValidation { std::unordered_set neglected_output = {}) { // Execute Fluid Op PADDLE_ENFORCE_LE(batch_size, max_batch_size_); - platform::CUDAPlace place; - platform::CUDADeviceContext ctx(place); - op_->Run(scope_, place); - // Execute TRT. - engine_->Execute(batch_size); - cudaStreamSynchronize(engine_->stream()); + platform::CUDADeviceContext ctx(place_); + op_->Run(scope_, place_); - ASSERT_FALSE(op_desc_->OutputArgumentNames().empty()); - const size_t output_space_size = 3000; + std::vector input_output_names; + + // Note: we need filter the parameter + for (const auto& input : op_desc_->InputArgumentNames()) { + if (parameters_.count(input)) continue; + input_output_names.push_back(input); + } + + // Collect the fluid outputs. + std::vector> fluid_outs; for (const auto& output : op_desc_->OutputArgumentNames()) { if (neglected_output.count(output)) continue; + input_output_names.push_back(output); std::vector fluid_out; - std::vector trt_out(output_space_size); - engine_->GetOutputInCPU(output, &trt_out[0], output_space_size); - cudaStreamSynchronize(engine_->stream()); - auto* var = scope_.FindVar(output); - auto tensor = var->GetMutable(); + auto* tensor = var->GetMutable(); framework::TensorToVector(*tensor, ctx, &fluid_out); + fluid_outs.push_back(fluid_out); + } + + // Bind input and output for TRT. + const int num_bindings = input_output_names.size(); + std::vector buffers(num_bindings); + + for (const std::string& name : input_output_names) { + auto* var = scope_.FindVar(name); + auto* tensor = var->GetMutable(); + const int bind_index = engine_->engine()->getBindingIndex(name.c_str()); + buffers[bind_index] = + static_cast(tensor->mutable_data(place_)); + } + + // Execute TRT. + engine_->Execute(batch_size, &buffers, stream_); - size_t fluid_out_size = fluid_out.size(); + ASSERT_FALSE(op_desc_->OutputArgumentNames().empty()); + int index = 0; + for (const auto& output : op_desc_->OutputArgumentNames()) { + if (neglected_output.count(output)) continue; + std::vector trt_out; + auto* var = scope_.FindVar(output); + auto* tensor = var->GetMutable(); + framework::TensorToVector(*tensor, ctx, &trt_out); + + size_t fluid_out_size = fluid_outs[index].size(); if (if_add_batch_ == true) { fluid_out_size = batch_size * (framework::product(tensor->dims()) / max_batch_size_); } - // Compare two output - ASSERT_FALSE(fluid_out.empty()); + for (size_t i = 0; i < fluid_out_size; i++) { // Loose the threshold for CI in different machine model. - EXPECT_LT(std::abs(fluid_out[i] - trt_out[i]), 2e-5); + EXPECT_LT(std::abs(fluid_outs[index][i] - trt_out[i]), 2e-5); } + index += 1; } } framework::Scope& scope() { return scope_; } private: + platform::CUDAPlace place_; std::unique_ptr engine_; cudaStream_t stream_; std::unique_ptr op_; diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 10f48462cfaf80..fddf5f11c285da 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -32,36 +32,18 @@ void TensorRTEngine::Build(const DescType &paddle_model) { PADDLE_ENFORCE(false, "not implemented"); } -void TensorRTEngine::Execute(int batch_size) { +void TensorRTEngine::Execute(int batch_size, std::vector *buffers, + cudaStream_t stream) { freshDeviceId(); batch_size_ = batch_size; - std::vector buffers; - for (auto &buf : buffers_) { - PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated"); - PADDLE_ENFORCE_GT(buf.max_size, 0); - PADDLE_ENFORCE(buf.device == DeviceType::GPU); - buffers.push_back(buf.buffer); - } - infer_context_->enqueue(batch_size, buffers.data(), stream_, nullptr); - cudaStreamSynchronize(stream_); + infer_context_->enqueue(batch_size, buffers->data(), stream, nullptr); + cudaStreamSynchronize(stream); SetRuntimeBatch(batch_size); } -TensorRTEngine::~TensorRTEngine() { - cudaStreamSynchronize(stream_); - // clean buffer - for (auto &buf : buffers_) { - if (buf.device == DeviceType::GPU && buf.buffer != nullptr) { - PADDLE_ENFORCE_EQ(0, cudaFree(buf.buffer)); - buf.buffer = nullptr; - buf.max_size = 0; - } - } -} - void TensorRTEngine::FreezeNetwork() { - VLOG(3) << "TRT to freeze network"; freshDeviceId(); + VLOG(3) << "TRT to freeze network"; PADDLE_ENFORCE(infer_builder_ != nullptr, "Call InitNetwork first to initialize network."); PADDLE_ENFORCE(infer_network_ != nullptr, @@ -81,30 +63,6 @@ void TensorRTEngine::FreezeNetwork() { PADDLE_ENFORCE(infer_engine_ != nullptr, "build cuda engine failed!"); infer_context_.reset(infer_engine_->createExecutionContext()); - - // allocate GPU buffers. - buffers_.resize(buffer_sizes_.size()); - for (auto &item : buffer_sizes_) { - // The output buffers are not set in the network building phrase, need to - // infer from the TesorRT network. - if (item.second == 0) { - auto slot_offset = infer_engine_->getBindingIndex(item.first.c_str()); - auto dims = infer_engine_->getBindingDimensions(slot_offset); - item.second = kDataTypeSize[static_cast( - infer_engine_->getBindingDataType(slot_offset))] * - analysis::AccuDims(dims.d, dims.nbDims) * max_batch_; - PADDLE_ENFORCE_GT(item.second, 0); - } - - auto &buf = buffer(item.first); - buf.max_size = item.second * max_batch_; - CHECK(buf.buffer == nullptr); // buffer should be allocated only once. - - PADDLE_ENFORCE_EQ(0, cudaMalloc(&buf.buffer, item.second * max_batch_)); - buf.size = 0; - PADDLE_ENFORCE_LE(buf.max_size, 1 << 30); // 10G - buf.device = DeviceType::GPU; - } } nvinfer1::ITensor *TensorRTEngine::DeclareInput(const std::string &name, @@ -158,83 +116,6 @@ void TensorRTEngine::DeclareOutput(const std::string &name) { buffer_sizes_[name] = 0; } -void *TensorRTEngine::GetOutputInGPU(const std::string &name) { - return buffer(name).buffer; -} - -void TensorRTEngine::GetOutputInGPU(const std::string &name, void *dst, - size_t max_size) { - // determine data size - auto *output = TensorRTEngine::GetITensor(name); - nvinfer1::Dims dims = output->getDimensions(); - auto dim_size = analysis::AccuDims(dims.d, dims.nbDims); - size_t dst_size = dim_size * runtime_batch_ * - kDataTypeSize[static_cast(output->getType())]; - - auto it = buffer_sizes_.find(name); - PADDLE_ENFORCE(it != buffer_sizes_.end()); - PADDLE_ENFORCE_GT(it->second, 0); - PADDLE_ENFORCE_LE(dst_size, it->second); - PADDLE_ENFORCE_GE(max_size, dst_size); - auto &buf = buffer(name); - PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before"); - PADDLE_ENFORCE_EQ(cudaMemcpyAsync(dst, buf.buffer, dst_size, - cudaMemcpyDeviceToDevice, stream_), - 0); -} - -void TensorRTEngine::GetOutputInCPU(const std::string &name, void *dst, - size_t max_size) { - // determine data size - - auto *output = TensorRTEngine::GetITensor(name); - nvinfer1::Dims dims = output->getDimensions(); - auto dim_size = analysis::AccuDims(dims.d, dims.nbDims); - size_t dst_size = dim_size * runtime_batch_ * - kDataTypeSize[static_cast(output->getType())]; - auto it = buffer_sizes_.find(name); - PADDLE_ENFORCE(it != buffer_sizes_.end()); - PADDLE_ENFORCE_GT(it->second, 0); - PADDLE_ENFORCE_LE(dst_size, it->second); - PADDLE_ENFORCE_GE(max_size, dst_size); - auto &buf = buffer(name); - PADDLE_ENFORCE_NOT_NULL(buf.buffer, "buffer should be allocated before"); - PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(dst, buf.buffer, dst_size, - cudaMemcpyDeviceToHost, stream_)); -} - -Buffer &TensorRTEngine::buffer(const std::string &name) { - PADDLE_ENFORCE(infer_engine_ != nullptr, "call FreezeNetwork first."); - auto it = buffer_sizes_.find(name); - PADDLE_ENFORCE(it != buffer_sizes_.end(), "tried to access buffer named %s", - name); - auto slot_offset = infer_engine_->getBindingIndex(name.c_str()); - return buffers_[slot_offset]; -} - -void TensorRTEngine::SetInputFromCPU(const std::string &name, const void *data, - size_t size) { - auto &buf = buffer(name); - PADDLE_ENFORCE_NOT_NULL(buf.buffer); - PADDLE_ENFORCE_NOT_NULL(data); - PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small"); - PADDLE_ENFORCE(buf.device == DeviceType::GPU); - buf.size = size; - PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size, - cudaMemcpyHostToDevice, stream_)); -} - -void TensorRTEngine::SetInputFromGPU(const std::string &name, const void *data, - size_t size) { - auto &buf = buffer(name); - buf.size = size; - PADDLE_ENFORCE_NOT_NULL(buf.buffer); - PADDLE_ENFORCE_LE(size, buf.max_size, "buffer is too small"); - PADDLE_ENFORCE(buf.device == DeviceType::GPU); - PADDLE_ENFORCE_EQ(0, cudaMemcpyAsync(buf.buffer, data, size, - cudaMemcpyDeviceToDevice, stream_)); -} - void TensorRTEngine::SetITensor(const std::string &name, nvinfer1::ITensor *tensor) { PADDLE_ENFORCE(tensor != nullptr); @@ -254,13 +135,6 @@ void TensorRTEngine::SetRuntimeBatch(size_t batch_size) { int TensorRTEngine::GetRuntimeBatch() { return runtime_batch_; } -void TensorRTEngine::freshDeviceId() { - int count; - cudaGetDeviceCount(&count); - PADDLE_ENFORCE_LT(device_, count); - cudaSetDevice(device_); -} - nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin( nvinfer1::ITensor *const *inputs, int num_inputs, plugin::PluginTensorRT *plugin) { @@ -268,6 +142,13 @@ nvinfer1::IPluginLayer *TensorRTEngine::AddPlugin( return infer_network_.get()->addPluginExt(inputs, num_inputs, *plugin); } +void TensorRTEngine::freshDeviceId() { + int count; + cudaGetDeviceCount(&count); + PADDLE_ENFORCE_LT(device_id_, count); + cudaSetDevice(device_id_); +} + } // namespace tensorrt } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index cdfe09b5a7fd2d..657dfd9355f9e3 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -23,6 +23,7 @@ limitations under the License. */ #include "paddle/fluid/inference/engine.h" #include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" #include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" #include "paddle/fluid/inference/utils/singleton.h" @@ -37,7 +38,9 @@ class TRTInt8Calibrator; * There are two alternative ways to use it, one is to build from a paddle * protobuf model, another way is to manully construct the network. */ -class TensorRTEngine : public EngineBase { +class TensorRTEngine { + using DescType = ::paddle::framework::proto::BlockDesc; + public: // Weight is model parameter. class Weight { @@ -56,28 +59,28 @@ class TensorRTEngine : public EngineBase { nvinfer1::Weights w_; }; - TensorRTEngine(int max_batch, int max_workspace, cudaStream_t stream, - int device = 0, bool enable_int8 = false, - TRTInt8Calibrator* calibrator = nullptr, + TensorRTEngine(int max_batch, int max_workspace, bool enable_int8 = false, + TRTInt8Calibrator* calibrator = nullptr, int device_id = 0, nvinfer1::ILogger& logger = NaiveLogger::Global()) : max_batch_(max_batch), max_workspace_(max_workspace), - stream_(stream), - device_(device), enable_int8_(enable_int8), calibrator_(calibrator), + device_id_(device_id), logger_(logger) {} - virtual ~TensorRTEngine(); + ~TensorRTEngine() {} // TODO(Superjomn) implement it later when graph segmentation is supported. - void Build(const DescType& paddle_model) override; + void Build(const DescType& paddle_model); - void Execute(int batch_size) override; + void Execute(int batch_size, std::vector* buffers, + cudaStream_t stream); // Initialize the inference network, so that TensorRT layers can add to this // network. void InitNetwork() { + freshDeviceId(); infer_builder_.reset(createInferBuilder(&logger_)); infer_network_.reset(infer_builder_->createNetwork()); } @@ -98,37 +101,34 @@ class TensorRTEngine : public EngineBase { // Check if the ITensor has been declared bool HasDeclared(const std::string& name); - // GPU memory address for an ITensor with specific name. One can operate on - // these memory directly for acceleration, for example, output the converted - // data directly to the buffer to save data copy overhead. - // NOTE this should be used after calling `FreezeNetwork`. - Buffer& buffer(const std::string& name) override; - - cudaStream_t stream() { return stream_; } - - // Fill an input from CPU memory with name and size. - void SetInputFromCPU(const std::string& name, const void* data, size_t size); - // TODO(Superjomn) is this method necessary given that buffer(xxx) can be - // accessed directly. Fill an input from GPU memory with name and size. - void SetInputFromGPU(const std::string& name, const void* data, size_t size); - // Get an output called name, the output of tensorrt is in GPU, so this method - // Return the output's GPU memory address without copy. - void* GetOutputInGPU(const std::string& name); - // Copy data into dst inside the GPU device. - void GetOutputInGPU(const std::string& name, void* dst, size_t max_size); - // LOW EFFICENCY! Get output to CPU, this will trigger a memory copy from GPU - // to CPU. - void GetOutputInCPU(const std::string& name, void* dst, size_t max_size); - // Fill an ITensor into map itensor_map_. void SetITensor(const std::string& name, nvinfer1::ITensor* tensor); // Get an ITensor called name. nvinfer1::ITensor* GetITensor(const std::string& name); nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); } nvinfer1::INetworkDefinition* network() { return infer_network_.get(); } + + nvinfer1::IHostMemory* Serialize() { + PADDLE_ENFORCE(infer_engine_ != nullptr, + "You should build engine first and then serialize"); + ihost_memory_.reset(infer_engine_->serialize()); + return ihost_memory_.get(); + } + + void Deserialize(const std::string& engine_serialized_data) { + freshDeviceId(); + infer_ptr runtime(createInferRuntime(&logger_)); + infer_engine_.reset(runtime->deserializeCudaEngine( + engine_serialized_data.c_str(), engine_serialized_data.size(), + &inference::Singleton::Global())); + PADDLE_ENFORCE(infer_engine_ != nullptr, + "build cuda engine failed when deserialize engine info.!"); + infer_context_.reset(infer_engine_->createExecutionContext()); + } + void SetRuntimeBatch(size_t batch_size); int GetRuntimeBatch(); - int GetDevice() { return device_; } + int GetDeviceId() { return device_id_; } nvinfer1::IPluginLayer* AddPlugin(nvinfer1::ITensor* const* inputs, int num_inputs, plugin::PluginTensorRT*); @@ -140,17 +140,12 @@ class TensorRTEngine : public EngineBase { std::unordered_map> weight_map; - // TODO(NHZLX) - // In the normal case, the paddle-trt exists bug when runing the googlenet. - // When there are more than two convolutions of 1 * 1 with the same input, the - // paddle-tensorrt will do the merging optimization, which fuse those conv - // into one conv, and then trigger bug. So, We should use strategy to avoid - // this - // optimization for the time being. This bug will be fixed in the future. - std::unordered_map - itensor_quote_num; - private: + // Each ICudaEngine object is bound to a specific GPU when it is instantiated, + // ensure that the thread is associated with the correct device by calling + // freshDeviceId(). + void freshDeviceId(); + // the max batch size int max_batch_; // the runtime batch size @@ -158,18 +153,14 @@ class TensorRTEngine : public EngineBase { // the max memory size the engine uses int max_workspace_; - cudaStream_t stream_; - // The specific GPU id that the TensorRTEngine bounded to. - int device_; - bool enable_int8_; TRTInt8Calibrator* calibrator_; // batch size of the current data, will be updated each Executation. int batch_size_{-1}; + int device_id_; nvinfer1::ILogger& logger_; - std::vector buffers_; // max data size for the buffers. std::unordered_map buffer_sizes_; std::unordered_map @@ -192,15 +183,11 @@ class TensorRTEngine : public EngineBase { infer_ptr infer_network_; infer_ptr infer_engine_; infer_ptr infer_context_; - // Each ICudaEngine object is bound to a specific GPU when it is instantiated, - // ensure that the thread is associated with the correct device by calling - // freshDeviceId(). - void freshDeviceId(); + infer_ptr ihost_memory_; }; // class TensorRTEngine // Add an layer__ into engine__ with args ARGS. // For example: -// TRT_ENGINE_ADD_LAYER(xxx, FullyConnected, input, dim, weights, bias) // // Reference // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#charRNN_define_network diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h index fc7ca7714e9325..010942a0678fe9 100644 --- a/paddle/fluid/inference/tensorrt/helper.h +++ b/paddle/fluid/inference/tensorrt/helper.h @@ -17,6 +17,9 @@ #include #include #include +#include +#include +#include #include "paddle/fluid/platform/dynload/tensorrt.h" #include "paddle/fluid/platform/enforce.h" @@ -74,6 +77,32 @@ class NaiveLogger : public nvinfer1::ILogger { ~NaiveLogger() override {} }; +class NaiveProfiler : public nvinfer1::IProfiler { + public: + typedef std::pair Record; + std::vector mProfile; + + virtual void reportLayerTime(const char* layerName, float ms) { + auto record = + std::find_if(mProfile.begin(), mProfile.end(), + [&](const Record& r) { return r.first == layerName; }); + if (record == mProfile.end()) + mProfile.push_back(std::make_pair(layerName, ms)); + else + record->second += ms; + } + + void printLayerTimes() { + float totalTime = 0; + for (size_t i = 0; i < mProfile.size(); i++) { + printf("%-40.40s %4.3fms\n", mProfile[i].first.c_str(), + mProfile[i].second); + totalTime += mProfile[i].second; + } + printf("Time over all layers: %4.3f\n", totalTime); + } +}; + } // namespace tensorrt } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt index 95443e813327c1..709aa103d1b668 100644 --- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt @@ -1,4 +1,5 @@ nv_library(tensorrt_plugin - SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu prelu_op_plugin.cu + SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu + prelu_op_plugin.cu trt_plugin_factory.cc avg_pool_op_plugin.cu DEPS enforce tensorrt_engine prelu) diff --git a/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu index 5d747af8c55d71..f27a838162c89b 100644 --- a/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.cu @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" #include "paddle/fluid/operators/math/pooling.h" namespace paddle { @@ -20,6 +21,12 @@ namespace inference { namespace tensorrt { namespace plugin { +AvgPoolPlugin* CreateAvgPoolPluginDeserialize(const void* buffer, + size_t length) { + return new AvgPoolPlugin(buffer, length); +} +REGISTER_TRT_PLUGIN("avg_pool_plugin", CreateAvgPoolPluginDeserialize); + nvinfer1::Dims AvgPoolPlugin::getOutputDimensions( int index, const nvinfer1::Dims* inputDims, int nbInputs) { assert(nbInputs == 1); diff --git a/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h index b5e4ece0fba446..a7c0aa5794e6bb 100644 --- a/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/avg_pool_op_plugin.h @@ -33,24 +33,27 @@ class AvgPoolPlugin : public PluginTensorRT { protected: size_t getSerializationSize() override { - return SerializedSize(ceil_mode_) + SerializedSize(ksize_) + - SerializedSize(strides_) + SerializedSize(paddings_) + - SerializedSize(input_shape_) + getBaseSerializationSize(); + return SerializedSize(getPluginType()) + SerializedSize(ceil_mode_) + + SerializedSize(ksize_) + SerializedSize(strides_) + + SerializedSize(paddings_) + SerializedSize(input_shape_) + + SerializedSize(output_shape_) + getBaseSerializationSize(); } // TRT will call this func when we need to serialize the configuration of // tensorrt. - // It should not be called by users. void serialize(void *buffer) override { + SerializeValue(&buffer, getPluginType()); serializeBase(buffer); SerializeValue(&buffer, ceil_mode_); SerializeValue(&buffer, ksize_); SerializeValue(&buffer, strides_); SerializeValue(&buffer, paddings_); SerializeValue(&buffer, input_shape_); + SerializeValue(&buffer, output_shape_); } public: + AvgPoolPlugin() {} AvgPoolPlugin(bool ceil_mode, std::vector ksize, std::vector strides, std::vector paddings, std::vector input_shape) @@ -89,6 +92,7 @@ class AvgPoolPlugin : public PluginTensorRT { DeserializeValue(&serialData, &serialLength, &strides_); DeserializeValue(&serialData, &serialLength, &paddings_); DeserializeValue(&serialData, &serialLength, &input_shape_); + DeserializeValue(&serialData, &serialLength, &output_shape_); } AvgPoolPlugin *clone() const override { @@ -96,7 +100,7 @@ class AvgPoolPlugin : public PluginTensorRT { input_shape_); } - const char *getPluginType() const override { return "avg_pool"; } + const char *getPluginType() const override { return "avg_pool_plugin"; } int getNbOutputs() const override { return 1; } nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs, int nbInputDims) override; diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu index 9cd9026b732808..9aed3ddab1448f 100644 --- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu @@ -14,12 +14,19 @@ limitations under the License. */ #include #include "paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" namespace paddle { namespace inference { namespace tensorrt { namespace plugin { +ElementWisePlugin* CreateElementWisePluginDeserialize(const void* buffer, + size_t length) { + return new ElementWisePlugin(buffer, length); +} +REGISTER_TRT_PLUGIN("elementwise_plugin", CreateElementWisePluginDeserialize); + namespace details { template @@ -119,10 +126,10 @@ int ElementWisePlugin::enqueue(int batch_size, const void* const* inputs, const float* y = reinterpret_cast(inputs[1]); float* out = reinterpret_cast(outputs[0]); - if (type_ == nvinfer1::ElementWiseOperation::kSUM) { + if (type_ == "add") { details::ElementWise(details::Add(), x, y, out, batch_size, prev_size_, midd_size_, post_size_, stream); - } else if (type_ == nvinfer1::ElementWiseOperation::kPROD) { + } else if (type_ == "mul") { details::ElementWise(details::Mul(), x, y, out, batch_size, prev_size_, midd_size_, post_size_, stream); } else { diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h index 9c461f7a5c44eb..3b040f14c531c5 100644 --- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" @@ -24,9 +25,8 @@ namespace plugin { class ElementWisePlugin : public PluginTensorRT { public: - ElementWisePlugin(nvinfer1::ElementWiseOperation type, - nvinfer1::Dims const &dims_x, nvinfer1::Dims const &dims_y, - int axis) + ElementWisePlugin(std::string type, nvinfer1::Dims const &dims_x, + nvinfer1::Dims const &dims_y, int axis) : type_(type), dims_x_(dims_x), dims_y_(dims_y), @@ -37,6 +37,9 @@ class ElementWisePlugin : public PluginTensorRT { ElementWisePlugin(void const *serial_data, size_t serial_length) { deserializeBase(serial_data, serial_length); + const char *elementwise_type; + DeserializeValue(&serial_data, &serial_length, &elementwise_type); + type_ = std::string(elementwise_type); DeserializeValue(&serial_data, &serial_length, &axis_); DeserializeValue(&serial_data, &serial_length, &dims_x_); DeserializeValue(&serial_data, &serial_length, &dims_y_); @@ -47,7 +50,7 @@ class ElementWisePlugin : public PluginTensorRT { return nullptr; } - const char *getPluginType() const override { return "elementwise"; } + const char *getPluginType() const override { return "elementwise_plugin"; } nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *input_dims, @@ -61,18 +64,21 @@ class ElementWisePlugin : public PluginTensorRT { protected: size_t getSerializationSize() override { - return SerializedSize(axis_) + SerializedSize(dims_x_) + - SerializedSize(dims_y_) + getBaseSerializationSize(); + return SerializedSize(getPluginType()) + SerializedSize(axis_) + + SerializedSize(dims_x_) + SerializedSize(dims_y_) + + getBaseSerializationSize(); } void serialize(void *buffer) override { + SerializeValue(&buffer, getPluginType()); serializeBase(buffer); + SerializeValue(&buffer, type_.c_str()); SerializeValue(&buffer, axis_); SerializeValue(&buffer, dims_x_); SerializeValue(&buffer, dims_y_); } - nvinfer1::ElementWiseOperation type_; + std::string type_; nvinfer1::Dims dims_x_; nvinfer1::Dims dims_y_; int axis_; diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu index 3075e87ea6d719..b8a044fe99b918 100644 --- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu @@ -17,6 +17,7 @@ #include #include "glog/logging.h" #include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" #include "paddle/fluid/operators/math/prelu.h" namespace paddle { @@ -24,6 +25,17 @@ namespace inference { namespace tensorrt { namespace plugin { +PReluPlugin *CreatePreluPluginDeserialize(const void *buffer, size_t length) { + return new PReluPlugin(buffer, length); +} +REGISTER_TRT_PLUGIN("prelu_plugin", CreatePreluPluginDeserialize); + +int PReluPlugin::initialize() { + cudaMalloc(&p_gpu_weight_, sizeof(float) * weight_.size()); + cudaMemcpy(p_gpu_weight_, weight_.data(), weight_.size() * sizeof(float), + cudaMemcpyHostToDevice); +} + nvinfer1::Dims PReluPlugin::getOutputDimensions(int index, const nvinfer1::Dims *inputDims, int nbInputs) { @@ -39,7 +51,8 @@ int PReluPlugin::enqueue(int batch_size, const void *const *inputs, // input dims is CHW. const auto &input_dims = this->getInputDims(0); const float *input = reinterpret_cast(inputs[0]); - const float *alpha = reinterpret_cast(alpha_.get().values); + // const float *alpha = reinterpret_cast(alpha_.get().values); + const float *alpha = p_gpu_weight_; float *output = reinterpret_cast(outputs)[0]; std::vector input_shape; diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h index 0db56a310b072e..a96649503f1c76 100644 --- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h @@ -14,7 +14,12 @@ #pragma once +#include #include +#include +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" + #include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" @@ -24,39 +29,51 @@ namespace tensorrt { namespace plugin { class PReluPlugin : public PluginTensorRT { - TensorRTEngine::Weight alpha_; + std::vector weight_; + float *p_gpu_weight_; std::string mode_; protected: size_t getSerializationSize() override { - // return getBaseSerializationSize(alpha_) + SerializedSize(mode_); - return 0; + return getBaseSerializationSize() + SerializedSize(mode_.c_str()) + + SerializedSize(weight_) + SerializedSize(getPluginType()); } // TRT will call this func when we need to serialize the configuration of // tensorrt. // It should not be called by users. void serialize(void *buffer) override { - // serializeBase(buffer); - // SerializeValue(&buffer, alpha_); - // SerializeValue(&buffer, mode_); + SerializeValue(&buffer, getPluginType()); + serializeBase(buffer); + SerializeValue(&buffer, weight_); + SerializeValue(&buffer, mode_.c_str()); } public: - PReluPlugin(TensorRTEngine::Weight const &alpha, std::string const &mode) - : alpha_(alpha), mode_(mode) {} + PReluPlugin(const float *weight, const int weight_num, + std::string const &mode) + : mode_(mode) { + weight_.resize(weight_num); + std::copy(weight, weight + weight_num, weight_.data()); + } // It was used for tensorrt deserialization. // It should not be called by users. PReluPlugin(void const *serialData, size_t serialLength) { - // deserializeBase(serialData, serialLength); - // DeserializeValue(&serialData, &serialLength, &alpha_); - // DeserializeValue(&serialData, &serialLength, &mode_); + deserializeBase(serialData, serialLength); + DeserializeValue(&serialData, &serialLength, &weight_); + const char *prelu_mode; + DeserializeValue(&serialData, &serialLength, &prelu_mode); + mode_ = std::string(prelu_mode); } + ~PReluPlugin() { cudaFree(p_gpu_weight_); } + int initialize() override; - PReluPlugin *clone() const override { return new PReluPlugin(alpha_, mode_); } + PReluPlugin *clone() const override { + return new PReluPlugin(weight_.data(), weight_.size(), mode_); + } - const char *getPluginType() const override { return "prelu"; } + const char *getPluginType() const override { return "prelu_plugin"; } int getNbOutputs() const override { return 1; } nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *inputs, int nbInputDims) override; diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu index de61ace59e299a..b5503c3b95ee24 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu @@ -15,12 +15,18 @@ #include #include #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" namespace paddle { namespace inference { namespace tensorrt { namespace plugin { +SplitPlugin* CreateSplitPluginDeserialize(const void* buffer, size_t length) { + return new SplitPlugin(buffer, length); +} +REGISTER_TRT_PLUGIN("split_plugin", CreateSplitPluginDeserialize); + // copied from operators::math::SplitFunctor template __global__ void SplitKernel(const T* input_data, const int in_row, diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h index 6f028d3d72ae3c..cbb72590567a35 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h @@ -15,6 +15,7 @@ #pragma once #include +#include #include #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" @@ -25,6 +26,7 @@ namespace plugin { class SplitPlugin : public PluginTensorRT { public: + SplitPlugin() {} SplitPlugin(int axis, std::vector const &output_lengths) : axis_(axis), same_shape_(true), output_length_(output_lengths) {} @@ -38,7 +40,7 @@ class SplitPlugin : public PluginTensorRT { return new SplitPlugin(axis_, output_length_); } - const char *getPluginType() const override { return "split"; } + const char *getPluginType() const override { return "split_plugin"; } int getNbOutputs() const override { return output_length_.size(); } nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims *input_dims, @@ -50,11 +52,12 @@ class SplitPlugin : public PluginTensorRT { protected: size_t getSerializationSize() override { - return SerializedSize(axis_) + SerializedSize(output_length_) + - getBaseSerializationSize(); + return SerializedSize(getPluginType()) + SerializedSize(axis_) + + SerializedSize(output_length_) + getBaseSerializationSize(); } void serialize(void *buffer) override { + SerializeValue(&buffer, getPluginType()); serializeBase(buffer); SerializeValue(&buffer, axis_); SerializeValue(&buffer, output_length_); diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h index 86084829e150f8..3b737bd726ad09 100644 --- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h @@ -17,9 +17,10 @@ #include #include #include +#include #include -#include "paddle/fluid/inference/tensorrt/plugin/serialize.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/profiler.h" @@ -30,6 +31,13 @@ namespace inference { namespace tensorrt { namespace plugin { +class PluginTensorRT; + +typedef std::function + PluginDeserializeFunc; + +typedef std::function PluginConstructFunc; + class PluginTensorRT : public nvinfer1::IPluginExt { public: PluginTensorRT() {} diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc new file mode 100644 index 00000000000000..3c20b6d1e72527 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.cc @@ -0,0 +1,48 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +PluginTensorRT* PluginFactoryTensorRT::createPlugin(const char* layer_name, + const void* serial_data, + size_t serial_length) { + const char* plugin_type; + DeserializeValue(&serial_data, &serial_length, &plugin_type); + + PADDLE_ENFORCE(Has(plugin_type), + "trt plugin type %s does not exists, check it.", plugin_type); + auto plugin = plugin_registry_[plugin_type](serial_data, serial_length); + owned_plugins_.emplace_back(plugin); + + return plugin; +} + +bool PluginFactoryTensorRT::RegisterPlugin( + const std::string& op_name, PluginDeserializeFunc deserialize_func) { + if (Has(op_name)) return false; + auto ret = plugin_registry_.emplace(op_name, deserialize_func); + return ret.second; +} + +void PluginFactoryTensorRT::DestroyPlugins() { owned_plugins_.clear(); } + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h new file mode 100644 index 00000000000000..139c75595f9f44 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h @@ -0,0 +1,78 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h" +#include "paddle/fluid/inference/utils/singleton.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +class PluginFactoryTensorRT : public nvinfer1::IPluginFactory, + public DeleteHelper { + public: + // Deserialization method + PluginTensorRT* createPlugin(const char* layer_name, const void* serial_data, + size_t serial_length) override; + + bool RegisterPlugin(const std::string& op_name, + PluginDeserializeFunc deserialize_func); + + bool Has(const std::string& op_name) { + return plugin_registry_.find(op_name) != plugin_registry_.end(); + } + + void DestroyPlugins(); + + protected: + std::unordered_map plugin_registry_; + + std::list> owned_plugins_; +}; + +class TrtPluginRegistrar { + public: + TrtPluginRegistrar(const std::string& name, + PluginDeserializeFunc deserialize_func) { + inference::Singleton::Global().RegisterPlugin( + name, deserialize_func); + } +}; + +#define REGISTER_TRT_PLUGIN(name, deserialize_func) \ + REGISTER_TRT_PLUGIN_UNIQ(__COUNTER__, name, deserialize_func) + +#define REGISTER_TRT_PLUGIN_UNIQ(ctr, name, deserialize_func) \ + static paddle::inference::tensorrt::plugin::TrtPluginRegistrar \ + trt_plugin_registrar##ctr __attribute__((unused)) = \ + paddle::inference::tensorrt::plugin::TrtPluginRegistrar( \ + name, deserialize_func) + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/serialize.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h similarity index 96% rename from paddle/fluid/inference/tensorrt/plugin/serialize.h rename to paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h index ce859f16fc8747..1cae4ccae4cc59 100644 --- a/paddle/fluid/inference/tensorrt/plugin/serialize.h +++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h @@ -13,8 +13,8 @@ // limitations under the License. #pragma once - #include +#include #include #include #include "paddle/fluid/platform/enforce.h" @@ -24,6 +24,13 @@ namespace inference { namespace tensorrt { namespace plugin { +// Some trt base classes lack of the destructor. +// We use a assisted class to fix this. +struct DeleteHelper { + protected: + virtual ~DeleteHelper() {} +}; + template inline void SerializeValue(void** buffer, T const& value); diff --git a/paddle/fluid/inference/tensorrt/test_engine.cc b/paddle/fluid/inference/tensorrt/test_engine.cc index 9eed0f6ee9ce4d..a03dd45db0f804 100644 --- a/paddle/fluid/inference/tensorrt/test_engine.cc +++ b/paddle/fluid/inference/tensorrt/test_engine.cc @@ -17,6 +17,8 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/inference/tensorrt/engine.h" #include "paddle/fluid/platform/enforce.h" @@ -27,19 +29,34 @@ namespace tensorrt { class TensorRTEngineTest : public ::testing::Test { protected: void SetUp() override { - ASSERT_EQ(0, cudaStreamCreate(&stream_)); - engine_ = new TensorRTEngine(10, 1 << 10, stream_); + ctx_ = new platform::CUDADeviceContext(platform::CUDAPlace(0)); + + engine_ = new TensorRTEngine(10, 1 << 10); engine_->InitNetwork(); } void TearDown() override { - delete engine_; - cudaStreamDestroy(stream_); + if (engine_) { + delete engine_; + engine_ = nullptr; + } + } + + void PrepareInputOutput(const std::vector &input, + std::vector output_shape) { + TensorFromVector(input, *ctx_, &input_); + output_.Resize(framework::make_ddim(output_shape)); + } + + void GetOutput(std::vector *output) { + TensorToVector(output_, *ctx_, output); } protected: - TensorRTEngine* engine_; - cudaStream_t stream_; + framework::Tensor input_; + framework::Tensor output_; + TensorRTEngine *engine_; + platform::CUDADeviceContext *ctx_; }; TEST_F(TensorRTEngineTest, add_layer) { @@ -48,12 +65,14 @@ TEST_F(TensorRTEngineTest, add_layer) { float raw_weight[size] = {2.}; // Weight in CPU memory. float raw_bias[size] = {3.}; + std::vector buffers(2); // TRT binded inputs + LOG(INFO) << "create weights"; TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, size); TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, size); - auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, + auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, nvinfer1::DimsCHW{1, 1, 1}); - auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, size, + auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, size, weight.get(), bias.get()); PADDLE_ENFORCE(fc_layer != nullptr); @@ -63,18 +82,24 @@ TEST_F(TensorRTEngineTest, add_layer) { ASSERT_EQ(engine_->engine()->getNbBindings(), 2); // fill in real data - float x_v = 1234; - engine_->SetInputFromCPU("x", reinterpret_cast(&x_v), - 1 * sizeof(float)); + std::vector x_v = {1234}; + std::vector y_cpu; + PrepareInputOutput(x_v, {1}); + + auto *x_v_gpu_data = input_.mutable_data(ctx_->GetPlace()); + auto *y_gpu_data = output_.mutable_data(ctx_->GetPlace()); + + buffers[0] = reinterpret_cast(x_v_gpu_data); + buffers[1] = reinterpret_cast(y_gpu_data); + LOG(INFO) << "to execute"; - engine_->Execute(1); + engine_->Execute(1, &buffers, ctx_->stream()); LOG(INFO) << "to get output"; - float y_cpu; - engine_->GetOutputInCPU("y", &y_cpu, 1 * sizeof(float)); + GetOutput(&y_cpu); LOG(INFO) << "to checkout output"; - ASSERT_EQ(y_cpu, x_v * 2 + 3); + ASSERT_EQ(y_cpu[0], x_v[0] * 2 + 3); } TEST_F(TensorRTEngineTest, add_layer_multi_dim) { @@ -83,12 +108,13 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) { // instead of row-major, which is [[1.0, 1.1], [3.3, 4.4]] float raw_weight[4] = {1.0, 1.1, 3.3, 4.4}; float raw_bias[2] = {1.3, 2.4}; + std::vector buffers(2); // TRT binded inputs TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, 4); TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, 2); - auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, + auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, nvinfer1::DimsCHW{1, 2, 1}); - auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, 2, + auto *fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *x, 2, weight.get(), bias.get()); PADDLE_ENFORCE(fc_layer != nullptr); @@ -96,19 +122,27 @@ TEST_F(TensorRTEngineTest, add_layer_multi_dim) { engine_->FreezeNetwork(); ASSERT_EQ(engine_->engine()->getNbBindings(), 2); - float x_v[2] = {1.0, 2.0}; - engine_->SetInputFromCPU("x", reinterpret_cast(&x_v), - 2 * sizeof(float)); - engine_->Execute(1); + // fill in real data + std::vector x_v = {1.0, 2.0}; + std::vector y_cpu; + PrepareInputOutput(x_v, {2}); + + auto *x_v_gpu_data = input_.mutable_data(ctx_->GetPlace()); + auto *y_gpu_data = output_.mutable_data(ctx_->GetPlace()); + + buffers[0] = reinterpret_cast(x_v_gpu_data); + buffers[1] = reinterpret_cast(y_gpu_data); + + engine_->Execute(1, &buffers, ctx_->stream()); LOG(INFO) << "to get output"; - float y_cpu[2] = {-1., -1.}; + GetOutput(&y_cpu); auto dims = engine_->GetITensor("y")->getDimensions(); ASSERT_EQ(dims.nbDims, 3); ASSERT_EQ(dims.d[0], 2); ASSERT_EQ(dims.d[1], 1); - engine_->GetOutputInCPU("y", &y_cpu[0], 2 * sizeof(float)); + ASSERT_EQ(y_cpu[0], 4.5); ASSERT_EQ(y_cpu[1], 14.5); } @@ -117,12 +151,13 @@ TEST_F(TensorRTEngineTest, test_conv2d) { // Weight in CPU memory. float raw_weight[9] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; float raw_bias[1] = {0}; + std::vector buffers(2); // TRT binded inputs TensorRTEngine::Weight weight(nvinfer1::DataType::kFLOAT, raw_weight, 9); TensorRTEngine::Weight bias(nvinfer1::DataType::kFLOAT, raw_bias, 1); - auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, + auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, nvinfer1::Dims3{1, 3, 3}); - auto* conv_layer = + auto *conv_layer = TRT_ENGINE_ADD_LAYER(engine_, Convolution, *x, 1, nvinfer1::DimsHW{3, 3}, weight.get(), bias.get()); PADDLE_ENFORCE(conv_layer != nullptr); @@ -133,28 +168,36 @@ TEST_F(TensorRTEngineTest, test_conv2d) { engine_->FreezeNetwork(); ASSERT_EQ(engine_->engine()->getNbBindings(), 2); - float x_v[18] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, - 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; - engine_->SetInputFromCPU("x", reinterpret_cast(&x_v), - 18 * sizeof(float)); - engine_->Execute(2); + // fill in real data + std::vector x_v = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, + 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; + std::vector y_cpu; + PrepareInputOutput(x_v, {18}); + + auto *x_v_gpu_data = input_.mutable_data(ctx_->GetPlace()); + auto *y_gpu_data = output_.mutable_data(ctx_->GetPlace()); + + buffers[0] = reinterpret_cast(x_v_gpu_data); + buffers[1] = reinterpret_cast(y_gpu_data); + + engine_->Execute(2, &buffers, ctx_->stream()); LOG(INFO) << "to get output"; - float* y_cpu = new float[18]; - engine_->GetOutputInCPU("y", &y_cpu[0], 18 * sizeof(float)); + GetOutput(&y_cpu); + ASSERT_EQ(y_cpu[0], 4.0); ASSERT_EQ(y_cpu[1], 6.0); } TEST_F(TensorRTEngineTest, test_pool2d) { // Weight in CPU memory. - auto* x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, + auto *x = engine_->DeclareInput("x", nvinfer1::DataType::kFLOAT, nvinfer1::Dims3{1, 2, 2}); + std::vector buffers(2); // TRT binded inputs nvinfer1::PoolingType pool_t = nvinfer1::PoolingType::kAVERAGE; - auto* pool_layer = - TRT_ENGINE_ADD_LAYER(engine_, Pooling, *const_cast(x), - pool_t, nvinfer1::DimsHW{2, 2}); + auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *x, pool_t, + nvinfer1::DimsHW{2, 2}); PADDLE_ENFORCE(pool_layer != nullptr); pool_layer->setStride(nvinfer1::DimsHW{1, 1}); @@ -164,14 +207,21 @@ TEST_F(TensorRTEngineTest, test_pool2d) { engine_->FreezeNetwork(); ASSERT_EQ(engine_->engine()->getNbBindings(), 2); - float x_v[8] = {1.0, 2.0, 5.0, 0.0, 2.0, 3.0, 5.0, 10.0}; - engine_->SetInputFromCPU("x", reinterpret_cast(&x_v), - 8 * sizeof(float)); - engine_->Execute(2); + // fill in real data + std::vector x_v = {1.0, 2.0, 5.0, 0.0, 2.0, 3.0, 5.0, 10.0}; + std::vector y_cpu; + PrepareInputOutput(x_v, {2}); + + auto *x_v_gpu_data = input_.mutable_data(ctx_->GetPlace()); + auto *y_gpu_data = output_.mutable_data(ctx_->GetPlace()); + + buffers[0] = reinterpret_cast(x_v_gpu_data); + buffers[1] = reinterpret_cast(y_gpu_data); + + engine_->Execute(2, &buffers, ctx_->stream()); LOG(INFO) << "to get output"; - float* y_cpu = new float[2]; - engine_->GetOutputInCPU("y", &y_cpu[0], 2 * sizeof(float)); + GetOutput(&y_cpu); ASSERT_EQ(y_cpu[0], 2.0); ASSERT_EQ(y_cpu[1], 5.0); diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc index 17a433c9d98768..cb668a4174134b 100644 --- a/paddle/fluid/inference/tests/api/trt_models_tester.cc +++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc @@ -54,7 +54,8 @@ void SetConfig(AnalysisConfig* config, std::string model_dir, if (use_gpu) { config->EnableUseGpu(100, 0); if (use_tensorrt) { - config->EnableTensorRtEngine(1 << 10, batch_size); + config->EnableTensorRtEngine(1 << 10, batch_size, 3, + AnalysisConfig::Precision::kFloat32, false); config->pass_builder()->DeletePass("conv_bn_fuse_pass"); config->pass_builder()->DeletePass("fc_fuse_pass"); config->pass_builder()->TurnOnDebug(); diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc index 031335009b692f..a8c86de9f9a1ae 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.cc @@ -30,6 +30,9 @@ class TensorRTEngineOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("Ys", "A list of outputs").AsDuplicable(); AddAttr("subgraph", "the subgraph."); AddAttr("calibration_data", "the calibration data for int8"); + AddAttr( + "engine_serialized_data", + "the serialized data contains the all info of the ICUDAEngine"); AddAttr( "engine_key", "The engine_key here is used to distinguish different TRT Engines"); diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h index 2ff35c7c6ac640..c3667331248973 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h @@ -16,8 +16,10 @@ #ifdef PADDLE_WITH_CUDA +#include #include #include +#include #include #include "paddle/fluid/framework/executor.h" @@ -31,37 +33,6 @@ namespace paddle { namespace operators { -using FluidDT = framework::proto::VarType_Type; -using TRT_DT = nvinfer1::DataType; - -namespace { // NOLINT - -TRT_DT FluidDataType2TRT(FluidDT type) { - switch (type) { - case FluidDT::VarType_Type_FP32: - return TRT_DT::kFLOAT; - case FluidDT::VarType_Type_INT32: - return TRT_DT::kINT32; - default: - return TRT_DT::kINT32; - } - PADDLE_THROW("unkown type"); - return TRT_DT::kINT32; -} - -nvinfer1::Dims Vec2TRT_Dims(const std::vector &shape) { - PADDLE_ENFORCE_GT(shape.size(), 1UL, - "TensorRT' tensor input requires at least 2 dimensions"); - PADDLE_ENFORCE_LE(shape.size(), 4UL, - "TensorRT' tensor input requires at most 4 dimensions"); - PADDLE_ENFORCE(shape.size() == 4UL || shape.size() == 2UL); - if (shape.size() == 4UL) - return nvinfer1::DimsCHW(shape[1], shape[2], shape[3]); - return nvinfer1::DimsCHW(shape[1], 1, 1); -} - -} // namespace // NOLINT - using inference::Singleton; using inference::tensorrt::TensorRTEngine; using inference::tensorrt::TRTInt8Calibrator; @@ -79,6 +50,7 @@ class TensorRTEngineOp : public framework::OperatorBase { bool enable_int8_; std::string calibration_data_; std::string engine_key_; + std::string engine_serialized_data_; bool calibration_mode_; public: @@ -93,6 +65,7 @@ class TensorRTEngineOp : public framework::OperatorBase { enable_int8_ = Attr("enable_int8"); calibration_data_ = Attr("calibration_data"); engine_key_ = Attr("engine_key"); + engine_serialized_data_ = Attr("engine_serialized_data"); auto params = Attr>("parameters"); for (const auto ¶m : params) { @@ -125,7 +98,8 @@ class TensorRTEngineOp : public framework::OperatorBase { RunCalibration(scope, dev_place); return; } - RunTrt(scope, dev_place); + auto *trt_engine = GetEngine(scope, dev_place); + RunTrt(scope, dev_place, trt_engine); } void RunCalibration(const framework::Scope &scope, @@ -136,10 +110,6 @@ class TensorRTEngineOp : public framework::OperatorBase { LOG_FIRST_N(INFO, 1) << "The TRT engine: " << engine_key_ << " is running calibration trt int8... "; int runtime_batch = 1; - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(dev_place); - auto stream = - reinterpret_cast(dev_ctx).stream(); if (!Singleton::Global().Has(engine_key_)) { TRTCalibratorEngine *calib_res = Singleton::Global().Create(engine_key_); @@ -156,11 +126,11 @@ class TensorRTEngineOp : public framework::OperatorBase { calib_buffers, runtime_batch, engine_key_, dev_place)); calib_res->thr_.reset(new std::thread([&]() { calib_res->engine_.reset(new TensorRTEngine( - max_batch_size_, workspace_size_, stream, - boost::get(dev_place).device, enable_int8_, - calib_res->calib_.get())); + max_batch_size_, workspace_size_, enable_int8_, + calib_res->calib_.get(), + boost::get(dev_place).device)); VLOG(3) << "start the calib trt engine thread"; - Prepare(scope, dev_place, calib_res->engine_.get()); + PrepareTRTEngine(scope, calib_res->engine_.get()); })); } @@ -180,28 +150,29 @@ class TensorRTEngineOp : public framework::OperatorBase { RunNativeImpl(scope, dev_place); } - void RunTrt(const framework::Scope &scope, - const platform::Place &dev_place) const { + void RunTrt(const framework::Scope &scope, const platform::Place &dev_place, + TensorRTEngine *engine) const { int runtime_batch = 1; platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &dev_ctx = *pool.Get(dev_place); auto stream = reinterpret_cast(dev_ctx).stream(); - if (trt_engine_.get() == nullptr) { - trt_engine_.reset( - new TensorRTEngine(max_batch_size_, workspace_size_, stream, - boost::get(dev_place).device, - enable_int8_, calibrator_.get())); - Prepare(scope, dev_place, trt_engine_.get()); - } - auto *engine = trt_engine_.get(); PADDLE_ENFORCE(!input_names_.empty(), "should pass more than one inputs"); std::vector output_maps = Attr>("output_name_mapping"); - // Convert input tensor from fluid to engine. + int num_inputs = 0; + + for (const auto &x : Inputs("Xs")) { + if (param_names_.count(x)) continue; + num_inputs += 1; + } + const int num_bindings = num_inputs + Outputs("Ys").size(); + std::vector buffers(num_bindings); + + // Bind input tensor to TRT. for (const auto &x : Inputs("Xs")) { if (param_names_.count(x)) continue; // convert input and copy to TRT engine's buffer @@ -209,28 +180,20 @@ class TensorRTEngineOp : public framework::OperatorBase { inference::analysis::GetFromScope(scope, x); auto t_shape = framework::vectorize(t.dims()); runtime_batch = t_shape[0]; - if (platform::is_cpu_place(t.place())) { - engine->SetInputFromCPU(x, static_cast(t.data()), - t.memory_size()); - } else { - engine->SetInputFromGPU(x, static_cast(t.data()), - t.memory_size()); - } - } - cudaStreamSynchronize(stream); - PADDLE_ENFORCE_LE(runtime_batch, max_batch_size_); - // Execute the engine. - engine->Execute(runtime_batch); + const int bind_index = engine->engine()->getBindingIndex(x.c_str()); + PADDLE_ENFORCE(bind_index < num_bindings, + "The bind index should be less than num_bindings"); + buffers[bind_index] = static_cast(t.data()); + } - // Convert output tensor from engine to fluid + // Bind output tensor to TRT. int output_index = 0; VLOG(4) << "TensorRT Engine Op Outputs:"; for (const auto &y : Outputs("Ys")) { - VLOG(4) << y; - // convert output and copy to fluid. - nvinfer1::ITensor *trt_t = engine->GetITensor(output_maps[output_index]); - auto dims = trt_t->getDimensions(); + const int bind_index = + engine->engine()->getBindingIndex(output_maps[output_index].c_str()); + auto dims = engine->engine()->getBindingDimensions(bind_index); // Use the output ITensor's dims to reshape the Fluid Tensor. // The ITensor doesn't contain the batch size dim. std::vector ddim; @@ -238,71 +201,55 @@ class TensorRTEngineOp : public framework::OperatorBase { for (int i = 0; i < dims.nbDims; i++) { ddim.push_back(dims.d[i]); } - auto *fluid_v = scope.FindVar(y); PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y); auto *fluid_t = fluid_v->GetMutable(); - fluid_t->Resize(framework::make_ddim(ddim)); - // TODO(Superjomn) change this float to dtype size. - auto size = - inference::analysis::AccuDims(dims.d, dims.nbDims) * runtime_batch; - engine->GetOutputInGPU( - output_maps[output_index], - fluid_t->mutable_data(platform::CUDAPlace( - boost::get(dev_place).device)), - size * sizeof(float)); + PADDLE_ENFORCE(bind_index < num_bindings, + "The bind index should be less than num_bindings"); + buffers[bind_index] = static_cast(fluid_t->mutable_data( + boost::get(dev_place))); + output_index += 1; } + PADDLE_ENFORCE_LE(runtime_batch, max_batch_size_); + // Execute the engine. + engine->Execute(runtime_batch, &buffers, stream); cudaStreamSynchronize(stream); } - void Prepare(const framework::Scope &scope, const platform::Place &dev_place, - TensorRTEngine *engine) const { + TensorRTEngine *GetEngine(const framework::Scope &scope, + const platform::Place &dev_place) const { + if (!trt_engine_) { + trt_engine_.reset(new inference::tensorrt::TensorRTEngine( + max_batch_size_, workspace_size_, enable_int8_, calibrator_.get(), + boost::get(dev_place).device)); + if (!engine_serialized_data_.empty()) { + trt_engine_->Deserialize(engine_serialized_data_); + } else { + PrepareTRTEngine(scope, trt_engine_.get()); + } + } + return trt_engine_.get(); + } + + void PrepareTRTEngine(const framework::Scope &scope, + TensorRTEngine *engine) const { LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP " "kernel etc). This process may cost a lot of time."; - framework::proto::BlockDesc block_desc; - block_desc.ParseFromString(Attr("subgraph")); + framework::proto::BlockDesc block_proto; + block_proto.ParseFromString(Attr("subgraph")); + framework::BlockDesc block_desc(nullptr, &block_proto); - std::vector output_maps = + std::vector inputs = Inputs("Xs"); + std::vector outputs = Attr>("output_name_mapping"); - engine->InitNetwork(); - - framework::BlockDesc block(nullptr /*programdesc*/, &block_desc); - VLOG(4) << "parsed var size " << block.AllVars().size(); - // Add inputs - VLOG(4) << "declare inputs"; - for (auto &input : Inputs("Xs")) { - if (param_names_.count(input)) continue; - VLOG(4) << "declare input " << input; - - auto &t = - inference::analysis::GetFromScope(scope, input); - auto t_shape = framework::vectorize(t.dims()); - - auto *var = block.FindVar(input); - // TensorRT engine need to create parameters. The parameter's description - // should be set in - PADDLE_ENFORCE(var, "no variable called %s", input); - PADDLE_ENFORCE_EQ(var->GetType(), FluidDT::VarType_Type_LOD_TENSOR, - "TensorRT engine only takes LoDTensor as input"); - - engine->DeclareInput( - input, FluidDataType2TRT( - var->Proto()->type().lod_tensor().tensor().data_type()), - Vec2TRT_Dims(t_shape)); - } inference::Singleton::Global() - .ConvertBlock(block_desc, param_names_, scope, engine); - - // Add outputs - for (auto &output : output_maps) { - engine->DeclareOutput(output); - } - engine->FreezeNetwork(); + .ConvertBlockToTRTEngine(&block_desc, scope, inputs, param_names_, + outputs, engine); } }; diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc index 5a3d9d2c1a3e81..e7ad2f4fe0c654 100644 --- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc +++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc @@ -107,6 +107,7 @@ TEST(TensorRTEngineOp, manual) { engine_op_desc.SetAttr("output_name_mapping", std::vector({"z0"})); engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString())); + engine_op_desc.SetAttr("engine_serialized_data", std::string("")); LOG(INFO) << "create engine op"; auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc); @@ -202,6 +203,7 @@ void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { engine_op_desc.SetAttr("output_name_mapping", std::vector({"z3"})); engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString())); + engine_op_desc.SetAttr("engine_serialized_data", std::string("")); auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc); diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index 7db2bb451b4991..03c1b0bd092181 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -221,7 +221,8 @@ void BindAnalysisConfig(py::module *m) { .def("enable_tensorrt_engine", &AnalysisConfig::EnableTensorRtEngine, py::arg("workspace_size") = 1 << 20, py::arg("max_batch_size") = 1, py::arg("min_subgraph_size") = 3, - py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32) + py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32, + py::arg("use_static") = true) .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled) .def("switch_ir_debug", &AnalysisConfig::SwitchIrDebug, py::arg("x") = true)