Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
2af763b
add tuned trt dynamic shape mode.
jiweibo Aug 11, 2021
96ee635
update io test
jiweibo Aug 11, 2021
122f108
add proto file
jiweibo Aug 11, 2021
99bd981
add python api.
jiweibo Aug 12, 2021
c3e8ab2
add ut
jiweibo Aug 13, 2021
450cf95
fix initial error
jiweibo Aug 13, 2021
fdf4650
update ShapeInfo to ShapeRangeInfo
jiweibo Aug 16, 2021
59531f8
imporve ci coverage.
jiweibo Aug 16, 2021
6efc679
fix failed ut
jiweibo Aug 17, 2021
e575abc
support serialize when rebuild engine
jiweibo Aug 18, 2021
a5bdc92
fix engine_op_test
jiweibo Aug 19, 2021
abf72d9
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
jiweibo Aug 23, 2021
300804b
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
jiweibo Aug 23, 2021
8fade9b
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
jiweibo Aug 30, 2021
8abb25a
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
jiweibo Aug 31, 2021
f4614a9
add ut.
jiweibo Sep 1, 2021
d4576f7
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
jiweibo Sep 1, 2021
0f58860
fix ut error.
jiweibo Sep 1, 2021
27fd96c
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
jiweibo Sep 4, 2021
23b2a85
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into…
jiweibo Sep 6, 2021
309e332
add ut.
jiweibo Sep 6, 2021
dcfc7cb
add trt engine ut.
jiweibo Sep 7, 2021
315aab9
fix compile error
jiweibo Sep 7, 2021
7850fa2
skip ci error
jiweibo Sep 8, 2021
b7254e1
update tests.
jiweibo Sep 9, 2021
e615f38
update test
jiweibo Sep 9, 2021
c3df13f
fix ut error.
jiweibo Sep 9, 2021
00a1833
update
jiweibo Sep 10, 2021
853f024
Merge branch 'develop' into tuned_dynamic_shape
jiweibo Sep 10, 2021
8085ff1
update
jiweibo Sep 10, 2021
d816963
Merge branch 'tuned_dynamic_shape' of https://github.com/jiweibo/Padd…
jiweibo Sep 10, 2021
4a871e4
fix compile error
jiweibo Sep 10, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions paddle/fluid/inference/analysis/argument.h
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,12 @@ struct Argument {
bool);
DECL_ARGUMENT_FIELD(tensorrt_use_calib_mode, TensorRtUseCalibMode, bool);
DECL_ARGUMENT_FIELD(tensorrt_use_oss, TensorRtUseOSS, bool);
DECL_ARGUMENT_FIELD(tensorrt_shape_range_info_path,
TensorRtShapeRangeInfoPath, std::string);
DECL_ARGUMENT_FIELD(tensorrt_tuned_dynamic_shape, TensorRtTunedDynamicShape,
bool);
DECL_ARGUMENT_FIELD(tensorrt_allow_build_at_runtime,
TensorRtAllowBuildAtRuntime, bool);

DECL_ARGUMENT_FIELD(use_dlnne, UseDlnne, bool);
DECL_ARGUMENT_FIELD(dlnne_min_subgraph_size, DlnneMinSubgraphSize, int);
Expand Down
18 changes: 13 additions & 5 deletions paddle/fluid/inference/analysis/ir_pass_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -146,24 +146,32 @@ void IRPassManager::CreatePasses(Argument *argument,
pass->Set("gpu_device_id", new int(argument->gpu_device_id()));
pass->Set("use_static_engine", new bool(use_static_engine));
pass->Set("model_from_memory", new bool(argument->model_from_memory()));

// tuned trt dynamic_shape
pass->Set("trt_shape_range_info_path",
new std::string(argument->tensorrt_shape_range_info_path()));
pass->Set("trt_tuned_dynamic_shape",
new bool(argument->tensorrt_tuned_dynamic_shape()));
pass->Set("trt_allow_build_at_runtime",
new bool(argument->tensorrt_allow_build_at_runtime()));
pass->Set("max_input_shape", new std::map<std::string, std::vector<int>>(
argument->max_input_shape()));
pass->Set("min_input_shape", new std::map<std::string, std::vector<int>>(
argument->min_input_shape()));
pass->Set("optim_input_shape",
new std::map<std::string, std::vector<int>>(
argument->optim_input_shape()));
bool with_dynamic_shape = argument->max_input_shape().size() > 0 &&
argument->min_input_shape().size() > 0 &&
argument->optim_input_shape().size() > 0;
bool with_dynamic_shape = (argument->max_input_shape().size() > 0 &&
argument->min_input_shape().size() > 0 &&
argument->optim_input_shape().size() > 0) ||
argument->tensorrt_tuned_dynamic_shape();
pass->Set("with_dynamic_shape", new bool(with_dynamic_shape));
pass->Set("trt_disabled_ops", new std::vector<std::string>(
argument->tensorrt_disabled_ops()));
pass->Set("trt_use_dla", new bool(argument->tensorrt_use_dla()));
pass->Set("trt_dla_core", new int(argument->tensorrt_dla_core()));
// Setting the disable_trt_plugin_fp16 to true means that TRT plugin will
// not
// run fp16.
// not run fp16.
pass->Set("disable_trt_plugin_fp16",
new bool(argument->disable_trt_plugin_fp16()));
} else if (pass_name == "dlnne_subgraph_pass") {
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
cc_library(subgraph_util SRCS subgraph_util.cc DEPS subgraph_detector)

if (WITH_GPU AND TENSORRT_FOUND)
cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_util tensorrt_op_teller)
cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_util tensorrt_op_teller infer_io_utils)

set(analysis_deps ${analysis_deps}
subgraph_util tensorrt_subgraph_pass
Expand Down
40 changes: 29 additions & 11 deletions paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "paddle/fluid/inference/tensorrt/engine.h"
#include "paddle/fluid/inference/tensorrt/helper.h"
#include "paddle/fluid/inference/tensorrt/op_teller.h"
#include "paddle/fluid/inference/utils/io_utils.h"

namespace paddle {
namespace inference {
Expand Down Expand Up @@ -197,6 +198,17 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
auto opt_input_shape =
Get<std::map<std::string, std::vector<int>>>("optim_input_shape");

auto allow_build_at_runtime = Get<bool>("trt_allow_build_at_runtime");
auto shape_range_info_path = Get<std::string>("trt_shape_range_info_path");
auto trt_tuned_dynamic_shape = Get<bool>("trt_tuned_dynamic_shape");
int max_batch_size = Get<int>("max_batch_size");
if (trt_tuned_dynamic_shape) {
VLOG(1) << "trt dynamic_shape deserialize from " << shape_range_info_path;
inference::DeserializeShapeRangeInfo(shape_range_info_path,
&min_input_shape, &max_input_shape,
&opt_input_shape);
}

// The following procedure is used to rename all the intermediate
// variables and the output variables of the subgraph.
// Why we do this?
Expand Down Expand Up @@ -242,12 +254,14 @@ void TensorRtSubgraphPass::CreateTensorRTOp(

op_desc->SetBlockAttr("sub_block", new_block);
op_desc->SetAttr("subgraph", block_desc.Proto()->SerializeAsString());
op_desc->SetAttr("max_batch_size", Get<int>("max_batch_size"));
op_desc->SetAttr("max_batch_size", max_batch_size);
op_desc->SetAttr("workspace_size", Get<int>("workspace_size"));
op_desc->SetAttr("gpu_id", Get<int>("gpu_device_id"));
op_desc->SetAttr("output_name_mapping", output_mapping);
op_desc->SetAttr("origin_output_dims", renamed_output_dims);
op_desc->SetAttr("parameters", params);
op_desc->SetAttr("allow_build_at_runtime", allow_build_at_runtime);
op_desc->SetAttr("shape_range_info_path", shape_range_info_path);

// we record all inputs' shapes in attr to check if they are consistent
// with the real inputs' shapes retrieved from scope when trt runs.
Expand All @@ -259,19 +273,24 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
}

auto use_static_engine = Get<bool>("use_static_engine");
op_desc->SetAttr("use_static_engine", use_static_engine);
if (use_static_engine)
op_desc->SetAttr("model_opt_cache_dir",
Get<std::string>("model_opt_cache_dir"));

// TODO(NHZlX)
// There are models with the same structure but the different parameters,
// when running in the 'use_serialize' mode, there is a bug.
// serialization is affected by max_batch_size, but calibration is not.
// So we use seperate engine keys in serialization and calibration.
auto engine_key = GenerateEngineKey(
input_names_with_id, output_names_with_id, std::to_string(0),
std::to_string(Get<int>("max_batch_size")),
std::to_string(max_batch_size),
std::to_string(static_cast<int>(precision_mode)), false);
auto calibration_engine_key = GenerateEngineKey(
input_names_with_id, output_names_with_id, std::to_string(0),
std::to_string(Get<int>("max_batch_size")),
std::to_string(static_cast<int>(precision_mode)), true);
auto calibration_engine_key =
GenerateEngineKey(input_names_with_id, output_names_with_id,
std::to_string(0), std::to_string(max_batch_size),
std::to_string(static_cast<int>(precision_mode)), true);
auto predictor_id = Get<int>("predictor_id");

// Get "" when there is no cached calibration table data.
Expand Down Expand Up @@ -345,11 +364,10 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
bool disable_trt_plugin_fp16 = Get<bool>("disable_trt_plugin_fp16");
tensorrt::TensorRTEngine *trt_engine =
inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
.Create(engine_key + std::to_string(predictor_id),
Get<int>("max_batch_size"), Get<int>("workspace_size"),
precision_mode, calibrator.get(), Get<int>("gpu_device_id"),
min_input_shape, max_input_shape, opt_input_shape,
disable_trt_plugin_fp16);
.Create(engine_key + std::to_string(predictor_id), max_batch_size,
Get<int>("workspace_size"), precision_mode, calibrator.get(),
Get<int>("gpu_device_id"), min_input_shape, max_input_shape,
opt_input_shape, disable_trt_plugin_fp16);
trt_engine->SetUseOSS(Get<bool>("use_oss"));
trt_engine->SetUseDLA(Get<bool>("trt_use_dla"));
trt_engine->SetDLACore(Get<int>("trt_dla_core"));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,17 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
// We get all the vars from local_scope instead of the ProgramDesc.
// Because there exists the case that new parameter variables are not added to
// the program in the analysis pass.
bool reserve_cpu_weights = false;
if (argument->tensorrt_allow_build_at_runtime_valid() &&
argument->tensorrt_allow_build_at_runtime()) {
reserve_cpu_weights = true;
}
for (auto &var_name : all_vars) {
if (std::count(repetitive_params.begin(), repetitive_params.end(),
var_name)) {
scope->EraseVars({var_name});
if (!reserve_cpu_weights) {
scope->EraseVars({var_name});
}
continue;
}
auto *var = scope->FindLocalVar(var_name);
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/inference/api/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
endif()

cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS ${inference_deps}
zero_copy_tensor ir_pass_manager op_compatible_info)
zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils)

cc_test(test_paddle_inference_api SRCS api_tester.cc DEPS paddle_inference_api)

Expand Down
49 changes: 47 additions & 2 deletions paddle/fluid/inference/api/analysis_config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,10 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(trt_use_static_engine_);
CP_MEMBER(trt_use_calib_mode_);
CP_MEMBER(trt_use_oss_);
CP_MEMBER(trt_tuned_dynamic_shape_);
CP_MEMBER(trt_allow_build_at_runtime_);
CP_MEMBER(collect_shape_range_info_);
CP_MEMBER(shape_range_info_path_);
// Dlnne related
CP_MEMBER(use_dlnne_);
CP_MEMBER(dlnne_min_subgraph_size_);
Expand Down Expand Up @@ -653,8 +657,8 @@ float AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
#endif
}

void AnalysisConfig::EnableMemoryOptim() {
enable_memory_optim_ = true;
void AnalysisConfig::EnableMemoryOptim(bool x) {
enable_memory_optim_ = x;
Update();
}

Expand Down Expand Up @@ -783,6 +787,9 @@ std::string AnalysisConfig::Summary() {
// dynamic_shape
os.InsertRow({"tensorrt_enable_dynamic_shape",
min_input_shape_.empty() ? "false" : "true"});
os.InsertRow({"tensorrt_tuned_dynamic_shape", trt_tuned_dynamic_shape_
? shape_range_info_path_
: "false"});

os.InsertRow({"tensorrt_use_oss", trt_use_oss_ ? "true" : "false"});
os.InsertRow({"tensorrt_use_dla", trt_use_dla_ ? "true" : "false"});
Expand Down Expand Up @@ -812,8 +819,46 @@ std::string AnalysisConfig::Summary() {
os.InsertRow({"memory_optim", enable_memory_optim_ ? "true" : "false"});
os.InsertRow({"enable_profile", with_profile_ ? "true" : "false"});
os.InsertRow({"enable_log", with_glog_info_ ? "true" : "false"});
os.InsertRow({"collect_shape_range_info",
collect_shape_range_info_ ? shape_range_info_path_ : "false"});

return os.PrintTable();
}

void AnalysisConfig::CollectShapeRangeInfo(
const std::string &shape_range_info_path) {
LOG(INFO) << "In CollectShapeInfo mode, we will disable optimizations and "
"collect the shape information of "
<< "all intermediate tensors in the compute graph and calculate "
"the min_shape, max_shape and opt_shape.";
collect_shape_range_info_ = true;
PADDLE_ENFORCE_EQ(shape_range_info_path.empty(), false,
platform::errors::InvalidArgument(
"The shape_range_info_path should not be empty, please "
"re-check the argument."));
shape_range_info_path_ = shape_range_info_path;
}

const std::string &AnalysisConfig::shape_range_info_path() {
return shape_range_info_path_;
}

bool AnalysisConfig::shape_range_info_collected() {
return collect_shape_range_info_;
}

void AnalysisConfig::EnableTunedTensorRtDynamicShape(
const std::string &shape_range_info_path, bool allow_build_at_runtime) {
shape_range_info_path_ = shape_range_info_path;
trt_allow_build_at_runtime_ = allow_build_at_runtime;
trt_tuned_dynamic_shape_ = true;
}

bool AnalysisConfig::tuned_tensorrt_dynamic_shape() {
return trt_tuned_dynamic_shape_;
}

bool AnalysisConfig::trt_allow_build_at_runtime() {
return trt_allow_build_at_runtime_;
}
} // namespace paddle
90 changes: 90 additions & 0 deletions paddle/fluid/inference/api/analysis_predictor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,17 @@
// limitations under the License.

#include "paddle/fluid/inference/api/analysis_predictor.h"

#include <glog/logging.h>

#include <algorithm>
#include <fstream>
#include <memory>
#include <set>
#include <string>
#include <utility>
#include <vector>

#include "paddle/fluid/extension/include/ext_op_meta_info.h"
#include "paddle/fluid/framework/feed_fetch_method.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
Expand All @@ -34,6 +37,7 @@
#include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
#include "paddle/fluid/inference/api/helper.h"
#include "paddle/fluid/inference/api/paddle_inference_pass.h"
#include "paddle/fluid/inference/utils/io_utils.h"
#include "paddle/fluid/inference/utils/singleton.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/platform/cpu_helper.h"
Expand Down Expand Up @@ -570,6 +574,11 @@ void AnalysisPredictor::PrepareArgument() {
argument_.SetMaxInputShape(config_.max_input_shape_);
argument_.SetOptimInputShape(config_.optim_input_shape_);
argument_.SetCloseTrtPluginFp16(config_.disable_trt_plugin_fp16_);
argument_.SetTensorRtShapeRangeInfoPath(config_.shape_range_info_path());
argument_.SetTensorRtTunedDynamicShape(
config_.tuned_tensorrt_dynamic_shape());
argument_.SetTensorRtAllowBuildAtRuntime(
config_.trt_allow_build_at_runtime());
}

if (config_.dlnne_enabled()) {
Expand Down Expand Up @@ -915,6 +924,11 @@ bool AnalysisPredictor::ZeroCopyRun() {
#endif

executor_->Run();

if (config_.shape_range_info_collected()) {
CollectShapeRangeInfo();
}

// Fix TensorArray reuse not cleaned bug.
tensor_array_batch_cleaner_.CollectTensorArrays(sub_scope_);
tensor_array_batch_cleaner_.ResetTensorArray();
Expand All @@ -934,6 +948,78 @@ bool AnalysisPredictor::ZeroCopyRun() {
return true;
}

void AnalysisPredictor::CollectShapeRangeInfo() {
// if use gpu, sync first.
if (config_.use_gpu()) {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
paddle::platform::DeviceContextPool &pool =
paddle::platform::DeviceContextPool::Instance();
auto gpu_place = BOOST_GET_CONST(paddle::platform::CUDAPlace, place_);
auto *dev_ctx = static_cast<const paddle::platform::CUDADeviceContext *>(
pool.Get(gpu_place));
#ifdef PADDLE_WITH_HIP
hipStreamSynchronize(dev_ctx->stream());
#else
cudaStreamSynchronize(dev_ctx->stream());
#endif
#endif
}

std::vector<std::string> var_names = sub_scope_->LocalVarNames();
for (const auto &name : var_names) {
auto *var = sub_scope_->GetVar(name);
if (!var->IsType<framework::LoDTensor>()) {
continue;
}
framework::DDim dim = var->Get<framework::LoDTensor>().dims();
std::vector<int32_t> shape(dim.size());
for (size_t i = 0; i < shape.size(); ++i) shape[i] = dim[i];
shape_info_[name].emplace_back(shape);
}
}

void AnalysisPredictor::StatisticShapeRangeInfo() {
std::map<std::string, std::vector<int32_t>> min_shapes;
std::map<std::string, std::vector<int32_t>> max_shapes;
std::map<std::string, std::vector<int32_t>> opt_shapes;
for (auto it : shape_info_) {
auto name = it.first;
auto shapes = it.second;

std::vector<int32_t> min_shape(shapes[0].begin(), shapes[0].end());
std::vector<int32_t> max_shape(shapes[0].begin(), shapes[0].end());
std::vector<int32_t> opt_shape(shapes[0].begin(), shapes[0].end());

auto ShapeMaxFreq = [](const std::map<int32_t, int32_t> &m) -> int32_t {
std::vector<std::pair<int32_t, int32_t>> counter;
for (auto &it : m) counter.push_back(it);
std::sort(
counter.begin(), counter.end(),
[](std::pair<int32_t, int32_t> &a, std::pair<int32_t, int32_t> &b) {
return a.second > b.second;
});
return counter[0].first;
};

for (size_t d = 0; d < shapes[0].size(); ++d) {
std::map<int32_t, int32_t> counter;
for (size_t i = 0; i < shapes.size(); ++i) {
counter[shapes[i][d]] += 1;
if (shapes[i][d] < min_shape[d]) min_shape[d] = shapes[i][d];
if (shapes[i][d] > max_shape[d]) max_shape[d] = shapes[i][d];
}
opt_shape[d] = ShapeMaxFreq(counter);
}

min_shapes[name] = min_shape;
max_shapes[name] = max_shape;
opt_shapes[name] = opt_shape;
}

inference::SerializeShapeRangeInfo(config_.shape_range_info_path(),
min_shapes, max_shapes, opt_shapes);
}

bool AnalysisPredictor::LoadProgramDesc() {
// Initialize the inference program
std::string filename;
Expand Down Expand Up @@ -1140,6 +1226,10 @@ AnalysisPredictor::~AnalysisPredictor() {
}
#endif

if (config_.shape_range_info_collected()) {
StatisticShapeRangeInfo();
}

memory::Release(place_);
}

Expand Down
Loading