PaddlePaddle · jiweibo · Sep 14, 2021 · Aug 11, 2021 · Aug 11, 2021 · Aug 11, 2021
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
@@ -212,6 +212,12 @@ struct Argument {
                       bool);
   DECL_ARGUMENT_FIELD(tensorrt_use_calib_mode, TensorRtUseCalibMode, bool);
   DECL_ARGUMENT_FIELD(tensorrt_use_oss, TensorRtUseOSS, bool);
+  DECL_ARGUMENT_FIELD(tensorrt_shape_range_info_path,
+                      TensorRtShapeRangeInfoPath, std::string);
+  DECL_ARGUMENT_FIELD(tensorrt_tuned_dynamic_shape, TensorRtTunedDynamicShape,
+                      bool);
+  DECL_ARGUMENT_FIELD(tensorrt_allow_build_at_runtime,
+                      TensorRtAllowBuildAtRuntime, bool);
 
   DECL_ARGUMENT_FIELD(use_dlnne, UseDlnne, bool);
   DECL_ARGUMENT_FIELD(dlnne_min_subgraph_size, DlnneMinSubgraphSize, int);

diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -146,24 +146,32 @@ void IRPassManager::CreatePasses(Argument *argument,
       pass->Set("gpu_device_id", new int(argument->gpu_device_id()));
       pass->Set("use_static_engine", new bool(use_static_engine));
       pass->Set("model_from_memory", new bool(argument->model_from_memory()));
+
+      // tuned trt dynamic_shape
+      pass->Set("trt_shape_range_info_path",
+                new std::string(argument->tensorrt_shape_range_info_path()));
+      pass->Set("trt_tuned_dynamic_shape",
+                new bool(argument->tensorrt_tuned_dynamic_shape()));
+      pass->Set("trt_allow_build_at_runtime",
+                new bool(argument->tensorrt_allow_build_at_runtime()));
       pass->Set("max_input_shape", new std::map<std::string, std::vector<int>>(
                                        argument->max_input_shape()));
       pass->Set("min_input_shape", new std::map<std::string, std::vector<int>>(
                                        argument->min_input_shape()));
       pass->Set("optim_input_shape",
                 new std::map<std::string, std::vector<int>>(
                     argument->optim_input_shape()));
-      bool with_dynamic_shape = argument->max_input_shape().size() > 0 &&
-                                argument->min_input_shape().size() > 0 &&
-                                argument->optim_input_shape().size() > 0;
+      bool with_dynamic_shape = (argument->max_input_shape().size() > 0 &&
+                                 argument->min_input_shape().size() > 0 &&
+                                 argument->optim_input_shape().size() > 0) ||
+                                argument->tensorrt_tuned_dynamic_shape();
       pass->Set("with_dynamic_shape", new bool(with_dynamic_shape));
       pass->Set("trt_disabled_ops", new std::vector<std::string>(
                                         argument->tensorrt_disabled_ops()));
       pass->Set("trt_use_dla", new bool(argument->tensorrt_use_dla()));
       pass->Set("trt_dla_core", new int(argument->tensorrt_dla_core()));
       // Setting the disable_trt_plugin_fp16 to true means that TRT plugin will
-      // not
-      // run fp16.
+      // not run fp16.
       pass->Set("disable_trt_plugin_fp16",
                 new bool(argument->disable_trt_plugin_fp16()));
     } else if (pass_name == "dlnne_subgraph_pass") {

diff --git a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt
@@ -1,7 +1,7 @@
 cc_library(subgraph_util SRCS subgraph_util.cc DEPS subgraph_detector)
 
 if (WITH_GPU AND TENSORRT_FOUND)
-  cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_util tensorrt_op_teller)
+  cc_library(tensorrt_subgraph_pass SRCS tensorrt_subgraph_pass.cc DEPS subgraph_util tensorrt_op_teller infer_io_utils)
 
   set(analysis_deps ${analysis_deps}
           subgraph_util tensorrt_subgraph_pass

diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -22,6 +22,7 @@
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/inference/tensorrt/op_teller.h"
+#include "paddle/fluid/inference/utils/io_utils.h"
 
 namespace paddle {
 namespace inference {
@@ -197,6 +198,17 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   auto opt_input_shape =
       Get<std::map<std::string, std::vector<int>>>("optim_input_shape");
 
+  auto allow_build_at_runtime = Get<bool>("trt_allow_build_at_runtime");
+  auto shape_range_info_path = Get<std::string>("trt_shape_range_info_path");
+  auto trt_tuned_dynamic_shape = Get<bool>("trt_tuned_dynamic_shape");
+  int max_batch_size = Get<int>("max_batch_size");
+  if (trt_tuned_dynamic_shape) {
+    VLOG(1) << "trt dynamic_shape deserialize from " << shape_range_info_path;
+    inference::DeserializeShapeRangeInfo(shape_range_info_path,
+                                         &min_input_shape, &max_input_shape,
+                                         &opt_input_shape);
+  }
+
   // The following procedure is used to rename all the intermediate
   // variables and the output variables of the subgraph.
   // Why we do this?
@@ -242,12 +254,14 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
 
   op_desc->SetBlockAttr("sub_block", new_block);
   op_desc->SetAttr("subgraph", block_desc.Proto()->SerializeAsString());
-  op_desc->SetAttr("max_batch_size", Get<int>("max_batch_size"));
+  op_desc->SetAttr("max_batch_size", max_batch_size);
   op_desc->SetAttr("workspace_size", Get<int>("workspace_size"));
   op_desc->SetAttr("gpu_id", Get<int>("gpu_device_id"));
   op_desc->SetAttr("output_name_mapping", output_mapping);
   op_desc->SetAttr("origin_output_dims", renamed_output_dims);
   op_desc->SetAttr("parameters", params);
+  op_desc->SetAttr("allow_build_at_runtime", allow_build_at_runtime);
+  op_desc->SetAttr("shape_range_info_path", shape_range_info_path);
 
   // we record all inputs' shapes in attr to check if they are consistent
   // with the real inputs' shapes retrieved from scope when trt runs.
@@ -259,19 +273,24 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   }
 
   auto use_static_engine = Get<bool>("use_static_engine");
+  op_desc->SetAttr("use_static_engine", use_static_engine);
+  if (use_static_engine)
+    op_desc->SetAttr("model_opt_cache_dir",
+                     Get<std::string>("model_opt_cache_dir"));
+
   // TODO(NHZlX)
   // There are models with the same structure but the different parameters,
   // when running in the 'use_serialize' mode, there is a bug.
   // serialization is affected by max_batch_size, but calibration is not.
   // So we use seperate engine keys in serialization and calibration.
   auto engine_key = GenerateEngineKey(
       input_names_with_id, output_names_with_id, std::to_string(0),
-      std::to_string(Get<int>("max_batch_size")),
+      std::to_string(max_batch_size),
       std::to_string(static_cast<int>(precision_mode)), false);
-  auto calibration_engine_key = GenerateEngineKey(
-      input_names_with_id, output_names_with_id, std::to_string(0),
-      std::to_string(Get<int>("max_batch_size")),
-      std::to_string(static_cast<int>(precision_mode)), true);
+  auto calibration_engine_key =
+      GenerateEngineKey(input_names_with_id, output_names_with_id,
+                        std::to_string(0), std::to_string(max_batch_size),
+                        std::to_string(static_cast<int>(precision_mode)), true);
   auto predictor_id = Get<int>("predictor_id");
 
   // Get "" when there is no cached calibration table data.
@@ -345,11 +364,10 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   bool disable_trt_plugin_fp16 = Get<bool>("disable_trt_plugin_fp16");
   tensorrt::TensorRTEngine *trt_engine =
       inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
-          .Create(engine_key + std::to_string(predictor_id),
-                  Get<int>("max_batch_size"), Get<int>("workspace_size"),
-                  precision_mode, calibrator.get(), Get<int>("gpu_device_id"),
-                  min_input_shape, max_input_shape, opt_input_shape,
-                  disable_trt_plugin_fp16);
+          .Create(engine_key + std::to_string(predictor_id), max_batch_size,
+                  Get<int>("workspace_size"), precision_mode, calibrator.get(),
+                  Get<int>("gpu_device_id"), min_input_shape, max_input_shape,
+                  opt_input_shape, disable_trt_plugin_fp16);
   trt_engine->SetUseOSS(Get<bool>("use_oss"));
   trt_engine->SetUseDLA(Get<bool>("trt_use_dla"));
   trt_engine->SetDLACore(Get<int>("trt_dla_core"));

diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -55,10 +55,17 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
   // We get all the vars from local_scope instead of the ProgramDesc.
   // Because there exists the case that new parameter variables are not added to
   // the program in the analysis pass.
+  bool reserve_cpu_weights = false;
+  if (argument->tensorrt_allow_build_at_runtime_valid() &&
+      argument->tensorrt_allow_build_at_runtime()) {
+    reserve_cpu_weights = true;
+  }
   for (auto &var_name : all_vars) {
     if (std::count(repetitive_params.begin(), repetitive_params.end(),
                    var_name)) {
-      scope->EraseVars({var_name});
+      if (!reserve_cpu_weights) {
+        scope->EraseVars({var_name});
+      }
       continue;
     }
     auto *var = scope->FindLocalVar(var_name);

diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
@@ -49,7 +49,7 @@ if(WITH_GPU AND TENSORRT_FOUND)
 endif()
 
 cc_library(analysis_predictor SRCS analysis_predictor.cc ${mkldnn_quantizer_src} DEPS ${inference_deps} 
-          zero_copy_tensor ir_pass_manager op_compatible_info)
+          zero_copy_tensor ir_pass_manager op_compatible_info infer_io_utils)
 
 cc_test(test_paddle_inference_api SRCS api_tester.cc DEPS paddle_inference_api)
 

diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
@@ -158,6 +158,10 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(trt_use_static_engine_);
   CP_MEMBER(trt_use_calib_mode_);
   CP_MEMBER(trt_use_oss_);
+  CP_MEMBER(trt_tuned_dynamic_shape_);
+  CP_MEMBER(trt_allow_build_at_runtime_);
+  CP_MEMBER(collect_shape_range_info_);
+  CP_MEMBER(shape_range_info_path_);
   // Dlnne related
   CP_MEMBER(use_dlnne_);
   CP_MEMBER(dlnne_min_subgraph_size_);
@@ -653,8 +657,8 @@ float AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
 #endif
 }
 
-void AnalysisConfig::EnableMemoryOptim() {
-  enable_memory_optim_ = true;
+void AnalysisConfig::EnableMemoryOptim(bool x) {
+  enable_memory_optim_ = x;
   Update();
 }
 
@@ -783,6 +787,9 @@ std::string AnalysisConfig::Summary() {
       // dynamic_shape
       os.InsertRow({"tensorrt_enable_dynamic_shape",
                     min_input_shape_.empty() ? "false" : "true"});
+      os.InsertRow({"tensorrt_tuned_dynamic_shape", trt_tuned_dynamic_shape_
+                                                        ? shape_range_info_path_
+                                                        : "false"});
 
       os.InsertRow({"tensorrt_use_oss", trt_use_oss_ ? "true" : "false"});
       os.InsertRow({"tensorrt_use_dla", trt_use_dla_ ? "true" : "false"});
@@ -812,8 +819,46 @@ std::string AnalysisConfig::Summary() {
   os.InsertRow({"memory_optim", enable_memory_optim_ ? "true" : "false"});
   os.InsertRow({"enable_profile", with_profile_ ? "true" : "false"});
   os.InsertRow({"enable_log", with_glog_info_ ? "true" : "false"});
+  os.InsertRow({"collect_shape_range_info",
+                collect_shape_range_info_ ? shape_range_info_path_ : "false"});
 
   return os.PrintTable();
 }
 
+void AnalysisConfig::CollectShapeRangeInfo(
+    const std::string &shape_range_info_path) {
+  LOG(INFO) << "In CollectShapeInfo mode, we will disable optimizations and "
+               "collect the shape information of "
+            << "all intermediate tensors in the compute graph and calculate "
+               "the min_shape, max_shape and opt_shape.";
+  collect_shape_range_info_ = true;
+  PADDLE_ENFORCE_EQ(shape_range_info_path.empty(), false,
+                    platform::errors::InvalidArgument(
+                        "The shape_range_info_path should not be empty, please "
+                        "re-check the argument."));
+  shape_range_info_path_ = shape_range_info_path;
+}
+
+const std::string &AnalysisConfig::shape_range_info_path() {
+  return shape_range_info_path_;
+}
+
+bool AnalysisConfig::shape_range_info_collected() {
+  return collect_shape_range_info_;
+}
+
+void AnalysisConfig::EnableTunedTensorRtDynamicShape(
+    const std::string &shape_range_info_path, bool allow_build_at_runtime) {
+  shape_range_info_path_ = shape_range_info_path;
+  trt_allow_build_at_runtime_ = allow_build_at_runtime;
+  trt_tuned_dynamic_shape_ = true;
+}
+
+bool AnalysisConfig::tuned_tensorrt_dynamic_shape() {
+  return trt_tuned_dynamic_shape_;
+}
+
+bool AnalysisConfig::trt_allow_build_at_runtime() {
+  return trt_allow_build_at_runtime_;
+}
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -13,14 +13,17 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/api/analysis_predictor.h"
+
 #include <glog/logging.h>
+
 #include <algorithm>
 #include <fstream>
 #include <memory>
 #include <set>
 #include <string>
 #include <utility>
 #include <vector>
+
 #include "paddle/fluid/extension/include/ext_op_meta_info.h"
 #include "paddle/fluid/framework/feed_fetch_method.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
@@ -34,6 +37,7 @@
 #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
 #include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/inference/api/paddle_inference_pass.h"
+#include "paddle/fluid/inference/utils/io_utils.h"
 #include "paddle/fluid/inference/utils/singleton.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/cpu_helper.h"
@@ -570,6 +574,11 @@ void AnalysisPredictor::PrepareArgument() {
     argument_.SetMaxInputShape(config_.max_input_shape_);
     argument_.SetOptimInputShape(config_.optim_input_shape_);
     argument_.SetCloseTrtPluginFp16(config_.disable_trt_plugin_fp16_);
+    argument_.SetTensorRtShapeRangeInfoPath(config_.shape_range_info_path());
+    argument_.SetTensorRtTunedDynamicShape(
+        config_.tuned_tensorrt_dynamic_shape());
+    argument_.SetTensorRtAllowBuildAtRuntime(
+        config_.trt_allow_build_at_runtime());
   }
 
   if (config_.dlnne_enabled()) {
@@ -915,6 +924,11 @@ bool AnalysisPredictor::ZeroCopyRun() {
 #endif
 
   executor_->Run();
+
+  if (config_.shape_range_info_collected()) {
+    CollectShapeRangeInfo();
+  }
+
   // Fix TensorArray reuse not cleaned bug.
   tensor_array_batch_cleaner_.CollectTensorArrays(sub_scope_);
   tensor_array_batch_cleaner_.ResetTensorArray();
@@ -934,6 +948,78 @@ bool AnalysisPredictor::ZeroCopyRun() {
   return true;
 }
 
+void AnalysisPredictor::CollectShapeRangeInfo() {
+  // if use gpu, sync first.
+  if (config_.use_gpu()) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    paddle::platform::DeviceContextPool &pool =
+        paddle::platform::DeviceContextPool::Instance();
+    auto gpu_place = BOOST_GET_CONST(paddle::platform::CUDAPlace, place_);
+    auto *dev_ctx = static_cast<const paddle::platform::CUDADeviceContext *>(
+        pool.Get(gpu_place));
+#ifdef PADDLE_WITH_HIP
+    hipStreamSynchronize(dev_ctx->stream());
+#else
+    cudaStreamSynchronize(dev_ctx->stream());
+#endif
+#endif
+  }
+
+  std::vector<std::string> var_names = sub_scope_->LocalVarNames();
+  for (const auto &name : var_names) {
+    auto *var = sub_scope_->GetVar(name);
+    if (!var->IsType<framework::LoDTensor>()) {
+      continue;
+    }
+    framework::DDim dim = var->Get<framework::LoDTensor>().dims();
+    std::vector<int32_t> shape(dim.size());
+    for (size_t i = 0; i < shape.size(); ++i) shape[i] = dim[i];
+    shape_info_[name].emplace_back(shape);
+  }
+}
+
+void AnalysisPredictor::StatisticShapeRangeInfo() {
+  std::map<std::string, std::vector<int32_t>> min_shapes;
+  std::map<std::string, std::vector<int32_t>> max_shapes;
+  std::map<std::string, std::vector<int32_t>> opt_shapes;
+  for (auto it : shape_info_) {
+    auto name = it.first;
+    auto shapes = it.second;
+
+    std::vector<int32_t> min_shape(shapes[0].begin(), shapes[0].end());
+    std::vector<int32_t> max_shape(shapes[0].begin(), shapes[0].end());
+    std::vector<int32_t> opt_shape(shapes[0].begin(), shapes[0].end());
+
+    auto ShapeMaxFreq = [](const std::map<int32_t, int32_t> &m) -> int32_t {
+      std::vector<std::pair<int32_t, int32_t>> counter;
+      for (auto &it : m) counter.push_back(it);
+      std::sort(
+          counter.begin(), counter.end(),
+          [](std::pair<int32_t, int32_t> &a, std::pair<int32_t, int32_t> &b) {
+            return a.second > b.second;
+          });
+      return counter[0].first;
+    };
+
+    for (size_t d = 0; d < shapes[0].size(); ++d) {
+      std::map<int32_t, int32_t> counter;
+      for (size_t i = 0; i < shapes.size(); ++i) {
+        counter[shapes[i][d]] += 1;
+        if (shapes[i][d] < min_shape[d]) min_shape[d] = shapes[i][d];
+        if (shapes[i][d] > max_shape[d]) max_shape[d] = shapes[i][d];
+      }
+      opt_shape[d] = ShapeMaxFreq(counter);
+    }
+
+    min_shapes[name] = min_shape;
+    max_shapes[name] = max_shape;
+    opt_shapes[name] = opt_shape;
+  }
+
+  inference::SerializeShapeRangeInfo(config_.shape_range_info_path(),
+                                     min_shapes, max_shapes, opt_shapes);
+}
+
 bool AnalysisPredictor::LoadProgramDesc() {
   // Initialize the inference program
   std::string filename;
@@ -1140,6 +1226,10 @@ AnalysisPredictor::~AnalysisPredictor() {
   }
 #endif
 
+  if (config_.shape_range_info_collected()) {
+    StatisticShapeRangeInfo();
+  }
+
   memory::Release(place_);
 }