Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 16 additions & 8 deletions paddle/fluid/inference/anakin/engine.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,18 +32,26 @@ namespace paddle {
namespace inference {
namespace anakin {

template <typename TargetT, Precision PrecisionType, OpRunType RunType>
extern std::once_flag
AnakinEngine<TargetT, PrecisionType, RunType>::init_anakin_;

template <typename TargetT, Precision PrecisionType, OpRunType RunType>
AnakinEngine<TargetT, PrecisionType, RunType>::AnakinEngine(
bool need_summary, int device, int max_batch_size,
std::map<std::string, std::vector<int>> max_input_shape,
std::vector<std::string> program_inputs, bool auto_config_layout)
: graph_(new AnakinGraphT<TargetT, PrecisionType>()),
net_(new AnakinNetT<TargetT, PrecisionType, RunType>(need_summary)) {
device_ = device;
max_batch_size_ = max_batch_size;
max_input_shape_ = max_input_shape;
program_inputs_ = program_inputs;
auto_config_layout_ = auto_config_layout;
: device_(device),
max_batch_size_(max_batch_size),
max_input_shape_(max_input_shape),
program_inputs_(program_inputs),
auto_config_layout_(auto_config_layout) {
std::call_once(init_anakin_, [this]() {
::anakin::TargetWrapper<TargetT>::set_device(device_);
::anakin::Env<TargetT>::env_init();
});
graph_.reset(new AnakinGraphT<TargetT, PrecisionType>());
net_.reset(new AnakinNetT<TargetT, PrecisionType, RunType>(need_summary));
}

template <typename TargetT, Precision PrecisionType, OpRunType RunType>
Expand Down Expand Up @@ -102,7 +110,7 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::BindInput(
anakin_input = net_->get_in(input.first);
}
anakin_input->reshape(fluid_input_shape);
::anakin::saber::Tensor<TargetT> tmp_anakin_tensor(data, TargetT(), 0,
::anakin::saber::Tensor<TargetT> tmp_anakin_tensor(data, TargetT(), device_,
fluid_input_shape);
anakin_input->copy_from(tmp_anakin_tensor);
}
Expand Down
5 changes: 3 additions & 2 deletions paddle/fluid/inference/anakin/engine.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,12 +114,13 @@ class AnakinEngine {

private:
bool initialized_{false};
int device_;
int max_batch_size_;
std::map<std::string, std::vector<int>> max_input_shape_;
int device_;
std::vector<std::string> program_inputs_;
std::unique_ptr<GraphT> graph_;
std::unique_ptr<NetT> net_;
std::vector<std::string> program_inputs_;
static std::once_flag init_anakin_;
std::unordered_map<std::string, float> tensor_scales_;
// Always be false in gpu mode but true in most cpu cases.
bool auto_config_layout_;
Expand Down
10 changes: 10 additions & 0 deletions paddle/fluid/inference/analysis/argument.h
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,16 @@ struct Argument {
using anakin_max_shape_t = std::map<std::string, std::vector<int>>;

bool Has(const std::string& key) const { return valid_fields_.count(key); }
void PartiallyRelease() {
if (Has("model_program_path")) {
if (Has("model_from_memory") && model_from_memory()) {
model_program_path().clear();
model_program_path().shrink_to_fit();
model_params_path().clear();
model_params_path().shrink_to_fit();
}
}
}

#define DECL_ARGUMENT_FIELD(field__, Field, type__) \
public: \
Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/inference/analysis/ir_pass_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ void IRPassManager::CreatePasses(Argument *argument,
bool enable_int8 = argument->tensorrt_precision_mode() ==
AnalysisConfig::Precision::kInt8;

pass->Set("predictor_id", new int(argument->predictor_id()));
bool use_calib_mode = argument->tensorrt_use_calib_mode();
pass->Set("enable_int8", new bool(enable_int8));
pass->Set("use_calib_mode", new bool(use_calib_mode));
Expand Down
33 changes: 19 additions & 14 deletions paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
Original file line number Diff line number Diff line change
Expand Up @@ -199,8 +199,12 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
SetAttr(op_desc->Proto(), "parameters", params);

auto use_static_engine = Get<bool>("use_static_engine");
// TODO(NHZlX)
// There are models with the same structure but the different parameters,
// when runing in the 'use_serialize' mode, there is a bug.
auto engine_key = GenerateEngineKey(input_names_with_id, output_names_with_id,
std::to_string(0));
auto predictor_id = Get<int>("predictor_id");

// Get "" when there is no cached calibration table data.
bool load_from_memory = Get<bool>("model_from_memory");
Expand All @@ -214,6 +218,7 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
SetAttr(op_desc->Proto(), "enable_int8", enable_int8);
SetAttr(op_desc->Proto(), "use_calib_mode", use_calib_mode);
SetAttr(op_desc->Proto(), "engine_key", engine_key);
SetAttr(op_desc->Proto(), "predictor_id", predictor_id);
std::string trt_engine_serialized_data = "";
SetAttr(op_desc->Proto(), "engine_serialized_data",
trt_engine_serialized_data);
Expand All @@ -233,15 +238,20 @@ void TensorRtSubgraphPass::CreateTensorRTOp(

std::copy(params.begin(), params.end(),
std::back_inserter(*repetitive_params));
bool need_serialize = (use_static_engine && !load_from_memory);

tensorrt::TensorRTEngine *trt_engine =
inference::Singleton<inference::tensorrt::TRTEngineManager>::Global()
.Create(engine_key + std::to_string(predictor_id),
Get<int>("max_batch_size"), Get<int>("workspace_size"),
enable_int8, calibrator.get(), Get<int>("gpu_device_id"));

bool need_serialize = (use_static_engine && !load_from_memory);
if (need_serialize) {
trt_engine_serialized_data = GetTrtEngineSerializedData(
Get<std::string>("model_opt_cache_dir"), engine_key);
// we can load the engine info serialized before from the disk.
if (!trt_engine_serialized_data.empty()) {
SetAttr(op_desc->Proto(), "engine_serialized_data",
trt_engine_serialized_data);
trt_engine->Deserialize(trt_engine_serialized_data);
LOG(INFO) << "Load TRT Optimized Info from "
<< GetTrtEngineSerializedPath(
Get<std::string>("model_opt_cache_dir"), engine_key);
Expand All @@ -254,31 +264,26 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
// 2. already load serialized trt engine info.
LOG(INFO) << "Prepare TRT engine (Optimize model structure, Select OP "
"kernel etc). This process may cost a lot of time.";
std::unique_ptr<tensorrt::TensorRTEngine> trt_engine(
new tensorrt::TensorRTEngine(
Get<int>("max_batch_size"), Get<int>("workspace_size"), enable_int8,
calibrator.get(), Get<int>("gpu_device_id")));

auto *scope = param_scope();
framework::BlockDesc block_desc_temp(nullptr, block_desc.Proto());
std::unordered_set<std::string> param_set(params.begin(), params.end());
inference::Singleton<inference::tensorrt::OpConverter>::Global()
.ConvertBlockToTRTEngine(
&block_desc_temp, *scope,
std::vector<std::string>(input_names.begin(), input_names.end()),
param_set, output_mapping, trt_engine.get());
nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize();
trt_engine_serialized_data =
std::string((const char *)serialized_engine_data->data(),
serialized_engine_data->size());
param_set, output_mapping, trt_engine);

if (need_serialize) {
nvinfer1::IHostMemory *serialized_engine_data = trt_engine->Serialize();
trt_engine_serialized_data =
std::string((const char *)serialized_engine_data->data(),
serialized_engine_data->size());
SaveTrtEngineSerializedDataToFile(
GetTrtEngineSerializedPath(Get<std::string>("model_opt_cache_dir"),
engine_key),
trt_engine_serialized_data);
}
SetAttr(op_desc->Proto(), "engine_serialized_data",
trt_engine_serialized_data);
}

} // namespace analysis
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
// Copy the parameter data to a tmp tensor.
TensorCopySync(*t, cpu_place, &temp_tensor);
// Reallocation the space on GPU
t->mutable_data<float>(place);
t->clear();

// Copy parameter data to newly allocated GPU space.
TensorCopySync(temp_tensor, place, t);
Expand Down
15 changes: 13 additions & 2 deletions paddle/fluid/inference/api/analysis_config.cc
Original file line number Diff line number Diff line change
Expand Up @@ -87,10 +87,12 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {

// Model related.
CP_MEMBER(model_dir_);
CP_MEMBER(prog_file_);
CP_MEMBER(params_file_);
CP_MEMBER(model_from_memory_); // the memory model reuses prog_file_ and
// params_file_ fields.

prog_file_ = std::move(other.prog_file_);
params_file_ = std::move(other.params_file_);

// Gpu related.
CP_MEMBER(use_gpu_);
CP_MEMBER(device_id_);
Expand Down Expand Up @@ -369,6 +371,7 @@ float AnalysisConfig::fraction_of_gpu_memory_for_pool() const {
// Get the GPU memory details and calculate the fraction of memory for the
// GPU memory pool.
size_t gpu_used, gpu_available;
platform::SetDeviceId(device_id_);
platform::GpuMemoryUsage(&gpu_used, &gpu_available);
double total_gpu_memory = (gpu_used + gpu_available) / 1024. / 1024.;
float fraction_of_gpu_memory =
Expand Down Expand Up @@ -439,4 +442,12 @@ void AnalysisConfig::EnableAnakinEngine(
anakin_auto_config_layout_ = auto_config_layout;
Update();
}

void AnalysisConfig::PartiallyRelease() {
prog_file_.clear();
prog_file_.shrink_to_fit();
params_file_.clear();
params_file_.shrink_to_fit();
}

} // namespace paddle
8 changes: 8 additions & 0 deletions paddle/fluid/inference/api/analysis_predictor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -444,13 +444,19 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
ARGUMENT_CHECK_FIELD((&argument_), ir_analyzed_program);
inference_program_.reset(
new framework::ProgramDesc(argument_.ir_analyzed_program()));
// The config and argument take a lot of storage,
// when the predictor settings are complete, we release these stores.
argument_.PartiallyRelease();
config_.PartiallyRelease();
LOG(INFO) << "== optimize end ==";
}

template <>
std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) {
VLOG(3) << "create AnalysisConfig";
PADDLE_ENFORCE(config.is_valid(),
"Note: Each config can only be used for one predictor.");
if (config.use_gpu()) {
// 1. GPU memory
PADDLE_ENFORCE_GE(config.memory_pool_init_size_mb(), 0.f);
Expand Down Expand Up @@ -480,6 +486,8 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
}

std::unique_ptr<PaddlePredictor> predictor(new AnalysisPredictor(config));
// Each config can only be used for one predictor.
config.SetInValid();
auto predictor_p = dynamic_cast<AnalysisPredictor *>(predictor.get());

if (!predictor_p->Init(nullptr)) {
Expand Down
12 changes: 10 additions & 2 deletions paddle/fluid/inference/api/paddle_analysis_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -232,13 +232,16 @@ struct AnalysisConfig {
bool force_update_static_cache = false);
/** Tell whether the memory optimization is activated. */
bool enable_memory_optim() const;
void SetInValid() const { is_valid_ = false; }
bool is_valid() const { return is_valid_; }

friend class ::paddle::AnalysisPredictor;

/** NOTE just for developer, not an official API, easily to be broken.
* Get a pass builder for customize the passes in IR analysis phase.
*/
PassStrategy* pass_builder() const;
void PartiallyRelease();

protected:
// Update the config.
Expand All @@ -249,8 +252,8 @@ struct AnalysisConfig {
protected:
// Model pathes.
std::string model_dir_;
std::string prog_file_;
std::string params_file_;
mutable std::string prog_file_;
mutable std::string params_file_;

// GPU related.
bool use_gpu_{false};
Expand Down Expand Up @@ -312,6 +315,11 @@ struct AnalysisConfig {

bool use_mkldnn_quantizer_{false};
std::shared_ptr<MkldnnQuantizerConfig> mkldnn_quantizer_config_;
// If the config is already used on a predictor, it becomes invalid.
mutable bool is_valid_{true};
// Any config can only be used with one predictor.
// Variables held by config can take up a lot of memory in some cases.
// So we release the memory when the predictor is set up.
};

} // namespace paddle
1 change: 1 addition & 0 deletions paddle/fluid/inference/api/paddle_pass_builder.cc
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,7 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
"conv_affine_channel_fuse_pass", //
"conv_eltwiseadd_affine_channel_fuse_pass", //
"conv_bn_fuse_pass", //
"conv_eltwiseadd_bn_fuse_pass", //
#if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be
// guaranteed at least v7
"conv_elementwise_add_act_fuse_pass", //
Expand Down
1 change: 1 addition & 0 deletions paddle/fluid/inference/tensorrt/convert/op_converter.h
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ class OpConverter {
engine->DeclareOutput(output);
}
engine->FreezeNetwork();
engine->ClearWeights();
}

void RreplenishLayerAndOutput(
Expand Down
39 changes: 39 additions & 0 deletions paddle/fluid/inference/tensorrt/engine.h
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,12 @@ class TensorRTEngine {
std::unordered_map<std::string /*name*/, std::unique_ptr<framework::Tensor>>
weight_map;

void ClearWeights() {
for (auto& weight_pair : weight_map) {
weight_pair.second.reset(nullptr);
}
}

private:
// Each ICudaEngine object is bound to a specific GPU when it is instantiated,
// ensure that the thread is associated with the correct device by calling
Expand Down Expand Up @@ -213,6 +219,39 @@ class TensorRTEngine {
#define TRT_ENGINE_ADD_LAYER(engine__, layer__, ARGS...) \
engine__->network()->add##layer__(ARGS);

class TRTEngineManager {
public:
bool Empty() const { return engines_.size() == 0; }
bool Has(const std::string& name) const {
if (engines_.count(name) == 0) return false;
return engines_.at(name).get() != nullptr;
}

TensorRTEngine* Get(const std::string& name) const {
return engines_.at(name).get();
}

TensorRTEngine* Create(std::string name, int max_batch, int max_workspace,
bool enable_int8 = false,
TRTInt8Calibrator* calibrator = nullptr,
int device_id = 0,
nvinfer1::ILogger& logger = NaiveLogger::Global()) {
auto* p = new TensorRTEngine(max_batch, max_workspace, enable_int8,
calibrator, device_id, logger);
engines_[name].reset(p);
return p;
}

void DeleteAll() {
for (auto& item : engines_) {
item.second.reset(nullptr);
}
}

private:
std::unordered_map<std::string, std::unique_ptr<TensorRTEngine>> engines_;
};

} // namespace tensorrt
} // namespace inference
} // namespace paddle
2 changes: 1 addition & 1 deletion paddle/fluid/inference/tensorrt/op_teller.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ struct SimpleOpTypeSetTeller : public Teller {
std::unordered_set<std::string> teller_set{
{"mul", "conv2d", "pool2d", "relu", "softmax", "sigmoid",
"depthwise_conv2d", "batch_norm", "concat", "tanh", "pad",
"elementwise_add", "elementwise_mul", "dropout", "split", "prelu",
"elementwise_add", "elementwise_mul", "dropout", "prelu",
"conv2d_transpose", "leaky_relu", "fc"}};
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -177,11 +177,15 @@ TEST(Analyzer_Pyramid_DNN, compare_zero_copy) {
AnalysisConfig cfg;
SetConfig(&cfg);

AnalysisConfig cfg1;
SetConfig(&cfg1);

std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
std::vector<std::string> outputs_name;
outputs_name.emplace_back("cos_sim_2.tmp_0");
CompareAnalysisAndZeroCopy(reinterpret_cast<PaddlePredictor::Config *>(&cfg),
reinterpret_cast<PaddlePredictor::Config *>(&cfg1),
input_slots_all, outputs_name);
}

Expand Down
4 changes: 4 additions & 0 deletions paddle/fluid/inference/tests/api/analyzer_rnn1_tester.cc
Original file line number Diff line number Diff line change
Expand Up @@ -293,11 +293,15 @@ TEST(Analyzer_rnn1, compare_zero_copy) {
AnalysisConfig cfg;
SetConfig(&cfg);

AnalysisConfig cfg1;
SetConfig(&cfg1);

std::vector<std::vector<PaddleTensor>> input_slots_all;
SetInput(&input_slots_all);
std::vector<std::string> outputs_name;
outputs_name.emplace_back("final_output.tmp_1");
CompareAnalysisAndZeroCopy(reinterpret_cast<PaddlePredictor::Config *>(&cfg),
reinterpret_cast<PaddlePredictor::Config *>(&cfg1),
input_slots_all, outputs_name);
}

Expand Down
Loading