Skip to content

Commit 32bc546

Browse files
committed
Merge branch 'develop' into docs
2 parents 9eeeab8 + 0fc181d commit 32bc546

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+538
-170
lines changed

paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -158,7 +158,7 @@ REGISTER_PASS(conv_transpose_bias_mkldnn_fuse_pass,
158158
REGISTER_PASS_CAPABILITY(conv_transpose_bias_mkldnn_fuse_pass)
159159
.AddCombination(
160160
paddle::framework::compatible::OpVersionComparatorCombination()
161-
.EQ("conv2d_transpose", 0)
161+
.LE("conv2d_transpose", 1)
162162
.EQ("elementwise_add", 0));
163163

164164
REGISTER_PASS(conv3d_bias_mkldnn_fuse_pass,

paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc

Lines changed: 63 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -195,32 +195,73 @@ void FuseDequant(ir::Graph* graph, Scope* scope,
195195
auto* weight_tensor =
196196
scope->Var(quantized_op_weight_node->Name())->GetMutable<LoDTensor>();
197197
auto w_dims = weight_tensor->dims();
198+
float* quantized_weight_data =
199+
weight_tensor->mutable_data<float>(platform::CPUPlace());
198200
// If quantized op is fc, weight scale size = 1;
199201
// If quantized op is conv2d, weight scale size = weight dims[0]
200202
// If quantized op is conv2d_transpose, weight scale size = weight dims[1]
201-
bool valid_scale_size =
202-
(weight_scale.size() == 1 ||
203-
weight_scale.size() == static_cast<size_t>(w_dims[0]) ||
204-
weight_scale.size() == static_cast<size_t>(w_dims[1]));
205-
PADDLE_ENFORCE_EQ(
206-
valid_scale_size, true,
207-
platform::errors::InvalidArgument(
208-
"TRT int8 quant: invalid scale size(%d).", weight_scale.size()));
209-
float* quantized_weight_data =
210-
weight_tensor->mutable_data<float>(platform::CPUPlace());
211-
for (int j = 0; j < weight_tensor->numel(); j++) {
212-
if (weight_scale.size() == 1) {
213-
quantized_weight_data[j] *= weight_scale[0];
214-
} else {
215-
if (quantized_op_type == "conv2d_transpose") {
216-
int inner_size = w_dims[2] * w_dims[3];
217-
quantized_weight_data[j] *=
218-
weight_scale[(j / inner_size) % w_dims[1]];
219-
} else {
220-
int inner_size = w_dims[1] * w_dims[2] * w_dims[3];
221-
quantized_weight_data[j] *= weight_scale[j / inner_size];
203+
if (quantized_op_type == "mul" || quantized_op_type == "fc") {
204+
if (dequant_type == "fake_dequantize_max_abs") {
205+
PADDLE_ENFORCE_EQ(
206+
weight_scale.size(), 1,
207+
platform::errors::InvalidArgument(
208+
"mul op weight dequantized by [fake_dequantize_max_abs] "
209+
"requires weight scale size = 1, but got %d.",
210+
weight_scale.size()));
211+
for (int j = 0; j < weight_tensor->numel(); j++) {
212+
quantized_weight_data[j] *= weight_scale[0];
222213
}
223214
}
215+
if (dequant_type == "fake_channel_wise_dequantize_max_abs") {
216+
PADDLE_ENFORCE_EQ(
217+
weight_scale.size(), static_cast<size_t>(w_dims[1]),
218+
platform::errors::InvalidArgument(
219+
"mul op weight dequantized by "
220+
"[fake_channel_wise_dequantize_max_abs] requires weight scale "
221+
"size = 2nd dim of mul's weight, which is %d, but got %d.",
222+
static_cast<size_t>(w_dims[1]), weight_scale.size()));
223+
for (int j = 0; j < weight_tensor->numel(); j++) {
224+
quantized_weight_data[j] *= weight_scale[j % w_dims[1]];
225+
}
226+
}
227+
} else if (quantized_op_type == "conv2d" ||
228+
quantized_op_type == "depthwise_conv2d") {
229+
PADDLE_ENFORCE_EQ(
230+
dequant_type, "fake_channel_wise_dequantize_max_abs",
231+
platform::errors::InvalidArgument("conv2d op must be dequantized by "
232+
"[fake_channel_wise_dequantize_max_"
233+
"abs], but got %s",
234+
dequant_type));
235+
PADDLE_ENFORCE_EQ(
236+
weight_scale.size(), static_cast<size_t>(w_dims[0]),
237+
platform::errors::InvalidArgument(
238+
"conv2d op requires weight scale size = channel size of the "
239+
"weight, which is %d, but got %d.",
240+
static_cast<size_t>(w_dims[0]), weight_scale.size()));
241+
for (int j = 0; j < weight_tensor->numel(); j++) {
242+
int inner_size = w_dims[1] * w_dims[2] * w_dims[3];
243+
quantized_weight_data[j] *= weight_scale[j / inner_size];
244+
}
245+
} else if (quantized_op_type == "conv2d_transpose") {
246+
PADDLE_ENFORCE_EQ(
247+
dequant_type, "fake_channel_wise_dequantize_max_abs",
248+
platform::errors::InvalidArgument(
249+
"conv2d_transpose must be dequantized by "
250+
"[fake_channel_wise_dequantize_max_abs], but got %s",
251+
dequant_type));
252+
PADDLE_ENFORCE_EQ(
253+
weight_scale.size(), static_cast<size_t>(w_dims[1]),
254+
platform::errors::InvalidArgument(
255+
"conv2d_transpose op requires weight scale size = channel size "
256+
"of the weight, which is %d, but got %d.",
257+
static_cast<size_t>(w_dims[1]), weight_scale.size()));
258+
for (int j = 0; j < weight_tensor->numel(); j++) {
259+
int inner_size = w_dims[2] * w_dims[3];
260+
quantized_weight_data[j] *= weight_scale[(j / inner_size) % w_dims[1]];
261+
}
262+
} else {
263+
PADDLE_THROW(platform::errors::InvalidArgument(
264+
"Unsupported quantized op type: %s", quantized_op_type));
224265
}
225266

226267
// create new op_desc
@@ -285,6 +326,7 @@ void QuantDequantFusePass::ApplyImpl(ir::Graph* graph) const {
285326

286327
REGISTER_PASS(quant_conv2d_dequant_fuse_pass,
287328
paddle::framework::ir::QuantDequantFusePass);
329+
REGISTER_PASS_CAPABILITY(quant_conv2d_dequant_fuse_pass);
288330

289331
REGISTER_PASS_CAPABILITY(tensorrt_subgraph_pass)
290332
.AddCombination(

paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -394,5 +394,5 @@ REGISTER_PASS_CAPABILITY(squared_mat_sub_fuse_pass)
394394
.EQ("square", 0)
395395
.EQ("elementwise_mul", 0)
396396
.EQ("elementwise_sub", 0)
397-
.EQ("fill_constant", 0)
397+
.EQ("fill_constant", 1)
398398
.EQ("fusion_squared_mat_sub", 0));

paddle/fluid/framework/op_version_registry.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -308,7 +308,7 @@ class PassVersionCheckerRegistrar {
308308
bool IsPassCompatible(const std::string& fuse_pass_name) const {
309309
auto iter = pass_version_checkers_map_.find(fuse_pass_name);
310310
if (iter == pass_version_checkers_map_.end()) {
311-
return true;
311+
return false;
312312
}
313313
return iter->second.IsPassCompatible();
314314
}

paddle/fluid/framework/op_version_registry_test.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,10 @@ TEST(test_operator_version, test_operator_version) {
5757

5858
TEST(test_pass_op_version_checker, test_pass_op_version_checker) {
5959
const std::string fake_op_name{"op_name__"};
60+
ASSERT_FALSE(PassVersionCheckerRegistrar::GetInstance().IsPassCompatible(
61+
"no_registered_capability_pass"));
62+
63+
REGISTER_PASS_CAPABILITY(no_bind_pass);
6064
ASSERT_TRUE(PassVersionCheckerRegistrar::GetInstance().IsPassCompatible(
6165
"no_bind_pass"));
6266

paddle/fluid/inference/api/analysis_predictor.cc

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,10 @@ bool AnalysisPredictor::PrepareScope(
175175
status_is_cloned_ = true;
176176
} else {
177177
paddle::framework::InitDevices(false);
178-
scope_.reset(new paddle::framework::Scope());
178+
scope_.reset(new paddle::framework::Scope(), [&](framework::Scope *scope) {
179+
delete scope;
180+
memory::Release(place_);
181+
});
179182
status_is_cloned_ = false;
180183
}
181184
sub_scope_ = &scope_->NewScope();
@@ -591,7 +594,6 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
591594
gflags.push_back("--allocator_strategy=thread_local");
592595
process_level_allocator_enabled = false;
593596
} else {
594-
gflags.push_back("--allocator_strategy=naive_best_fit");
595597
process_level_allocator_enabled = true;
596598
}
597599

@@ -890,6 +892,11 @@ bool AnalysisPredictor::LoadParameters() {
890892
return true;
891893
}
892894

895+
uint64_t AnalysisPredictor::TryShrinkMemory() {
896+
ClearIntermediateTensor();
897+
return paddle::memory::Release(place_);
898+
}
899+
893900
void AnalysisPredictor::ClearIntermediateTensor() {
894901
PADDLE_ENFORCE_NOT_NULL(inference_program_.get(),
895902
platform::errors::PreconditionNotMet(
@@ -985,6 +992,8 @@ AnalysisPredictor::~AnalysisPredictor() {
985992
mkldnn_quantizer_ = nullptr;
986993
}
987994
#endif
995+
996+
memory::Release(place_);
988997
}
989998

990999
std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
@@ -1142,6 +1151,8 @@ void Predictor::ClearIntermediateTensor() {
11421151
predictor_->ClearIntermediateTensor();
11431152
}
11441153

1154+
uint64_t Predictor::TryShrinkMemory() { return predictor_->TryShrinkMemory(); }
1155+
11451156
int GetNumBytesOfDataType(DataType dtype) {
11461157
switch (dtype) {
11471158
case DataType::FLOAT32:

paddle/fluid/inference/api/analysis_predictor.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -193,6 +193,17 @@ class AnalysisPredictor : public PaddlePredictor {
193193
///
194194
void ClearIntermediateTensor();
195195

196+
///
197+
/// \brief Release all tmp tensor to compress the size of the memory pool.
198+
/// The memory pool is considered to be composed of a list of chunks, if
199+
/// the chunk is not occupied, it can be released.
200+
///
201+
/// \return Number of bytes released. It may be smaller than the actual
202+
/// released memory, because part of the memory is not managed by the
203+
/// MemoryPool.
204+
///
205+
uint64_t TryShrinkMemory() override;
206+
196207
///
197208
/// \brief Get the argument used by predictor
198209
///

paddle/fluid/inference/api/analysis_predictor_tester.cc

Lines changed: 44 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,7 @@ TEST(AnalysisPredictor, ZeroCopy) {
135135
auto* out_data = out->data<float>(&place, &size);
136136
LOG(INFO) << "output size: " << size / sizeof(float);
137137
LOG(INFO) << "output_data: " << out_data;
138+
predictor->TryShrinkMemory();
138139
}
139140

140141
TEST(AnalysisPredictor, Clone) {
@@ -253,8 +254,7 @@ class MkldnnQuantizerTest : public testing::Test {
253254
public:
254255
MkldnnQuantizerTest() {
255256
AnalysisConfig config(FLAGS_dirname);
256-
257-
predictor.reset(new AnalysisPredictor(config));
257+
predictor = std::move(CreatePaddlePredictor(config));
258258
auto* predictor_p = static_cast<AnalysisPredictor*>(predictor.get());
259259

260260
auto qconfig = new MkldnnQuantizerConfig();
@@ -507,3 +507,45 @@ TEST(AnalysisPredictor, bf16_pass_strategy) {
507507
}
508508

509509
} // namespace paddle
510+
511+
namespace paddle_infer {
512+
513+
TEST(Predictor, Run) {
514+
Config config;
515+
config.SetModel(FLAGS_dirname);
516+
517+
auto predictor = CreatePredictor(config);
518+
519+
auto w0 = predictor->GetInputHandle("firstw");
520+
auto w1 = predictor->GetInputHandle("secondw");
521+
auto w2 = predictor->GetInputHandle("thirdw");
522+
auto w3 = predictor->GetInputHandle("forthw");
523+
524+
w0->Reshape({4, 1});
525+
w1->Reshape({4, 1});
526+
w2->Reshape({4, 1});
527+
w3->Reshape({4, 1});
528+
529+
auto* w0_data = w0->mutable_data<int64_t>(PlaceType::kCPU);
530+
auto* w1_data = w1->mutable_data<int64_t>(PlaceType::kCPU);
531+
auto* w2_data = w2->mutable_data<int64_t>(PlaceType::kCPU);
532+
auto* w3_data = w3->mutable_data<int64_t>(PlaceType::kCPU);
533+
534+
for (int i = 0; i < 4; i++) {
535+
w0_data[i] = i;
536+
w1_data[i] = i;
537+
w2_data[i] = i;
538+
w3_data[i] = i;
539+
}
540+
541+
predictor->Run();
542+
543+
auto out = predictor->GetOutputHandle("fc_1.tmp_2");
544+
PlaceType place;
545+
int size = 0;
546+
out->data<float>(&place, &size);
547+
LOG(INFO) << "output size: " << size / sizeof(float);
548+
predictor->TryShrinkMemory();
549+
}
550+
551+
} // namespace paddle_infer

paddle/fluid/inference/api/api_tester.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ TEST(paddle_inference_api, demo) {
6060
auto predictor = CreatePaddlePredictor(config);
6161
std::vector<PaddleTensor> outputs;
6262
predictor->Run({}, &outputs);
63+
predictor->TryShrinkMemory();
6364
}
6465

6566
TEST(paddle_inference_api, get_version) {

paddle/fluid/inference/api/paddle_api.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,17 @@ class PD_INFER_DECL PaddlePredictor {
319319
///
320320
virtual void ClearIntermediateTensor() {}
321321

322+
///
323+
/// \brief Release all tmp tensor to compress the size of the memory pool.
324+
/// The memory pool is considered to be composed of a list of chunks, if
325+
/// the chunk is not occupied, it can be released.
326+
///
327+
/// \return Number of bytes released. It may be smaller than the actual
328+
/// released memory, because part of the memory is not managed by the
329+
/// MemoryPool.
330+
///
331+
virtual uint64_t TryShrinkMemory() { return 0; }
332+
322333
/// \brief Clone an existing predictor
323334
/// When using clone, the same network will be created,
324335
/// and the parameters between them are shared.

0 commit comments

Comments
 (0)