PaddlePaddle
diff --git a/‎paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc‎
Lines changed: 1 addition & 1 deletion b/‎paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc‎
Lines changed: 63 additions & 21 deletions b/‎paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc‎
Lines changed: 63 additions & 21 deletions
diff --git a/‎paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc‎
Lines changed: 1 addition & 1 deletion b/‎paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/framework/op_version_registry.h‎
Lines changed: 1 addition & 1 deletion b/‎paddle/fluid/framework/op_version_registry.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/framework/op_version_registry_test.cc‎
Lines changed: 4 additions & 0 deletions b/‎paddle/fluid/framework/op_version_registry_test.cc‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎paddle/fluid/inference/api/analysis_predictor.cc‎
Lines changed: 13 additions & 2 deletions b/‎paddle/fluid/inference/api/analysis_predictor.cc‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎paddle/fluid/inference/api/analysis_predictor.h‎
Lines changed: 11 additions & 0 deletions b/‎paddle/fluid/inference/api/analysis_predictor.h‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎paddle/fluid/inference/api/analysis_predictor_tester.cc‎
Lines changed: 44 additions & 2 deletions b/‎paddle/fluid/inference/api/analysis_predictor_tester.cc‎
Lines changed: 44 additions & 2 deletions
diff --git a/‎paddle/fluid/inference/api/api_tester.cc‎
Lines changed: 1 addition & 0 deletions b/‎paddle/fluid/inference/api/api_tester.cc‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎paddle/fluid/inference/api/paddle_api.h‎
Lines changed: 11 additions & 0 deletions b/‎paddle/fluid/inference/api/paddle_api.h‎
Lines changed: 11 additions & 0 deletions
@@ -158,7 +158,7 @@ REGISTER_PASS(conv_transpose_bias_mkldnn_fuse_pass,
 REGISTER_PASS_CAPABILITY(conv_transpose_bias_mkldnn_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
-            .EQ("conv2d_transpose", 0)
+            .LE("conv2d_transpose", 1)
             .EQ("elementwise_add", 0));
 
 REGISTER_PASS(conv3d_bias_mkldnn_fuse_pass,
 
@@ -195,32 +195,73 @@ void FuseDequant(ir::Graph* graph, Scope* scope,
     auto* weight_tensor =
         scope->Var(quantized_op_weight_node->Name())->GetMutable<LoDTensor>();
     auto w_dims = weight_tensor->dims();
+    float* quantized_weight_data =
+        weight_tensor->mutable_data<float>(platform::CPUPlace());
     // If quantized op is fc, weight scale size = 1;
     // If quantized op is conv2d, weight scale size = weight dims[0]
     // If quantized op is conv2d_transpose, weight scale size = weight dims[1]
-    bool valid_scale_size =
-        (weight_scale.size() == 1 ||
-         weight_scale.size() == static_cast<size_t>(w_dims[0]) ||
-         weight_scale.size() == static_cast<size_t>(w_dims[1]));
-    PADDLE_ENFORCE_EQ(
-        valid_scale_size, true,
-        platform::errors::InvalidArgument(
-            "TRT int8 quant: invalid scale size(%d).", weight_scale.size()));
-    float* quantized_weight_data =
-        weight_tensor->mutable_data<float>(platform::CPUPlace());
-    for (int j = 0; j < weight_tensor->numel(); j++) {
-      if (weight_scale.size() == 1) {
-        quantized_weight_data[j] *= weight_scale[0];
-      } else {
-        if (quantized_op_type == "conv2d_transpose") {
-          int inner_size = w_dims[2] * w_dims[3];
-          quantized_weight_data[j] *=
-              weight_scale[(j / inner_size) % w_dims[1]];
-        } else {
-          int inner_size = w_dims[1] * w_dims[2] * w_dims[3];
-          quantized_weight_data[j] *= weight_scale[j / inner_size];
+    if (quantized_op_type == "mul" || quantized_op_type == "fc") {
+      if (dequant_type == "fake_dequantize_max_abs") {
+        PADDLE_ENFORCE_EQ(
+            weight_scale.size(), 1,
+            platform::errors::InvalidArgument(
+                "mul op weight dequantized by [fake_dequantize_max_abs] "
+                "requires weight scale size = 1, but got %d.",
+                weight_scale.size()));
+        for (int j = 0; j < weight_tensor->numel(); j++) {
+          quantized_weight_data[j] *= weight_scale[0];
         }
       }
+      if (dequant_type == "fake_channel_wise_dequantize_max_abs") {
+        PADDLE_ENFORCE_EQ(
+            weight_scale.size(), static_cast<size_t>(w_dims[1]),
+            platform::errors::InvalidArgument(
+                "mul op weight dequantized by "
+                "[fake_channel_wise_dequantize_max_abs] requires weight scale "
+                "size = 2nd dim of mul's weight, which is %d, but got %d.",
+                static_cast<size_t>(w_dims[1]), weight_scale.size()));
+        for (int j = 0; j < weight_tensor->numel(); j++) {
+          quantized_weight_data[j] *= weight_scale[j % w_dims[1]];
+        }
+      }
+    } else if (quantized_op_type == "conv2d" ||
+               quantized_op_type == "depthwise_conv2d") {
+      PADDLE_ENFORCE_EQ(
+          dequant_type, "fake_channel_wise_dequantize_max_abs",
+          platform::errors::InvalidArgument("conv2d op must be dequantized by "
+                                            "[fake_channel_wise_dequantize_max_"
+                                            "abs], but got %s",
+                                            dequant_type));
+      PADDLE_ENFORCE_EQ(
+          weight_scale.size(), static_cast<size_t>(w_dims[0]),
+          platform::errors::InvalidArgument(
+              "conv2d op requires weight scale size = channel size of the "
+              "weight, which is %d, but got %d.",
+              static_cast<size_t>(w_dims[0]), weight_scale.size()));
+      for (int j = 0; j < weight_tensor->numel(); j++) {
+        int inner_size = w_dims[1] * w_dims[2] * w_dims[3];
+        quantized_weight_data[j] *= weight_scale[j / inner_size];
+      }
+    } else if (quantized_op_type == "conv2d_transpose") {
+      PADDLE_ENFORCE_EQ(
+          dequant_type, "fake_channel_wise_dequantize_max_abs",
+          platform::errors::InvalidArgument(
+              "conv2d_transpose must be dequantized by "
+              "[fake_channel_wise_dequantize_max_abs], but got %s",
+              dequant_type));
+      PADDLE_ENFORCE_EQ(
+          weight_scale.size(), static_cast<size_t>(w_dims[1]),
+          platform::errors::InvalidArgument(
+              "conv2d_transpose op requires weight scale size = channel size "
+              "of the weight, which is %d, but got %d.",
+              static_cast<size_t>(w_dims[1]), weight_scale.size()));
+      for (int j = 0; j < weight_tensor->numel(); j++) {
+        int inner_size = w_dims[2] * w_dims[3];
+        quantized_weight_data[j] *= weight_scale[(j / inner_size) % w_dims[1]];
+      }
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Unsupported quantized op type: %s", quantized_op_type));
     }
 
     // create new op_desc
@@ -285,6 +326,7 @@ void QuantDequantFusePass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(quant_conv2d_dequant_fuse_pass,
               paddle::framework::ir::QuantDequantFusePass);
+REGISTER_PASS_CAPABILITY(quant_conv2d_dequant_fuse_pass);
 
 REGISTER_PASS_CAPABILITY(tensorrt_subgraph_pass)
     .AddCombination(
 
@@ -394,5 +394,5 @@ REGISTER_PASS_CAPABILITY(squared_mat_sub_fuse_pass)
             .EQ("square", 0)
             .EQ("elementwise_mul", 0)
             .EQ("elementwise_sub", 0)
-            .EQ("fill_constant", 0)
+            .EQ("fill_constant", 1)
             .EQ("fusion_squared_mat_sub", 0));
@@ -308,7 +308,7 @@ class PassVersionCheckerRegistrar {
   bool IsPassCompatible(const std::string& fuse_pass_name) const {
     auto iter = pass_version_checkers_map_.find(fuse_pass_name);
     if (iter == pass_version_checkers_map_.end()) {
-      return true;
+      return false;
     }
     return iter->second.IsPassCompatible();
   }
 
@@ -57,6 +57,10 @@ TEST(test_operator_version, test_operator_version) {
 
 TEST(test_pass_op_version_checker, test_pass_op_version_checker) {
   const std::string fake_op_name{"op_name__"};
+  ASSERT_FALSE(PassVersionCheckerRegistrar::GetInstance().IsPassCompatible(
+      "no_registered_capability_pass"));
+
+  REGISTER_PASS_CAPABILITY(no_bind_pass);
   ASSERT_TRUE(PassVersionCheckerRegistrar::GetInstance().IsPassCompatible(
       "no_bind_pass"));
 
 
@@ -175,7 +175,10 @@ bool AnalysisPredictor::PrepareScope(
     status_is_cloned_ = true;
   } else {
     paddle::framework::InitDevices(false);
-    scope_.reset(new paddle::framework::Scope());
+    scope_.reset(new paddle::framework::Scope(), [&](framework::Scope *scope) {
+      delete scope;
+      memory::Release(place_);
+    });
     status_is_cloned_ = false;
   }
   sub_scope_ = &scope_->NewScope();
@@ -591,7 +594,6 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
         gflags.push_back("--allocator_strategy=thread_local");
         process_level_allocator_enabled = false;
       } else {
-        gflags.push_back("--allocator_strategy=naive_best_fit");
         process_level_allocator_enabled = true;
       }
 
@@ -890,6 +892,11 @@ bool AnalysisPredictor::LoadParameters() {
   return true;
 }
 
+uint64_t AnalysisPredictor::TryShrinkMemory() {
+  ClearIntermediateTensor();
+  return paddle::memory::Release(place_);
+}
+
 void AnalysisPredictor::ClearIntermediateTensor() {
   PADDLE_ENFORCE_NOT_NULL(inference_program_.get(),
                           platform::errors::PreconditionNotMet(
@@ -985,6 +992,8 @@ AnalysisPredictor::~AnalysisPredictor() {
     mkldnn_quantizer_ = nullptr;
   }
 #endif
+
+  memory::Release(place_);
 }
 
 std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
@@ -1142,6 +1151,8 @@ void Predictor::ClearIntermediateTensor() {
   predictor_->ClearIntermediateTensor();
 }
 
+uint64_t Predictor::TryShrinkMemory() { return predictor_->TryShrinkMemory(); }
+
 int GetNumBytesOfDataType(DataType dtype) {
   switch (dtype) {
     case DataType::FLOAT32:
 
@@ -193,6 +193,17 @@ class AnalysisPredictor : public PaddlePredictor {
   ///
   void ClearIntermediateTensor();
 
+  ///
+  /// \brief Release all tmp tensor to compress the size of the memory pool.
+  /// The memory pool is considered to be composed of a list of chunks, if
+  /// the chunk is not occupied, it can be released.
+  ///
+  /// \return Number of bytes released. It may be smaller than the actual
+  /// released memory, because part of the memory is not managed by the
+  /// MemoryPool.
+  ///
+  uint64_t TryShrinkMemory() override;
+
   ///
   /// \brief Get the argument used by predictor
   ///
 
@@ -135,6 +135,7 @@ TEST(AnalysisPredictor, ZeroCopy) {
   auto* out_data = out->data<float>(&place, &size);
   LOG(INFO) << "output size: " << size / sizeof(float);
   LOG(INFO) << "output_data: " << out_data;
+  predictor->TryShrinkMemory();
 }
 
 TEST(AnalysisPredictor, Clone) {
@@ -253,8 +254,7 @@ class MkldnnQuantizerTest : public testing::Test {
  public:
   MkldnnQuantizerTest() {
     AnalysisConfig config(FLAGS_dirname);
-
-    predictor.reset(new AnalysisPredictor(config));
+    predictor = std::move(CreatePaddlePredictor(config));
     auto* predictor_p = static_cast<AnalysisPredictor*>(predictor.get());
 
     auto qconfig = new MkldnnQuantizerConfig();
@@ -507,3 +507,45 @@ TEST(AnalysisPredictor, bf16_pass_strategy) {
 }
 
 }  // namespace paddle
+
+namespace paddle_infer {
+
+TEST(Predictor, Run) {
+  Config config;
+  config.SetModel(FLAGS_dirname);
+
+  auto predictor = CreatePredictor(config);
+
+  auto w0 = predictor->GetInputHandle("firstw");
+  auto w1 = predictor->GetInputHandle("secondw");
+  auto w2 = predictor->GetInputHandle("thirdw");
+  auto w3 = predictor->GetInputHandle("forthw");
+
+  w0->Reshape({4, 1});
+  w1->Reshape({4, 1});
+  w2->Reshape({4, 1});
+  w3->Reshape({4, 1});
+
+  auto* w0_data = w0->mutable_data<int64_t>(PlaceType::kCPU);
+  auto* w1_data = w1->mutable_data<int64_t>(PlaceType::kCPU);
+  auto* w2_data = w2->mutable_data<int64_t>(PlaceType::kCPU);
+  auto* w3_data = w3->mutable_data<int64_t>(PlaceType::kCPU);
+
+  for (int i = 0; i < 4; i++) {
+    w0_data[i] = i;
+    w1_data[i] = i;
+    w2_data[i] = i;
+    w3_data[i] = i;
+  }
+
+  predictor->Run();
+
+  auto out = predictor->GetOutputHandle("fc_1.tmp_2");
+  PlaceType place;
+  int size = 0;
+  out->data<float>(&place, &size);
+  LOG(INFO) << "output size: " << size / sizeof(float);
+  predictor->TryShrinkMemory();
+}
+
+}  // namespace paddle_infer
@@ -60,6 +60,7 @@ TEST(paddle_inference_api, demo) {
   auto predictor = CreatePaddlePredictor(config);
   std::vector<PaddleTensor> outputs;
   predictor->Run({}, &outputs);
+  predictor->TryShrinkMemory();
 }
 
 TEST(paddle_inference_api, get_version) {
 
@@ -319,6 +319,17 @@ class PD_INFER_DECL PaddlePredictor {
   ///
   virtual void ClearIntermediateTensor() {}
 
+  ///
+  /// \brief Release all tmp tensor to compress the size of the memory pool.
+  /// The memory pool is considered to be composed of a list of chunks, if
+  /// the chunk is not occupied, it can be released.
+  ///
+  /// \return Number of bytes released. It may be smaller than the actual
+  /// released memory, because part of the memory is not managed by the
+  /// MemoryPool.
+  ///
+  virtual uint64_t TryShrinkMemory() { return 0; }
+
   /// \brief Clone an existing predictor
   /// When using clone, the same network will be created,
   /// and the parameters between them are shared.
Original file line number	Diff line number	Diff line change
`@@ -308,7 +308,7 @@ class PassVersionCheckerRegistrar {`
`308`	`308`	`bool IsPassCompatible(const std::string& fuse_pass_name) const {`
`309`	`309`	`auto iter = pass_version_checkers_map_.find(fuse_pass_name);`
`310`	`310`	`if (iter == pass_version_checkers_map_.end()) {`
`311`		`- return true;`
	`311`	`+ return false;`
`312`	`312`	`}`
`313`	`313`	`return iter->second.IsPassCompatible();`
`314`	`314`	`}`
Original file line number	Diff line number	Diff line change
`@@ -60,6 +60,7 @@ TEST(paddle_inference_api, demo) {`
`60`	`60`	`auto predictor = CreatePaddlePredictor(config);`
`61`	`61`	`std::vector<PaddleTensor> outputs;`
`62`	`62`	`predictor->Run({}, &outputs);`
	`63`	`+ predictor->TryShrinkMemory();`
`63`	`64`	`}`
`64`	`65`
`65`	`66`	`TEST(paddle_inference_api, get_version) {`