PaddlePaddle · luotao1 · Jun 25, 2019 · Jun 26, 2019 · Jun 26, 2019 · Jun 27, 2019
diff --git a/README.md b/README.md
@@ -18,17 +18,17 @@ learning to many products at Baidu.
 Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle.
 
-### Latest PaddlePaddle Release: [Fluid 1.4.1](https://github.com/PaddlePaddle/Paddle/tree/release/1.4)
+### Latest PaddlePaddle Release: [Fluid 1.5.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.5)
 ### Install Latest Stable Release:
 ```
 # Linux CPU
 pip install paddlepaddle
 # Linux GPU cuda9cudnn7
 pip install paddlepaddle-gpu
+# Linux GPU cuda10cudnn7
+pip install paddlepaddle-gpu==1.5.0.post107
 # Linux GPU cuda8cudnn7
-pip install paddlepaddle-gpu==1.4.1.post87
-# Linux GPU cuda8cudnn5
-pip install paddlepaddle-gpu==1.4.1.post85
+pip install paddlepaddle-gpu==1.5.0.post87
 
 # For installation on other platform, refer to http://paddlepaddle.org/
 ```

diff --git a/README_cn.md b/README_cn.md
@@ -16,17 +16,17 @@ PaddlePaddle (PArallel Distributed Deep LEarning) 是一个简单易用、高效
 
 跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases)
 
-### PaddlePaddle最新版本: [Fluid 1.4.1](https://github.com/PaddlePaddle/Paddle/tree/release/1.4)
+### PaddlePaddle最新版本: [Fluid 1.5.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.5)
 ### 安装最新稳定版本:
 ```
 # Linux CPU
 pip install paddlepaddle
 # Linux GPU cuda9cudnn7
 pip install paddlepaddle-gpu
+# Linux GPU cuda10cudnn7
+pip install paddlepaddle-gpu==1.5.0.post107
 # Linux GPU cuda8cudnn7
-pip install paddlepaddle-gpu==1.4.1.post87
-# Linux GPU cuda8cudnn5
-pip install paddlepaddle-gpu==1.4.1.post85
+pip install paddlepaddle-gpu==1.5.0.post87
 
 # 其他平台上的安装指引请参考 http://paddlepaddle.org/
 ```

diff --git a/cmake/external/libmct.cmake b/cmake/external/libmct.cmake
@@ -31,7 +31,7 @@ IF((NOT DEFINED LIBMCT_VER) OR (NOT DEFINED LIBMCT_URL))
   MESSAGE(STATUS "use pre defined download url")
   SET(LIBMCT_VER "0.1.0" CACHE STRING "" FORCE)
   SET(LIBMCT_NAME "libmct" CACHE STRING "" FORCE)
-  SET(LIBMCT_URL "https://raw.githubusercontent.com/PaddlePaddle/Fleet/release/${LIBMCT_VER}/${LIBMCT_NAME}.tar.gz" CACHE STRING "" FORCE) 
+  SET(LIBMCT_URL "https://pslib.bj.bcebos.com/libmct.tar.gz" CACHE STRING "" FORCE) 
 ENDIF()
 MESSAGE(STATUS "LIBMCT_NAME: ${LIBMCT_NAME}, LIBMCT_URL: ${LIBMCT_URL}")
 SET(LIBMCT_SOURCE_DIR    "${THIRD_PARTY_PATH}/libmct")

diff --git a/cmake/external/pslib.cmake b/cmake/external/pslib.cmake
@@ -31,7 +31,7 @@ IF((NOT DEFINED PSLIB_VER) OR (NOT DEFINED PSLIB_URL))
   MESSAGE(STATUS "use pre defined download url")
   SET(PSLIB_VER "0.1.1" CACHE STRING "" FORCE)
   SET(PSLIB_NAME "pslib" CACHE STRING "" FORCE)
-  SET(PSLIB_URL "https://raw.githubusercontent.com/PaddlePaddle/Fleet/release/${PSLIB_VER}/ps/${PSLIB_NAME}.tar.gz" CACHE STRING "" FORCE)
+  SET(PSLIB_URL "https://pslib.bj.bcebos.com/pslib.tar.gz" CACHE STRING "" FORCE)
 ENDIF()
 MESSAGE(STATUS "PSLIB_NAME: ${PSLIB_NAME}, PSLIB_URL: ${PSLIB_URL}")
 SET(PSLIB_SOURCE_DIR    "${THIRD_PARTY_PATH}/pslib")

diff --git a/cmake/external/pslib_brpc.cmake b/cmake/external/pslib_brpc.cmake
@@ -31,7 +31,7 @@ IF((NOT DEFINED PSLIB_BRPC_NAME) OR (NOT DEFINED PSLIB_BRPC_URL))
   MESSAGE(STATUS "use pre defined download url")
   SET(PSLIB_BRPC_VER "0.1.0" CACHE STRING "" FORCE)
   SET(PSLIB_BRPC_NAME "pslib_brpc" CACHE STRING "" FORCE)
-  SET(PSLIB_BRPC_URL "https://raw.githubusercontent.com/PaddlePaddle/Fleet/release/${PSLIB_BRPC_VER}/${PSLIB_BRPC_NAME}.tar.gz" CACHE STRING "" FORCE)
+  SET(PSLIB_BRPC_URL "https://pslib.bj.bcebos.com/pslib_brpc.tar.gz" CACHE STRING "" FORCE)
 ENDIF()
 MESSAGE(STATUS "PSLIB_BRPC_NAME: ${PSLIB_BRPC_NAME}, PSLIB_BRPC_URL: ${PSLIB_BRPC_URL}")
 SET(PSLIB_BRPC_SOURCE_DIR    "${THIRD_PARTY_PATH}/pslib_brpc")

diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h
@@ -119,9 +119,9 @@ class DeviceWorker {
   }
 
  protected:
-  Scope* root_scope_;
+  Scope* root_scope_ = nullptr;
   paddle::platform::Place place_;
-  DataFeed* device_reader_;
+  DataFeed* device_reader_ = nullptr;
   int64_t batch_num_;
   FetchConfig fetch_config_;
   bool use_cvm_;

diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -77,7 +77,9 @@ void CPUQuantizePass::QuantizeInputs(Graph* g, Node* op, std::string input_name,
                                      VarQuantScale* scales, bool are_unsigned,
                                      std::string scale_attr_name) const {
   auto inputs = op->inputs;
+  auto output = op->outputs[0];
   PADDLE_ENFORCE_GE(inputs.size(), 1);
+  PADDLE_ENFORCE_EQ(op->outputs.size(), 1);
 
   // create a quantize op desc prototype
   OpDesc q_desc;
@@ -86,13 +88,9 @@ void CPUQuantizePass::QuantizeInputs(Graph* g, Node* op, std::string input_name,
   std::vector<Node*> quantize_out_nodes(inputs.size());
   std::vector<std::string> quantize_out_node_names(inputs.size());
 
-  double scale_min = std::numeric_limits<double>::max();
-  for (const auto& input : inputs) {
-    double scale = (*scales)[input->Name()].second.data<double>()[0];
-    if (scale < scale_min) scale_min = scale;
-  }
+  double scale_out = (*scales)[output->Name()].second.data<double>()[0];
   unsigned max = are_unsigned ? U8_MAX : S8_MAX;
-  float scale = scale_min * max;
+  float scale = scale_out * max;
 
   for (size_t i = 0; i < inputs.size(); i++) {
     // Create quantize output variable

diff --git a/paddle/fluid/framework/op_proto_maker.cc b/paddle/fluid/framework/op_proto_maker.cc
@@ -74,7 +74,6 @@ void OpProtoAndCheckerMaker::operator()(proto::OpProto* proto,
                static_cast<int>(OpRole::kBackward),
            static_cast<int>(OpRole::kOptimize) |
                static_cast<int>(OpRole::kLRSched),
-           static_cast<int>(OpRole::kCollective),
            static_cast<int>(OpRole::kNotSpecified)})
       .SetDefault(static_cast<int>(OpRole::kNotSpecified));
   AddAttr<std::vector<std::string>>(OpRoleVarAttrName(),

diff --git a/paddle/fluid/framework/op_proto_maker.h b/paddle/fluid/framework/op_proto_maker.h
@@ -34,9 +34,6 @@ enum class OpRole {
   kDist = 0x0008,
   // Tag all learning rate scheduler operators.
   kLRSched = 0x0010,
-  // Collective role is for all collective operators and other operators used
-  // for collective training
-  kCollective = 0x0020,
 
   kLoss = 0x0100,
   // The default value of op's role. This should be only used for unittests and

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -185,14 +185,6 @@ bool AnalysisPredictor::PrepareExecutor() {
   return true;
 }
 
-void AnalysisPredictor::SetMkldnnThreadID(int tid) {
-#ifdef PADDLE_WITH_MKLDNN
-  platform::set_cur_thread_id(tid);
-#else
-  LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN";
-#endif
-}
-
 bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
                             std::vector<PaddleTensor> *output_data,
                             int batch_size) {

diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
@@ -80,8 +80,6 @@ class AnalysisPredictor : public PaddlePredictor {
   framework::Scope *scope() { return scope_.get(); }
   framework::ProgramDesc &program() { return *inference_program_; }
 
-  void SetMkldnnThreadID(int tid);
-
   std::string GetSerializedProgram() const override;
 
   bool MkldnnQuantize();

diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/fluid/inference/api/mkldnn_quantizer.h"
 #include <algorithm>
+#include <limits>
 #include <map>
 #include <numeric>
 #include <unordered_map>
@@ -37,6 +38,7 @@ using framework::ir::Graph;
 using ConstEigenVectorArrayMap =
     Eigen::Map<const Eigen::Array<float, Eigen::Dynamic, 1>>;
 using string::PrettyLogH1;
+static LoDTensor CreateScaleTensor(int64_t channels_num = 1);
 
 bool AnalysisPredictor::MkldnnQuantizer::CalculateScales() {
   PrettyLogH1("--- Calculating scales for quantization");
@@ -52,7 +54,7 @@ bool AnalysisPredictor::MkldnnQuantizer::CalculateScales() {
         for (auto const& conn : connections) {
           for (const auto& var_name : conn.second) {
             // skip if scale already computed
-            if (scales_.find(var_name) != scales_.end()) return;
+            if (scales_.find(var_name) != scales_.end()) continue;
 
             auto* var = predictor_.sub_scope_->FindVar(var_name);
             PADDLE_ENFORCE(var, "%s is not in the scope", var_name);
@@ -62,29 +64,49 @@ bool AnalysisPredictor::MkldnnQuantizer::CalculateScales() {
 
             // force unsigned type if already know it
             bool is_unsigned = false;
-            if (is_output && op->Type() == "conv2d") {
-              // output of conv2d with relu must be unsigned
-              is_unsigned = (op->HasAttr("fuse_relu") &&
-                             boost::get<bool>(op->GetAttr("fuse_relu"))) ||
-                            (op->HasAttr("fuse_brelu") &&
-                             boost::get<bool>(op->GetAttr("fuse_brelu")));
-            } else if (is_output && op->Type() == "relu") {
-              is_unsigned = true;
-            } else if (is_output &&
-                       (op->Type() == "pool2d" || op->Type() == "transpose2" ||
-                        op->Type() == "reshape2" || op->Type() == "concat")) {
-              // output of ops with unsigned input must be unsigned
-              is_unsigned = true;
-              for (auto input_var_name : op->Input("X")) {
+            bool compute_scale = true;
+            if (is_output) {
+              if (op->Type() == "conv2d") {
+                // output of conv2d with relu must be unsigned
+                is_unsigned = (op->HasAttr("fuse_relu") &&
+                               boost::get<bool>(op->GetAttr("fuse_relu"))) ||
+                              (op->HasAttr("fuse_brelu") &&
+                               boost::get<bool>(op->GetAttr("fuse_brelu")));
+              } else if (op->Type() == "relu") {
+                is_unsigned = true;
+              } else if (op->Type() == "transpose2" ||
+                         op->Type() == "reshape2" || op->Type() == "pool2d") {
+                auto input_var_name = op->Input("X")[0];
                 PADDLE_ENFORCE(scales_.find(input_var_name) != scales_.end(),
                                "Input scales must be calculated before the "
                                "output scales to infer if output is unsigned.");
-                is_unsigned = is_unsigned && scales_[input_var_name].first;
+                if (scales_.find(input_var_name) != scales_.end()) {
+                  scales_[var_name] = scales_[input_var_name];
+                }
+                compute_scale = false;
+              } else if (op->Type() == "concat") {
+                // output of ops with unsigned input must be unsigned
+                is_unsigned = true;
+                double min_scale = std::numeric_limits<double>::max();
+                for (auto input_var_name : op->Input("X")) {
+                  PADDLE_ENFORCE(
+                      scales_.find(input_var_name) != scales_.end(),
+                      "Input scales must be calculated before the "
+                      "output scales to infer if output is unsigned.");
+                  is_unsigned = is_unsigned && scales_[input_var_name].first;
+                  min_scale = std::min(
+                      min_scale,
+                      scales_[input_var_name].second.data<double>()[0]);
+                }
+                auto scale_tensor = CreateScaleTensor();
+                scale_tensor.data<double>()[0] = min_scale;
+                scales_[var_name] = {is_unsigned, scale_tensor};
+                compute_scale = false;
               }
             }
-
-            CalculateSingleScale(op->Type(), conn.first, var_name, *var_tensor,
-                                 is_unsigned);
+            if (compute_scale)
+              CalculateSingleScale(op->Type(), conn.first, var_name,
+                                   *var_tensor, is_unsigned);
           }
         }
       };
@@ -127,6 +149,13 @@ void AnalysisPredictor::MkldnnQuantizer::CalculateSingleScale(
   }
 }
 
+static LoDTensor CreateScaleTensor(int64_t channels_num) {
+  LoDTensor scale_tensor;
+  scale_tensor.Resize({channels_num});
+  scale_tensor.mutable_data<double>(CPUPlace());
+  return scale_tensor;
+}
+
 std::vector<int> AnalysisPredictor::MkldnnQuantizer::ExpandQuantizedBins(
     std::vector<int> quantized_bins, std::vector<int> reference_bins) const {
   std::vector<int> expanded_quantized_bins(reference_bins.size(), 0);
@@ -263,11 +292,8 @@ AnalysisPredictor::MkldnnQuantizer::GetKLScalingFactor(
     min_kl_index = starting_iter;
   }
 
-  LoDTensor scale_tensor;
-  scale_tensor.Resize({1});
-  auto* scale_ptr = scale_tensor.mutable_data<double>(CPUPlace());
-
-  scale_ptr[0] = 1.0 / ((min_kl_index + 0.5) * bin_width);
+  LoDTensor scale_tensor = CreateScaleTensor();
+  scale_tensor.data<double>()[0] = 1.0 / ((min_kl_index + 0.5) * bin_width);
 
   return std::make_pair(is_unsigned, scale_tensor);
 }
@@ -285,10 +311,8 @@ AnalysisPredictor::MkldnnQuantizer::GetMaxScalingFactor(
         "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0",
         min_val);
 
-  LoDTensor scale_tensor;
-  scale_tensor.Resize({1});
-  auto* scale_ptr = scale_tensor.mutable_data<double>(CPUPlace());
-  scale_ptr[0] = 1.0 / max_abs;
+  LoDTensor scale_tensor = CreateScaleTensor();
+  scale_tensor.data<double>()[0] = 1.0 / max_abs;
 
   return std::make_pair(is_unsigned, scale_tensor);
 }
@@ -308,8 +332,7 @@ AnalysisPredictor::MkldnnQuantizer::GetMaxChScalingFactor(
         min_val);
 
   int channels = var_tensor.dims()[0];
-  LoDTensor scale_tensor;
-  scale_tensor.Resize({channels});
+  LoDTensor scale_tensor = CreateScaleTensor(channels);
   auto* scale_ptr = scale_tensor.mutable_data<double>(CPUPlace());
 
   for (int i = 0; i < channels; ++i) {

diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -155,6 +155,9 @@ if (NOT EXISTS ${MOBILENET_INSTALL_DIR})
 endif()
 inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc)
 
+# detect
+inference_analysis_api_test_with_refer_result(test_analyzer_detect ${OCR_INSTALL_DIR} analyzer_detect_tester.cc)
+
 ### Image classification tests with fake data
 set(IMG_CLASS_TEST_APP "test_analyzer_image_classification")
 set(IMG_CLASS_TEST_APP_SRC "analyzer_image_classification_tester.cc")

diff --git a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/fluid/framework/transfer_scope_cache.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 
 namespace paddle {
@@ -228,5 +229,53 @@ TEST(Analyzer_bert, compare_determine) {
   CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
                        inputs);
 }
+
+#ifdef PADDLE_WITH_MKLDNN
+void test_mkldnn_cache_clear_mode(bool enable_mkldnn_cache_clear_mode) {
+  AnalysisConfig config;
+  SetConfig(&config);
+  config.EnableMKLDNN();
+
+  std::vector<PaddleTensor> input, output;
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
+
+  int threads_num = 10;
+  std::vector<std::thread> threads;
+  std::unordered_set<std::unordered_set<paddle::framework::Scope *> *>
+      global_transfer_scope_cache;
+
+  std::ifstream fin(FLAGS_infer_data);
+  std::string line;
+
+  for (int i = 0; i < threads_num; i++) {
+    threads.emplace_back([&, i]() {
+      // TODO(luotao, intel): Will remove platform::set_cur_thread_id after
+      // enhance
+      // the interface of config.EnableMKLDNN();
+      if (enable_mkldnn_cache_clear_mode) platform::set_cur_thread_id(-1);
+      std::getline(fin, line);
+      ParseLine(line, &input);
+      predictor->Run(input, &output, FLAGS_batch_size);
+      global_transfer_scope_cache.insert(
+          &paddle::framework::global_transfer_scope_cache());
+      LOG(INFO) << &paddle::framework::global_transfer_scope_cache();
+    });
+    threads[0].join();
+    threads.clear();
+    std::vector<PaddleTensor>().swap(input);
+  }
+  if (enable_mkldnn_cache_clear_mode) {
+    PADDLE_ENFORCE(global_transfer_scope_cache.size(), 1);
+  } else {
+    PADDLE_ENFORCE(global_transfer_scope_cache.size(), threads_num);
+  }
+}
+
+TEST(Analyzer_bert, multi_instance_mkldnn_memory_leak) {
+  test_mkldnn_cache_clear_mode(true);
+  test_mkldnn_cache_clear_mode(false);
+}
+#endif
+
 }  // namespace inference
 }  // namespace paddle