PaddlePaddle · LeoZhao-Intel · Jun 24, 2019 · Jun 24, 2019 · Jun 24, 2019 · Jun 24, 2019
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
@@ -114,6 +114,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   // MKLDNN related.
   CP_MEMBER(use_mkldnn_);
   CP_MEMBER(mkldnn_enabled_op_types_);
+  CP_MEMBER(mkldnn_thread_id_);
   // Quantization related.
   CP_MEMBER(use_mkldnn_quantizer_);
   CP_MEMBER(mkldnn_quantizer_config_);
@@ -161,6 +162,15 @@ void AnalysisConfig::EnableMKLDNN() {
   Update();
 }
 
+void AnalysisConfig::SetMKLDNNThreadId(int id) {
+#ifdef PADDLE_WITH_MKLDNN
+  mkldnn_thread_id_ = id;
+#else
+  LOG(ERROR) << "Please compile with MKLDNN first to set MKLDNN Thread Id";
+  mkldnn_thread_id_ = 0;
+#endif
+}
+
 void AnalysisConfig::EnableMkldnnQuantizer() {
 #ifdef PADDLE_WITH_MKLDNN
   if (!mkldnn_quantizer_config_)

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -197,6 +197,16 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
                             std::vector<PaddleTensor> *output_data,
                             int batch_size) {
   paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
+#ifdef PADDLE_WITH_MKLDNN
+  // TODO(intel): will refactor this code later
+  // Make sure it not conflict with AnalysisPredictor::SetMkldnnthreadid case
+  VLOG(2) << "AnalysisPredictor::Run get_cur_thread_id="
+          << paddle::platform::get_cur_thread_id()
+          << ", mkldnn_thread_id_=" << config_.mkldnn_thread_id_ << "\n";
+  if (paddle::platform::get_cur_thread_id() == 0)
+    paddle::platform::set_cur_thread_id(config_.mkldnn_thread_id_);
+#endif
+
   VLOG(3) << "Predictor::predict";
   inference::Timer timer;
   timer.tic();
@@ -238,7 +248,15 @@ bool AnalysisPredictor::Run(const std::vector<PaddleTensor> &inputs,
   // recover the cpu_math_library_num_threads to 1, in order to avoid thread
   // conflict when integrating it into deployment service.
   paddle::platform::SetNumThreads(1);
-
+#ifdef PADDLE_WITH_MKLDNN
+  // TODO(intel): will refactor this code later
+  // reset thread id to avoid confusion when thread is reused from pool again
+  // mkldnn_thread_id_ = -1 is reserved for cache clearing mode only
+  if (paddle::platform::get_cur_thread_id() == -1) {
+    VLOG(2) << "Clear previous mkldnn thread id -1\n";
+    paddle::platform::set_cur_thread_id(0);
+  }
+#endif
   return true;
 }
 
@@ -595,6 +613,15 @@ std::unique_ptr<ZeroCopyTensor> AnalysisPredictor::GetOutputTensor(
 
 bool AnalysisPredictor::ZeroCopyRun() {
   paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads());
+#ifdef PADDLE_WITH_MKLDNN
+  // TODO(intel): will refactor this code later
+  // Make sure it not conflict with AnalysisPredictor::SetMkldnnthreadid case
+  VLOG(2) << "AnalysisPredictor::Run get_cur_thread_id="
+          << paddle::platform::get_cur_thread_id()
+          << ", mkldnn_thread_id_=" << config_.mkldnn_thread_id_ << "\n";
+  if (paddle::platform::get_cur_thread_id() == 0)
+    paddle::platform::set_cur_thread_id(config_.mkldnn_thread_id_);
+#endif
   executor_->Run();
   // Fix TensorArray reuse not cleaned bug.
   tensor_array_batch_cleaner_.CollectTensorArrays(sub_scope_);
@@ -603,7 +630,15 @@ bool AnalysisPredictor::ZeroCopyRun() {
   // recover the cpu_math_library_num_threads to 1, in order to avoid thread
   // conflict when integrating it into deployment service.
   paddle::platform::SetNumThreads(1);
-
+#ifdef PADDLE_WITH_MKLDNN
+  // TODO(intel): will refactor this code later
+  // reset thread id to avoid confusion when thread is reused from pool again
+  // mkldnn_thread_id_ = -1 is reserved for cache clearing mode only
+  if (paddle::platform::get_cur_thread_id() == -1) {
+    VLOG(2) << "Clear previous mkldnn thread id setting\n";
+    paddle::platform::set_cur_thread_id(0);
+  }
+#endif
   return true;
 }
 

diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -182,6 +182,9 @@ struct AnalysisConfig {
   /** A boolean state telling whether to use the MKLDNN.
    */
   bool mkldnn_enabled() const { return use_mkldnn_; }
+  /** Set MKLDNN thread id.
+   */
+  void SetMKLDNNThreadId(int id);
 
   /** Set and get the number of cpu math library threads.
    */
@@ -287,6 +290,7 @@ struct AnalysisConfig {
   bool use_ngraph_{false};
   bool use_mkldnn_{false};
   std::unordered_set<std::string> mkldnn_enabled_op_types_;
+  int mkldnn_thread_id_{0};
 
   bool model_from_memory_{false};
 

diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -157,6 +157,9 @@ if (NOT EXISTS ${MOBILENET_INSTALL_DIR})
 endif()
 inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc)
 
+# detect
+inference_analysis_api_test_with_refer_result(test_analyzer_detect ${OCR_INSTALL_DIR} analyzer_detect_tester.cc)
+
 ### Image classification tests with fake data
 set(IMG_CLASS_TEST_APP "test_analyzer_image_classification")
 set(IMG_CLASS_TEST_APP_SRC "analyzer_image_classification_tester.cc")

diff --git a/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc b/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc
@@ -0,0 +1,150 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <fstream>
+#include <iostream>
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+DEFINE_string(infer_shape, "", "data shape file");
+
+namespace paddle {
+namespace inference {
+namespace analysis {
+
+struct Record {
+  std::vector<float> data;
+  std::vector<int32_t> shape;
+};
+
+Record ProcessALine(const std::string &line, const std::string &shape_line) {
+  VLOG(3) << "process a line";
+  std::vector<std::string> columns;
+
+  Record record;
+  std::vector<std::string> data_strs;
+  split(line, ' ', &data_strs);
+  for (auto &d : data_strs) {
+    record.data.push_back(std::stof(d));
+  }
+
+  std::vector<std::string> shape_strs;
+  split(shape_line, ' ', &shape_strs);
+  for (auto &s : shape_strs) {
+    record.shape.push_back(std::stoi(s));
+  }
+  // VLOG(3) << "data size " << record.data.size();
+  // VLOG(3) << "data shape size " << record.shape.size();
+  VLOG(2) << "data shape size " << record.shape[3];
+  return record;
+}
+
+void SetConfig(AnalysisConfig *cfg) {
+  cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params");
+  cfg->DisableGpu();
+  cfg->SwitchIrDebug();
+  cfg->SwitchSpecifyInputNames(false);
+  cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads);
+}
+
+void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
+  std::string line;
+  std::ifstream file(FLAGS_infer_data);
+  std::string shape_line;
+  std::ifstream infer_file(FLAGS_infer_shape);
+
+  int iteration = FLAGS_test_all_data ? 1000 : 1;
+  for (int k = 0; k < iteration; k++) {
+    std::getline(file, line);
+    std::getline(infer_file, shape_line);
+    auto record = ProcessALine(line, shape_line);
+
+    PaddleTensor input;
+    input.shape = record.shape;
+    input.dtype = PaddleDType::FLOAT32;
+    size_t input_size = record.data.size() * sizeof(float);
+    input.data.Resize(input_size);
+    memcpy(input.data.data(), record.data.data(), input_size);
+    std::vector<PaddleTensor> input_slots;
+    input_slots.assign({input});
+    (*inputs).emplace_back(input_slots);
+  }
+}
+
+// Easy for profiling independently.
+//  ocr, mobilenet and se_resnext50
+void profile(bool use_mkldnn = false) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  if (use_mkldnn) {
+    cfg.EnableMKLDNN();
+    cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
+  }
+  // cfg.pass_builder()->TurnOnDebug();
+  std::vector<std::vector<PaddleTensor>> outputs;
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  TestPrediction(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                 input_slots_all, &outputs, FLAGS_num_threads);
+}
+
+TEST(Analyzer_vis, profile) { profile(); }
+
+#ifdef PADDLE_WITH_MKLDNN
+TEST(Analyzer_vis, profile_mkldnn) { profile(true /* use_mkldnn */); }
+#endif
+
+// Check the fuse status
+TEST(Analyzer_vis, fuse_statis) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  int num_ops;
+  auto predictor = CreatePaddlePredictor<AnalysisConfig>(cfg);
+  GetFuseStatis(predictor.get(), &num_ops);
+}
+
+// Compare result of NativeConfig and AnalysisConfig
+void compare(bool use_mkldnn = false) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  if (use_mkldnn) {
+    cfg.EnableMKLDNN();
+    cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
+  }
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
+}
+
+TEST(Analyzer_vis, compare) { compare(); }
+#ifdef PADDLE_WITH_MKLDNN
+TEST(Analyzer_vis, compare_mkldnn) { compare(true /* use_mkldnn */); }
+#endif
+
+// Compare Deterministic result
+TEST(Analyzer_vis, compare_determine) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareDeterministic(reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
+                       input_slots_all);
+}
+
+}  // namespace analysis
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
@@ -81,6 +81,13 @@ std::string CreateKey(const paddle::framework::ExecutionContext& ctx,
   platform::MKLDNNHandler::AppendKey(&key, std::to_string(dt));
   platform::MKLDNNHandler::AppendKey(&key,
                                      std::to_string(multi_input[0]->format()));
+  if (platform::get_cur_thread_id() != -1) {
+    auto tid = std::this_thread::get_id();
+    std::stringstream ss;
+    ss << tid;
+    platform::MKLDNNHandler::AppendKey(&key, "-t:");
+    platform::MKLDNNHandler::AppendKey(&key, ss.str());
+  }
   return key;
 }
 

diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -221,6 +221,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
         user_weights_memory_p, pipeline, is_test);
 
     std::shared_ptr<mkldnn::memory> dst_memory_p;
+    std::shared_ptr<mkldnn::memory> user_residual_memory_p;
 
     if (fuse_residual_conn) {
       auto residual_param = ctx.Input<Tensor>("ResidualData");
@@ -243,7 +244,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
         auto user_residual_md = platform::MKLDNNMemDesc(
             residual_data_tz, residual_data_type, residual_param->format());
-        auto user_residual_memory_p = handler.AcquireResidualDataMemory(
+        user_residual_memory_p = handler.AcquireResidualDataMemory(
             user_residual_md, to_void_cast<T>(residual_param_data));
 
         dst_memory_p = handler.AcquireDstMemoryFromResidualDataMemory(
@@ -263,14 +264,15 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
     // create convolution op primitive
     std::shared_ptr<mkldnn::convolution_forward> conv_p;
+    std::shared_ptr<mkldnn::memory> user_bias_memory_p, bias_memory_p;
     if (bias) {
       const T* bias_data = bias->data<T>();
       auto user_bias_md = platform::MKLDNNMemDesc(
           {bias_tz}, platform::MKLDNNGetDataType<T>(), memory::format::x);
-      auto user_bias_memory_p =
+      user_bias_memory_p =
           handler.AcquireBiasMemory(user_bias_md, to_void_cast<T>(bias_data));
 
-      auto bias_memory_p =
+      bias_memory_p =
           handler.AcquireBiasMemoryFromPrimitive(user_bias_memory_p, pipeline);
       conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p,
                                           bias_memory_p, dst_memory_p);

diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc
@@ -48,6 +48,13 @@ std::string CreateKey(const paddle::framework::ExecutionContext& ctx,
   platform::MKLDNNHandler::AppendKey(&key, std::to_string(dt));
   platform::MKLDNNHandler::AppendKey(&key, std::to_string(fmt));
   platform::MKLDNNHandler::AppendKey(&key, suffix);
+  if (platform::get_cur_thread_id() != -1) {
+    auto tid = std::this_thread::get_id();
+    std::stringstream ss;
+    ss << tid;
+    platform::MKLDNNHandler::AppendKey(&key, "-t:");
+    platform::MKLDNNHandler::AppendKey(&key, ss.str());
+  }
   return key;
 }
 
@@ -128,6 +135,9 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     const std::string key_pool_workspace_memory =
         key + "@pool_workspace_memory";
 
+    std::shared_ptr<mkldnn::memory> src_memory, dst_memory;
+    std::shared_ptr<mkldnn::pooling_forward::primitive_desc> pool_pd;
+    std::shared_ptr<mkldnn::memory> pool_src_memory_p, pool_dst_memory_p;
     auto pool_p =
         std::static_pointer_cast<pooling_forward>(dev_ctx.GetBlob(key_pool_p));
     if (pool_p == nullptr) {
@@ -150,17 +160,17 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       auto propagation = src_md.data.data_type == mkldnn_f32
                              ? mkldnn::prop_kind::forward_training
                              : mkldnn::prop_kind::forward_scoring;
-      std::shared_ptr<mkldnn::pooling_forward::primitive_desc> pool_pd =
+      pool_pd =
           CreatePrimitiveDesc(src_md, dst_md, propagation, strides,
                               padding_left_top, padding_right_bottom, ksize,
                               pooling_type, mkldnn_engine, ceil_mode, is_test);
 
       // save pool_pd into global device context to be referred in backward path
       if (!is_test) dev_ctx.SetBlob(key_pool_pd, pool_pd);
 
-      auto src_memory = std::make_shared<memory>(pool_pd->src_primitive_desc(),
-                                                 to_void_cast<T>(input_data));
-      auto dst_memory =
+      src_memory = std::make_shared<memory>(pool_pd->src_primitive_desc(),
+                                            to_void_cast<T>(input_data));
+      dst_memory =
           std::make_shared<memory>(pool_pd->dst_primitive_desc(), output_data);
 
       dev_ctx.SetBlob(key_pool_src_mem_p, src_memory);
@@ -186,7 +196,7 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
           (memory::format)dst_memory->get_primitive_desc().desc().data.format;
     } else {
       // Primitives already exist
-      auto pool_src_memory_p =
+      pool_src_memory_p =
           std::static_pointer_cast<memory>(dev_ctx.GetBlob(key_pool_src_mem_p));
       PADDLE_ENFORCE(pool_src_memory_p != nullptr,
                      "Fail to find pooling src mem_p in device context");