diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 890c90697bcd52..b1221984f66b5b 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -114,6 +114,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { // MKLDNN related. CP_MEMBER(use_mkldnn_); CP_MEMBER(mkldnn_enabled_op_types_); + CP_MEMBER(mkldnn_thread_id_); // Quantization related. CP_MEMBER(use_mkldnn_quantizer_); CP_MEMBER(mkldnn_quantizer_config_); @@ -161,6 +162,15 @@ void AnalysisConfig::EnableMKLDNN() { Update(); } +void AnalysisConfig::SetMKLDNNThreadId(int id) { +#ifdef PADDLE_WITH_MKLDNN + mkldnn_thread_id_ = id; +#else + LOG(ERROR) << "Please compile with MKLDNN first to set MKLDNN Thread Id"; + mkldnn_thread_id_ = 0; +#endif +} + void AnalysisConfig::EnableMkldnnQuantizer() { #ifdef PADDLE_WITH_MKLDNN if (!mkldnn_quantizer_config_) diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 5d9d5a3178aaa3..e839b6c2c93cf9 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -197,6 +197,16 @@ bool AnalysisPredictor::Run(const std::vector &inputs, std::vector *output_data, int batch_size) { paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads()); +#ifdef PADDLE_WITH_MKLDNN + // TODO(intel): will refactor this code later + // Make sure it not conflict with AnalysisPredictor::SetMkldnnthreadid case + VLOG(2) << "AnalysisPredictor::Run get_cur_thread_id=" + << paddle::platform::get_cur_thread_id() + << ", mkldnn_thread_id_=" << config_.mkldnn_thread_id_ << "\n"; + if (paddle::platform::get_cur_thread_id() == 0) + paddle::platform::set_cur_thread_id(config_.mkldnn_thread_id_); +#endif + VLOG(3) << "Predictor::predict"; inference::Timer timer; timer.tic(); @@ -238,7 +248,15 @@ bool AnalysisPredictor::Run(const std::vector &inputs, // recover the cpu_math_library_num_threads to 1, in order to avoid thread // conflict when integrating it into deployment service. paddle::platform::SetNumThreads(1); - +#ifdef PADDLE_WITH_MKLDNN + // TODO(intel): will refactor this code later + // reset thread id to avoid confusion when thread is reused from pool again + // mkldnn_thread_id_ = -1 is reserved for cache clearing mode only + if (paddle::platform::get_cur_thread_id() == -1) { + VLOG(2) << "Clear previous mkldnn thread id -1\n"; + paddle::platform::set_cur_thread_id(0); + } +#endif return true; } @@ -595,6 +613,15 @@ std::unique_ptr AnalysisPredictor::GetOutputTensor( bool AnalysisPredictor::ZeroCopyRun() { paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads()); +#ifdef PADDLE_WITH_MKLDNN + // TODO(intel): will refactor this code later + // Make sure it not conflict with AnalysisPredictor::SetMkldnnthreadid case + VLOG(2) << "AnalysisPredictor::Run get_cur_thread_id=" + << paddle::platform::get_cur_thread_id() + << ", mkldnn_thread_id_=" << config_.mkldnn_thread_id_ << "\n"; + if (paddle::platform::get_cur_thread_id() == 0) + paddle::platform::set_cur_thread_id(config_.mkldnn_thread_id_); +#endif executor_->Run(); // Fix TensorArray reuse not cleaned bug. tensor_array_batch_cleaner_.CollectTensorArrays(sub_scope_); @@ -603,7 +630,15 @@ bool AnalysisPredictor::ZeroCopyRun() { // recover the cpu_math_library_num_threads to 1, in order to avoid thread // conflict when integrating it into deployment service. paddle::platform::SetNumThreads(1); - +#ifdef PADDLE_WITH_MKLDNN + // TODO(intel): will refactor this code later + // reset thread id to avoid confusion when thread is reused from pool again + // mkldnn_thread_id_ = -1 is reserved for cache clearing mode only + if (paddle::platform::get_cur_thread_id() == -1) { + VLOG(2) << "Clear previous mkldnn thread id setting\n"; + paddle::platform::set_cur_thread_id(0); + } +#endif return true; } diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index e3682d27054a12..43fd321fa27ae6 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -182,6 +182,9 @@ struct AnalysisConfig { /** A boolean state telling whether to use the MKLDNN. */ bool mkldnn_enabled() const { return use_mkldnn_; } + /** Set MKLDNN thread id. + */ + void SetMKLDNNThreadId(int id); /** Set and get the number of cpu math library threads. */ @@ -287,6 +290,7 @@ struct AnalysisConfig { bool use_ngraph_{false}; bool use_mkldnn_{false}; std::unordered_set mkldnn_enabled_op_types_; + int mkldnn_thread_id_{0}; bool model_from_memory_{false}; diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 243f5cef00835e..ec33df962e46ee 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -157,6 +157,9 @@ if (NOT EXISTS ${MOBILENET_INSTALL_DIR}) endif() inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc) +# detect +inference_analysis_api_test_with_refer_result(test_analyzer_detect ${OCR_INSTALL_DIR} analyzer_detect_tester.cc) + ### Image classification tests with fake data set(IMG_CLASS_TEST_APP "test_analyzer_image_classification") set(IMG_CLASS_TEST_APP_SRC "analyzer_image_classification_tester.cc") diff --git a/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc b/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc new file mode 100644 index 00000000000000..d09f1ff81a2189 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_detect_tester.cc @@ -0,0 +1,150 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include "paddle/fluid/inference/tests/api/tester_helper.h" +DEFINE_string(infer_shape, "", "data shape file"); + +namespace paddle { +namespace inference { +namespace analysis { + +struct Record { + std::vector data; + std::vector shape; +}; + +Record ProcessALine(const std::string &line, const std::string &shape_line) { + VLOG(3) << "process a line"; + std::vector columns; + + Record record; + std::vector data_strs; + split(line, ' ', &data_strs); + for (auto &d : data_strs) { + record.data.push_back(std::stof(d)); + } + + std::vector shape_strs; + split(shape_line, ' ', &shape_strs); + for (auto &s : shape_strs) { + record.shape.push_back(std::stoi(s)); + } + // VLOG(3) << "data size " << record.data.size(); + // VLOG(3) << "data shape size " << record.shape.size(); + VLOG(2) << "data shape size " << record.shape[3]; + return record; +} + +void SetConfig(AnalysisConfig *cfg) { + cfg->SetModel(FLAGS_infer_model + "/model", FLAGS_infer_model + "/params"); + cfg->DisableGpu(); + cfg->SwitchIrDebug(); + cfg->SwitchSpecifyInputNames(false); + cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); +} + +void SetInput(std::vector> *inputs) { + std::string line; + std::ifstream file(FLAGS_infer_data); + std::string shape_line; + std::ifstream infer_file(FLAGS_infer_shape); + + int iteration = FLAGS_test_all_data ? 1000 : 1; + for (int k = 0; k < iteration; k++) { + std::getline(file, line); + std::getline(infer_file, shape_line); + auto record = ProcessALine(line, shape_line); + + PaddleTensor input; + input.shape = record.shape; + input.dtype = PaddleDType::FLOAT32; + size_t input_size = record.data.size() * sizeof(float); + input.data.Resize(input_size); + memcpy(input.data.data(), record.data.data(), input_size); + std::vector input_slots; + input_slots.assign({input}); + (*inputs).emplace_back(input_slots); + } +} + +// Easy for profiling independently. +// ocr, mobilenet and se_resnext50 +void profile(bool use_mkldnn = false) { + AnalysisConfig cfg; + SetConfig(&cfg); + if (use_mkldnn) { + cfg.EnableMKLDNN(); + cfg.pass_builder()->AppendPass("fc_mkldnn_pass"); + } + // cfg.pass_builder()->TurnOnDebug(); + std::vector> outputs; + + std::vector> input_slots_all; + SetInput(&input_slots_all); + TestPrediction(reinterpret_cast(&cfg), + input_slots_all, &outputs, FLAGS_num_threads); +} + +TEST(Analyzer_vis, profile) { profile(); } + +#ifdef PADDLE_WITH_MKLDNN +TEST(Analyzer_vis, profile_mkldnn) { profile(true /* use_mkldnn */); } +#endif + +// Check the fuse status +TEST(Analyzer_vis, fuse_statis) { + AnalysisConfig cfg; + SetConfig(&cfg); + int num_ops; + auto predictor = CreatePaddlePredictor(cfg); + GetFuseStatis(predictor.get(), &num_ops); +} + +// Compare result of NativeConfig and AnalysisConfig +void compare(bool use_mkldnn = false) { + AnalysisConfig cfg; + SetConfig(&cfg); + if (use_mkldnn) { + cfg.EnableMKLDNN(); + cfg.pass_builder()->AppendPass("fc_mkldnn_pass"); + } + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), input_slots_all); +} + +TEST(Analyzer_vis, compare) { compare(); } +#ifdef PADDLE_WITH_MKLDNN +TEST(Analyzer_vis, compare_mkldnn) { compare(true /* use_mkldnn */); } +#endif + +// Compare Deterministic result +TEST(Analyzer_vis, compare_determine) { + AnalysisConfig cfg; + SetConfig(&cfg); + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareDeterministic(reinterpret_cast(&cfg), + input_slots_all); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc index a855ba8475a1b0..ac9164a77f893c 100644 --- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc @@ -81,6 +81,13 @@ std::string CreateKey(const paddle::framework::ExecutionContext& ctx, platform::MKLDNNHandler::AppendKey(&key, std::to_string(dt)); platform::MKLDNNHandler::AppendKey(&key, std::to_string(multi_input[0]->format())); + if (platform::get_cur_thread_id() != -1) { + auto tid = std::this_thread::get_id(); + std::stringstream ss; + ss << tid; + platform::MKLDNNHandler::AppendKey(&key, "-t:"); + platform::MKLDNNHandler::AppendKey(&key, ss.str()); + } return key; } diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc index 647e09a92911e3..e20dfb35682752 100644 --- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc @@ -221,6 +221,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { user_weights_memory_p, pipeline, is_test); std::shared_ptr dst_memory_p; + std::shared_ptr user_residual_memory_p; if (fuse_residual_conn) { auto residual_param = ctx.Input("ResidualData"); @@ -243,7 +244,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto user_residual_md = platform::MKLDNNMemDesc( residual_data_tz, residual_data_type, residual_param->format()); - auto user_residual_memory_p = handler.AcquireResidualDataMemory( + user_residual_memory_p = handler.AcquireResidualDataMemory( user_residual_md, to_void_cast(residual_param_data)); dst_memory_p = handler.AcquireDstMemoryFromResidualDataMemory( @@ -263,14 +264,15 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { // create convolution op primitive std::shared_ptr conv_p; + std::shared_ptr user_bias_memory_p, bias_memory_p; if (bias) { const T* bias_data = bias->data(); auto user_bias_md = platform::MKLDNNMemDesc( {bias_tz}, platform::MKLDNNGetDataType(), memory::format::x); - auto user_bias_memory_p = + user_bias_memory_p = handler.AcquireBiasMemory(user_bias_md, to_void_cast(bias_data)); - auto bias_memory_p = + bias_memory_p = handler.AcquireBiasMemoryFromPrimitive(user_bias_memory_p, pipeline); conv_p = handler.AcquireConvolution(src_memory_p, weights_memory_p, bias_memory_p, dst_memory_p); diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc index c635fd11c37aec..5f797f3581ec56 100644 --- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc @@ -48,6 +48,13 @@ std::string CreateKey(const paddle::framework::ExecutionContext& ctx, platform::MKLDNNHandler::AppendKey(&key, std::to_string(dt)); platform::MKLDNNHandler::AppendKey(&key, std::to_string(fmt)); platform::MKLDNNHandler::AppendKey(&key, suffix); + if (platform::get_cur_thread_id() != -1) { + auto tid = std::this_thread::get_id(); + std::stringstream ss; + ss << tid; + platform::MKLDNNHandler::AppendKey(&key, "-t:"); + platform::MKLDNNHandler::AppendKey(&key, ss.str()); + } return key; } @@ -128,6 +135,9 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { const std::string key_pool_workspace_memory = key + "@pool_workspace_memory"; + std::shared_ptr src_memory, dst_memory; + std::shared_ptr pool_pd; + std::shared_ptr pool_src_memory_p, pool_dst_memory_p; auto pool_p = std::static_pointer_cast(dev_ctx.GetBlob(key_pool_p)); if (pool_p == nullptr) { @@ -150,7 +160,7 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { auto propagation = src_md.data.data_type == mkldnn_f32 ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring; - std::shared_ptr pool_pd = + pool_pd = CreatePrimitiveDesc(src_md, dst_md, propagation, strides, padding_left_top, padding_right_bottom, ksize, pooling_type, mkldnn_engine, ceil_mode, is_test); @@ -158,9 +168,9 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { // save pool_pd into global device context to be referred in backward path if (!is_test) dev_ctx.SetBlob(key_pool_pd, pool_pd); - auto src_memory = std::make_shared(pool_pd->src_primitive_desc(), - to_void_cast(input_data)); - auto dst_memory = + src_memory = std::make_shared(pool_pd->src_primitive_desc(), + to_void_cast(input_data)); + dst_memory = std::make_shared(pool_pd->dst_primitive_desc(), output_data); dev_ctx.SetBlob(key_pool_src_mem_p, src_memory); @@ -186,7 +196,7 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { (memory::format)dst_memory->get_primitive_desc().desc().data.format; } else { // Primitives already exist - auto pool_src_memory_p = + pool_src_memory_p = std::static_pointer_cast(dev_ctx.GetBlob(key_pool_src_mem_p)); PADDLE_ENFORCE(pool_src_memory_p != nullptr, "Fail to find pooling src mem_p in device context"); diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 4f048d44685a88..41cdc92e20d2a2 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -407,6 +407,7 @@ thread_local int cur_thread_id = 0; void set_cur_thread_id(int tid) { cur_thread_id = tid; } int get_cur_thread_id(void) { return cur_thread_id; } +#define MKLDNN_CAP 10000 void MKLDNNDeviceContext::SetBlob(const std::string& name, std::shared_ptr data) const { @@ -424,19 +425,30 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name, // 1st time to set blob in current thread pBlob = std::shared_ptr(new KeyBlob()); (*pMap)[tid] = pBlob; + VLOG(2) << "SetBlob: tid=" << tid << ", add new tid\n"; } else { pBlob = map_it->second; } // Find Key in found (or newly created) KeyBlob - auto key_it = pBlob->find(name); + auto key_it = std::find_if( + pBlob->begin(), pBlob->end(), + [=](std::pair> const& obj) { + return obj.first == name; + }); if (key_it == pBlob->end()) { - (*pBlob)[name] = data; // create new blob + // tid = -1 means cache clearing mode, MKLDNN_CAP defines max blob capacity + if ((tid == -1) && (pBlob->size() > MKLDNN_CAP)) { + VLOG(2) << "SetBlob: tid=" << tid << ", remove head blob " + << pBlob->begin()->first << "\n"; + pBlob->erase(pBlob->begin()); + } + pBlob->push_back(std::make_pair(name, data)); } else { key_it->second = data; // set data to existing blob } - + VLOG(2) << "SetBlob: tid=" << tid << ", add blob=" << name << "\n"; // lock will be automatically released when out of scope return; } @@ -452,14 +464,25 @@ std::shared_ptr MKLDNNDeviceContext::GetBlob( // Find KeyBlob for current thread firstly auto map_it = pMap->find(tid); - if (map_it == pMap->end()) return nullptr; + if (map_it == pMap->end()) { + VLOG(2) << "GetBlob: tid=" << tid << ", miss tid\n"; + return nullptr; + } pBlob = map_it->second; // Find Blob via name - auto key_it = pBlob->find(name); + auto key_it = std::find_if( + pBlob->begin(), pBlob->end(), + [=](std::pair> const& obj) { + return obj.first == name; + }); - if (key_it == pBlob->end()) return nullptr; + if (key_it == pBlob->end()) { + VLOG(2) << "GetBlob tid=" << tid << ", miss blob=" << name << "\n"; + return nullptr; + } + VLOG(2) << "GetBlob tid=" << tid << ", get blob=" << name << "\n"; // lock will be automatically released when out of scope return key_it->second; } diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 812181563e6e55..628273a110e3f1 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -378,7 +378,7 @@ struct DefaultDeviceContextType { #endif #ifdef PADDLE_WITH_MKLDNN -using KeyBlob = std::unordered_map>; +using KeyBlob = std::vector>>; using BlobMap = std::unordered_map>; void set_cur_thread_id(int); diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index f1fb6b156aedcb..76302d2bc42460 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -38,6 +38,9 @@ class MKLDNNHandler { std::stringstream ss; ss << tid; key_ = key_common_ + "-t:" + ss.str(); + if (platform::get_cur_thread_id() == -1) { + key_ = key_common_; + } } std::shared_ptr AcquireSrcMemory(