PaddlePaddle
diff --git a/‎cmake/external/gloo.cmake‎
Lines changed: 30 additions & 44 deletions b/‎cmake/external/gloo.cmake‎
Lines changed: 30 additions & 44 deletions
diff --git a/‎paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc‎
Lines changed: 0 additions & 13 deletions b/‎paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc‎
Lines changed: 0 additions & 13 deletions
diff --git a/‎paddle/fluid/framework/tensor_util.cc‎
Lines changed: 12 additions & 0 deletions b/‎paddle/fluid/framework/tensor_util.cc‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎paddle/fluid/inference/tests/api/CMakeLists.txt‎
Lines changed: 4 additions & 0 deletions b/‎paddle/fluid/inference/tests/api/CMakeLists.txt‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎paddle/fluid/inference/tests/api/tester_helper.h‎
Lines changed: 17 additions & 15 deletions b/‎paddle/fluid/inference/tests/api/tester_helper.h‎
Lines changed: 17 additions & 15 deletions
@@ -14,55 +14,41 @@
 
 INCLUDE(ExternalProject)
 
-execute_process(COMMAND bash -c "gcc -dumpversion" OUTPUT_VARIABLE GCC_VERSION)
-
 SET(GLOO_PROJECT       "extern_gloo")
-IF((NOT DEFINED GLOO_VER) OR (NOT DEFINED GLOO_URL))
-  MESSAGE(STATUS "use pre defined download url")
-  SET(GLOO_VER "master" CACHE STRING "" FORCE)
-  SET(GLOO_NAME "gloo" CACHE STRING "" FORCE)
-
-  if(${GCC_VERSION} VERSION_EQUAL "8.2.0")
-    SET(GLOO_URL "https://fleet.bj.bcebos.com/gloo/gloo.tar.gz.gcc8" CACHE STRING "" FORCE)
-  else()
-    SET(GLOO_URL "https://fleet.bj.bcebos.com/gloo/gloo.tar.gz.gcc482" CACHE STRING "" FORCE)
-  endif()
-ENDIF()
-
-MESSAGE(STATUS "GLOO_NAME: ${GLOO_NAME}, GLOO_URL: ${GLOO_URL}")
-SET(GLOO_SOURCE_DIR    "${THIRD_PARTY_PATH}/gloo")
-SET(GLOO_DOWNLOAD_DIR  "${GLOO_SOURCE_DIR}/src/${GLOO_PROJECT}")
-SET(GLOO_DST_DIR       "gloo")
-SET(GLOO_INSTALL_ROOT  "${THIRD_PARTY_PATH}/install")
-SET(GLOO_INSTALL_DIR   ${GLOO_INSTALL_ROOT}/${GLOO_DST_DIR})
-SET(GLOO_ROOT          ${GLOO_INSTALL_DIR})
-SET(GLOO_INC_DIR       ${GLOO_ROOT}/include)
-SET(GLOO_LIB_DIR       ${GLOO_ROOT}/lib)
-SET(GLOO_LIB           ${GLOO_LIB_DIR}/libgloo.a)
-#SET(GLOO_IOMP_LIB      ${GLOO_LIB_DIR}/libiomp5.so) #todo what is this
-SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${GLOO_ROOT}/lib")
-
-INCLUDE_DIRECTORIES(${GLOO_INC_DIR})
-
-FILE(WRITE ${GLOO_DOWNLOAD_DIR}/CMakeLists.txt
-  "PROJECT(GLOO)\n"
-  "cmake_minimum_required(VERSION 3.0)\n"
-  "install(DIRECTORY ${GLOO_NAME}/include ${GLOO_NAME}/lib \n"
-  "        DESTINATION ${GLOO_DST_DIR})\n")
+SET(GLOO_PREFIX_DIR    ${THIRD_PARTY_PATH}/gloo)
+SET(GLOO_SOURCE_DIR    ${THIRD_PARTY_PATH}/gloo/src/extern_gloo/gloo)
+SET(GLOO_INSTALL_DIR   ${THIRD_PARTY_PATH}/install/gloo)
+SET(GLOO_INCLUDE_DIR   "${GLOO_INSTALL_DIR}/include" CACHE PATH "gloo include directory." FORCE)
+SET(GLOO_LIBRARY_DIR   "${GLOO_INSTALL_DIR}/lib" CACHE PATH "gloo library directory." FORCE)
+# As we add extra features for gloo, we use the non-official repo
+SET(GLOO_REPOSITORY    https://github.com/sandyhouse/gloo.git)
+SET(GLOO_TAG           v0.0.2)
+SET(GLOO_LIBRARIES     "${GLOO_INSTALL_DIR}/lib/libgloo.a" CACHE FILEPATH "gloo library." FORCE)
+
+INCLUDE_DIRECTORIES(${GLOO_INCLUDE_DIR})
+
+cache_third_party(extern_gloo
+    REPOSITORY    ${GLOO_REPOSITORY}
+    TAG           ${GLOO_TAG}
+    DIR           GLOO_SOURCE_DIR)
 
 ExternalProject_Add(
-    ${GLOO_PROJECT}
+    extern_gloo
     ${EXTERNAL_PROJECT_LOG_ARGS}
-    PREFIX                ${GLOO_SOURCE_DIR}
-    DOWNLOAD_DIR          ${GLOO_DOWNLOAD_DIR}
-    DOWNLOAD_COMMAND      wget --no-check-certificate ${GLOO_URL} -c -q -O ${GLOO_NAME}.tar.gz
-                          && tar zxvf ${GLOO_NAME}.tar.gz
-    DOWNLOAD_NO_PROGRESS  1
+    ${SHALLOW_CLONE}
+    "${GLOO_DOWNLOAD_CMD}"
+    PREFIX                "${GLOO_PREFIX_DIR}"
+    SOURCE_DIR            "${GLOO_SOURCE_DIR}"
     UPDATE_COMMAND        ""
-    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${GLOO_INSTALL_ROOT}
-    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${GLOO_INSTALL_ROOT}
+    CONFIGURE_COMMAND     ""
+    BUILD_COMMAND         mkdir -p ${GLOO_SOURCE_DIR}/build
+        && cd ${GLOO_SOURCE_DIR}/build && cmake .. && make
+        && mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo
+    INSTALL_COMMAND      ${CMAKE_COMMAND} -E copy ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR}
+    COMMAND              ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" "${GLOO_INCLUDE_DIR}/gloo"
 )
 
-ADD_LIBRARY(gloo SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET gloo PROPERTY IMPORTED_LOCATION ${GLOO_LIB})
+
+ADD_LIBRARY(gloo STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET gloo PROPERTY IMPORTED_LOCATION ${GLOO_LIBRARIES})
 ADD_DEPENDENCIES(gloo ${GLOO_PROJECT})
@@ -84,19 +84,6 @@ void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const {
       VLOG(3) << "do not perform " + type() + "+bias fuse";
       return;
     }
-    if (conv->Op()->HasAttr("dilations")) {
-      auto dilations =
-          BOOST_GET_CONST(std::vector<int>, conv->Op()->GetAttr("dilations"));
-      for (const auto& d : dilations) {
-        if (d != 1) {
-          LOG(WARNING)
-              << "dilation conv not supported in MKLDNN, fuse not apply "
-              << "and set conv attribute use_mkldnn = false";
-          conv->Op()->SetAttr("use_mkldnn", false);
-          return;
-        }
-      }
-    }
 
     auto* eltwise_bias_tensor =
         scope->FindVar(eltwise_bias->Name())->GetMutable<LoDTensor>();
 
@@ -84,6 +84,12 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
   }
 #endif
 #ifdef PADDLE_WITH_CUDA
+  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
+           platform::is_cuda_pinned_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place), src_ptr,
+                 size);
+  }
   else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {
     memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
@@ -285,6 +291,12 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
   }
 #endif
 #ifdef PADDLE_WITH_CUDA
+  else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
+           platform::is_cuda_pinned_place(dst_place)) {
+    memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr,
+                 BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place), src_ptr,
+                 size);
+  }
   else if (platform::is_cuda_pinned_place(src_place) &&  // NOLINT
            platform::is_cpu_place(dst_place)) {
     memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
 
@@ -66,6 +66,7 @@ function(inference_analysis_api_int8_test_run TARGET_NAME test_binary model_dir
              --infer_data=${data_path}
              --warmup_batch_size=${WARMUP_BATCH_SIZE}
              --batch_size=50
+             --enable_int8=true
              --cpu_num_threads=${CPU_NUM_THREADS_ON_CI}
 	     --iterations=2)
 endfunction()
@@ -81,6 +82,7 @@ function(inference_analysis_api_bfloat16_test_run TARGET_NAME test_binary model_
         ARGS --infer_model=${model_dir}/model
              --infer_data=${data_path}
              --batch_size=50
+             --enable_bf16=true
              --paddle_num_threads=${CPU_NUM_THREADS_ON_CI}
 	     --iterations=2)
 endfunction()
@@ -92,6 +94,7 @@ function(inference_analysis_api_object_dection_int8_test_run TARGET_NAME test_bi
              --infer_data=${data_path}
              --warmup_batch_size=10
              --batch_size=300
+             --enable_int8=true
              --cpu_num_threads=${CPU_NUM_THREADS_ON_CI}
 	     --iterations=1)
 endfunction()
@@ -115,6 +118,7 @@ function(inference_analysis_api_quant_test_run TARGET_NAME test_binary fp32_mode
              --int8_model=${int8_model_dir}
              --infer_data=${data_path}
              --batch_size=50
+             --enable_int8=true
              --cpu_num_threads=${CPU_NUM_THREADS_ON_CI}
              --with_accuracy_layer=false
              --iterations=2)
 
@@ -50,8 +50,8 @@ DEFINE_bool(ernie_large, false, "Test ernie large");
 DEFINE_bool(with_accuracy_layer, true,
             "Calculate the accuracy while label is in the input");
 DEFINE_bool(enable_fp32, true, "Enable FP32 type prediction");
-DEFINE_bool(enable_bf16, true, "Enable BF16 type prediction");
-DEFINE_bool(enable_int8, true, "Enable INT8 type prediction");
+DEFINE_bool(enable_bf16, false, "Enable BF16 type prediction");
+DEFINE_bool(enable_int8, false, "Enable INT8 type prediction");
 DEFINE_int32(warmup_batch_size, 100, "batch size for quantization warmup");
 // setting iterations to 0 means processing the whole dataset
 DEFINE_int32(iterations, 0, "number of batches to process");
@@ -639,8 +639,9 @@ void TestPrediction(const PaddlePredictor::Config *config,
   }
 }
 
-void SummarizeAccuracy(float avg_acc_fp32, float avg_acc_int8,
-                       int compared_idx) {
+void SummarizeAccuracy(float avg_acc_ref, float avg_acc, int compared_idx) {
+  std::string data_type_name = "INT8";
+  if (FLAGS_enable_bf16) data_type_name = "BF16";
   PADDLE_ENFORCE_LE(
       compared_idx, 2,
       platform::errors::InvalidArgument(
@@ -659,12 +660,12 @@ void SummarizeAccuracy(float avg_acc_fp32, float avg_acc_int8,
   LOG(INFO) << "--- Accuracy summary --- ";
   LOG(INFO) << "Accepted " << prefix
             << "drop threshold: " << FLAGS_quantized_accuracy
-            << ". (condition: (FP32_" << prefix << " - INT8_" << prefix
-            << ") <= threshold)";
+            << ". (condition: (FP32_" << prefix << " - " << data_type_name
+            << "_" << prefix << ") <= threshold)";
   LOG(INFO) << "FP32: avg " << prefix << std::fixed << std::setw(6)
-            << std::setprecision(4) << avg_acc_fp32;
-  LOG(INFO) << "INT8: avg " << prefix << std::fixed << std::setw(6)
-            << std::setprecision(4) << avg_acc_int8;
+            << std::setprecision(4) << avg_acc_ref;
+  LOG(INFO) << data_type_name << ": avg " << prefix << std::fixed
+            << std::setw(6) << std::setprecision(4) << avg_acc;
 }
 
 void SummarizePerformance(const char *title, float sample) {
@@ -677,8 +678,9 @@ void SummarizePerformance(const char *title, float sample) {
 
 void SummarizePerformance(const char *title_fp32, float sample_latency_fp32,
                           const char *title, float sample_latency) {
-  SummarizePerformance(title_fp32, sample_latency_fp32);
-  SummarizePerformance(title, sample_latency);
+  if (FLAGS_enable_fp32) SummarizePerformance(title_fp32, sample_latency_fp32);
+  if (FLAGS_enable_int8 || FLAGS_enable_bf16)
+    SummarizePerformance(title, sample_latency);
 }
 
 float CompareAccuracyOne(
@@ -733,15 +735,15 @@ void CompareAccuracy(
     const std::vector<std::vector<PaddleTensor>> &output_slots_quant,
     const std::vector<std::vector<PaddleTensor>> &output_slots_ref,
     int compared_idx) {
-  if ((FLAGS_enable_fp32 && FLAGS_enable_int8) &&
+  if ((FLAGS_enable_fp32 && (FLAGS_enable_int8 || FLAGS_enable_bf16)) &&
       (output_slots_quant.size() == 0 || output_slots_ref.size()) == 0)
     throw std::invalid_argument(
         "CompareAccuracy: output_slots vector is empty.");
 
   float avg_acc_quant = 0.0;
   float avg_acc_ref = 0.0;
 
-  if (FLAGS_enable_int8)
+  if (FLAGS_enable_int8 || FLAGS_enable_bf16)
     avg_acc_quant = CompareAccuracyOne(output_slots_quant, compared_idx);
 
   if (FLAGS_enable_fp32)
@@ -751,9 +753,9 @@ void CompareAccuracy(
 
   if (FLAGS_enable_fp32) CHECK_GT(avg_acc_ref, 0.0);
 
-  if (FLAGS_enable_int8) CHECK_GT(avg_acc_quant, 0.0);
+  if (FLAGS_enable_int8 || FLAGS_enable_bf16) CHECK_GT(avg_acc_quant, 0.0);
 
-  if (FLAGS_enable_fp32 && FLAGS_enable_int8)
+  if (FLAGS_enable_fp32 && (FLAGS_enable_int8 || FLAGS_enable_bf16))
     CHECK_LE(avg_acc_ref - avg_acc_quant, FLAGS_quantized_accuracy);
 }