Skip to content

Commit 9f45e75

Browse files
committed
2 parents a19950c + b6eff44 commit 9f45e75

File tree

115 files changed

+3379
-6424
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

115 files changed

+3379
-6424
lines changed

cmake/external/gloo.cmake

Lines changed: 30 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -14,55 +14,41 @@
1414

1515
INCLUDE(ExternalProject)
1616

17-
execute_process(COMMAND bash -c "gcc -dumpversion" OUTPUT_VARIABLE GCC_VERSION)
18-
1917
SET(GLOO_PROJECT "extern_gloo")
20-
IF((NOT DEFINED GLOO_VER) OR (NOT DEFINED GLOO_URL))
21-
MESSAGE(STATUS "use pre defined download url")
22-
SET(GLOO_VER "master" CACHE STRING "" FORCE)
23-
SET(GLOO_NAME "gloo" CACHE STRING "" FORCE)
24-
25-
if(${GCC_VERSION} VERSION_EQUAL "8.2.0")
26-
SET(GLOO_URL "https://fleet.bj.bcebos.com/gloo/gloo.tar.gz.gcc8" CACHE STRING "" FORCE)
27-
else()
28-
SET(GLOO_URL "https://fleet.bj.bcebos.com/gloo/gloo.tar.gz.gcc482" CACHE STRING "" FORCE)
29-
endif()
30-
ENDIF()
31-
32-
MESSAGE(STATUS "GLOO_NAME: ${GLOO_NAME}, GLOO_URL: ${GLOO_URL}")
33-
SET(GLOO_SOURCE_DIR "${THIRD_PARTY_PATH}/gloo")
34-
SET(GLOO_DOWNLOAD_DIR "${GLOO_SOURCE_DIR}/src/${GLOO_PROJECT}")
35-
SET(GLOO_DST_DIR "gloo")
36-
SET(GLOO_INSTALL_ROOT "${THIRD_PARTY_PATH}/install")
37-
SET(GLOO_INSTALL_DIR ${GLOO_INSTALL_ROOT}/${GLOO_DST_DIR})
38-
SET(GLOO_ROOT ${GLOO_INSTALL_DIR})
39-
SET(GLOO_INC_DIR ${GLOO_ROOT}/include)
40-
SET(GLOO_LIB_DIR ${GLOO_ROOT}/lib)
41-
SET(GLOO_LIB ${GLOO_LIB_DIR}/libgloo.a)
42-
#SET(GLOO_IOMP_LIB ${GLOO_LIB_DIR}/libiomp5.so) #todo what is this
43-
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${GLOO_ROOT}/lib")
44-
45-
INCLUDE_DIRECTORIES(${GLOO_INC_DIR})
46-
47-
FILE(WRITE ${GLOO_DOWNLOAD_DIR}/CMakeLists.txt
48-
"PROJECT(GLOO)\n"
49-
"cmake_minimum_required(VERSION 3.0)\n"
50-
"install(DIRECTORY ${GLOO_NAME}/include ${GLOO_NAME}/lib \n"
51-
" DESTINATION ${GLOO_DST_DIR})\n")
18+
SET(GLOO_PREFIX_DIR ${THIRD_PARTY_PATH}/gloo)
19+
SET(GLOO_SOURCE_DIR ${THIRD_PARTY_PATH}/gloo/src/extern_gloo/gloo)
20+
SET(GLOO_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gloo)
21+
SET(GLOO_INCLUDE_DIR "${GLOO_INSTALL_DIR}/include" CACHE PATH "gloo include directory." FORCE)
22+
SET(GLOO_LIBRARY_DIR "${GLOO_INSTALL_DIR}/lib" CACHE PATH "gloo library directory." FORCE)
23+
# As we add extra features for gloo, we use the non-official repo
24+
SET(GLOO_REPOSITORY https://github.com/sandyhouse/gloo.git)
25+
SET(GLOO_TAG v0.0.2)
26+
SET(GLOO_LIBRARIES "${GLOO_INSTALL_DIR}/lib/libgloo.a" CACHE FILEPATH "gloo library." FORCE)
27+
28+
INCLUDE_DIRECTORIES(${GLOO_INCLUDE_DIR})
29+
30+
cache_third_party(extern_gloo
31+
REPOSITORY ${GLOO_REPOSITORY}
32+
TAG ${GLOO_TAG}
33+
DIR GLOO_SOURCE_DIR)
5234

5335
ExternalProject_Add(
54-
${GLOO_PROJECT}
36+
extern_gloo
5537
${EXTERNAL_PROJECT_LOG_ARGS}
56-
PREFIX ${GLOO_SOURCE_DIR}
57-
DOWNLOAD_DIR ${GLOO_DOWNLOAD_DIR}
58-
DOWNLOAD_COMMAND wget --no-check-certificate ${GLOO_URL} -c -q -O ${GLOO_NAME}.tar.gz
59-
&& tar zxvf ${GLOO_NAME}.tar.gz
60-
DOWNLOAD_NO_PROGRESS 1
38+
${SHALLOW_CLONE}
39+
"${GLOO_DOWNLOAD_CMD}"
40+
PREFIX "${GLOO_PREFIX_DIR}"
41+
SOURCE_DIR "${GLOO_SOURCE_DIR}"
6142
UPDATE_COMMAND ""
62-
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${GLOO_INSTALL_ROOT}
63-
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOO_INSTALL_ROOT}
43+
CONFIGURE_COMMAND ""
44+
BUILD_COMMAND mkdir -p ${GLOO_SOURCE_DIR}/build
45+
&& cd ${GLOO_SOURCE_DIR}/build && cmake .. && make
46+
&& mkdir -p ${GLOO_LIBRARY_DIR} ${GLOO_INCLUDE_DIR}/gloo
47+
INSTALL_COMMAND ${CMAKE_COMMAND} -E copy ${GLOO_SOURCE_DIR}/build/gloo/libgloo.a ${GLOO_LIBRARY_DIR}
48+
COMMAND ${CMAKE_COMMAND} -E copy_directory "${GLOO_SOURCE_DIR}/gloo/" "${GLOO_INCLUDE_DIR}/gloo"
6449
)
6550

66-
ADD_LIBRARY(gloo SHARED IMPORTED GLOBAL)
67-
SET_PROPERTY(TARGET gloo PROPERTY IMPORTED_LOCATION ${GLOO_LIB})
51+
52+
ADD_LIBRARY(gloo STATIC IMPORTED GLOBAL)
53+
SET_PROPERTY(TARGET gloo PROPERTY IMPORTED_LOCATION ${GLOO_LIBRARIES})
6854
ADD_DEPENDENCIES(gloo ${GLOO_PROJECT})

paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc

Lines changed: 0 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -84,19 +84,6 @@ void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const {
8484
VLOG(3) << "do not perform " + type() + "+bias fuse";
8585
return;
8686
}
87-
if (conv->Op()->HasAttr("dilations")) {
88-
auto dilations =
89-
BOOST_GET_CONST(std::vector<int>, conv->Op()->GetAttr("dilations"));
90-
for (const auto& d : dilations) {
91-
if (d != 1) {
92-
LOG(WARNING)
93-
<< "dilation conv not supported in MKLDNN, fuse not apply "
94-
<< "and set conv attribute use_mkldnn = false";
95-
conv->Op()->SetAttr("use_mkldnn", false);
96-
return;
97-
}
98-
}
99-
}
10087

10188
auto* eltwise_bias_tensor =
10289
scope->FindVar(eltwise_bias->Name())->GetMutable<LoDTensor>();

paddle/fluid/framework/tensor_util.cc

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,12 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
8484
}
8585
#endif
8686
#ifdef PADDLE_WITH_CUDA
87+
else if (platform::is_cuda_pinned_place(src_place) && // NOLINT
88+
platform::is_cuda_pinned_place(dst_place)) {
89+
memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr,
90+
BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place), src_ptr,
91+
size);
92+
}
8793
else if (platform::is_cuda_pinned_place(src_place) && // NOLINT
8894
platform::is_cpu_place(dst_place)) {
8995
memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
@@ -285,6 +291,12 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
285291
}
286292
#endif
287293
#ifdef PADDLE_WITH_CUDA
294+
else if (platform::is_cuda_pinned_place(src_place) && // NOLINT
295+
platform::is_cuda_pinned_place(dst_place)) {
296+
memory::Copy(BOOST_GET_CONST(platform::CUDAPinnedPlace, dst_place), dst_ptr,
297+
BOOST_GET_CONST(platform::CUDAPinnedPlace, src_place), src_ptr,
298+
size);
299+
}
288300
else if (platform::is_cuda_pinned_place(src_place) && // NOLINT
289301
platform::is_cpu_place(dst_place)) {
290302
memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,

paddle/fluid/inference/tests/api/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ function(inference_analysis_api_int8_test_run TARGET_NAME test_binary model_dir
6666
--infer_data=${data_path}
6767
--warmup_batch_size=${WARMUP_BATCH_SIZE}
6868
--batch_size=50
69+
--enable_int8=true
6970
--cpu_num_threads=${CPU_NUM_THREADS_ON_CI}
7071
--iterations=2)
7172
endfunction()
@@ -81,6 +82,7 @@ function(inference_analysis_api_bfloat16_test_run TARGET_NAME test_binary model_
8182
ARGS --infer_model=${model_dir}/model
8283
--infer_data=${data_path}
8384
--batch_size=50
85+
--enable_bf16=true
8486
--paddle_num_threads=${CPU_NUM_THREADS_ON_CI}
8587
--iterations=2)
8688
endfunction()
@@ -92,6 +94,7 @@ function(inference_analysis_api_object_dection_int8_test_run TARGET_NAME test_bi
9294
--infer_data=${data_path}
9395
--warmup_batch_size=10
9496
--batch_size=300
97+
--enable_int8=true
9598
--cpu_num_threads=${CPU_NUM_THREADS_ON_CI}
9699
--iterations=1)
97100
endfunction()
@@ -115,6 +118,7 @@ function(inference_analysis_api_quant_test_run TARGET_NAME test_binary fp32_mode
115118
--int8_model=${int8_model_dir}
116119
--infer_data=${data_path}
117120
--batch_size=50
121+
--enable_int8=true
118122
--cpu_num_threads=${CPU_NUM_THREADS_ON_CI}
119123
--with_accuracy_layer=false
120124
--iterations=2)

paddle/fluid/inference/tests/api/tester_helper.h

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,8 @@ DEFINE_bool(ernie_large, false, "Test ernie large");
5050
DEFINE_bool(with_accuracy_layer, true,
5151
"Calculate the accuracy while label is in the input");
5252
DEFINE_bool(enable_fp32, true, "Enable FP32 type prediction");
53-
DEFINE_bool(enable_bf16, true, "Enable BF16 type prediction");
54-
DEFINE_bool(enable_int8, true, "Enable INT8 type prediction");
53+
DEFINE_bool(enable_bf16, false, "Enable BF16 type prediction");
54+
DEFINE_bool(enable_int8, false, "Enable INT8 type prediction");
5555
DEFINE_int32(warmup_batch_size, 100, "batch size for quantization warmup");
5656
// setting iterations to 0 means processing the whole dataset
5757
DEFINE_int32(iterations, 0, "number of batches to process");
@@ -639,8 +639,9 @@ void TestPrediction(const PaddlePredictor::Config *config,
639639
}
640640
}
641641

642-
void SummarizeAccuracy(float avg_acc_fp32, float avg_acc_int8,
643-
int compared_idx) {
642+
void SummarizeAccuracy(float avg_acc_ref, float avg_acc, int compared_idx) {
643+
std::string data_type_name = "INT8";
644+
if (FLAGS_enable_bf16) data_type_name = "BF16";
644645
PADDLE_ENFORCE_LE(
645646
compared_idx, 2,
646647
platform::errors::InvalidArgument(
@@ -659,12 +660,12 @@ void SummarizeAccuracy(float avg_acc_fp32, float avg_acc_int8,
659660
LOG(INFO) << "--- Accuracy summary --- ";
660661
LOG(INFO) << "Accepted " << prefix
661662
<< "drop threshold: " << FLAGS_quantized_accuracy
662-
<< ". (condition: (FP32_" << prefix << " - INT8_" << prefix
663-
<< ") <= threshold)";
663+
<< ". (condition: (FP32_" << prefix << " - " << data_type_name
664+
<< "_" << prefix << ") <= threshold)";
664665
LOG(INFO) << "FP32: avg " << prefix << std::fixed << std::setw(6)
665-
<< std::setprecision(4) << avg_acc_fp32;
666-
LOG(INFO) << "INT8: avg " << prefix << std::fixed << std::setw(6)
667-
<< std::setprecision(4) << avg_acc_int8;
666+
<< std::setprecision(4) << avg_acc_ref;
667+
LOG(INFO) << data_type_name << ": avg " << prefix << std::fixed
668+
<< std::setw(6) << std::setprecision(4) << avg_acc;
668669
}
669670

670671
void SummarizePerformance(const char *title, float sample) {
@@ -677,8 +678,9 @@ void SummarizePerformance(const char *title, float sample) {
677678

678679
void SummarizePerformance(const char *title_fp32, float sample_latency_fp32,
679680
const char *title, float sample_latency) {
680-
SummarizePerformance(title_fp32, sample_latency_fp32);
681-
SummarizePerformance(title, sample_latency);
681+
if (FLAGS_enable_fp32) SummarizePerformance(title_fp32, sample_latency_fp32);
682+
if (FLAGS_enable_int8 || FLAGS_enable_bf16)
683+
SummarizePerformance(title, sample_latency);
682684
}
683685

684686
float CompareAccuracyOne(
@@ -733,15 +735,15 @@ void CompareAccuracy(
733735
const std::vector<std::vector<PaddleTensor>> &output_slots_quant,
734736
const std::vector<std::vector<PaddleTensor>> &output_slots_ref,
735737
int compared_idx) {
736-
if ((FLAGS_enable_fp32 && FLAGS_enable_int8) &&
738+
if ((FLAGS_enable_fp32 && (FLAGS_enable_int8 || FLAGS_enable_bf16)) &&
737739
(output_slots_quant.size() == 0 || output_slots_ref.size()) == 0)
738740
throw std::invalid_argument(
739741
"CompareAccuracy: output_slots vector is empty.");
740742

741743
float avg_acc_quant = 0.0;
742744
float avg_acc_ref = 0.0;
743745

744-
if (FLAGS_enable_int8)
746+
if (FLAGS_enable_int8 || FLAGS_enable_bf16)
745747
avg_acc_quant = CompareAccuracyOne(output_slots_quant, compared_idx);
746748

747749
if (FLAGS_enable_fp32)
@@ -751,9 +753,9 @@ void CompareAccuracy(
751753

752754
if (FLAGS_enable_fp32) CHECK_GT(avg_acc_ref, 0.0);
753755

754-
if (FLAGS_enable_int8) CHECK_GT(avg_acc_quant, 0.0);
756+
if (FLAGS_enable_int8 || FLAGS_enable_bf16) CHECK_GT(avg_acc_quant, 0.0);
755757

756-
if (FLAGS_enable_fp32 && FLAGS_enable_int8)
758+
if (FLAGS_enable_fp32 && (FLAGS_enable_int8 || FLAGS_enable_bf16))
757759
CHECK_LE(avg_acc_ref - avg_acc_quant, FLAGS_quantized_accuracy);
758760
}
759761

0 commit comments

Comments
 (0)