diff --git a/CMakeLists.txt b/CMakeLists.txt index 30f9e3a3dcdd2c..f30671bd3a87e8 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License -cmake_minimum_required(VERSION 3.15) +cmake_minimum_required(VERSION 3.10) cmake_policy(VERSION 3.10) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake") set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) @@ -22,9 +22,6 @@ include(system) project(paddle CXX C) -include(init) -include(generic) # simplify cmake module - # enable language CUDA # TODO(Shibo Tao): remove find_package(CUDA) completely. find_package(CUDA QUIET) @@ -34,10 +31,14 @@ option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF) option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF) option(WITH_ASCEND "Compile PaddlePaddle with ASCEND" OFF) option(WITH_ROCM "Compile PaddlePaddle with ROCM platform" OFF) -# NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON +# NOTE(zhiqiu): WITH_ASCEND_CL can be compile on x86_64, so we can set WITH_ASCEND=OFF and WITH_ASCEND_CL=ON # to develop some acl related functionality on x86 option(WITH_ASCEND_CL "Compile PaddlePaddle with ASCEND CL" ${WITH_ASCEND}) option(WITH_ASCEND_CXX11 "Compile PaddlePaddle with ASCEND and CXX11 ABI" OFF) +# Note(zhouwei): It use option above, so put here +include(init) +include(generic) # simplify cmake module + if (WITH_GPU AND WITH_XPU) message(FATAL_ERROR "Error when compile GPU and XPU at the same time") endif() @@ -65,7 +66,7 @@ if(WITH_MUSL) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=deprecated-declarations -Wno-deprecated-declarations -Wno-error=pessimizing-move -Wno-error=deprecated-copy") endif() -if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11) +if(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D_GLIBCXX_USE_CXX11_ABI=0") endif() @@ -103,9 +104,9 @@ if(WIN32) endif() endforeach(flag_var) endif() - - # NOTE(Avin0323): Less parallel count result in faster compilation. + math(EXPR PROCESS_MAX "${CPU_CORES} * 2 / 3") + # windows build turn off warnings, use parallel compiling. foreach(flag_var CMAKE_CXX_FLAGS CMAKE_CXX_FLAGS_DEBUG CMAKE_CXX_FLAGS_RELEASE @@ -113,7 +114,10 @@ if(WIN32) CMAKE_C_FLAGS CMAKE_C_FLAGS_DEBUG CMAKE_C_FLAGS_RELEASE CMAKE_C_FLAGS_MINSIZEREL CMAKE_C_FLAGS_RELWITHDEBINFO) string(REGEX REPLACE "/W[1-4]" " /W0 " ${flag_var} "${${flag_var}}") - set(${flag_var} "${${flag_var}} /MP${PROCESS_MAX}") + # NOTE(zhouwei25): GPU compile have too high memory utilization when parallel compiling + if(NOT WITH_GPU) + set(${flag_var} "${${flag_var}} /MP${PROCESS_MAX}") + endif() endforeach(flag_var) foreach(flag_var CMAKE_CXX_FLAGS CMAKE_C_FLAGS) set(${flag_var} "${${flag_var}} /w") @@ -133,6 +137,9 @@ if(WIN32) foreach(flag_var CMAKE_SHARED_LINKER_FLAGS CMAKE_STATIC_LINKER_FLAGS CMAKE_EXE_LINKER_FLAGS CMAKE_LINKER_FLAGS) set(${flag_var} "${${flag_var}} /ignore:4049 /ignore:4217 /ignore:4006 /ignore:4221") + if(MSVC_STATIC_CRT) + set(${flag_var} "${${flag_var}} /NODEFAULTLIB:MSVCRT.LIB") + endif() endforeach(flag_var) if (WITH_WIN_DUMP_DBG) @@ -182,7 +189,6 @@ option(WITH_PSLIB "Compile with pslib support" OFF) option(WITH_BOX_PS "Compile with box_ps support" OFF) option(WITH_XBYAK "Compile with xbyak support" ON) option(WITH_CONTRIB "Compile the third-party contributation" OFF) -option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE}) option(WITH_PSCORE "Compile with parameter server support" ${WITH_DISTRIBUTE}) option(WITH_HETERPS "Compile with heterps" OFF}) option(WITH_INFERENCE_API_TEST "Test fluid inference C++ high-level api interface" OFF) @@ -199,6 +205,7 @@ option(WITH_SW "Compile PaddlePaddle with sw support" OFF) option(WITH_MIPS "Compile PaddlePaddle with mips support" OFF) option(WITH_MUSL "Compile with musl libc instead of gblic" OFF) option(WITH_UNITY_BUILD "Compile with UnityBuild mode" OFF) +option(WITH_STRIP "Strip so files of Whl packages" OFF) # PY_VERSION if(NOT PY_VERSION) @@ -259,9 +266,6 @@ endif() if(WITH_BRPC_RDMA) message(STATUS "Use brpc with rdma.") - if(WITH_GRPC) - message(FATAL_ERROR "Can't use grpc with brpc rdma.") - endif() if(NOT WITH_DISTRIBUTE) message(FATAL_ERROR "Can't use brpc rdma in no distribute env.") endif() @@ -349,6 +353,11 @@ if (WITH_MIPS) add_definitions(-DPADDLE_WITH_MIPS) endif() +if (WITH_HETERPS) + if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -faligned-new") + endif() +endif() set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") @@ -366,6 +375,13 @@ else() message(WARNING "On inference mode, will take place some specific optimization. Turn on the ON_INFER flag when building inference_lib only.") endif() +if(WITH_STRIP) + find_program(STRIP_PATH strip) + if(NOT STRIP_PATH OR NOT LINUX) + set(WITH_STRIP OFF CACHE STRING "Command strip is only used on Linux when it exists." FORCE) + endif() +endif() + add_subdirectory(paddle) if(WITH_PYTHON) add_subdirectory(python) diff --git a/README.md b/README.md index e8a7013d0b4432..8b437e4115abe8 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,4 @@ - -

+

diff --git a/cmake/configure.cmake b/cmake/configure.cmake index bf1352d4e11479..e7f125269be1f5 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -177,10 +177,6 @@ if(WITH_HETERPS) add_definitions(-DPADDLE_WITH_HETERPS) endif() -if(WITH_GRPC) - add_definitions(-DPADDLE_WITH_GRPC) -endif(WITH_GRPC) - if(WITH_BRPC_RDMA) add_definitions(-DPADDLE_WITH_BRPC_RDMA) endif(WITH_BRPC_RDMA) diff --git a/cmake/external/ascend.cmake b/cmake/external/ascend.cmake index bddd2023b437b1..414b2a54be0342 100644 --- a/cmake/external/ascend.cmake +++ b/cmake/external/ascend.cmake @@ -21,7 +21,13 @@ else() set(ASCEND_DIR /usr/local/Ascend) endif() -if(WITH_ASCEND) +if(EXISTS ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include/graph/ascend_string.h) + # It means CANN 20.2 + + add_definitions(-DPADDLE_WITH_ASCEND_STRING) +endif() + + +if(WITH_ASCEND OR WITH_ASCEND_CL) set(ASCEND_DRIVER_DIR ${ASCEND_DIR}/driver/lib64) set(ASCEND_DRIVER_COMMON_DIR ${ASCEND_DIR}/driver/lib64/common) set(ASCEND_DRIVER_SHARE_DIR ${ASCEND_DIR}/driver/lib64/share) @@ -43,9 +49,6 @@ if(WITH_ASCEND) set(atlas_acl_lib ${ATLAS_RUNTIME_DIR}/libascendcl.so) INCLUDE_DIRECTORIES(${ATLAS_RUNTIME_INC_DIR}) - if(EXISTS ${ATLAS_RUNTIME_INC_DIR}/graph/ascend_string.h) - add_definitions(-DPADDLE_WITH_ASCEND_STRING) - endif() ADD_LIBRARY(ascend_ge SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET ascend_ge PROPERTY IMPORTED_LOCATION ${atlas_ge_runner_lib}) @@ -62,17 +65,23 @@ endif() if(WITH_ASCEND_CL) set(ASCEND_CL_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/lib64) + set(ascend_hccl_lib ${ASCEND_CL_DIR}/libhccl.so) set(ascendcl_lib ${ASCEND_CL_DIR}/libascendcl.so) set(acl_op_compiler_lib ${ASCEND_CL_DIR}/libacl_op_compiler.so) - set(ASCEND_CL_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include) + set(FWKACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/fwkacllib/include) + set(ACLLIB_INC_DIR ${ASCEND_DIR}/ascend-toolkit/latest/acllib/include) - message(STATUS "ASCEND_CL_INC_DIR ${ASCEND_CL_INC_DIR}") + message(STATUS "FWKACLLIB_INC_DIR ${FWKACLLIB_INC_DIR}") message(STATUS "ASCEND_CL_DIR ${ASCEND_CL_DIR}") - INCLUDE_DIRECTORIES(${ASCEND_CL_INC_DIR}) + INCLUDE_DIRECTORIES(${FWKACLLIB_INC_DIR}) + INCLUDE_DIRECTORIES(${ACLLIB_INC_DIR}) ADD_LIBRARY(ascendcl SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET ascendcl PROPERTY IMPORTED_LOCATION ${ascendcl_lib}) + ADD_LIBRARY(ascend_hccl SHARED IMPORTED GLOBAL) + SET_PROPERTY(TARGET ascend_hccl PROPERTY IMPORTED_LOCATION ${ascend_hccl_lib}) + ADD_LIBRARY(acl_op_compiler SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET acl_op_compiler PROPERTY IMPORTED_LOCATION ${acl_op_compiler_lib}) add_custom_target(extern_ascend_cl DEPENDS ascendcl acl_op_compiler) diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index 4619f9f7b7e34c..aa471002eacb6a 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -33,7 +33,9 @@ elseif(LINUX) # which will cause compiler error of using __host__ funciont in __host__ __device__ file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/Meta.h native_src) file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/Eigen/src/Core/util/Meta.h native_dst) - set(EIGEN_PATCH_COMMAND cp ${native_src} ${native_dst}) + file(TO_NATIVE_PATH ${PADDLE_SOURCE_DIR}/patches/eigen/TensorReductionGpu.h native_src1) + file(TO_NATIVE_PATH ${EIGEN_SOURCE_DIR}/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h native_dst1) + set(EIGEN_PATCH_COMMAND cp ${native_src} ${native_dst} && cp ${native_src1} ${native_dst1}) endif() endif() diff --git a/cmake/generic.cmake b/cmake/generic.cmake index c85654a5674a00..a5c74a46631e9d 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -447,9 +447,20 @@ function(cc_test TARGET_NAME) cc_test_build(${TARGET_NAME} SRCS ${cc_test_SRCS} DEPS ${cc_test_DEPS}) - cc_test_run(${TARGET_NAME} - COMMAND ${TARGET_NAME} - ARGS ${cc_test_ARGS}) + # we dont test hcom op, because it need complex configuration + # with more than one machine + if(NOT ("${TARGET_NAME}" STREQUAL "c_broadcast_op_npu_test" OR + "${TARGET_NAME}" STREQUAL "c_allreduce_sum_op_npu_test" OR + "${TARGET_NAME}" STREQUAL "c_allreduce_max_op_npu_test" OR + "${TARGET_NAME}" STREQUAL "c_reducescatter_op_npu_test" OR + "${TARGET_NAME}" STREQUAL "c_allgather_op_npu_test" OR + "${TARGET_NAME}" STREQUAL "send_v2_op_npu_test" OR + "${TARGET_NAME}" STREQUAL "c_reduce_sum_op_npu_test" OR + "${TARGET_NAME}" STREQUAL "recv_v2_op_npu_test")) + cc_test_run(${TARGET_NAME} + COMMAND ${TARGET_NAME} + ARGS ${cc_test_ARGS}) + endif() endif() endfunction(cc_test) @@ -807,7 +818,7 @@ function(py_test TARGET_NAME) ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS} WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) endif() - + if (WIN32) set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150) endif() diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 4864e04fa05164..9694a7bc59c12a 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -211,11 +211,11 @@ set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid") if(WIN32) set(paddle_inference_c_lib $/paddle_inference_c.*) else(WIN32) - set(paddle_inference_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi/libpaddle_inference_c.*) + set(paddle_inference_c_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/capi_exp/libpaddle_inference_c.*) endif(WIN32) copy(inference_lib_dist - SRCS ${src_dir}/inference/capi/paddle_c_api.h ${paddle_inference_c_lib} + SRCS ${src_dir}/inference/capi_exp/pd_*.h ${paddle_inference_c_lib} DSTS ${PADDLE_INFERENCE_C_INSTALL_DIR}/paddle/include ${PADDLE_INFERENCE_C_INSTALL_DIR}/paddle/lib) # fluid library for both train and inference diff --git a/cmake/init.cmake b/cmake/init.cmake index 19fdb6c601a112..b11156d2e9986f 100644 --- a/cmake/init.cmake +++ b/cmake/init.cmake @@ -18,10 +18,10 @@ if(NOT WIN32) set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g -DNDEBUG") set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG") else() - # It has not been used now, it can specify CUDA compile flag manualy, + # It can specify CUDA compile flag manualy, # its use is to remvoe /Zi to reduce GPU static library size. But it's dangerous # because CUDA will update by nvidia, then error will occur. - # Now, it's used in CUDA:[10.0, 10.2] + # Now, it's only used in VS2015 + CUDA:[10.0, 10.2] set(WIN_PROPS ${CMAKE_SOURCE_DIR}/cmake/paddle_win.props) endif() diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 7dac91e531e4cf..75b1100caa915e 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -180,8 +180,8 @@ function(op_library TARGET) list(REMOVE_ITEM miopen_cu_cc_srcs "affine_grid_cudnn_op.cu.cc") list(REMOVE_ITEM miopen_cu_cc_srcs "grid_sampler_cudnn_op.cu.cc") list(REMOVE_ITEM hip_srcs "cholesky_op.cu") - list(REMOVE_ITEM hip_srcs "correlation_op.cu") list(REMOVE_ITEM hip_srcs "multinomial_op.cu") + list(REMOVE_ITEM hip_srcs "decode_jpeg_op.cu") hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cc_srcs} ${miopen_cu_cc_srcs} ${miopen_cu_srcs} ${mkldnn_cc_srcs} ${hip_srcs} DEPS ${op_library_DEPS} ${op_common_deps}) else() diff --git a/cmake/paddle_win.props b/cmake/paddle_win.props index 296940dc3f50cc..3c069bd2981c43 100644 --- a/cmake/paddle_win.props +++ b/cmake/paddle_win.props @@ -88,4 +88,3 @@ set CUDAFE_FLAGS=--sdk_dir "$(WindowsSdkDir)" - diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index 81fa7d0dfa98f0..f90fa3509d63d4 100644 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -29,9 +29,9 @@ set(third_party_deps) # 2. REPOSITORY: specify git REPOSITORY of 3rd party # 3. TAG: specify git tag/branch/commitID of 3rd party # 4. DIR: overwrite the original SOURCE_DIR when cache directory -# +# # The function Return 1 PARENT_SCOPE variables: -# - ${TARGET}_DOWNLOAD_CMD: Simply place "${TARGET}_DOWNLOAD_CMD" in ExternalProject_Add, +# - ${TARGET}_DOWNLOAD_CMD: Simply place "${TARGET}_DOWNLOAD_CMD" in ExternalProject_Add, # and you no longer need to set any donwnload steps in ExternalProject_Add. # For example: # Cache_third_party(${TARGET} @@ -52,7 +52,7 @@ FUNCTION(cache_third_party TARGET) SET(${TARGET_NAME}_DOWNLOAD_CMD GIT_REPOSITORY ${cache_third_party_REPOSITORY}) IF(cache_third_party_TAG) - LIST(APPEND ${TARGET_NAME}_DOWNLOAD_CMD + LIST(APPEND ${TARGET_NAME}_DOWNLOAD_CMD GIT_TAG ${cache_third_party_TAG}) ENDIF() ELSEIF(cache_third_party_URL) @@ -130,7 +130,7 @@ ENDFUNCTION() # Correction of flags on different Platform(WIN/MAC) and Print Warning Message if (APPLE) if(WITH_MKL) - MESSAGE(WARNING + MESSAGE(WARNING "Mac is not supported with MKL in Paddle yet. Force WITH_MKL=OFF.") set(WITH_MKL OFF CACHE STRING "Disable MKL for building on mac" FORCE) endif() @@ -141,7 +141,7 @@ if(WIN32 OR APPLE) SET(WITH_XBYAK OFF CACHE STRING "Disable XBYAK in Windows and MacOS" FORCE) if(WITH_LIBXSMM) - MESSAGE(WARNING + MESSAGE(WARNING "Windows, Mac are not supported with libxsmm in Paddle yet." "Force WITH_LIBXSMM=OFF") SET(WITH_LIBXSMM OFF CACHE STRING "Disable LIBXSMM in Windows and MacOS" FORCE) @@ -276,7 +276,7 @@ endif(WITH_BOX_PS) if(WITH_ASCEND OR WITH_ASCEND_CL) include(external/ascend) - if(WITH_ASCEND) + if(WITH_ASCEND OR WITH_ASCEND_CL) list(APPEND third_party_deps extern_ascend) endif() if(WITH_ASCEND_CL) @@ -290,7 +290,7 @@ if (WITH_PSCORE) include(external/leveldb) list(APPEND third_party_deps extern_leveldb) - + include(external/brpc) list(APPEND third_party_deps extern_brpc) diff --git a/go/demo/mobilenet_c_exp.cc b/go/demo/mobilenet_c_exp.cc new file mode 100644 index 00000000000000..b4f42dab6790bf --- /dev/null +++ b/go/demo/mobilenet_c_exp.cc @@ -0,0 +1,84 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include +#include +#include + +void ReadData(float* data, int size); + +int main(int argc, char* argv[]) { + PD_Config* config = PD_ConfigCreate(); + PD_ConfigSetModel(config, "data/model/__model__", "data/model/__params__"); + PD_ConfigDisableGlogInfo(config); + + PD_Predictor* predictor = PD_PredictorCreate(config); + // config has destroyed in PD_PredictorCreate + config = NULL; + + int input_num = PD_PredictorGetInputNum(predictor); + printf("Input num: %d\n", input_num); + int output_num = PD_PredictorGetOutputNum(predictor); + printf("Output num: %d\n", output_num); + + PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor); + PD_Tensor* input_tensor = + PD_PredictorGetInputHandle(predictor, input_names->data[0]); + PD_OneDimArrayCstrDestroy(input_names); + input_names = NULL; + + int32_t shape[] = {1, 3, 300, 300}; + float* data = (float*)malloc(sizeof(float) * 1 * 3 * 300 * 300); // NOLINT + ReadData(data, 1 * 3 * 300 * 300); // NOLINT + PD_TensorReshape(input_tensor, 4, shape); + PD_TensorCopyFromCpuFloat(input_tensor, data); + free(data); + data = NULL; + PD_PredictorRun(predictor); + + PD_OneDimArrayCstr* output_names = PD_PredictorGetOutputNames(predictor); + PD_Tensor* output_tensor = + PD_PredictorGetOutputHandle(predictor, output_names->data[0]); + PD_OneDimArrayCstrDestroy(output_names); + output_names = nullptr; + + PD_OneDimArrayInt32* out_shape = PD_TensorGetShape(output_tensor); + int32_t size = 1; + for (size_t index = 0; index < out_shape->size; ++index) { + size = size * out_shape->data[index]; + } + PD_OneDimArrayInt32Destroy(out_shape); + out_shape = NULL; + + data = (float*)malloc(sizeof(float) * size); // NOLINT + PD_TensorCopyToCpuFloat(output_tensor, data); + free(data); + data = NULL; + + PD_TensorDestroy(output_tensor); + output_tensor = NULL; + PD_TensorDestroy(input_tensor); + input_tensor = NULL; + PD_PredictorDestroy(predictor); + predictor = NULL; + + return 0; +} + +void ReadData(float* data, int n) { + FILE* fp = fopen("data/data.txt", "r"); + for (int i = 0; i < n; i++) { + fscanf(fp, "%f", &data[i]); + } + fclose(fp); +} diff --git a/paddle/extension.h b/paddle/extension.h index 71469576853a33..98d4bfd0326c5c 100644 --- a/paddle/extension.h +++ b/paddle/extension.h @@ -15,4 +15,4 @@ limitations under the License. */ #pragma once // All paddle apis in C++ frontend -#include "paddle/fluid/extension/include/ext_all.h" +#include "paddle/extension/include/ext_all.h" diff --git a/paddle/fluid/distributed/CMakeLists.txt b/paddle/fluid/distributed/CMakeLists.txt index a2062d82c8130b..905347d031b35b 100644 --- a/paddle/fluid/distributed/CMakeLists.txt +++ b/paddle/fluid/distributed/CMakeLists.txt @@ -11,8 +11,8 @@ if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) "${DISTRIBUTE_COMPILE_FLAGS} -faligned-new") endif() -add_subdirectory(table) add_subdirectory(service) +add_subdirectory(table) add_subdirectory(test) add_subdirectory(index_dataset) diff --git a/paddle/fluid/distributed/fleet.cc b/paddle/fluid/distributed/fleet.cc index 9aafdd769ed4a0..dfd55f16e1a065 100644 --- a/paddle/fluid/distributed/fleet.cc +++ b/paddle/fluid/distributed/fleet.cc @@ -146,6 +146,44 @@ void FleetWrapper::CreateClient2ClientConnection() { client2client_max_retry_); } +std::future FleetWrapper::PullSparseVarsAsync( + const Scope& scope, const uint64_t table_id, + const std::vector& var_names, std::vector* fea_keys, + std::vector>* fea_values, int fea_value_dim) { + fea_keys->clear(); + fea_keys->resize(0); + fea_keys->reserve(MAX_FEASIGN_NUM); + for (auto name : var_names) { + Variable* var = scope.FindVar(name); + if (var == nullptr) { + continue; + } + LoDTensor* tensor = var->GetMutable(); + CHECK(tensor != nullptr) << "tensor of var " << name << " is null"; + int64_t* ids = tensor->data(); + size_t len = tensor->numel(); + for (auto i = 0u; i < len; ++i) { + if (ids[i] == 0u) { + continue; + } + fea_keys->push_back(static_cast(ids[i])); + } + } + fea_values->resize(fea_keys->size() + 1); + for (auto& t : *fea_values) { + t.resize(fea_value_dim); + } + std::vector pull_result_ptr; + for (auto& t : *fea_values) { + pull_result_ptr.push_back(t.data()); + } + + bool training = true; + return pserver_ptr_->_worker_ptr->pull_sparse(pull_result_ptr.data(), + table_id, fea_keys->data(), + fea_keys->size(), training); +} + void FleetWrapper::PullSparseVarsSync( const Scope& scope, const uint64_t table_id, const std::vector& var_names, std::vector* fea_keys, diff --git a/paddle/fluid/distributed/fleet.h b/paddle/fluid/distributed/fleet.h index 863440180a808d..0da5d1e2bf987f 100644 --- a/paddle/fluid/distributed/fleet.h +++ b/paddle/fluid/distributed/fleet.h @@ -84,6 +84,15 @@ class FleetWrapper { int fea_dim, const std::vector& var_emb_names); + // Pull sparse variables from server in async mode + // Param: scope, table_id, var_names, fea_keys, fea_dim + // Param: fea_values std::future + std::future PullSparseVarsAsync( + const Scope& scope, const uint64_t table_id, + const std::vector& var_names, + std::vector* fea_keys, + std::vector>* fea_values, int fea_dim); + // Pull sparse variables from server in sync mode // pull immediately to tensors // is_training is true means training, false means inference, the behavior is diff --git a/paddle/fluid/distributed/index_dataset/index_sampler.cc b/paddle/fluid/distributed/index_dataset/index_sampler.cc index 58f85d98fb09c6..3e573bbdd2de97 100644 --- a/paddle/fluid/distributed/index_dataset/index_sampler.cc +++ b/paddle/fluid/distributed/index_dataset/index_sampler.cc @@ -13,13 +13,10 @@ // limitations under the License. #include "paddle/fluid/distributed/index_dataset/index_sampler.h" -#include "paddle/fluid/operators/math/sampler.h" namespace paddle { namespace distributed { -using Sampler = paddle::operators::math::Sampler; - std::vector> LayerWiseSampler::sample( const std::vector>& user_inputs, const std::vector& target_ids, bool with_hierarchy) { @@ -30,22 +27,7 @@ std::vector> LayerWiseSampler::sample( std::vector(user_feature_num + 2)); auto max_layer = tree_->Height(); - std::vector sampler_vec(max_layer - start_sample_layer_); - std::vector> layer_ids(max_layer - - start_sample_layer_); - - auto layer_index = max_layer - 1; size_t idx = 0; - while (layer_index >= start_sample_layer_) { - auto layer_codes = tree_->GetLayerCodes(layer_index); - layer_ids[idx] = tree_->GetNodes(layer_codes); - sampler_vec[idx] = new paddle::operators::math::UniformSampler( - layer_ids[idx].size() - 1, seed_); - layer_index--; - idx++; - } - - idx = 0; for (size_t i = 0; i < input_num; i++) { auto travel_codes = tree_->GetTravelCodes(target_ids[i], start_sample_layer_); @@ -76,18 +58,15 @@ std::vector> LayerWiseSampler::sample( for (int idx_offset = 0; idx_offset < layer_counts_[j]; idx_offset++) { int sample_res = 0; do { - sample_res = sampler_vec[j]->Sample(); - } while (layer_ids[j][sample_res].id() == travel_path[j].id()); + sample_res = sampler_vec_[j]->Sample(); + } while (layer_ids_[j][sample_res].id() == travel_path[j].id()); outputs[idx + idx_offset][user_feature_num] = - layer_ids[j][sample_res].id(); + layer_ids_[j][sample_res].id(); outputs[idx + idx_offset][user_feature_num + 1] = 0; } idx += layer_counts_[j]; } } - for (size_t i = 0; i < sampler_vec.size(); i++) { - delete sampler_vec[i]; - } return outputs; } diff --git a/paddle/fluid/distributed/index_dataset/index_sampler.h b/paddle/fluid/distributed/index_dataset/index_sampler.h index 66882bedc9b765..8813421446a21c 100644 --- a/paddle/fluid/distributed/index_dataset/index_sampler.h +++ b/paddle/fluid/distributed/index_dataset/index_sampler.h @@ -16,6 +16,7 @@ #include #include "paddle/fluid/distributed/index_dataset/index_wrapper.h" #include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/math/sampler.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -83,6 +84,23 @@ class LayerWiseSampler : public IndexSampler { } reverse(layer_counts_.begin(), layer_counts_.end()); VLOG(3) << "sample counts sum: " << layer_counts_sum_; + + auto max_layer = tree_->Height(); + sampler_vec_.clear(); + layer_ids_.clear(); + + auto layer_index = max_layer - 1; + size_t idx = 0; + while (layer_index >= start_sample_layer_) { + auto layer_codes = tree_->GetLayerCodes(layer_index); + layer_ids_.push_back(tree_->GetNodes(layer_codes)); + auto sampler_temp = + std::make_shared( + layer_ids_[idx].size() - 1, seed_); + sampler_vec_.push_back(sampler_temp); + layer_index--; + idx++; + } } std::vector> sample( const std::vector>& user_inputs, @@ -94,6 +112,8 @@ class LayerWiseSampler : public IndexSampler { std::shared_ptr tree_{nullptr}; int seed_{0}; int start_sample_layer_{1}; + std::vector> sampler_vec_; + std::vector> layer_ids_; }; } // end namespace distributed diff --git a/paddle/fluid/distributed/service/brpc_ps_server.cc b/paddle/fluid/distributed/service/brpc_ps_server.cc index a9370561a540be..a1440260bf2e77 100644 --- a/paddle/fluid/distributed/service/brpc_ps_server.cc +++ b/paddle/fluid/distributed/service/brpc_ps_server.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/distributed/service/brpc_ps_server.h" #include // NOLINT +#include "butil/object_pool.h" #include "paddle/fluid/distributed/table/depends/sparse_utils.h" #include "paddle/fluid/distributed/table/table.h" #include "paddle/fluid/framework/archive.h" @@ -196,12 +197,13 @@ int32_t BrpcPsService::pull_dense(Table *table, const PsRequestMessage &request, return 0; } - std::vector res_data; - res_data.resize(num * table->value_accesor()->select_size() / sizeof(float)); - table->pull_dense(res_data.data(), num); + auto res_data = butil::get_object>(); + res_data->resize(num * table->value_accesor()->select_size() / sizeof(float)); + table->pull_dense(res_data->data(), num); - cntl->response_attachment().append((char *)res_data.data(), - res_data.size() * sizeof(float)); + cntl->response_attachment().append((char *)(res_data->data()), + res_data->size() * sizeof(float)); + butil::return_object(res_data); return 0; } @@ -367,12 +369,13 @@ int32_t BrpcPsService::pull_sparse(Table *table, value.DeserializeFromBytes(const_cast(data)); - std::vector res_data; - res_data.resize(num * dim); - table->pull_sparse(res_data.data(), value); + auto res_data = butil::get_object>(); + res_data->resize(num * dim); + table->pull_sparse(res_data->data(), value); - cntl->response_attachment().append((char *)res_data.data(), - res_data.size() * sizeof(float)); + cntl->response_attachment().append((char *)(res_data->data()), + res_data->size() * sizeof(float)); + butil::return_object(res_data); return 0; } diff --git a/paddle/fluid/distributed/service/graph_brpc_client.cc b/paddle/fluid/distributed/service/graph_brpc_client.cc index a6271cac83c9a9..eafb4d596cc167 100644 --- a/paddle/fluid/distributed/service/graph_brpc_client.cc +++ b/paddle/fluid/distributed/service/graph_brpc_client.cc @@ -135,7 +135,8 @@ std::future GraphBrpcClient::get_node_feat( closure->request(request_idx) ->add_params(joint_feature_name.c_str(), joint_feature_name.size()); - PsService_Stub rpc_stub(get_cmd_channel(server_index)); + GraphPsService_Stub rpc_stub = + getServiceStub(get_cmd_channel(server_index)); closure->cntl(request_idx)->set_log_id(butil::gettimeofday_ms()); rpc_stub.service(closure->cntl(request_idx), closure->request(request_idx), closure->response(request_idx), closure); diff --git a/paddle/fluid/distributed/service/graph_py_service.h b/paddle/fluid/distributed/service/graph_py_service.h index e185f23e3d240f..c6657be96ba446 100644 --- a/paddle/fluid/distributed/service/graph_py_service.h +++ b/paddle/fluid/distributed/service/graph_py_service.h @@ -54,19 +54,7 @@ class GraphPyService { std::vector table_feat_conf_feat_dtype; std::vector table_feat_conf_feat_shape; - // std::thread *server_thread, *client_thread; - - // std::shared_ptr pserver_ptr; - - // std::shared_ptr worker_ptr; - public: - // std::shared_ptr get_ps_server() { - // return pserver_ptr; - // } - // std::shared_ptr get_ps_client() { - // return worker_ptr; - // } int get_shard_num() { return shard_num; } void set_shard_num(int shard_num) { this->shard_num = shard_num; } void GetDownpourSparseTableProto( diff --git a/paddle/fluid/distributed/table/CMakeLists.txt b/paddle/fluid/distributed/table/CMakeLists.txt index dde1f5ae8ee3a1..dab390958034af 100644 --- a/paddle/fluid/distributed/table/CMakeLists.txt +++ b/paddle/fluid/distributed/table/CMakeLists.txt @@ -13,7 +13,11 @@ set_source_files_properties(sparse_geo_table.cc PROPERTIES COMPILE_FLAGS ${DISTR set_source_files_properties(barrier_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(common_graph_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_library(common_table SRCS common_sparse_table.cc common_dense_table.cc sparse_geo_table.cc barrier_table.cc common_graph_table.cc DEPS ${TABLE_DEPS} graph_edge graph_node device_context string_helper simple_threadpool xxhash generator) +get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS) + +cc_library(common_table SRCS common_sparse_table.cc common_dense_table.cc +sparse_geo_table.cc barrier_table.cc common_graph_table.cc DEPS ${TABLE_DEPS} +${RPC_DEPS} graph_edge graph_node device_context string_helper simple_threadpool xxhash generator) set_source_files_properties(tensor_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(tensor_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) diff --git a/paddle/fluid/distributed/table/common_graph_table.cc b/paddle/fluid/distributed/table/common_graph_table.cc index 020bcdcc52ef4b..0dc99de1bfe82a 100644 --- a/paddle/fluid/distributed/table/common_graph_table.cc +++ b/paddle/fluid/distributed/table/common_graph_table.cc @@ -171,7 +171,7 @@ int32_t GraphTable::load_nodes(const std::string &path, std::string node_type) { int32_t GraphTable::load_edges(const std::string &path, bool reverse_edge) { auto paths = paddle::string::split_string(path, ";"); - int count = 0; + int64_t count = 0; std::string sample_type = "random"; bool is_weighted = false; int valid_count = 0; diff --git a/paddle/fluid/distributed/table/common_graph_table.h b/paddle/fluid/distributed/table/common_graph_table.h index 8ddf3c8f904a6c..b18da82abe61c9 100644 --- a/paddle/fluid/distributed/table/common_graph_table.h +++ b/paddle/fluid/distributed/table/common_graph_table.h @@ -33,26 +33,11 @@ namespace paddle { namespace distributed { class GraphShard { public: - // static int bucket_low_bound; - // static int gcd(int s, int t) { - // if (s % t == 0) return t; - // return gcd(t, s % t); - // } size_t get_size(); GraphShard() {} - GraphShard(int shard_num) { - this->shard_num = shard_num; - // bucket_size = init_bucket_size(shard_num); - // bucket.resize(bucket_size); - } + GraphShard(int shard_num) { this->shard_num = shard_num; } std::vector &get_bucket() { return bucket; } std::vector get_batch(int start, int end, int step); - // int init_bucket_size(int shard_num) { - // for (int i = bucket_low_bound;; i++) { - // if (gcd(i, shard_num) == 1) return i; - // } - // return -1; - // } std::vector get_ids_by_range(int start, int end) { std::vector res; for (int i = start; i < end && i < bucket.size(); i++) { @@ -64,7 +49,6 @@ class GraphShard { FeatureNode *add_feature_node(uint64_t id); Node *find_node(uint64_t id); void add_neighboor(uint64_t id, uint64_t dst_id, float weight); - // std::unordered_map::iterator> std::unordered_map get_node_location() { return node_location; } @@ -131,7 +115,7 @@ class GraphTable : public SparseTable { protected: std::vector shards; size_t shard_start, shard_end, server_num, shard_num_per_table, shard_num; - const int task_pool_size_ = 11; + const int task_pool_size_ = 24; const int random_sample_nodes_ranges = 3; std::vector feat_name; diff --git a/paddle/fluid/distributed/table/common_sparse_table.cc b/paddle/fluid/distributed/table/common_sparse_table.cc index 2e8c257b6aad47..718fce9950719f 100644 --- a/paddle/fluid/distributed/table/common_sparse_table.cc +++ b/paddle/fluid/distributed/table/common_sparse_table.cc @@ -125,34 +125,37 @@ void ProcessALine(const std::vector& columns, const Meta& meta, int64_t SaveToText(std::ostream* os, std::shared_ptr block, const int mode) { - int64_t not_save_num = 0; - for (auto value : block->values_) { - if (mode == SaveMode::delta && !value.second->need_save_) { - not_save_num++; - continue; - } - - auto* vs = value.second->data_.data(); - std::stringstream ss; - auto id = value.first; - ss << id << "\t" << value.second->count_ << "\t" - << value.second->unseen_days_ << "\t" << value.second->is_entry_ << "\t"; - - for (int i = 0; i < block->value_length_; i++) { - ss << vs[i]; - ss << ","; - } + int64_t save_num = 0; + for (auto& table : block->values_) { + for (auto& value : table) { + if (mode == SaveMode::delta && !value.second->need_save_) { + continue; + } + save_num += 1; + + auto* vs = value.second->data_.data(); + std::stringstream ss; + auto id = value.first; + ss << id << "\t" << value.second->count_ << "\t" + << value.second->unseen_days_ << "\t" << value.second->is_entry_ + << "\t"; + + for (int i = 0; i < block->value_length_; i++) { + ss << vs[i]; + ss << ","; + } - ss << "\n"; + ss << "\n"; - os->write(ss.str().c_str(), sizeof(char) * ss.str().size()); + os->write(ss.str().c_str(), sizeof(char) * ss.str().size()); - if (mode == SaveMode::base || mode == SaveMode::delta) { - value.second->need_save_ = false; + if (mode == SaveMode::base || mode == SaveMode::delta) { + value.second->need_save_ = false; + } } } - return block->values_.size() - not_save_num; + return save_num; } int64_t LoadFromText(const std::string& valuepath, const std::string& metapath, @@ -183,7 +186,7 @@ int64_t LoadFromText(const std::string& valuepath, const std::string& metapath, block->Init(id, false); - auto value_instant = block->GetValue(id); + VALUE* value_instant = block->GetValue(id); if (values.size() == 5) { value_instant->count_ = std::stoi(values[1]); value_instant->unseen_days_ = std::stoi(values[2]); @@ -373,8 +376,10 @@ std::pair CommonSparseTable::print_table_stat() { int64_t feasign_size = 0; int64_t mf_size = 0; - for (auto& value : shard_values_) { - feasign_size += value->values_.size(); + for (auto& shard : shard_values_) { + for (auto& table : shard->values_) { + feasign_size += table.size(); + } } return {feasign_size, mf_size}; diff --git a/paddle/fluid/distributed/table/depends/large_scale_kv.h b/paddle/fluid/distributed/table/depends/large_scale_kv.h index 68d252661edd53..5c10fca98cda4d 100644 --- a/paddle/fluid/distributed/table/depends/large_scale_kv.h +++ b/paddle/fluid/distributed/table/depends/large_scale_kv.h @@ -26,8 +26,10 @@ #include #include "gflags/gflags.h" +#include "butil/object_pool.h" #include "paddle/fluid/distributed/common/utils.h" #include "paddle/fluid/distributed/table/depends/initializers.h" +#include "paddle/fluid/distributed/thirdparty/round_robin.h" #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/rw_lock.h" @@ -47,6 +49,10 @@ namespace distributed { enum Mode { training, infer }; +static const int SPARSE_SHARD_BUCKET_NUM_BITS = 6; +static const size_t SPARSE_SHARD_BUCKET_NUM = (size_t)1 + << SPARSE_SHARD_BUCKET_NUM_BITS; + struct VALUE { explicit VALUE(size_t length) : length_(length), @@ -66,11 +72,11 @@ struct VALUE { bool is_entry_; // whether knock-in }; -inline bool count_entry(std::shared_ptr value, int threshold) { +inline bool count_entry(VALUE *value, int threshold) { return value->count_ >= threshold; } -inline bool probility_entry(std::shared_ptr value, float threshold) { +inline bool probility_entry(VALUE *value, float threshold) { UniformInitializer uniform = UniformInitializer({"uniform", "0", "0", "1"}); return uniform.GetValue() >= threshold; } @@ -145,7 +151,7 @@ class ValueBlock { const std::vector &value_dims) { auto pts = std::vector(); pts.reserve(value_names.size()); - auto &values = values_.at(id); + auto values = GetValue(id); for (int i = 0; i < static_cast(value_names.size()); i++) { PADDLE_ENFORCE_EQ( value_dims[i], value_dims_[i], @@ -159,35 +165,48 @@ class ValueBlock { // pull float *Init(const uint64_t &id, const bool with_update = true, const int counter = 1) { - if (!Has(id)) { - values_[id] = std::make_shared(value_length_); - } + size_t hash = _hasher(id); + size_t bucket = compute_bucket(hash); + + auto &table = values_[bucket]; + auto res = table.find(id); - auto &value = values_.at(id); + VALUE *value = nullptr; + if (res == table.end()) { + value = butil::get_object(value_length_); + + table[id] = value; + + } else { + value = res->second; + } if (with_update) { AttrUpdate(value, counter); } - return value->data_.data(); } VALUE *InitGet(const uint64_t &id, const bool with_update = true, const int counter = 1) { - if (!Has(id)) { - values_[id] = std::make_shared(value_length_); - } + size_t hash = _hasher(id); + size_t bucket = compute_bucket(hash); - auto &value = values_.at(id); + auto &table = values_[bucket]; + auto res = table.find(id); - if (with_update) { - AttrUpdate(value, counter); + VALUE *value = nullptr; + if (res == table.end()) { + value = butil::get_object(value_length_); + // value = _alloc.acquire(value_length_); + table[id] = value; + } else { + value = (VALUE *)(void *)(res->second); } - - return value.get(); + return value; } - void AttrUpdate(std::shared_ptr value, const int counter) { + void AttrUpdate(VALUE *value, const int counter) { // update state value->unseen_days_ = 0; value->count_ += counter; @@ -211,42 +230,73 @@ class ValueBlock { // dont jude if (has(id)) float *Get(const uint64_t &id) { - auto &value = values_.at(id); + size_t hash = _hasher(id); + size_t bucket = compute_bucket(hash); + auto &table = values_[bucket]; + + // auto &value = table.at(id); + // return value->data_.data(); + auto res = table.find(id); + VALUE *value = res->second; return value->data_.data(); } // for load, to reset count, unseen_days - std::shared_ptr GetValue(const uint64_t &id) { return values_.at(id); } + VALUE *GetValue(const uint64_t &id) { + size_t hash = _hasher(id); + size_t bucket = compute_bucket(hash); + + auto &table = values_[bucket]; + auto res = table.find(id); + return res->second; + } bool GetEntry(const uint64_t &id) { - auto &value = values_.at(id); + auto value = GetValue(id); return value->is_entry_; } void SetEntry(const uint64_t &id, const bool state) { - auto &value = values_.at(id); + auto value = GetValue(id); value->is_entry_ = state; } void Shrink(const int threshold) { - for (auto iter = values_.begin(); iter != values_.end();) { - auto &value = iter->second; - value->unseen_days_++; - if (value->unseen_days_ >= threshold) { - iter = values_.erase(iter); - } else { - ++iter; + for (auto &table : values_) { + for (auto iter = table.begin(); iter != table.end();) { + // VALUE* value = (VALUE*)(void*)(iter->second); + VALUE *value = iter->second; + value->unseen_days_++; + if (value->unseen_days_ >= threshold) { + butil::return_object(iter->second); + //_alloc.release(iter->second); + //_alloc.release(value); + iter = table.erase(iter); + } else { + ++iter; + } } } return; } float GetThreshold() { return threshold_; } + size_t compute_bucket(size_t hash) { + if (SPARSE_SHARD_BUCKET_NUM == 1) { + return 0; + } else { + return hash >> (sizeof(size_t) * 8 - SPARSE_SHARD_BUCKET_NUM_BITS); + } + } private: bool Has(const uint64_t id) { - auto got = values_.find(id); - if (got == values_.end()) { + size_t hash = _hasher(id); + size_t bucket = compute_bucket(hash); + auto &table = values_[bucket]; + + auto got = table.find(id); + if (got == table.end()) { return false; } else { return true; @@ -254,8 +304,9 @@ class ValueBlock { } public: - std::unordered_map> values_; + robin_hood::unordered_map values_[SPARSE_SHARD_BUCKET_NUM]; size_t value_length_ = 0; + std::hash _hasher; private: const std::vector &value_names_; @@ -263,7 +314,7 @@ class ValueBlock { const std::vector &value_offsets_; const std::unordered_map &value_idx_; - std::function)> entry_func_; + std::function entry_func_; std::vector> initializers_; float threshold_; }; diff --git a/paddle/fluid/distributed/test/CMakeLists.txt b/paddle/fluid/distributed/test/CMakeLists.txt index b756c740ac764c..af87e1b6cc61d1 100644 --- a/paddle/fluid/distributed/test/CMakeLists.txt +++ b/paddle/fluid/distributed/test/CMakeLists.txt @@ -1,8 +1,10 @@ set_source_files_properties(table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test(table_test SRCS table_test.cc DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS}) +cc_test(table_test SRCS table_test.cc DEPS common_table table tensor_accessor +ps_framework_proto ${COMMON_DEPS} ${RPC_DEPS}) set_source_files_properties(dense_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_test(dense_table_test SRCS dense_table_test.cc DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS}) +cc_test(dense_table_test SRCS dense_table_test.cc DEPS common_table table +tensor_accessor ps_framework_proto ${COMMON_DEPS} ${RPC_DEPS}) set_source_files_properties(barrier_table_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(barrier_table_test SRCS barrier_table_test.cc DEPS common_table table tensor_accessor ps_framework_proto ${COMMON_DEPS}) diff --git a/paddle/fluid/distributed/thirdparty/round_robin.h b/paddle/fluid/distributed/thirdparty/round_robin.h new file mode 100644 index 00000000000000..f5075b4545af04 --- /dev/null +++ b/paddle/fluid/distributed/thirdparty/round_robin.h @@ -0,0 +1,2685 @@ +// ______ _____ ______ _________ +// ______________ ___ /_ ___(_)_______ ___ /_ ______ ______ ______ / +// __ ___/_ __ \__ __ \__ / __ __ \ __ __ \_ __ \_ __ \_ __ / +// _ / / /_/ /_ /_/ /_ / _ / / / _ / / // /_/ // /_/ // /_/ / +// /_/ \____/ /_.___/ /_/ /_/ /_/ ________/_/ /_/ \____/ \____/ \__,_/ +// _/_____/ +// +// Fast & memory efficient hashtable based on robin hood hashing for +// C++11/14/17/20 +// https://github.com/martinus/robin-hood-hashing +// +// Licensed under the MIT License . +// SPDX-License-Identifier: MIT +// Copyright (c) 2018-2021 Martin Ankerl +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all +// copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +#ifndef ROBIN_HOOD_H_INCLUDED +#define ROBIN_HOOD_H_INCLUDED + +// see https://semver.org/ +#define ROBIN_HOOD_VERSION_MAJOR 3 // for incompatible API changes +#define ROBIN_HOOD_VERSION_MINOR \ + 11 // for adding functionality in a backwards-compatible manner +#define ROBIN_HOOD_VERSION_PATCH 1 // for backwards-compatible bug fixes + +#include +#include +#include +#include +#include // only to support hash of smart pointers +#include +#include +#include +#include +#if __cplusplus >= 201703L +#include +#endif + +// #define ROBIN_HOOD_LOG_ENABLED +#ifdef ROBIN_HOOD_LOG_ENABLED +#include +#define ROBIN_HOOD_LOG(...) \ + std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << __VA_ARGS__ \ + << std::endl; +#else +#define ROBIN_HOOD_LOG(x) +#endif + +// #define ROBIN_HOOD_TRACE_ENABLED +#ifdef ROBIN_HOOD_TRACE_ENABLED +#include +#define ROBIN_HOOD_TRACE(...) \ + std::cout << __FUNCTION__ << "@" << __LINE__ << ": " << __VA_ARGS__ \ + << std::endl; +#else +#define ROBIN_HOOD_TRACE(x) +#endif + +// #define ROBIN_HOOD_COUNT_ENABLED +#ifdef ROBIN_HOOD_COUNT_ENABLED +#include +#define ROBIN_HOOD_COUNT(x) ++counts().x; +namespace robin_hood { +struct Counts { + uint64_t shiftUp{}; + uint64_t shiftDown{}; +}; +inline std::ostream &operator<<(std::ostream &os, Counts const &c) { + return os << c.shiftUp << " shiftUp" << std::endl + << c.shiftDown << " shiftDown" << std::endl; +} + +static Counts &counts() { + static Counts counts{}; + return counts; +} +} // namespace robin_hood +#else +#define ROBIN_HOOD_COUNT(x) +#endif + +// all non-argument macros should use this facility. See +// https://www.fluentcpp.com/2019/05/28/better-macros-better-flags/ +#define ROBIN_HOOD(x) ROBIN_HOOD_PRIVATE_DEFINITION_##x() + +// mark unused members with this macro +#define ROBIN_HOOD_UNUSED(identifier) + +// bitness +#if SIZE_MAX == UINT32_MAX +#define ROBIN_HOOD_PRIVATE_DEFINITION_BITNESS() 32 +#elif SIZE_MAX == UINT64_MAX +#define ROBIN_HOOD_PRIVATE_DEFINITION_BITNESS() 64 +#else +#error Unsupported bitness +#endif + +// endianess +#ifdef _MSC_VER +#define ROBIN_HOOD_PRIVATE_DEFINITION_LITTLE_ENDIAN() 1 +#define ROBIN_HOOD_PRIVATE_DEFINITION_BIG_ENDIAN() 0 +#else +#define ROBIN_HOOD_PRIVATE_DEFINITION_LITTLE_ENDIAN() \ + (__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) +#define ROBIN_HOOD_PRIVATE_DEFINITION_BIG_ENDIAN() \ + (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +#endif + +// inline +#ifdef _MSC_VER +#define ROBIN_HOOD_PRIVATE_DEFINITION_NOINLINE() __declspec(noinline) +#else +#define ROBIN_HOOD_PRIVATE_DEFINITION_NOINLINE() __attribute__((noinline)) +#endif + +// exceptions +#if !defined(__cpp_exceptions) && !defined(__EXCEPTIONS) && !defined(_CPPUNWIND) +#define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_EXCEPTIONS() 0 +#else +#define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_EXCEPTIONS() 1 +#endif + +// count leading/trailing bits +#if !defined(ROBIN_HOOD_DISABLE_INTRINSICS) +#ifdef _MSC_VER +#if ROBIN_HOOD(BITNESS) == 32 +#define ROBIN_HOOD_PRIVATE_DEFINITION_BITSCANFORWARD() _BitScanForward +#else +#define ROBIN_HOOD_PRIVATE_DEFINITION_BITSCANFORWARD() _BitScanForward64 +#endif +#include +#pragma intrinsic(ROBIN_HOOD(BITSCANFORWARD)) +#define ROBIN_HOOD_COUNT_TRAILING_ZEROES(x) \ + [](size_t mask) noexcept->int { \ + unsigned long index; \ + return ROBIN_HOOD(BITSCANFORWARD)(&index, mask) ? static_cast(index) \ + : ROBIN_HOOD(BITNESS); \ + } \ + (x) +#else +#if ROBIN_HOOD(BITNESS) == 32 +#define ROBIN_HOOD_PRIVATE_DEFINITION_CTZ() __builtin_ctzl +#define ROBIN_HOOD_PRIVATE_DEFINITION_CLZ() __builtin_clzl +#else +#define ROBIN_HOOD_PRIVATE_DEFINITION_CTZ() __builtin_ctzll +#define ROBIN_HOOD_PRIVATE_DEFINITION_CLZ() __builtin_clzll +#endif +#define ROBIN_HOOD_COUNT_LEADING_ZEROES(x) \ + ((x) ? ROBIN_HOOD(CLZ)(x) : ROBIN_HOOD(BITNESS)) +#define ROBIN_HOOD_COUNT_TRAILING_ZEROES(x) \ + ((x) ? ROBIN_HOOD(CTZ)(x) : ROBIN_HOOD(BITNESS)) +#endif +#endif + +// fallthrough +#ifndef __has_cpp_attribute // For backwards compatibility +#define __has_cpp_attribute(x) 0 +#endif +#if __has_cpp_attribute(clang::fallthrough) +#define ROBIN_HOOD_PRIVATE_DEFINITION_FALLTHROUGH() [[clang::fallthrough]] +#elif __has_cpp_attribute(gnu::fallthrough) +#define ROBIN_HOOD_PRIVATE_DEFINITION_FALLTHROUGH() [[gnu::fallthrough]] +#else +#define ROBIN_HOOD_PRIVATE_DEFINITION_FALLTHROUGH() +#endif + +// likely/unlikely +#ifdef _MSC_VER +#define ROBIN_HOOD_LIKELY(condition) condition +#define ROBIN_HOOD_UNLIKELY(condition) condition +#else +#define ROBIN_HOOD_LIKELY(condition) __builtin_expect(condition, 1) +#define ROBIN_HOOD_UNLIKELY(condition) __builtin_expect(condition, 0) +#endif + +// detect if native wchar_t type is availiable in MSVC +#ifdef _MSC_VER +#ifdef _NATIVE_WCHAR_T_DEFINED +#define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_NATIVE_WCHART() 1 +#else +#define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_NATIVE_WCHART() 0 +#endif +#else +#define ROBIN_HOOD_PRIVATE_DEFINITION_HAS_NATIVE_WCHART() 1 +#endif + +// detect if MSVC supports the pair(std::piecewise_construct_t,...) consructor +// being constexpr +#ifdef _MSC_VER +#if _MSC_VER <= 1900 +#define ROBIN_HOOD_PRIVATE_DEFINITION_BROKEN_CONSTEXPR() 1 +#else +#define ROBIN_HOOD_PRIVATE_DEFINITION_BROKEN_CONSTEXPR() 0 +#endif +#else +#define ROBIN_HOOD_PRIVATE_DEFINITION_BROKEN_CONSTEXPR() 0 +#endif + +// workaround missing "is_trivially_copyable" in g++ < 5.0 +// See https://stackoverflow.com/a/31798726/48181 +#if defined(__GNUC__) && __GNUC__ < 5 +#define ROBIN_HOOD_IS_TRIVIALLY_COPYABLE(...) __has_trivial_copy(__VA_ARGS__) +#else +#define ROBIN_HOOD_IS_TRIVIALLY_COPYABLE(...) \ + std::is_trivially_copyable<__VA_ARGS__>::value +#endif + +// helpers for C++ versions, see +// https://gcc.gnu.org/onlinedocs/cpp/Standard-Predefined-Macros.html +#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX() __cplusplus +#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX98() 199711L +#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX11() 201103L +#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX14() 201402L +#define ROBIN_HOOD_PRIVATE_DEFINITION_CXX17() 201703L + +#if ROBIN_HOOD(CXX) >= ROBIN_HOOD(CXX17) +#define ROBIN_HOOD_PRIVATE_DEFINITION_NODISCARD() [[nodiscard]] +#else +#define ROBIN_HOOD_PRIVATE_DEFINITION_NODISCARD() +#endif + +namespace robin_hood { + +#if ROBIN_HOOD(CXX) >= ROBIN_HOOD(CXX14) +#define ROBIN_HOOD_STD std +#else + +// c++11 compatibility layer +namespace ROBIN_HOOD_STD { +template +struct alignment_of + : std::integral_constant< + std::size_t, alignof(typename std::remove_all_extents::type)> {}; + +template +class integer_sequence { + public: + using value_type = T; + static_assert(std::is_integral::value, "not integral type"); + static constexpr std::size_t size() noexcept { return sizeof...(Ints); } +}; +template +using index_sequence = integer_sequence; + +namespace detail_ { +template +struct IntSeqImpl { + using TValue = T; + static_assert(std::is_integral::value, "not integral type"); + static_assert(Begin >= 0 && Begin < End, + "unexpected argument (Begin<0 || Begin<=End)"); + + template + struct IntSeqCombiner; + + template + struct IntSeqCombiner, + integer_sequence> { + using TResult = integer_sequence; + }; + + using TResult = typename IntSeqCombiner< + typename IntSeqImpl::TResult, + typename IntSeqImpl::TResult>::TResult; +}; + +template +struct IntSeqImpl { + using TValue = T; + static_assert(std::is_integral::value, "not integral type"); + static_assert(Begin >= 0, "unexpected argument (Begin<0)"); + using TResult = integer_sequence; +}; + +template +struct IntSeqImpl { + using TValue = T; + static_assert(std::is_integral::value, "not integral type"); + static_assert(Begin >= 0, "unexpected argument (Begin<0)"); + using TResult = integer_sequence; +}; +} // namespace detail_ + +template +using make_integer_sequence = + typename detail_::IntSeqImpl::TResult; + +template +using make_index_sequence = make_integer_sequence; + +template +using index_sequence_for = make_index_sequence; + +} // namespace ROBIN_HOOD_STD + +#endif + +namespace detail { + +// make sure we static_cast to the correct type for hash_int +#if ROBIN_HOOD(BITNESS) == 64 +using SizeT = uint64_t; +#else +using SizeT = uint32_t; +#endif + +template +T rotr(T x, unsigned k) { + return (x >> k) | (x << (8U * sizeof(T) - k)); +} + +// This cast gets rid of warnings like "cast from 'uint8_t*' {aka 'unsigned +// char*'} to +// 'uint64_t*' {aka 'long unsigned int*'} increases required alignment of target +// type". Use with +// care! +template +inline T reinterpret_cast_no_cast_align_warning(void *ptr) noexcept { + return reinterpret_cast(ptr); +} + +template +inline T reinterpret_cast_no_cast_align_warning(void const *ptr) noexcept { + return reinterpret_cast(ptr); +} + +// make sure this is not inlined as it is slow and dramatically enlarges code, +// thus making other +// inlinings more difficult. Throws are also generally the slow path. +template +[[noreturn]] ROBIN_HOOD(NOINLINE) +#if ROBIN_HOOD(HAS_EXCEPTIONS) + void doThrow(Args &&... args) { + // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-array-to-pointer-decay) + throw E(std::forward(args)...); +} +#else + void doThrow(Args &&... ROBIN_HOOD_UNUSED(args) /*unused*/) { + abort(); +} +#endif + +template +T *assertNotNull(T *t, Args &&... args) { + if (ROBIN_HOOD_UNLIKELY(nullptr == t)) { + doThrow(std::forward(args)...); + } + return t; +} + +template +inline T unaligned_load(void const *ptr) noexcept { + // using memcpy so we don't get into unaligned load problems. + // compiler should optimize this very well anyways. + T t; + std::memcpy(&t, ptr, sizeof(T)); + return t; +} + +// Allocates bulks of memory for objects of type T. This deallocates the memory +// in the destructor, +// and keeps a linked list of the allocated memory around. Overhead per +// allocation is the size of a +// pointer. +template +class BulkPoolAllocator { + public: + BulkPoolAllocator() noexcept = default; + + // does not copy anything, just creates a new allocator. + BulkPoolAllocator(const BulkPoolAllocator &ROBIN_HOOD_UNUSED( + o) /*unused*/) noexcept : mHead(nullptr), + mListForFree(nullptr) {} + + BulkPoolAllocator(BulkPoolAllocator &&o) noexcept + : mHead(o.mHead), + mListForFree(o.mListForFree) { + o.mListForFree = nullptr; + o.mHead = nullptr; + } + + BulkPoolAllocator &operator=(BulkPoolAllocator &&o) noexcept { + reset(); + mHead = o.mHead; + mListForFree = o.mListForFree; + o.mListForFree = nullptr; + o.mHead = nullptr; + return *this; + } + + BulkPoolAllocator & + // NOLINTNEXTLINE(bugprone-unhandled-self-assignment,cert-oop54-cpp) + operator=(const BulkPoolAllocator &ROBIN_HOOD_UNUSED(o) /*unused*/) noexcept { + // does not do anything + return *this; + } + + ~BulkPoolAllocator() noexcept { reset(); } + + // Deallocates all allocated memory. + void reset() noexcept { + while (mListForFree) { + T *tmp = *mListForFree; + ROBIN_HOOD_LOG("std::free") + std::free(mListForFree); + mListForFree = reinterpret_cast_no_cast_align_warning(tmp); + } + mHead = nullptr; + } + + // allocates, but does NOT initialize. Use in-place new constructor, e.g. + // T* obj = pool.allocate(); + // ::new (static_cast(obj)) T(); + T *allocate() { + T *tmp = mHead; + if (!tmp) { + tmp = performAllocation(); + } + + mHead = *reinterpret_cast_no_cast_align_warning(tmp); + return tmp; + } + + // does not actually deallocate but puts it in store. + // make sure you have already called the destructor! e.g. with + // obj->~T(); + // pool.deallocate(obj); + void deallocate(T *obj) noexcept { + *reinterpret_cast_no_cast_align_warning(obj) = mHead; + mHead = obj; + } + + // Adds an already allocated block of memory to the allocator. This allocator + // is from now on + // responsible for freeing the data (with free()). If the provided data is not + // large enough to + // make use of, it is immediately freed. Otherwise it is reused and freed in + // the destructor. + void addOrFree(void *ptr, const size_t numBytes) noexcept { + // calculate number of available elements in ptr + if (numBytes < ALIGNMENT + ALIGNED_SIZE) { + // not enough data for at least one element. Free and return. + ROBIN_HOOD_LOG("std::free") + std::free(ptr); + } else { + ROBIN_HOOD_LOG("add to buffer") + add(ptr, numBytes); + } + } + + void swap(BulkPoolAllocator &other) noexcept { + using std::swap; + swap(mHead, other.mHead); + swap(mListForFree, other.mListForFree); + } + + private: + // iterates the list of allocated memory to calculate how many to alloc next. + // Recalculating this each time saves us a size_t member. + // This ignores the fact that memory blocks might have been added manually + // with addOrFree. In + // practice, this should not matter much. + ROBIN_HOOD(NODISCARD) size_t calcNumElementsToAlloc() const noexcept { + auto tmp = mListForFree; + size_t numAllocs = MinNumAllocs; + + while (numAllocs * 2 <= MaxNumAllocs && tmp) { + auto x = reinterpret_cast(tmp); + tmp = *x; + numAllocs *= 2; + } + + return numAllocs; + } + + // WARNING: Underflow if numBytes < ALIGNMENT! This is guarded in addOrFree(). + void add(void *ptr, const size_t numBytes) noexcept { + const size_t numElements = (numBytes - ALIGNMENT) / ALIGNED_SIZE; + + auto data = reinterpret_cast(ptr); + + // link free list + auto x = reinterpret_cast(data); + *x = mListForFree; + mListForFree = data; + + // create linked list for newly allocated data + auto *const headT = reinterpret_cast_no_cast_align_warning( + reinterpret_cast(ptr) + ALIGNMENT); + + auto *const head = reinterpret_cast(headT); + + // Visual Studio compiler automatically unrolls this loop, which is pretty + // cool + for (size_t i = 0; i < numElements; ++i) { + *reinterpret_cast_no_cast_align_warning( + head + i * ALIGNED_SIZE) = head + (i + 1) * ALIGNED_SIZE; + } + + // last one points to 0 + *reinterpret_cast_no_cast_align_warning( + head + (numElements - 1) * ALIGNED_SIZE) = mHead; + mHead = headT; + } + + // Called when no memory is available (mHead == 0). + // Don't inline this slow path. + ROBIN_HOOD(NOINLINE) T *performAllocation() { + size_t const numElementsToAlloc = calcNumElementsToAlloc(); + + // alloc new memory: [prev |T, T, ... T] + size_t const bytes = ALIGNMENT + ALIGNED_SIZE * numElementsToAlloc; + ROBIN_HOOD_LOG("std::malloc " << bytes << " = " << ALIGNMENT << " + " + << ALIGNED_SIZE << " * " + << numElementsToAlloc) + add(assertNotNull(std::malloc(bytes)), bytes); + return mHead; + } + +// enforce byte alignment of the T's +#if ROBIN_HOOD(CXX) >= ROBIN_HOOD(CXX14) + static constexpr size_t ALIGNMENT = + (std::max)(std::alignment_of::value, std::alignment_of::value); +#else + static const size_t ALIGNMENT = + (ROBIN_HOOD_STD::alignment_of::value > + ROBIN_HOOD_STD::alignment_of::value) + ? ROBIN_HOOD_STD::alignment_of::value + : +ROBIN_HOOD_STD::alignment_of::value; // the + is for + // walkarround +#endif + + static constexpr size_t ALIGNED_SIZE = + ((sizeof(T) - 1) / ALIGNMENT + 1) * ALIGNMENT; + + static_assert(MinNumAllocs >= 1, "MinNumAllocs"); + static_assert(MaxNumAllocs >= MinNumAllocs, "MaxNumAllocs"); + static_assert(ALIGNED_SIZE >= sizeof(T *), "ALIGNED_SIZE"); + static_assert(0 == (ALIGNED_SIZE % sizeof(T *)), "ALIGNED_SIZE mod"); + static_assert(ALIGNMENT >= sizeof(T *), "ALIGNMENT"); + + T *mHead{nullptr}; + T **mListForFree{nullptr}; +}; + +template +struct NodeAllocator; + +// dummy allocator that does nothing +template +struct NodeAllocator { + // we are not using the data, so just free it. + void addOrFree(void *ptr, + size_t ROBIN_HOOD_UNUSED(numBytes) /*unused*/) noexcept { + ROBIN_HOOD_LOG("std::free") + std::free(ptr); + } +}; + +template +struct NodeAllocator + : public BulkPoolAllocator {}; + +// c++14 doesn't have is_nothrow_swappable, and clang++ 6.0.1 doesn't like it +// either, so I'm making +// my own here. +namespace swappable { +#if ROBIN_HOOD(CXX) < ROBIN_HOOD(CXX17) +using std::swap; +template +struct nothrow { + static const bool value = + noexcept(swap(std::declval(), std::declval())); +}; +#else +template +struct nothrow { + static const bool value = std::is_nothrow_swappable::value; +}; +#endif +} // namespace swappable + +} // namespace detail + +struct is_transparent_tag {}; + +// A custom pair implementation is used in the map because std::pair is not +// is_trivially_copyable, +// which means it would not be allowed to be used in std::memcpy. This struct +// is copyable, which is +// also tested. +template +struct pair { + using first_type = T1; + using second_type = T2; + + template ::value && + std::is_default_constructible::value>::type> + constexpr pair() noexcept(noexcept(U1()) && noexcept(U2())) + : first(), second() {} + + // pair constructors are explicit so we don't accidentally call this ctor when + // we don't have to. + explicit constexpr pair(std::pair const &o) noexcept( + noexcept(T1(std::declval())) && + noexcept(T2(std::declval()))) + : first(o.first), second(o.second) {} + + // pair constructors are explicit so we don't accidentally call this ctor when + // we don't have to. + explicit constexpr pair(std::pair &&o) noexcept( + noexcept(T1(std::move(std::declval()))) && + noexcept(T2(std::move(std::declval())))) + : first(std::move(o.first)), second(std::move(o.second)) {} + + constexpr pair(T1 &&a, T2 &&b) noexcept( + noexcept(T1(std::move(std::declval()))) && + noexcept(T2(std::move(std::declval())))) + : first(std::move(a)), second(std::move(b)) {} + + template + constexpr pair(U1 &&a, U2 &&b) noexcept( + noexcept(T1(std::forward(std::declval()))) && + noexcept(T2(std::forward(std::declval())))) + : first(std::forward(a)), second(std::forward(b)) {} + + template +// MSVC 2015 produces error "C2476: ‘constexpr’ constructor does not initialize +// all members" +// if this constructor is constexpr +#if !ROBIN_HOOD(BROKEN_CONSTEXPR) + constexpr +#endif + pair(std::piecewise_construct_t /*unused*/, std::tuple a, + std::tuple + b) noexcept(noexcept(pair(std::declval &>(), + std::declval &>(), + ROBIN_HOOD_STD::index_sequence_for< + U1...>(), + ROBIN_HOOD_STD::index_sequence_for< + U2...>()))) + : pair(a, b, ROBIN_HOOD_STD::index_sequence_for(), + ROBIN_HOOD_STD::index_sequence_for()) { + } + + // constructor called from the std::piecewise_construct_t ctor + template + pair( + std::tuple &a, std::tuple &b, + ROBIN_HOOD_STD::index_sequence /*unused*/, + ROBIN_HOOD_STD::index_sequence< + I2...> /*unused*/) noexcept(noexcept(T1(std:: + forward(std::get( + std::declval< + std::tuple + &>()))...)) && + noexcept(T2(std::forward(std::get( + std::declval< + std::tuple &>()))...))) + : first(std::forward(std::get(a))...), + second(std::forward(std::get(b))...) { + // make visual studio compiler happy about warning about unused a & b. + // Visual studio's pair implementation disables warning 4100. + (void)a; + (void)b; + } + + void swap(pair &o) noexcept((detail::swappable::nothrow::value) && + (detail::swappable::nothrow::value)) { + using std::swap; + swap(first, o.first); + swap(second, o.second); + } + + T1 first; // NOLINT(misc-non-private-member-variables-in-classes) + T2 second; // NOLINT(misc-non-private-member-variables-in-classes) +}; + +template +inline void swap(pair &a, pair &b) noexcept( + noexcept(std::declval &>().swap(std::declval &>()))) { + a.swap(b); +} + +template +inline constexpr bool operator==(pair const &x, pair const &y) { + return (x.first == y.first) && (x.second == y.second); +} +template +inline constexpr bool operator!=(pair const &x, pair const &y) { + return !(x == y); +} +template +inline constexpr bool +operator<(pair const &x, pair const &y) noexcept( + noexcept(std::declval() < std::declval()) && + noexcept(std::declval() < std::declval())) { + return x.first < y.first || (!(y.first < x.first) && x.second < y.second); +} +template +inline constexpr bool operator>(pair const &x, pair const &y) { + return y < x; +} +template +inline constexpr bool operator<=(pair const &x, pair const &y) { + return !(x > y); +} +template +inline constexpr bool operator>=(pair const &x, pair const &y) { + return !(x < y); +} + +inline size_t hash_bytes(void const *ptr, size_t len) noexcept { + static constexpr uint64_t m = UINT64_C(0xc6a4a7935bd1e995); + static constexpr uint64_t seed = UINT64_C(0xe17a1465); + static constexpr unsigned int r = 47; + + auto const *const data64 = static_cast(ptr); + uint64_t h = seed ^ (len * m); + + size_t const n_blocks = len / 8; + for (size_t i = 0; i < n_blocks; ++i) { + auto k = detail::unaligned_load(data64 + i); + + k *= m; + k ^= k >> r; + k *= m; + + h ^= k; + h *= m; + } + + auto const *const data8 = + reinterpret_cast(data64 + n_blocks); + switch (len & 7U) { + case 7: + h ^= static_cast(data8[6]) << 48U; + ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH + case 6: + h ^= static_cast(data8[5]) << 40U; + ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH + case 5: + h ^= static_cast(data8[4]) << 32U; + ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH + case 4: + h ^= static_cast(data8[3]) << 24U; + ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH + case 3: + h ^= static_cast(data8[2]) << 16U; + ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH + case 2: + h ^= static_cast(data8[1]) << 8U; + ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH + case 1: + h ^= static_cast(data8[0]); + h *= m; + ROBIN_HOOD(FALLTHROUGH); // FALLTHROUGH + default: + break; + } + + h ^= h >> r; + + // not doing the final step here, because this will be done by keyToIdx + // anyways + // h *= m; + // h ^= h >> r; + return static_cast(h); +} + +inline size_t hash_int(uint64_t x) noexcept { + // tried lots of different hashes, let's stick with murmurhash3. It's simple, + // fast, well tested, + // and doesn't need any special 128bit operations. + x ^= x >> 33U; + x *= UINT64_C(0xff51afd7ed558ccd); + x ^= x >> 33U; + + // not doing the final step here, because this will be done by keyToIdx + // anyways + // x *= UINT64_C(0xc4ceb9fe1a85ec53); + // x ^= x >> 33U; + return static_cast(x); +} + +// A thin wrapper around std::hash, performing an additional simple mixing step +// of the result. +template +struct hash : public std::hash { + size_t operator()(T const &obj) const noexcept(noexcept( + std::declval>().operator()(std::declval()))) { + // call base hash + auto result = std::hash::operator()(obj); + // return mixed of that, to be save against identity has + return hash_int(static_cast(result)); + } +}; + +template +struct hash> { + size_t operator()(std::basic_string const &str) const noexcept { + return hash_bytes(str.data(), sizeof(CharT) * str.size()); + } +}; + +#if ROBIN_HOOD(CXX) >= ROBIN_HOOD(CXX17) +template +struct hash> { + size_t operator()(std::basic_string_view const &sv) const noexcept { + return hash_bytes(sv.data(), sizeof(CharT) * sv.size()); + } +}; +#endif + +template +struct hash { + size_t operator()(T *ptr) const noexcept { + return hash_int(reinterpret_cast(ptr)); + } +}; + +template +struct hash> { + size_t operator()(std::unique_ptr const &ptr) const noexcept { + return hash_int(reinterpret_cast(ptr.get())); + } +}; + +template +struct hash> { + size_t operator()(std::shared_ptr const &ptr) const noexcept { + return hash_int(reinterpret_cast(ptr.get())); + } +}; + +template +struct hash::value>::type> { + size_t operator()(Enum e) const noexcept { + using Underlying = typename std::underlying_type::type; + return hash{}(static_cast(e)); + } +}; + +#define ROBIN_HOOD_HASH_INT(T) \ + template <> \ + struct hash { \ + size_t operator()(T const &obj) const noexcept { \ + return hash_int(static_cast(obj)); \ + } \ + } + +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wuseless-cast" +#endif +// see https://en.cppreference.com/w/cpp/utility/hash +ROBIN_HOOD_HASH_INT(bool); +ROBIN_HOOD_HASH_INT(char); +ROBIN_HOOD_HASH_INT(signed char); +ROBIN_HOOD_HASH_INT(unsigned char); +ROBIN_HOOD_HASH_INT(char16_t); +ROBIN_HOOD_HASH_INT(char32_t); +#if ROBIN_HOOD(HAS_NATIVE_WCHART) +ROBIN_HOOD_HASH_INT(wchar_t); +#endif +ROBIN_HOOD_HASH_INT(short); +ROBIN_HOOD_HASH_INT(unsigned short); +ROBIN_HOOD_HASH_INT(int); +ROBIN_HOOD_HASH_INT(unsigned int); +ROBIN_HOOD_HASH_INT(long); +ROBIN_HOOD_HASH_INT(long long); +ROBIN_HOOD_HASH_INT(unsigned long); +ROBIN_HOOD_HASH_INT(unsigned long long); +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC diagnostic pop +#endif +namespace detail { + +template +struct void_type { + using type = void; +}; + +template +struct has_is_transparent : public std::false_type {}; + +template +struct has_is_transparent::type> + : public std::true_type {}; + +// using wrapper classes for hash and key_equal prevents the diamond problem +// when the same type +// is used. see https://stackoverflow.com/a/28771920/48181 +template +struct WrapHash : public T { + WrapHash() = default; + explicit WrapHash(T const &o) noexcept(noexcept(T(std::declval()))) + : T(o) {} +}; + +template +struct WrapKeyEqual : public T { + WrapKeyEqual() = default; + explicit WrapKeyEqual(T const &o) noexcept( + noexcept(T(std::declval()))) + : T(o) {} +}; + +// A highly optimized hashmap implementation, using the Robin Hood algorithm. +// +// In most cases, this map should be usable as a drop-in replacement for +// std::unordered_map, but +// be about 2x faster in most cases and require much less allocations. +// +// This implementation uses the following memory layout: +// +// [Node, Node, ... Node | info, info, ... infoSentinel ] +// +// * Node: either a DataNode that directly has the std::pair as +// member, +// or a DataNode with a pointer to std::pair. Which DataNode +// representation to use +// depends on how fast the swap() operation is. Heuristically, this is +// automatically choosen +// based on sizeof(). there are always 2^n Nodes. +// +// * info: Each Node in the map has a corresponding info byte, so there are 2^n +// info bytes. +// Each byte is initialized to 0, meaning the corresponding Node is empty. Set +// to 1 means the +// corresponding node contains data. Set to 2 means the corresponding Node is +// filled, but it +// actually belongs to the previous position and was pushed out because that +// place is already +// taken. +// +// * infoSentinel: Sentinel byte set to 1, so that iterator's ++ can stop at +// end() without the +// need for a idx variable. +// +// According to STL, order of templates has effect on throughput. That's why +// I've moved the +// boolean to the front. +// https://www.reddit.com/r/cpp/comments/ahp6iu/compile_time_binary_size_reductions_and_cs_future/eeguck4/ +template +class Table + : public WrapHash, + public WrapKeyEqual, + detail::NodeAllocator< + typename std::conditional< + std::is_void::value, Key, + robin_hood::pair< + typename std::conditional::type, + T>>::type, + 4, 16384, IsFlat> { + public: + static constexpr bool is_flat = IsFlat; + static constexpr bool is_map = !std::is_void::value; + static constexpr bool is_set = !is_map; + static constexpr bool is_transparent = + has_is_transparent::value && has_is_transparent::value; + + using key_type = Key; + using mapped_type = T; + using value_type = typename std::conditional< + is_set, Key, + robin_hood::pair::type, + T>>::type; + using size_type = size_t; + using hasher = Hash; + using key_equal = KeyEqual; + using Self = + Table; + + private: + static_assert(MaxLoadFactor100 > 10 && MaxLoadFactor100 < 100, + "MaxLoadFactor100 needs to be >10 && < 100"); + + using WHash = WrapHash; + using WKeyEqual = WrapKeyEqual; + + // configuration defaults + + // make sure we have 8 elements, needed to quickly rehash mInfo + static constexpr size_t InitialNumElements = sizeof(uint64_t); + static constexpr uint32_t InitialInfoNumBits = 5; + static constexpr uint8_t InitialInfoInc = 1U << InitialInfoNumBits; + static constexpr size_t InfoMask = InitialInfoInc - 1U; + static constexpr uint8_t InitialInfoHashShift = 0; + using DataPool = detail::NodeAllocator; + + // type needs to be wider than uint8_t. + using InfoType = uint32_t; + + // DataNode //////////////////////////////////////////////////////// + + // Primary template for the data node. We have special implementations for + // small and big + // objects. For large objects it is assumed that swap() is fairly slow, so we + // allocate these + // on the heap so swap merely swaps a pointer. + template + class DataNode {}; + + // Small: just allocate on the stack. + template + class DataNode final { + public: + template + explicit DataNode( + M &ROBIN_HOOD_UNUSED(map) /*unused*/, + Args &&... args) noexcept(noexcept(value_type(std:: + forward( + args)...))) + : mData(std::forward(args)...) {} + + DataNode( + M &ROBIN_HOOD_UNUSED(map) /*unused*/, + DataNode + &&n) noexcept(std::is_nothrow_move_constructible::value) + : mData(std::move(n.mData)) {} + + // doesn't do anything + void destroy(M &ROBIN_HOOD_UNUSED(map) /*unused*/) noexcept {} + void destroyDoNotDeallocate() noexcept {} + + value_type const *operator->() const noexcept { return &mData; } + value_type *operator->() noexcept { return &mData; } + + const value_type &operator*() const noexcept { return mData; } + + value_type &operator*() noexcept { return mData; } + + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::type + getFirst() noexcept { + return mData.first; + } + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::type getFirst() noexcept { + return mData; + } + + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::type + getFirst() const noexcept { + return mData.first; + } + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::type getFirst() const + noexcept { + return mData; + } + + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::type getSecond() noexcept { + return mData.second; + } + + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::type getSecond() const + noexcept { + return mData.second; + } + + void swap(DataNode &o) noexcept( + noexcept(std::declval().swap(std::declval()))) { + mData.swap(o.mData); + } + + private: + value_type mData; + }; + + // big object: allocate on heap. + template + class DataNode { + public: + template + explicit DataNode(M &map, Args &&... args) : mData(map.allocate()) { + ::new (static_cast(mData)) + value_type(std::forward(args)...); + } + + DataNode(M &ROBIN_HOOD_UNUSED(map) /*unused*/, + DataNode &&n) noexcept : mData(std::move(n.mData)) {} + + void destroy(M &map) noexcept { + // don't deallocate, just put it into list of datapool. + mData->~value_type(); + map.deallocate(mData); + } + + void destroyDoNotDeallocate() noexcept { mData->~value_type(); } + + value_type const *operator->() const noexcept { return mData; } + + value_type *operator->() noexcept { return mData; } + + const value_type &operator*() const { return *mData; } + + value_type &operator*() { return *mData; } + + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::type + getFirst() noexcept { + return mData->first; + } + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::type getFirst() noexcept { + return *mData; + } + + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::type + getFirst() const noexcept { + return mData->first; + } + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::type getFirst() const + noexcept { + return *mData; + } + + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::type getSecond() noexcept { + return mData->second; + } + + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::type getSecond() const + noexcept { + return mData->second; + } + + void swap(DataNode &o) noexcept { + using std::swap; + swap(mData, o.mData); + } + + private: + value_type *mData; + }; + + using Node = DataNode; + + // helpers for insertKeyPrepareEmptySpot: extract first entry (only const + // required) + ROBIN_HOOD(NODISCARD) + key_type const &getFirstConst(Node const &n) const noexcept { + return n.getFirst(); + } + + // in case we have void mapped_type, we are not using a pair, thus we just + // route k through. + // No need to disable this because it's just not used if not applicable. + ROBIN_HOOD(NODISCARD) + key_type const &getFirstConst(key_type const &k) const noexcept { return k; } + + // in case we have non-void mapped_type, we have a standard robin_hood::pair + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::value, key_type const &>::type + getFirstConst(value_type const &vt) const noexcept { + return vt.first; + } + + // Cloner ////////////////////////////////////////////////////////// + + template + struct Cloner; + + // fast path: Just copy data, without allocating anything. + template + struct Cloner { + void operator()(M const &source, M &target) const { + auto const *const src = reinterpret_cast(source.mKeyVals); + auto *tgt = reinterpret_cast(target.mKeyVals); + auto const numElementsWithBuffer = + target.calcNumElementsWithBuffer(target.mMask + 1); + std::copy(src, src + target.calcNumBytesTotal(numElementsWithBuffer), + tgt); + } + }; + + template + struct Cloner { + void operator()(M const &s, M &t) const { + auto const numElementsWithBuffer = + t.calcNumElementsWithBuffer(t.mMask + 1); + std::copy(s.mInfo, s.mInfo + t.calcNumBytesInfo(numElementsWithBuffer), + t.mInfo); + + for (size_t i = 0; i < numElementsWithBuffer; ++i) { + if (t.mInfo[i]) { + ::new (static_cast(t.mKeyVals + i)) Node(t, *s.mKeyVals[i]); + } + } + } + }; + + // Destroyer /////////////////////////////////////////////////////// + + template + struct Destroyer {}; + + template + struct Destroyer { + void nodes(M &m) const noexcept { m.mNumElements = 0; } + + void nodesDoNotDeallocate(M &m) const noexcept { m.mNumElements = 0; } + }; + + template + struct Destroyer { + void nodes(M &m) const noexcept { + m.mNumElements = 0; + // clear also resets mInfo to 0, that's sometimes not necessary. + auto const numElementsWithBuffer = + m.calcNumElementsWithBuffer(m.mMask + 1); + + for (size_t idx = 0; idx < numElementsWithBuffer; ++idx) { + if (0 != m.mInfo[idx]) { + Node &n = m.mKeyVals[idx]; + n.destroy(m); + n.~Node(); + } + } + } + + void nodesDoNotDeallocate(M &m) const noexcept { + m.mNumElements = 0; + // clear also resets mInfo to 0, that's sometimes not necessary. + auto const numElementsWithBuffer = + m.calcNumElementsWithBuffer(m.mMask + 1); + for (size_t idx = 0; idx < numElementsWithBuffer; ++idx) { + if (0 != m.mInfo[idx]) { + Node &n = m.mKeyVals[idx]; + n.destroyDoNotDeallocate(); + n.~Node(); + } + } + } + }; + + // Iter //////////////////////////////////////////////////////////// + + struct fast_forward_tag {}; + + // generic iterator for both const_iterator and iterator. + template + // NOLINTNEXTLINE(hicpp-special-member-functions,cppcoreguidelines-special-member-functions) + class Iter { + private: + using NodePtr = + typename std::conditional::type; + + public: + using difference_type = std::ptrdiff_t; + using value_type = typename Self::value_type; + using reference = typename std::conditional::type; + using pointer = typename std::conditional::type; + using iterator_category = std::forward_iterator_tag; + + // default constructed iterator can be compared to itself, but WON'T return + // true when + // compared to end(). + Iter() = default; + + // Rule of zero: nothing specified. The conversion constructor is only + // enabled for + // iterator to const_iterator, so it doesn't accidentally work as a copy + // ctor. + + // Conversion constructor from iterator to const_iterator. + template ::type> + // NOLINTNEXTLINE(hicpp-explicit-conversions) + Iter(Iter const &other) noexcept : mKeyVals(other.mKeyVals), + mInfo(other.mInfo) {} + + Iter(NodePtr valPtr, uint8_t const *infoPtr) noexcept : mKeyVals(valPtr), + mInfo(infoPtr) {} + + Iter(NodePtr valPtr, uint8_t const *infoPtr, + fast_forward_tag ROBIN_HOOD_UNUSED(tag) /*unused*/) noexcept + : mKeyVals(valPtr), + mInfo(infoPtr) { + fastForward(); + } + + template ::type> + Iter &operator=(Iter const &other) noexcept { + mKeyVals = other.mKeyVals; + mInfo = other.mInfo; + return *this; + } + + // prefix increment. Undefined behavior if we are at end()! + Iter &operator++() noexcept { + mInfo++; + mKeyVals++; + fastForward(); + return *this; + } + + Iter operator++(int)noexcept { + Iter tmp = *this; + ++(*this); + return tmp; + } + + reference operator*() const { return **mKeyVals; } + + pointer operator->() const { return &**mKeyVals; } + + template + bool operator==(Iter const &o) const noexcept { + return mKeyVals == o.mKeyVals; + } + + template + bool operator!=(Iter const &o) const noexcept { + return mKeyVals != o.mKeyVals; + } + + private: + // fast forward to the next non-free info byte + // I've tried a few variants that don't depend on intrinsics, but + // unfortunately they are + // quite a bit slower than this one. So I've reverted that change again. See + // map_benchmark. + void fastForward() noexcept { + size_t n = 0; + while (0U == (n = detail::unaligned_load(mInfo))) { + mInfo += sizeof(size_t); + mKeyVals += sizeof(size_t); + } +#if defined(ROBIN_HOOD_DISABLE_INTRINSICS) + // we know for certain that within the next 8 bytes we'll find a non-zero + // one. + if (ROBIN_HOOD_UNLIKELY(0U == detail::unaligned_load(mInfo))) { + mInfo += 4; + mKeyVals += 4; + } + if (ROBIN_HOOD_UNLIKELY(0U == detail::unaligned_load(mInfo))) { + mInfo += 2; + mKeyVals += 2; + } + if (ROBIN_HOOD_UNLIKELY(0U == *mInfo)) { + mInfo += 1; + mKeyVals += 1; + } +#else +#if ROBIN_HOOD(LITTLE_ENDIAN) + auto inc = ROBIN_HOOD_COUNT_TRAILING_ZEROES(n) / 8; +#else + auto inc = ROBIN_HOOD_COUNT_LEADING_ZEROES(n) / 8; +#endif + mInfo += inc; + mKeyVals += inc; +#endif + } + + friend class Table; + NodePtr mKeyVals{nullptr}; + uint8_t const *mInfo{nullptr}; + }; + + //////////////////////////////////////////////////////////////////// + + // highly performance relevant code. + // Lower bits are used for indexing into the array (2^n size) + // The upper 1-5 bits need to be a reasonable good hash, to save comparisons. + template + void keyToIdx(HashKey &&key, size_t *idx, InfoType *info) const { + // In addition to whatever hash is used, add another mul & shift so we get + // better hashing. + // This serves as a bad hash prevention, if the given data is + // badly mixed. + auto h = static_cast(WHash::operator()(key)); + + h *= mHashMultiplier; + h ^= h >> 33U; + + // the lower InitialInfoNumBits are reserved for info. + *info = mInfoInc + static_cast((h & InfoMask) >> mInfoHashShift); + *idx = (static_cast(h) >> InitialInfoNumBits) & mMask; + } + + // forwards the index by one, wrapping around at the end + void next(InfoType *info, size_t *idx) const noexcept { + *idx = *idx + 1; + *info += mInfoInc; + } + + void nextWhileLess(InfoType *info, size_t *idx) const noexcept { + // unrolling this by hand did not bring any speedups. + while (*info < mInfo[*idx]) { + next(info, idx); + } + } + + // Shift everything up by one element. Tries to move stuff around. + void shiftUp(size_t startIdx, size_t const insertion_idx) noexcept( + std::is_nothrow_move_assignable::value) { + auto idx = startIdx; + ::new (static_cast(mKeyVals + idx)) + Node(std::move(mKeyVals[idx - 1])); + while (--idx != insertion_idx) { + mKeyVals[idx] = std::move(mKeyVals[idx - 1]); + } + + idx = startIdx; + while (idx != insertion_idx) { + ROBIN_HOOD_COUNT(shiftUp) + mInfo[idx] = static_cast(mInfo[idx - 1] + mInfoInc); + if (ROBIN_HOOD_UNLIKELY(mInfo[idx] + mInfoInc > 0xFF)) { + mMaxNumElementsAllowed = 0; + } + --idx; + } + } + + void shiftDown(size_t idx) noexcept( + std::is_nothrow_move_assignable::value) { + // until we find one that is either empty or has zero offset. + // TODO(martinus) we don't need to move everything, just the last one for + // the same + // bucket. + mKeyVals[idx].destroy(*this); + + // until we find one that is either empty or has zero offset. + while (mInfo[idx + 1] >= 2 * mInfoInc) { + ROBIN_HOOD_COUNT(shiftDown) + mInfo[idx] = static_cast(mInfo[idx + 1] - mInfoInc); + mKeyVals[idx] = std::move(mKeyVals[idx + 1]); + ++idx; + } + + mInfo[idx] = 0; + // don't destroy, we've moved it + // mKeyVals[idx].destroy(*this); + mKeyVals[idx].~Node(); + } + + // copy of find(), except that it returns iterator instead of const_iterator. + template + ROBIN_HOOD(NODISCARD) + size_t findIdx(Other const &key) const { + size_t idx{}; + InfoType info{}; + keyToIdx(key, &idx, &info); + + do { + // unrolling this twice gives a bit of a speedup. More unrolling did not + // help. + if (info == mInfo[idx] && ROBIN_HOOD_LIKELY(WKeyEqual::operator()( + key, mKeyVals[idx].getFirst()))) { + return idx; + } + next(&info, &idx); + if (info == mInfo[idx] && ROBIN_HOOD_LIKELY(WKeyEqual::operator()( + key, mKeyVals[idx].getFirst()))) { + return idx; + } + next(&info, &idx); + } while (info <= mInfo[idx]); + + // nothing found! + return mMask == 0 + ? 0 + : static_cast(std::distance( + mKeyVals, + reinterpret_cast_no_cast_align_warning(mInfo))); + } + + void cloneData(const Table &o) { + Cloner()(o, *this); + } + + // inserts a keyval that is guaranteed to be new, e.g. when the hashmap is + // resized. + // @return True on success, false if something went wrong + void insert_move(Node &&keyval) { + // we don't retry, fail if overflowing + // don't need to check max num elements + if (0 == mMaxNumElementsAllowed && !try_increase_info()) { + throwOverflowError(); + } + + size_t idx{}; + InfoType info{}; + keyToIdx(keyval.getFirst(), &idx, &info); + + // skip forward. Use <= because we are certain that the element is not + // there. + while (info <= mInfo[idx]) { + idx = idx + 1; + info += mInfoInc; + } + + // key not found, so we are now exactly where we want to insert it. + auto const insertion_idx = idx; + auto const insertion_info = static_cast(info); + if (ROBIN_HOOD_UNLIKELY(insertion_info + mInfoInc > 0xFF)) { + mMaxNumElementsAllowed = 0; + } + + // find an empty spot + while (0 != mInfo[idx]) { + next(&info, &idx); + } + + auto &l = mKeyVals[insertion_idx]; + if (idx == insertion_idx) { + ::new (static_cast(&l)) Node(std::move(keyval)); + } else { + shiftUp(idx, insertion_idx); + l = std::move(keyval); + } + + // put at empty spot + mInfo[insertion_idx] = insertion_info; + + ++mNumElements; + } + + public: + using iterator = Iter; + using const_iterator = Iter; + + Table() noexcept(noexcept(Hash()) && noexcept(KeyEqual())) + : WHash(), WKeyEqual() { + ROBIN_HOOD_TRACE(this) + } + + // Creates an empty hash map. Nothing is allocated yet, this happens at the + // first insert. + // This tremendously speeds up ctor & dtor of a map that never receives an + // element. The + // penalty is payed at the first insert, and not before. Lookup of this empty + // map works + // because everybody points to DummyInfoByte::b. parameter bucket_count is + // dictated by the + // standard, but we can ignore it. + explicit Table( + size_t ROBIN_HOOD_UNUSED(bucket_count) /*unused*/, const Hash &h = Hash{}, + const KeyEqual &equal = KeyEqual{}) noexcept(noexcept(Hash(h)) && + noexcept(KeyEqual(equal))) + : WHash(h), WKeyEqual(equal) { + ROBIN_HOOD_TRACE(this) + } + + template + Table(Iter first, Iter last, + size_t ROBIN_HOOD_UNUSED(bucket_count) /*unused*/ = 0, + const Hash &h = Hash{}, const KeyEqual &equal = KeyEqual{}) + : WHash(h), WKeyEqual(equal) { + ROBIN_HOOD_TRACE(this) + insert(first, last); + } + + Table(std::initializer_list initlist, + size_t ROBIN_HOOD_UNUSED(bucket_count) /*unused*/ = 0, + const Hash &h = Hash{}, const KeyEqual &equal = KeyEqual{}) + : WHash(h), WKeyEqual(equal) { + ROBIN_HOOD_TRACE(this) + insert(initlist.begin(), initlist.end()); + } + + Table(Table &&o) noexcept : WHash(std::move(static_cast(o))), + WKeyEqual(std::move(static_cast(o))), + DataPool(std::move(static_cast(o))) { + ROBIN_HOOD_TRACE(this) + if (o.mMask) { + mHashMultiplier = std::move(o.mHashMultiplier); + mKeyVals = std::move(o.mKeyVals); + mInfo = std::move(o.mInfo); + mNumElements = std::move(o.mNumElements); + mMask = std::move(o.mMask); + mMaxNumElementsAllowed = std::move(o.mMaxNumElementsAllowed); + mInfoInc = std::move(o.mInfoInc); + mInfoHashShift = std::move(o.mInfoHashShift); + // set other's mask to 0 so its destructor won't do anything + o.init(); + } + } + + Table &operator=(Table &&o) noexcept { + ROBIN_HOOD_TRACE(this) + if (&o != this) { + if (o.mMask) { + // only move stuff if the other map actually has some data + destroy(); + mHashMultiplier = std::move(o.mHashMultiplier); + mKeyVals = std::move(o.mKeyVals); + mInfo = std::move(o.mInfo); + mNumElements = std::move(o.mNumElements); + mMask = std::move(o.mMask); + mMaxNumElementsAllowed = std::move(o.mMaxNumElementsAllowed); + mInfoInc = std::move(o.mInfoInc); + mInfoHashShift = std::move(o.mInfoHashShift); + WHash::operator=(std::move(static_cast(o))); + WKeyEqual::operator=(std::move(static_cast(o))); + DataPool::operator=(std::move(static_cast(o))); + + o.init(); + + } else { + // nothing in the other map => just clear us. + clear(); + } + } + return *this; + } + + Table(const Table &o) + : WHash(static_cast(o)), + WKeyEqual(static_cast(o)), + DataPool(static_cast(o)) { + ROBIN_HOOD_TRACE(this) + if (!o.empty()) { + // not empty: create an exact copy. it is also possible to just iterate + // through all + // elements and insert them, but copying is probably faster. + + auto const numElementsWithBuffer = calcNumElementsWithBuffer(o.mMask + 1); + auto const numBytesTotal = calcNumBytesTotal(numElementsWithBuffer); + + ROBIN_HOOD_LOG("std::malloc " << numBytesTotal << " = calcNumBytesTotal(" + << numElementsWithBuffer << ")") + mHashMultiplier = o.mHashMultiplier; + mKeyVals = static_cast( + detail::assertNotNull(std::malloc(numBytesTotal))); + // no need for calloc because clonData does memcpy + mInfo = reinterpret_cast(mKeyVals + numElementsWithBuffer); + mNumElements = o.mNumElements; + mMask = o.mMask; + mMaxNumElementsAllowed = o.mMaxNumElementsAllowed; + mInfoInc = o.mInfoInc; + mInfoHashShift = o.mInfoHashShift; + cloneData(o); + } + } + + // Creates a copy of the given map. Copy constructor of each entry is used. + // Not sure why clang-tidy thinks this doesn't handle self assignment, it does + // NOLINTNEXTLINE(bugprone-unhandled-self-assignment,cert-oop54-cpp) + Table &operator=(Table const &o) { + ROBIN_HOOD_TRACE(this) + if (&o == this) { + // prevent assigning of itself + return *this; + } + + // we keep using the old allocator and not assign the new one, because we + // want to keep + // the memory available. when it is the same size. + if (o.empty()) { + if (0 == mMask) { + // nothing to do, we are empty too + return *this; + } + + // not empty: destroy what we have there + // clear also resets mInfo to 0, that's sometimes not necessary. + destroy(); + init(); + WHash::operator=(static_cast(o)); + WKeyEqual::operator=(static_cast(o)); + DataPool::operator=(static_cast(o)); + + return *this; + } + + // clean up old stuff + Destroyer::value>{} + .nodes(*this); + + if (mMask != o.mMask) { + // no luck: we don't have the same array size allocated, so we need to + // realloc. + if (0 != mMask) { + // only deallocate if we actually have data! + ROBIN_HOOD_LOG("std::free") + std::free(mKeyVals); + } + + auto const numElementsWithBuffer = calcNumElementsWithBuffer(o.mMask + 1); + auto const numBytesTotal = calcNumBytesTotal(numElementsWithBuffer); + ROBIN_HOOD_LOG("std::malloc " << numBytesTotal << " = calcNumBytesTotal(" + << numElementsWithBuffer << ")") + mKeyVals = static_cast( + detail::assertNotNull(std::malloc(numBytesTotal))); + + // no need for calloc here because cloneData performs a memcpy. + mInfo = reinterpret_cast(mKeyVals + numElementsWithBuffer); + // sentinel is set in cloneData + } + WHash::operator=(static_cast(o)); + WKeyEqual::operator=(static_cast(o)); + DataPool::operator=(static_cast(o)); + mHashMultiplier = o.mHashMultiplier; + mNumElements = o.mNumElements; + mMask = o.mMask; + mMaxNumElementsAllowed = o.mMaxNumElementsAllowed; + mInfoInc = o.mInfoInc; + mInfoHashShift = o.mInfoHashShift; + cloneData(o); + + return *this; + } + + // Swaps everything between the two maps. + void swap(Table &o) { + ROBIN_HOOD_TRACE(this) + using std::swap; + swap(o, *this); + } + + // Clears all data, without resizing. + void clear() { + ROBIN_HOOD_TRACE(this) + if (empty()) { + // don't do anything! also important because we don't want to write to + // DummyInfoByte::b, even though we would just write 0 to it. + return; + } + + Destroyer::value>{} + .nodes(*this); + + auto const numElementsWithBuffer = calcNumElementsWithBuffer(mMask + 1); + // clear everything, then set the sentinel again + uint8_t const z = 0; + std::fill(mInfo, mInfo + calcNumBytesInfo(numElementsWithBuffer), z); + mInfo[numElementsWithBuffer] = 1; + + mInfoInc = InitialInfoInc; + mInfoHashShift = InitialInfoHashShift; + } + + // Destroys the map and all it's contents. + ~Table() { + ROBIN_HOOD_TRACE(this) + destroy(); + } + + // Checks if both tables contain the same entries. Order is irrelevant. + bool operator==(const Table &other) const { + ROBIN_HOOD_TRACE(this) + if (other.size() != size()) { + return false; + } + for (auto const &otherEntry : other) { + if (!has(otherEntry)) { + return false; + } + } + + return true; + } + + bool operator!=(const Table &other) const { + ROBIN_HOOD_TRACE(this) + return !operator==(other); + } + + template + typename std::enable_if::value, Q &>::type operator[]( + const key_type &key) { + ROBIN_HOOD_TRACE(this) + auto idxAndState = insertKeyPrepareEmptySpot(key); + switch (idxAndState.second) { + case InsertionState::key_found: + break; + + case InsertionState::new_node: + ::new (static_cast(&mKeyVals[idxAndState.first])) + Node(*this, std::piecewise_construct, std::forward_as_tuple(key), + std::forward_as_tuple()); + break; + + case InsertionState::overwrite_node: + mKeyVals[idxAndState.first] = + Node(*this, std::piecewise_construct, std::forward_as_tuple(key), + std::forward_as_tuple()); + break; + + case InsertionState::overflow_error: + throwOverflowError(); + } + + return mKeyVals[idxAndState.first].getSecond(); + } + + template + typename std::enable_if::value, Q &>::type operator[]( + key_type &&key) { + ROBIN_HOOD_TRACE(this) + auto idxAndState = insertKeyPrepareEmptySpot(key); + switch (idxAndState.second) { + case InsertionState::key_found: + break; + + case InsertionState::new_node: + ::new (static_cast(&mKeyVals[idxAndState.first])) Node( + *this, std::piecewise_construct, + std::forward_as_tuple(std::move(key)), std::forward_as_tuple()); + break; + + case InsertionState::overwrite_node: + mKeyVals[idxAndState.first] = Node( + *this, std::piecewise_construct, + std::forward_as_tuple(std::move(key)), std::forward_as_tuple()); + break; + + case InsertionState::overflow_error: + throwOverflowError(); + } + + return mKeyVals[idxAndState.first].getSecond(); + } + + template + void insert(Iter first, Iter last) { + for (; first != last; ++first) { + // value_type ctor needed because this might be called with std::pair's + insert(value_type(*first)); + } + } + + void insert(std::initializer_list ilist) { + for (auto &&vt : ilist) { + insert(std::move(vt)); + } + } + + template + std::pair emplace(Args &&... args) { + ROBIN_HOOD_TRACE(this) + Node n{*this, std::forward(args)...}; + auto idxAndState = insertKeyPrepareEmptySpot(getFirstConst(n)); + switch (idxAndState.second) { + case InsertionState::key_found: + n.destroy(*this); + break; + + case InsertionState::new_node: + ::new (static_cast(&mKeyVals[idxAndState.first])) + Node(*this, std::move(n)); + break; + + case InsertionState::overwrite_node: + mKeyVals[idxAndState.first] = std::move(n); + break; + + case InsertionState::overflow_error: + n.destroy(*this); + throwOverflowError(); + break; + } + + return std::make_pair( + iterator(mKeyVals + idxAndState.first, mInfo + idxAndState.first), + InsertionState::key_found != idxAndState.second); + } + + template + std::pair try_emplace(const key_type &key, Args &&... args) { + return try_emplace_impl(key, std::forward(args)...); + } + + template + std::pair try_emplace(key_type &&key, Args &&... args) { + return try_emplace_impl(std::move(key), std::forward(args)...); + } + + template + std::pair try_emplace(const_iterator hint, + const key_type &key, Args &&... args) { + (void)hint; + return try_emplace_impl(key, std::forward(args)...); + } + + template + std::pair try_emplace(const_iterator hint, key_type &&key, + Args &&... args) { + (void)hint; + return try_emplace_impl(std::move(key), std::forward(args)...); + } + + template + std::pair insert_or_assign(const key_type &key, + Mapped &&obj) { + return insertOrAssignImpl(key, std::forward(obj)); + } + + template + std::pair insert_or_assign(key_type &&key, Mapped &&obj) { + return insertOrAssignImpl(std::move(key), std::forward(obj)); + } + + template + std::pair insert_or_assign(const_iterator hint, + const key_type &key, + Mapped &&obj) { + (void)hint; + return insertOrAssignImpl(key, std::forward(obj)); + } + + template + std::pair insert_or_assign(const_iterator hint, + key_type &&key, Mapped &&obj) { + (void)hint; + return insertOrAssignImpl(std::move(key), std::forward(obj)); + } + + std::pair insert(const value_type &keyval) { + ROBIN_HOOD_TRACE(this) + return emplace(keyval); + } + + std::pair insert(value_type &&keyval) { + return emplace(std::move(keyval)); + } + + // Returns 1 if key is found, 0 otherwise. + size_t count(const key_type &key) const { // NOLINT(modernize-use-nodiscard) + ROBIN_HOOD_TRACE(this) + auto kv = mKeyVals + findIdx(key); + if (kv != reinterpret_cast_no_cast_align_warning(mInfo)) { + return 1; + } + return 0; + } + + template + // NOLINTNEXTLINE(modernize-use-nodiscard) + typename std::enable_if::type count( + const OtherKey &key) const { + ROBIN_HOOD_TRACE(this) + auto kv = mKeyVals + findIdx(key); + if (kv != reinterpret_cast_no_cast_align_warning(mInfo)) { + return 1; + } + return 0; + } + + bool contains(const key_type &key) const { // NOLINT(modernize-use-nodiscard) + return 1U == count(key); + } + + template + // NOLINTNEXTLINE(modernize-use-nodiscard) + typename std::enable_if::type contains( + const OtherKey &key) const { + return 1U == count(key); + } + + // Returns a reference to the value found for key. + // Throws std::out_of_range if element cannot be found + template + // NOLINTNEXTLINE(modernize-use-nodiscard) + typename std::enable_if::value, Q &>::type at( + key_type const &key) { + ROBIN_HOOD_TRACE(this) + auto kv = mKeyVals + findIdx(key); + if (kv == reinterpret_cast_no_cast_align_warning(mInfo)) { + doThrow("key not found"); + } + return kv->getSecond(); + } + + // Returns a reference to the value found for key. + // Throws std::out_of_range if element cannot be found + template + // NOLINTNEXTLINE(modernize-use-nodiscard) + typename std::enable_if::value, Q const &>::type at( + key_type const &key) const { + ROBIN_HOOD_TRACE(this) + auto kv = mKeyVals + findIdx(key); + if (kv == reinterpret_cast_no_cast_align_warning(mInfo)) { + doThrow("key not found"); + } + return kv->getSecond(); + } + + const_iterator find( + const key_type &key) const { // NOLINT(modernize-use-nodiscard) + ROBIN_HOOD_TRACE(this) + const size_t idx = findIdx(key); + return const_iterator{mKeyVals + idx, mInfo + idx}; + } + + template + const_iterator find(const OtherKey &key, + is_transparent_tag /*unused*/) const { + ROBIN_HOOD_TRACE(this) + const size_t idx = findIdx(key); + return const_iterator{mKeyVals + idx, mInfo + idx}; + } + + template + typename std::enable_if< + Self_::is_transparent, // NOLINT(modernize-use-nodiscard) + const_iterator>::type // NOLINT(modernize-use-nodiscard) + find(const OtherKey &key) const { // NOLINT(modernize-use-nodiscard) + ROBIN_HOOD_TRACE(this) + const size_t idx = findIdx(key); + return const_iterator{mKeyVals + idx, mInfo + idx}; + } + + iterator find(const key_type &key) { + ROBIN_HOOD_TRACE(this) + const size_t idx = findIdx(key); + return iterator{mKeyVals + idx, mInfo + idx}; + } + + template + iterator find(const OtherKey &key, is_transparent_tag /*unused*/) { + ROBIN_HOOD_TRACE(this) + const size_t idx = findIdx(key); + return iterator{mKeyVals + idx, mInfo + idx}; + } + + template + typename std::enable_if::type find( + const OtherKey &key) { + ROBIN_HOOD_TRACE(this) + const size_t idx = findIdx(key); + return iterator{mKeyVals + idx, mInfo + idx}; + } + + iterator begin() { + ROBIN_HOOD_TRACE(this) + if (empty()) { + return end(); + } + return iterator(mKeyVals, mInfo, fast_forward_tag{}); + } + const_iterator begin() const { // NOLINT(modernize-use-nodiscard) + ROBIN_HOOD_TRACE(this) + return cbegin(); + } + const_iterator cbegin() const { // NOLINT(modernize-use-nodiscard) + ROBIN_HOOD_TRACE(this) + if (empty()) { + return cend(); + } + return const_iterator(mKeyVals, mInfo, fast_forward_tag{}); + } + + iterator end() { + ROBIN_HOOD_TRACE(this) + // no need to supply valid info pointer: end() must not be dereferenced, and + // only node + // pointer is compared. + return iterator{reinterpret_cast_no_cast_align_warning(mInfo), + nullptr}; + } + const_iterator end() const { // NOLINT(modernize-use-nodiscard) + ROBIN_HOOD_TRACE(this) + return cend(); + } + const_iterator cend() const { // NOLINT(modernize-use-nodiscard) + ROBIN_HOOD_TRACE(this) + return const_iterator{reinterpret_cast_no_cast_align_warning(mInfo), + nullptr}; + } + + iterator erase(const_iterator pos) { + ROBIN_HOOD_TRACE(this) + // its safe to perform const cast here + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast) + return erase(iterator{const_cast(pos.mKeyVals), + const_cast(pos.mInfo)}); + } + + // Erases element at pos, returns iterator to the next element. + iterator erase(iterator pos) { + ROBIN_HOOD_TRACE(this) + // we assume that pos always points to a valid entry, and not end(). + auto const idx = static_cast(pos.mKeyVals - mKeyVals); + + shiftDown(idx); + --mNumElements; + + if (*pos.mInfo) { + // we've backward shifted, return this again + return pos; + } + + // no backward shift, return next element + return ++pos; + } + + size_t erase(const key_type &key) { + ROBIN_HOOD_TRACE(this) + size_t idx{}; + InfoType info{}; + keyToIdx(key, &idx, &info); + + // check while info matches with the source idx + do { + if (info == mInfo[idx] && + WKeyEqual::operator()(key, mKeyVals[idx].getFirst())) { + shiftDown(idx); + --mNumElements; + return 1; + } + next(&info, &idx); + } while (info <= mInfo[idx]); + + // nothing found to delete + return 0; + } + + // reserves space for the specified number of elements. Makes sure the old + // data fits. + // exactly the same as reserve(c). + void rehash(size_t c) { + // forces a reserve + reserve(c, true); + } + + // reserves space for the specified number of elements. Makes sure the old + // data fits. + // Exactly the same as rehash(c). Use rehash(0) to shrink to fit. + void reserve(size_t c) { + // reserve, but don't force rehash + reserve(c, false); + } + + // If possible reallocates the map to a smaller one. This frees the underlying + // table. + // Does not do anything if load_factor is too large for decreasing the table's + // size. + void compact() { + ROBIN_HOOD_TRACE(this) + auto newSize = InitialNumElements; + while (calcMaxNumElementsAllowed(newSize) < mNumElements && newSize != 0) { + newSize *= 2; + } + if (ROBIN_HOOD_UNLIKELY(newSize == 0)) { + throwOverflowError(); + } + + ROBIN_HOOD_LOG("newSize > mMask + 1: " << newSize << " > " << mMask + << " + 1") + + // only actually do anything when the new size is bigger than the old one. + // This prevents to + // continuously allocate for each reserve() call. + if (newSize < mMask + 1) { + rehashPowerOfTwo(newSize, true); + } + } + + size_type size() const noexcept { // NOLINT(modernize-use-nodiscard) + ROBIN_HOOD_TRACE(this) + return mNumElements; + } + + size_type max_size() const noexcept { // NOLINT(modernize-use-nodiscard) + ROBIN_HOOD_TRACE(this) + return static_cast(-1); + } + + ROBIN_HOOD(NODISCARD) bool empty() const noexcept { + ROBIN_HOOD_TRACE(this) + return 0 == mNumElements; + } + + float max_load_factor() const noexcept { // NOLINT(modernize-use-nodiscard) + ROBIN_HOOD_TRACE(this) + return MaxLoadFactor100 / 100.0F; + } + + // Average number of elements per bucket. Since we allow only 1 per bucket + float load_factor() const noexcept { // NOLINT(modernize-use-nodiscard) + ROBIN_HOOD_TRACE(this) + return static_cast(size()) / static_cast(mMask + 1); + } + + ROBIN_HOOD(NODISCARD) size_t mask() const noexcept { + ROBIN_HOOD_TRACE(this) + return mMask; + } + + ROBIN_HOOD(NODISCARD) + size_t calcMaxNumElementsAllowed(size_t maxElements) const noexcept { + if (ROBIN_HOOD_LIKELY(maxElements <= + (std::numeric_limits::max)() / 100)) { + return maxElements * MaxLoadFactor100 / 100; + } + + // we might be a bit inprecise, but since maxElements is quite large that + // doesn't matter + return (maxElements / 100) * MaxLoadFactor100; + } + + ROBIN_HOOD(NODISCARD) + size_t calcNumBytesInfo(size_t numElements) const noexcept { + // we add a uint64_t, which houses the sentinel (first byte) and padding so + // we can load + // 64bit types. + return numElements + sizeof(uint64_t); + } + + ROBIN_HOOD(NODISCARD) + size_t calcNumElementsWithBuffer(size_t numElements) const noexcept { + auto maxNumElementsAllowed = calcMaxNumElementsAllowed(numElements); + return numElements + + (std::min)(maxNumElementsAllowed, (static_cast(0xFF))); + } + + // calculation only allowed for 2^n values + ROBIN_HOOD(NODISCARD) size_t calcNumBytesTotal(size_t numElements) const { +#if ROBIN_HOOD(BITNESS) == 64 + return numElements * sizeof(Node) + calcNumBytesInfo(numElements); +#else + // make sure we're doing 64bit operations, so we are at least safe against + // 32bit overflows. + auto const ne = static_cast(numElements); + auto const s = static_cast(sizeof(Node)); + auto const infos = static_cast(calcNumBytesInfo(numElements)); + + auto const total64 = ne * s + infos; + auto const total = static_cast(total64); + + if (ROBIN_HOOD_UNLIKELY(static_cast(total) != total64)) { + throwOverflowError(); + } + return total; +#endif + } + + private: + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::value, bool>::type + has(const value_type &e) const { + ROBIN_HOOD_TRACE(this) + auto it = find(e.first); + return it != end() && it->second == e.second; + } + + template + ROBIN_HOOD(NODISCARD) + typename std::enable_if::value, bool>::type + has(const value_type &e) const { + ROBIN_HOOD_TRACE(this) + return find(e) != end(); + } + + void reserve(size_t c, bool forceRehash) { + ROBIN_HOOD_TRACE(this) + auto const minElementsAllowed = (std::max)(c, mNumElements); + auto newSize = InitialNumElements; + while (calcMaxNumElementsAllowed(newSize) < minElementsAllowed && + newSize != 0) { + newSize *= 2; + } + if (ROBIN_HOOD_UNLIKELY(newSize == 0)) { + throwOverflowError(); + } + + ROBIN_HOOD_LOG("newSize > mMask + 1: " << newSize << " > " << mMask + << " + 1") + + // only actually do anything when the new size is bigger than the old one. + // This prevents to + // continuously allocate for each reserve() call. + if (forceRehash || newSize > mMask + 1) { + rehashPowerOfTwo(newSize, false); + } + } + + // reserves space for at least the specified number of elements. + // only works if numBuckets if power of two + // True on success, false otherwise + void rehashPowerOfTwo(size_t numBuckets, bool forceFree) { + ROBIN_HOOD_TRACE(this) + + Node *const oldKeyVals = mKeyVals; + uint8_t const *const oldInfo = mInfo; + + const size_t oldMaxElementsWithBuffer = + calcNumElementsWithBuffer(mMask + 1); + + // resize operation: move stuff + initData(numBuckets); + if (oldMaxElementsWithBuffer > 1) { + for (size_t i = 0; i < oldMaxElementsWithBuffer; ++i) { + if (oldInfo[i] != 0) { + // might throw an exception, which is really bad since we are in the + // middle of + // moving stuff. + insert_move(std::move(oldKeyVals[i])); + // destroy the node but DON'T destroy the data. + oldKeyVals[i].~Node(); + } + } + + // this check is not necessary as it's guarded by the previous if, but it + // helps + // silence g++'s overeager "attempt to free a non-heap object 'map' + // [-Werror=free-nonheap-object]" warning. + if (oldKeyVals != + reinterpret_cast_no_cast_align_warning(&mMask)) { + // don't destroy old data: put it into the pool instead + if (forceFree) { + std::free(oldKeyVals); + } else { + DataPool::addOrFree(oldKeyVals, + calcNumBytesTotal(oldMaxElementsWithBuffer)); + } + } + } + } + + ROBIN_HOOD(NOINLINE) void throwOverflowError() const { +#if ROBIN_HOOD(HAS_EXCEPTIONS) + throw std::overflow_error("robin_hood::map overflow"); +#else + abort(); +#endif + } + + template + std::pair try_emplace_impl(OtherKey &&key, Args &&... args) { + ROBIN_HOOD_TRACE(this) + auto idxAndState = insertKeyPrepareEmptySpot(key); + switch (idxAndState.second) { + case InsertionState::key_found: + break; + + case InsertionState::new_node: + ::new (static_cast(&mKeyVals[idxAndState.first])) + Node(*this, std::piecewise_construct, + std::forward_as_tuple(std::forward(key)), + std::forward_as_tuple(std::forward(args)...)); + break; + + case InsertionState::overwrite_node: + mKeyVals[idxAndState.first] = + Node(*this, std::piecewise_construct, + std::forward_as_tuple(std::forward(key)), + std::forward_as_tuple(std::forward(args)...)); + break; + + case InsertionState::overflow_error: + throwOverflowError(); + break; + } + + return std::make_pair( + iterator(mKeyVals + idxAndState.first, mInfo + idxAndState.first), + InsertionState::key_found != idxAndState.second); + } + + template + std::pair insertOrAssignImpl(OtherKey &&key, Mapped &&obj) { + ROBIN_HOOD_TRACE(this) + auto idxAndState = insertKeyPrepareEmptySpot(key); + switch (idxAndState.second) { + case InsertionState::key_found: + mKeyVals[idxAndState.first].getSecond() = std::forward(obj); + break; + + case InsertionState::new_node: + ::new (static_cast(&mKeyVals[idxAndState.first])) + Node(*this, std::piecewise_construct, + std::forward_as_tuple(std::forward(key)), + std::forward_as_tuple(std::forward(obj))); + break; + + case InsertionState::overwrite_node: + mKeyVals[idxAndState.first] = + Node(*this, std::piecewise_construct, + std::forward_as_tuple(std::forward(key)), + std::forward_as_tuple(std::forward(obj))); + break; + + case InsertionState::overflow_error: + throwOverflowError(); + break; + } + + return std::make_pair( + iterator(mKeyVals + idxAndState.first, mInfo + idxAndState.first), + InsertionState::key_found != idxAndState.second); + } + + void initData(size_t max_elements) { + mNumElements = 0; + mMask = max_elements - 1; + mMaxNumElementsAllowed = calcMaxNumElementsAllowed(max_elements); + + auto const numElementsWithBuffer = calcNumElementsWithBuffer(max_elements); + + // calloc also zeroes everything + auto const numBytesTotal = calcNumBytesTotal(numElementsWithBuffer); + ROBIN_HOOD_LOG("std::calloc " << numBytesTotal << " = calcNumBytesTotal(" + << numElementsWithBuffer << ")") + mKeyVals = reinterpret_cast( + detail::assertNotNull(std::calloc(1, numBytesTotal))); + mInfo = reinterpret_cast(mKeyVals + numElementsWithBuffer); + + // set sentinel + mInfo[numElementsWithBuffer] = 1; + + mInfoInc = InitialInfoInc; + mInfoHashShift = InitialInfoHashShift; + } + + enum class InsertionState { + overflow_error, + key_found, + new_node, + overwrite_node + }; + + // Finds key, and if not already present prepares a spot where to pot the key + // & value. + // This potentially shifts nodes out of the way, updates mInfo and number of + // inserted + // elements, so the only operation left to do is create/assign a new node at + // that spot. + template + std::pair insertKeyPrepareEmptySpot(OtherKey &&key) { + for (int i = 0; i < 256; ++i) { + size_t idx{}; + InfoType info{}; + keyToIdx(key, &idx, &info); + nextWhileLess(&info, &idx); + + // while we potentially have a match + while (info == mInfo[idx]) { + if (WKeyEqual::operator()(key, mKeyVals[idx].getFirst())) { + // key already exists, do NOT insert. + // see http://en.cppreference.com/w/cpp/container/unordered_map/insert + return std::make_pair(idx, InsertionState::key_found); + } + next(&info, &idx); + } + + // unlikely that this evaluates to true + if (ROBIN_HOOD_UNLIKELY(mNumElements >= mMaxNumElementsAllowed)) { + if (!increase_size()) { + return std::make_pair(size_t(0), InsertionState::overflow_error); + } + continue; + } + + // key not found, so we are now exactly where we want to insert it. + auto const insertion_idx = idx; + auto const insertion_info = info; + if (ROBIN_HOOD_UNLIKELY(insertion_info + mInfoInc > 0xFF)) { + mMaxNumElementsAllowed = 0; + } + + // find an empty spot + while (0 != mInfo[idx]) { + next(&info, &idx); + } + + if (idx != insertion_idx) { + shiftUp(idx, insertion_idx); + } + // put at empty spot + mInfo[insertion_idx] = static_cast(insertion_info); + ++mNumElements; + return std::make_pair( + insertion_idx, idx == insertion_idx ? InsertionState::new_node + : InsertionState::overwrite_node); + } + + // enough attempts failed, so finally give up. + return std::make_pair(size_t(0), InsertionState::overflow_error); + } + + bool try_increase_info() { + ROBIN_HOOD_LOG("mInfoInc=" << mInfoInc << ", numElements=" << mNumElements + << ", maxNumElementsAllowed=" + << calcMaxNumElementsAllowed(mMask + 1)) + if (mInfoInc <= 2) { + // need to be > 2 so that shift works (otherwise undefined behavior!) + return false; + } + // we got space left, try to make info smaller + mInfoInc = static_cast(mInfoInc >> 1U); + + // remove one bit of the hash, leaving more space for the distance info. + // This is extremely fast because we can operate on 8 bytes at once. + ++mInfoHashShift; + auto const numElementsWithBuffer = calcNumElementsWithBuffer(mMask + 1); + + for (size_t i = 0; i < numElementsWithBuffer; i += 8) { + auto val = unaligned_load(mInfo + i); + val = (val >> 1U) & UINT64_C(0x7f7f7f7f7f7f7f7f); + std::memcpy(mInfo + i, &val, sizeof(val)); + } + // update sentinel, which might have been cleared out! + mInfo[numElementsWithBuffer] = 1; + + mMaxNumElementsAllowed = calcMaxNumElementsAllowed(mMask + 1); + return true; + } + + // True if resize was possible, false otherwise + bool increase_size() { + // nothing allocated yet? just allocate InitialNumElements + if (0 == mMask) { + initData(InitialNumElements); + return true; + } + + auto const maxNumElementsAllowed = calcMaxNumElementsAllowed(mMask + 1); + if (mNumElements < maxNumElementsAllowed && try_increase_info()) { + return true; + } + + ROBIN_HOOD_LOG("mNumElements=" + << mNumElements + << ", maxNumElementsAllowed=" << maxNumElementsAllowed + << ", load=" << (static_cast(mNumElements) * 100.0 / + (static_cast(mMask) + 1))) + + nextHashMultiplier(); + if (mNumElements * 2 < calcMaxNumElementsAllowed(mMask + 1)) { + // we have to resize, even though there would still be plenty of space + // left! + // Try to rehash instead. Delete freed memory so we don't steadyily + // increase mem in case + // we have to rehash a few times + rehashPowerOfTwo(mMask + 1, true); + } else { + // Each resize use a different hash so we don't so easily overflow. + // Make sure we only have odd numbers, so that the multiplication is + // reversible! + rehashPowerOfTwo((mMask + 1) * 2, false); + } + return true; + } + + void nextHashMultiplier() { + // adding an *even* number, so that the multiplier will always stay odd. + // This is necessary + // so that the hash stays a mixing function (and thus doesn't have any + // information loss). + mHashMultiplier += UINT64_C(0xc4ceb9fe1a85ec54); + } + + void destroy() { + if (0 == mMask) { + // don't deallocate! + return; + } + + Destroyer::value>{} + .nodesDoNotDeallocate(*this); + + // This protection against not deleting mMask shouldn't be needed as it's + // sufficiently + // protected with the 0==mMask check, but I have this anyways because g++ 7 + // otherwise + // reports a compile error: attempt to free a non-heap object 'fm' + // [-Werror=free-nonheap-object] + if (mKeyVals != reinterpret_cast_no_cast_align_warning(&mMask)) { + ROBIN_HOOD_LOG("std::free") + std::free(mKeyVals); + } + } + + void init() noexcept { + mKeyVals = reinterpret_cast_no_cast_align_warning(&mMask); + mInfo = reinterpret_cast(&mMask); + mNumElements = 0; + mMask = 0; + mMaxNumElementsAllowed = 0; + mInfoInc = InitialInfoInc; + mInfoHashShift = InitialInfoHashShift; + } + + // members are sorted so no padding occurs + uint64_t mHashMultiplier = UINT64_C(0xc4ceb9fe1a85ec53); // 8 byte 8 + Node *mKeyVals = + reinterpret_cast_no_cast_align_warning(&mMask); // 8 byte 16 + uint8_t *mInfo = reinterpret_cast(&mMask); // 8 byte 24 + size_t mNumElements = 0; // 8 byte 32 + size_t mMask = 0; // 8 byte 40 + size_t mMaxNumElementsAllowed = 0; // 8 byte 48 + InfoType mInfoInc = InitialInfoInc; // 4 byte 52 + InfoType mInfoHashShift = InitialInfoHashShift; // 4 byte 56 + // 16 byte 56 if NodeAllocator +}; + +} // namespace detail + +// map + +template , + typename KeyEqual = std::equal_to, size_t MaxLoadFactor100 = 80> +using unordered_flat_map = + detail::Table; + +template , + typename KeyEqual = std::equal_to, size_t MaxLoadFactor100 = 80> +using unordered_node_map = + detail::Table; + +template , + typename KeyEqual = std::equal_to, size_t MaxLoadFactor100 = 80> +using unordered_map = detail::Table< + sizeof(robin_hood::pair) <= sizeof(size_t) * 6 && + std::is_nothrow_move_constructible>::value && + std::is_nothrow_move_assignable>::value, + MaxLoadFactor100, Key, T, Hash, KeyEqual>; + +// set + +template , + typename KeyEqual = std::equal_to, size_t MaxLoadFactor100 = 80> +using unordered_flat_set = + detail::Table; + +template , + typename KeyEqual = std::equal_to, size_t MaxLoadFactor100 = 80> +using unordered_node_set = + detail::Table; + +template , + typename KeyEqual = std::equal_to, size_t MaxLoadFactor100 = 80> +using unordered_set = + detail::Table::value && + std::is_nothrow_move_assignable::value, + MaxLoadFactor100, Key, void, Hash, KeyEqual>; + +} // namespace robin_hood + +#endif diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 24bed277280839..e55fca403af3ac 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -301,8 +301,14 @@ cc_library(parallel_executor SRCS parallel_executor.cc DEPS fast_threaded_ssa_graph_executor variable_helper) cc_library(executor_cache SRCS executor_cache.cc DEPS executor) -cc_test(dist_multi_trainer_test SRCS dist_multi_trainer_test.cc DEPS - conditional_block_op executor) +if(WITH_PSCORE) + get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS) + cc_test(dist_multi_trainer_test SRCS dist_multi_trainer_test.cc DEPS + conditional_block_op executor ${RPC_DEPS}) +else() + cc_test(dist_multi_trainer_test SRCS dist_multi_trainer_test.cc DEPS + conditional_block_op executor) +endif() cc_library(prune SRCS prune.cc DEPS framework_proto boost) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry @@ -369,36 +375,3 @@ cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES}) if(WITH_TESTING AND TEST selected_rows_test) set_tests_properties(selected_rows_test PROPERTIES TIMEOUT 120) endif() - -##### 2.0 New custom op extension mechanism related ##### - -# if not deps `layer`, will cause: undefined symbol: _ZN6paddle10imperative7VarBase9name_set_ -if (WIN32) - set(PADDLE_CUSTOM_OP_MODULES custom_tensor op_meta_info custom_operator layer) - - set(PADDLE_CUSTOM_OP_SRCS - ${CMAKE_CURRENT_SOURCE_DIR}/custom_operator.cc - ${CMAKE_CURRENT_SOURCE_DIR}/../extension/src/ext_tensor.cc - ${CMAKE_CURRENT_SOURCE_DIR}/../extension/src/ext_op_meta_info.cc - ${CMAKE_SOURCE_DIR}/paddle/fluid/imperative/layer.cc) - set(PADDLE_CUSTOM_OP_SRCS ${PADDLE_CUSTOM_OP_SRCS} PARENT_SCOPE) - - cc_library(paddle_custom_op_shared - SHARED SRCS ${PADDLE_CUSTOM_OP_SRCS} DEPS ${PADDLE_CUSTOM_OP_MODULES}) - - get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) - set_target_properties(paddle_custom_op_shared PROPERTIES OUTPUT_NAME paddle_custom_op) - target_link_libraries(paddle_custom_op_shared ${os_dependency_modules}) - - if("${CMAKE_GENERATOR}" STREQUAL "Ninja") - set(paddle_custom_op_lib_path ${CMAKE_CURRENT_BINARY_DIR}) - else() - set(paddle_custom_op_lib_path ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}) - endif() - set(PADDLE_CUSTOM_OP_IMPORT_LIB - ${paddle_custom_op_lib_path}/paddle_custom_op.lib - CACHE INTERNAL "Paddle custom op import lib") - set(PADDLE_CUSTOM_OP_SHARED_LIB - ${paddle_custom_op_lib_path}/paddle_custom_op.dll - CACHE INTERNAL "Paddle custom op dll") -endif() diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc index 97d58df6dc5738..c4b833ec94c294 100644 --- a/paddle/fluid/framework/custom_operator.cc +++ b/paddle/fluid/framework/custom_operator.cc @@ -246,7 +246,7 @@ class CustomOperator : public OperatorWithKernel { * it can only be determined at runtime. */ framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext& ctx) const { + const framework::ExecutionContext& ctx) const override { return framework::OpKernelType(proto::VarType::RAW, ctx.GetPlace()); } @@ -257,7 +257,7 @@ class CustomOperator : public OperatorWithKernel { */ framework::OpKernelType GetKernelTypeForVar( const std::string& var_name, const Tensor& tensor, - const OpKernelType& expected_kernel_type) { + const OpKernelType& expected_kernel_type) const override { return OpKernelType(expected_kernel_type.data_type_, expected_kernel_type.place_, tensor.layout()); } diff --git a/paddle/fluid/framework/custom_operator.h b/paddle/fluid/framework/custom_operator.h index 117841f80cf47e..259901c09f3e00 100644 --- a/paddle/fluid/framework/custom_operator.h +++ b/paddle/fluid/framework/custom_operator.h @@ -28,5 +28,8 @@ void LoadOpMetaInfoAndRegisterOp(const std::string& dso_name); void RegisterOperatorWithMetaInfoMap( const paddle::OpMetaInfoMap& op_meta_info_map); +// Interface for selective register custom op. +void RegisterOperatorWithMetaInfo(const std::vector& op_meta_infos); + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc index 103dd0c5ae599b..0fdb97db20af99 100644 --- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc +++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc @@ -354,8 +354,36 @@ void CheckVarHasNanOrInf(const std::string& op_type, var_name)); #endif return; - } + } else if (platform::is_npu_place(tensor->place())) { +#ifdef PADDLE_WITH_ASCEND_CL + if (tensor->type() != proto::VarType::FP32) { + return; + } + + framework::LoDTensor cpu_tensor; + cpu_tensor.Resize(tensor->dims()); + float* cpu_data = static_cast( + cpu_tensor.mutable_data(platform::CPUPlace(), tensor->type())); + framework::TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor); + bool flag = false; + for (int i = 0; i < cpu_tensor.numel(); i++) { + if (isnan(cpu_data[i]) || isinf(cpu_data[i])) { + flag = true; + break; + } + } + PADDLE_ENFORCE_NE( + flag, true, + platform::errors::Fatal("Operator %s output Tensor %s contains Inf.", + op_type, var_name)); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "Tensor[%s] use npu place. PaddlePaddle must compile with NPU.", + var_name)); +#endif + return; + } tensor_check(op_type, var_name, *tensor, place); } diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h index 628b9f0d70f598..cd5de19bdc0887 100644 --- a/paddle/fluid/framework/device_worker.h +++ b/paddle/fluid/framework/device_worker.h @@ -638,7 +638,8 @@ class PSGPUWorker : public HogwildWorker { }; #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(PADDLE_WITH_ASCEND_CL) class SectionWorker : public DeviceWorker { public: SectionWorker() {} diff --git a/paddle/fluid/framework/device_worker_factory.cc b/paddle/fluid/framework/device_worker_factory.cc index a539a5d5f96b52..fb2323d96e2916 100644 --- a/paddle/fluid/framework/device_worker_factory.cc +++ b/paddle/fluid/framework/device_worker_factory.cc @@ -79,7 +79,8 @@ REGISTER_DEVICE_WORKER_CLASS(HeterBoxWorker); REGISTER_DEVICE_WORKER_CLASS(PSGPUWorker); #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(PADDLE_WITH_ASCEND_CL) REGISTER_DEVICE_WORKER_CLASS(SectionWorker); #endif } // namespace framework diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto old mode 100755 new mode 100644 index e6a7d74cc43433..654b88920acaf6 --- a/paddle/fluid/framework/distributed_strategy.proto +++ b/paddle/fluid/framework/distributed_strategy.proto @@ -139,6 +139,10 @@ message PipelineConfig { optional string schedule_mode = 3 [ default = '1F1B' ]; } +message TensorParallelConfig { + optional int32 tensor_parallel_degree = 1 [ default = 1 ]; +} + message DistributedStrategy { // bool options optional Mode mode = 1 [ default = COLLECTIVE ]; @@ -169,6 +173,7 @@ message DistributedStrategy { optional bool sharding = 26 [ default = false ]; optional float last_comm_group_size_MB = 27 [ default = 1 ]; optional bool find_unused_parameters = 28 [ default = true ]; + optional bool tensor_parallel = 29 [ default = false ]; optional RecomputeConfig recompute_configs = 101; optional AMPConfig amp_configs = 102; @@ -182,6 +187,7 @@ message DistributedStrategy { optional AdaptiveLocalSGDConfig adaptive_localsgd_configs = 110; optional ShardingConfig sharding_configs = 111; optional HybridConfig hybrid_configs = 112; + optional TensorParallelConfig tensor_parallel_configs = 113; optional BuildStrategy build_strategy = 201; optional ExecutionStrategy execution_strategy = 202; } diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 101991d2c1ba00..de007c128d7543 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -72,7 +72,7 @@ Executor::~Executor() { #ifdef PADDLE_WITH_MKLDNN // Clear mkl-dnn cache, // this is needed to have mkl-dnn unit tests working - ClearMKLDNNCache(place_); + ClearMKLDNNCache(place_, this); #endif } @@ -169,6 +169,9 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, bool force_disable_gc, bool keep_kid_scopes) { platform::RecordBlock b(block_id); if (FLAGS_use_mkldnn) EnableMKLDNN(pdesc); +#ifdef PADDLE_WITH_MKLDNN + platform::AttachPointerHashToMKLDNNKey(this, place_); +#endif auto ctx = Prepare(pdesc, block_id, skip_ref_cnt_vars, force_disable_gc); RunPreparedContext(ctx.get(), scope, create_local_scope, create_vars, keep_kid_scopes); @@ -294,6 +297,9 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, const std::string& fetch_holder_name) { platform::RecordBlock b(kProgramId); if (FLAGS_use_mkldnn) EnableMKLDNN(program); +#ifdef PADDLE_WITH_MKLDNN + platform::AttachPointerHashToMKLDNNKey(this, place_); +#endif bool has_feed_ops = has_feed_operators(program.Block(0), *feed_targets, feed_holder_name); bool has_fetch_ops = @@ -456,11 +462,22 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx, #endif } else if (platform::is_npu_place(place_)) { #ifdef PADDLE_WITH_ASCEND_CL - // TODO(ascendrc): Support garbage collector on NPUPlace - VLOG(4) << "Skip NPU gc because it is not implemented now."; + if (IsFastEagerDeletionModeEnabled()) { + VLOG(4) << "Use unsafe fast gc for NPU."; + gc.reset(new NPUUnsafeFastGarbageCollector( + BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size)); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Please set FLAGS_fast_eager_deletion_mode=true to use " + "GarbageCollector on NPU.")); + // TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector. + VLOG(4) << "Use default stream gc for NPU."; + gc.reset(new NPUDefaultStreamGarbageCollector( + BOOST_GET_CONST(platform::NPUPlace, place_), max_memory_size)); + } #else - PADDLE_THROW(platform::errors::Unimplemented( - "No NPU gc found in CPU/GPU/XPU paddle")); + PADDLE_THROW( + platform::errors::Unimplemented("No NPU gc found in CPU/NPU paddle")); #endif } } @@ -565,7 +582,6 @@ void Executor::EnableMKLDNN(const ProgramDesc& program) { } } } - platform::AttachPointerHashToMKLDNNKey(this, place_); #else LOG(WARNING) << "'MKLDNN' is not supported, Please re-compile with WITH_MKLDNN option"; diff --git a/paddle/fluid/framework/fleet/CMakeLists.txt b/paddle/fluid/framework/fleet/CMakeLists.txt index c8517b9503741b..03dd2cff655c06 100644 --- a/paddle/fluid/framework/fleet/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/CMakeLists.txt @@ -43,6 +43,6 @@ cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto device_conte cc_test(test_fleet_cc SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell) -if(WITH_ASCEND) +if(WITH_ASCEND OR WITH_ASCEND_CL) cc_library(ascend_wrapper SRCS ascend_wrapper.cc DEPS framework_proto lod_tensor ascend_ge ascend_graph) -endif(WITH_ASCEND) +endif() diff --git a/paddle/fluid/framework/fleet/ascend_wrapper.cc b/paddle/fluid/framework/fleet/ascend_wrapper.cc index d1b2f51f700363..273939f6bee613 100644 --- a/paddle/fluid/framework/fleet/ascend_wrapper.cc +++ b/paddle/fluid/framework/fleet/ascend_wrapper.cc @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifdef PADDLE_WITH_ASCEND +#ifdef PADDLE_WITH_ASCEND_CL #include "paddle/fluid/framework/fleet/ascend_wrapper.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/fleet/ascend_wrapper.h b/paddle/fluid/framework/fleet/ascend_wrapper.h index baa2fd126a4b77..f749ee8cfa0baa 100644 --- a/paddle/fluid/framework/fleet/ascend_wrapper.h +++ b/paddle/fluid/framework/fleet/ascend_wrapper.h @@ -14,7 +14,7 @@ limitations under the License. */ #pragma once -#ifdef PADDLE_WITH_ASCEND +#ifdef PADDLE_WITH_ASCEND_CL #include #include @@ -29,7 +29,6 @@ limitations under the License. */ #include "paddle/fluid/platform/timer.h" #include "ge/ge_api.h" -#include "ge/ge_api_types.h" #include "graph/attr_value.h" #include "graph/tensor.h" #include "graph/types.h" diff --git a/paddle/fluid/framework/fleet/heter_context.h b/paddle/fluid/framework/fleet/heter_context.h index 6f063e830c2da7..1fb2f0fab4aff9 100644 --- a/paddle/fluid/framework/fleet/heter_context.h +++ b/paddle/fluid/framework/fleet/heter_context.h @@ -77,6 +77,21 @@ class HeterContext { mutex_[i] = new std::mutex(); } } + + void Reset() { + for (size_t i = 0; i < feature_keys_.size(); ++i) { + feature_keys_[i].clear(); + } + for (size_t i = 0; i < value_ptr_.size(); ++i) { + value_ptr_[i].clear(); + } + for (size_t i = 0; i < device_values_.size(); ++i) { + device_values_[i].clear(); + } + for (size_t i = 0; i < device_keys_.size(); ++i) { + device_keys_[i].clear(); + } + } void batch_add_keys( const std::vector>& thread_keys) { assert(thread_keys.size() == feature_keys_.size()); @@ -90,6 +105,15 @@ class HeterContext { } } + void batch_add_keys(int shard_num, + const std::unordered_set& shard_keys) { + int idx = feature_keys_[shard_num].size(); + feature_keys_[shard_num].resize(feature_keys_[shard_num].size() + + shard_keys.size()); + std::copy(shard_keys.begin(), shard_keys.end(), + feature_keys_[shard_num].begin() + idx); + } + void UniqueKeys() { std::vector threads; auto unique_func = [this](int i) { diff --git a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt index 6df2cd52bb401d..67c44368b7aea4 100644 --- a/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt +++ b/paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt @@ -1,5 +1,13 @@ IF(WITH_GPU) - nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS cub device_context) + SET(HETERPS_DEPS device_context) + if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0) + SET(HETERPS_DEPS ${HETERPS_DEPS} cub) + endif() + if(WITH_PSCORE) + get_property(RPC_DEPS GLOBAL PROPERTY RPC_DEPS) + SET(HETERPS_DEPS ${HETERPS_DEPS} ${RPC_DEPS}) + endif() + nv_library(heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc heter_resource.h hashtable.h DEPS ${HETERPS_DEPS}) nv_test(test_heter_comm SRCS test_heter_comm.cu feature_value.h DEPS heter_comm) nv_library(heter_ps SRCS heter_ps.cu DEPS heter_comm) ENDIF() diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index b7bb5110744649..67ff6b6acaefb2 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -103,12 +103,26 @@ void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task, timeline.Start(); + threads.clear(); // merge thread_keys to shard_keys - for (size_t i = 0; i < thread_keys_.size(); i++) { - gpu_task->batch_add_keys(thread_keys_[i]); - for (int j = 0; j < thread_keys_thread_num_; j++) { - thread_keys_[i][j].clear(); + auto merge_ins_func = [this, gpu_task](int shard_num) { + for (int i = 0; i < thread_keys_thread_num_; ++i) { + gpu_task->batch_add_keys(shard_num, thread_keys_[i][shard_num]); + thread_keys_[i][shard_num].clear(); } + }; + + // for (size_t i = 0; i < thread_keys_.size(); i++) { + // gpu_task->batch_add_keys(thread_keys_[i]); + // for (int j = 0; j < thread_keys_thread_num_; j++) { + // thread_keys_[i][j].clear(); + // } + //} + for (int i = 0; i < thread_keys_shard_num_; ++i) { + threads.push_back(std::thread(merge_ins_func, i)); + } + for (auto& t : threads) { + t.join(); } timeline.Pause(); @@ -261,6 +275,7 @@ void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task, void PSGPUWrapper::BuildGPUPS(uint64_t table_id, int feature_dim) { int device_num = heter_devices_.size(); std::shared_ptr gpu_task = gpu_task_pool_.Get(); + gpu_task->Reset(); BuildTask(gpu_task, table_id, feature_dim); platform::Timer timeline; timeline.Start(); @@ -273,8 +288,8 @@ void PSGPUWrapper::BuildGPUPS(uint64_t table_id, int feature_dim) { size_max = std::max(size_max, feature_keys_count[i]); } if (HeterPs_) { - HeterPs_->show_one_table(0); - return; + delete HeterPs_; + HeterPs_ = nullptr; } std::vector threads(device_num); HeterPs_ = HeterPsBase::get_instance(size_max, resource_); @@ -295,6 +310,7 @@ void PSGPUWrapper::BuildGPUPS(uint64_t table_id, int feature_dim) { timeline.Pause(); VLOG(1) << "GpuPs build table total costs: " << timeline.ElapsedSec() << " s."; + gpu_task_pool_.Push(gpu_task); } void PSGPUWrapper::PullSparse(const paddle::platform::Place& place, diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc index 8dfbd3c268b866..9ab6b5d8c178b9 100644 --- a/paddle/fluid/framework/garbage_collector.cc +++ b/paddle/fluid/framework/garbage_collector.cc @@ -122,6 +122,32 @@ void CUDAPinnedGarbageCollector::ClearCallback( } #endif +#ifdef PADDLE_WITH_ASCEND_CL +NPUDefaultStreamGarbageCollector::NPUDefaultStreamGarbageCollector( + const platform::NPUPlace &place, size_t max_memory_size) + : GarbageCollector(place, max_memory_size) {} + +void NPUDefaultStreamGarbageCollector::Wait() const { + static_cast(this->dev_ctx_) + ->WaitStreamCallback(); +} + +void NPUDefaultStreamGarbageCollector::ClearCallback( + const std::function &callback) { + static_cast(this->dev_ctx_) + ->AddStreamCallback(callback); +} +NPUUnsafeFastGarbageCollector::NPUUnsafeFastGarbageCollector( + const platform::NPUPlace &place, size_t max_memory_size) + : GarbageCollector(place, max_memory_size) {} + +void NPUUnsafeFastGarbageCollector::ClearCallback( + const std::function &callback) { + callback(); +} + +#endif + int64_t GetEagerDeletionThreshold() { return FLAGS_eager_delete_tensor_gb < 0 ? -1 diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h index 572c79d21a045b..2c2b57bbe420a8 100644 --- a/paddle/fluid/framework/garbage_collector.h +++ b/paddle/fluid/framework/garbage_collector.h @@ -131,6 +131,28 @@ class CUDAPinnedGarbageCollector : public GarbageCollector { }; #endif +#ifdef PADDLE_WITH_ASCEND_CL +class NPUDefaultStreamGarbageCollector : public GarbageCollector { + public: + NPUDefaultStreamGarbageCollector(const platform::NPUPlace &place, + size_t max_memory_size); + + void Wait() const override; + + protected: + void ClearCallback(const std::function &callback) override; +}; + +class NPUUnsafeFastGarbageCollector : public GarbageCollector { + public: + NPUUnsafeFastGarbageCollector(const platform::NPUPlace &place, + size_t max_memory_size); + + protected: + void ClearCallback(const std::function &callback) override; +}; +#endif + template void GarbageCollector::Add(Container &&objs) { Add(std::forward(objs), []() {}); diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 0ca78c679aecaa..ab69170322ce3e 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -86,6 +86,7 @@ pass_library(quant_conv2d_dequant_fuse_pass inference) pass_library(shuffle_channel_detect_pass inference) pass_library(delete_quant_dequant_op_pass inference) pass_library(delete_quant_dequant_filter_op_pass inference) +pass_library(delete_dropout_op_pass inference) pass_library(simplify_with_basic_ops_pass base) pass_library(fc_elementwise_layernorm_fuse_pass base) pass_library(skip_layernorm_fuse_pass base) diff --git a/paddle/fluid/framework/ir/delete_dropout_op_pass.cc b/paddle/fluid/framework/ir/delete_dropout_op_pass.cc new file mode 100644 index 00000000000000..09962239a01b18 --- /dev/null +++ b/paddle/fluid/framework/ir/delete_dropout_op_pass.cc @@ -0,0 +1,96 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include + +#include "paddle/fluid/framework/ir/delete_dropout_op_pass.h" + +namespace paddle { +namespace framework { +class LoDTensor; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern); +#define GET_NODES \ + GET_IR_NODE(any_op_out); \ + GET_IR_NODE(dropout_op); \ + GET_IR_NODE(dropout_op_out); \ + GET_IR_NODE(dropout_op_outmask); \ + GET_IR_NODE(any_op2); + +void DeleteDropoutOpPass::ApplyImpl(ir::Graph* graph) const { + const std::string pattern_name = "delete_dropout_op_pattern"; + FusePassBase::Init(pattern_name, graph); + + GraphPatternDetector gpd; + + patterns::DeleteDropoutOpPattern pattern(gpd.mutable_pattern(), pattern_name); + pattern(); + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_NODES; + IR_NODE_LINK_TO(any_op_out, any_op2); + std::string any_op_out_name = any_op_out->Var()->Name(); + std::string dropout_op_out_name = dropout_op_out->Var()->Name(); + + auto* any_op2_desc = any_op2->Op(); + auto var_map = any_op2_desc->Inputs(); + std::string arg_name = ""; + for (auto& name_m : var_map) { + if (std::find(name_m.second.begin(), name_m.second.end(), + dropout_op_out_name) != name_m.second.end()) { + arg_name = name_m.first; + } + } + if (arg_name.size() == 0) { + LOG(INFO) << "Delete dropout op pass: can not find the input " + << dropout_op_out_name; + return; + } + + // modify the any_op2's inputs + for (auto& name_m : var_map) { + if (std::find(name_m.second.begin(), name_m.second.end(), + dropout_op_out_name) != name_m.second.end()) { + std::vector new_inputs; + for (auto& i_n : name_m.second) { + if (i_n != dropout_op_out_name) { + new_inputs.push_back(i_n); + } + } + new_inputs.push_back(any_op_out_name); + any_op2_desc->SetInput(name_m.first, new_inputs); + any_op2_desc->Flush(); + } + } + any_op2_desc->Flush(); + // Delete the unneeded nodes. + GraphSafeRemoveNodes(graph, + {dropout_op, dropout_op_out, dropout_op_outmask}); + }; + + gpd(graph, handler); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(delete_dropout_op_pass, + paddle::framework::ir::DeleteDropoutOpPass); diff --git a/paddle/fluid/framework/ir/delete_dropout_op_pass.h b/paddle/fluid/framework/ir/delete_dropout_op_pass.h new file mode 100644 index 00000000000000..c49abf3c871ced --- /dev/null +++ b/paddle/fluid/framework/ir/delete_dropout_op_pass.h @@ -0,0 +1,37 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +class Graph; + +class DeleteDropoutOpPass : public FusePassBase { + public: + virtual ~DeleteDropoutOpPass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc index 59d071e1034590..48f79e63b4f0ea 100644 --- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc @@ -290,10 +290,20 @@ static int BuildFusion(Graph* graph, const std::string& name_scope ids.push_back(inner_pattern_ins[js[iter]].first->Name()); embs.push_back(inner_pattern_ins[js[iter]].second->Name()); } + OpDesc new_op_desc; new_op_desc.SetType("fused_embedding_eltwise_layernorm"); new_op_desc.SetInput("Ids", ids); new_op_desc.SetInput("Embs", embs); + + new_op_desc.SetInput("WordId", {ids[0]}); + new_op_desc.SetInput("PosId", {ids[1]}); + new_op_desc.SetInput("SentId", {ids[2]}); + + new_op_desc.SetInput("WordEmbedding", {embs[0]}); + new_op_desc.SetInput("PosEmbedding", {embs[1]}); + new_op_desc.SetInput("SentEmbedding", {embs[2]}); + new_op_desc.SetInput("Bias", {end_pattern_biases[k]->Name()}); new_op_desc.SetInput("Scale", {end_pattern_scales[k]->Name()}); new_op_desc.SetOutput("Out", {end_pattern_out[k]->Name()}); diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index d74e8e5f65cd20..064da3d941602e 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -2439,6 +2439,29 @@ PDNode *patterns::TransposeFlattenConcat::operator()( return concat_out; } +void patterns::DeleteDropoutOpPattern::operator()() { + auto any_op_out = pattern->NewNode(any_op_out_repr()) + ->assert_is_op_input("dropout", "X") + ->AsInput(); + + auto dropout_op = + pattern->NewNode(dropout_op_repr())->assert_is_op("dropout"); + + auto dropout_op_out = pattern->NewNode(dropout_op_out_repr()) + ->assert_is_op_output("dropout", "Out") + ->AsIntermediate(); + + auto dropout_op_outmask = pattern->NewNode(dropout_op_outmask_repr()) + ->assert_is_op_output("dropout", "Mask") + ->AsOutput(); + auto any_op2 = pattern->NewNode(any_op2_repr())->assert_is_op()->AsOutput(); + + dropout_op->LinksFrom({any_op_out}); + dropout_op_out->LinksFrom({dropout_op}); + dropout_op_outmask->LinksFrom({dropout_op}); + any_op2->LinksFrom({dropout_op_out}); +} + void patterns::DeleteQuantOpFuse::operator()(PDNode *input_act_node, const std::string &quant_type) { auto *input_scale_node = pattern->NewNode(GetNodeName("input_scale_node")) diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index cfac01ec9dedc8..13f65859954d58 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -1464,6 +1464,19 @@ struct ShuffleChannelPattern : public PatternBase { PATTERN_DECL_NODE(reshape2_out); }; +struct DeleteDropoutOpPattern : public PatternBase { + DeleteDropoutOpPattern(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "delete_dropout_op_pattern") {} + + void operator()(); + + PATTERN_DECL_NODE(any_op_out); + PATTERN_DECL_NODE(dropout_op); + PATTERN_DECL_NODE(dropout_op_out); + PATTERN_DECL_NODE(dropout_op_outmask); + PATTERN_DECL_NODE(any_op2); +}; + struct DeleteQuantDequantOpPattern : public PatternBase { DeleteQuantDequantOpPattern(PDPattern* pattern, const std::string& name_scope) : PatternBase(pattern, name_scope, "delete_quantdequant_op_pattern") {} diff --git a/paddle/fluid/framework/ir/is_test_pass.cc b/paddle/fluid/framework/ir/is_test_pass.cc index 0a70440765d44d..25bf03f426a1d9 100644 --- a/paddle/fluid/framework/ir/is_test_pass.cc +++ b/paddle/fluid/framework/ir/is_test_pass.cc @@ -35,7 +35,7 @@ void IsTestPass::ApplyImpl(ir::Graph* graph) const { "hard_shrink", "hard_sigmoid", "relu6", "soft_relu", "swish", "thresholded_relu", "log", "square", "softplus", - "softsign"}; + "softsign", "silu"}; for (const Node* n : graph->Nodes()) { if (n->IsOp()) { auto* op = n->Op(); diff --git a/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc index 06df1caca35b92..4eb532b47cb4b5 100644 --- a/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/interpolate_mkldnn_pass.cc @@ -43,8 +43,9 @@ void InterpolateMKLDNNPass::ApplyImpl(ir::Graph* graph) const { int found_count = 0; const std::vector interpolate_op_types = { - "bilinear_interp", "nearest_interp", "trilinear_interp", "bicubic_interp", - "linear_interp"}; + "bilinear_interp", "nearest_interp", "trilinear_interp", + "bicubic_interp", "linear_interp", "bilinear_interp_v2", + "nearest_interp_v2"}; for (const Node* node : graph->Nodes()) { if (node->IsOp() && diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc index 1e8349e878781d..57bee20247c964 100644 --- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc +++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc @@ -753,7 +753,7 @@ PDNode* MultiHeadMatmulV3Pattern::operator()() { pattern->NewNode(transpose2_0_repr())->assert_is_op("transpose2"); auto* transpose2_0_out_var = pattern->NewNode(transpose2_0_out_repr()) ->assert_is_op_output("transpose2"); - transpose2_0_out_var->AsIntermediate()->assert_is_op_input("matmul"); + transpose2_0_out_var->AsIntermediate()->assert_is_op_input("matmul", "X"); auto* matmul_qk = pattern->NewNode(matmul_qk_repr())->assert_is_op("matmul"); auto* matmul_qk_out_var = @@ -827,7 +827,7 @@ PDNode* MultiHeadMatmulV3Pattern::operator()() { auto* transpose2_1_out_var = pattern->NewNode(transpose2_1_out_repr()) ->assert_is_op_output("transpose2"); transpose2_1_out_var->AsIntermediate()->assert_is_op_input( - "matmul"); // link to matmul qk + "matmul", "Y"); // link to matmul qk // Third path to matmul auto* mul2 = pattern->NewNode(mul2_repr())->assert_is_op("matmul"); diff --git a/paddle/fluid/framework/ir/placement_pass_base.cc b/paddle/fluid/framework/ir/placement_pass_base.cc index fd604ffe7b5de4..35ba9200607799 100644 --- a/paddle/fluid/framework/ir/placement_pass_base.cc +++ b/paddle/fluid/framework/ir/placement_pass_base.cc @@ -77,7 +77,8 @@ bool PlacementPassBase::IsDefaultOpTypes(const std::string& op_type) const { // the corresponding pass. const std::vector not_default_op_types = { "bilinear_interp", "nearest_interp", "trilinear_interp", - "bicubic_interp", "linear_interp"}; + "bicubic_interp", "linear_interp", "bilinear_interp_v2", + "linear_interp_v2"}; bool is_interpolate_op = std::find(not_default_op_types.begin(), not_default_op_types.end(), op_type) != not_default_op_types.end(); diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc index 5043fce8885cde..2fc39fd25d56c1 100644 --- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc +++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc @@ -225,10 +225,13 @@ void FuseDequant(ir::Graph* graph, Scope* scope, quantized_op_type == "depthwise_conv2d") { PADDLE_ENFORCE_EQ( dequant_type, "fake_channel_wise_dequantize_max_abs", - platform::errors::InvalidArgument("conv2d op must be dequantized by " - "[fake_channel_wise_dequantize_max_" - "abs], but got %s", - dequant_type)); + platform::errors::InvalidArgument( + "conv2d op must be dequantized by " + "[fake_channel_wise_dequantize_max_abs], but got %s. " + "If you uses PaddleSlim to generate the quantized " + "model, please set the 'weight_quantize_type' params as " + "'channel_wise_abs_max' and generate the quantized model again.", + dequant_type)); PADDLE_ENFORCE_EQ( weight_scale.size(), static_cast(w_dims[0]), platform::errors::InvalidArgument( diff --git a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc index 479df876fbe007..bf59c140005167 100644 --- a/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/repeated_fc_relu_fuse_pass.cc @@ -54,6 +54,17 @@ static bool IsFCWithAct(Node* n, const std::string& act_type = "relu") { return false; } +static bool IsFCWithPaddingWeights(Node* n) { + bool res = false; + if (n && n->IsOp() && n->Op() && n->Op()->Type() == "fc" && + n->inputs.size() == 3U && n->outputs.size() == 1U) { + if (n->Op()->HasAttr("padding_weights")) { + res = BOOST_GET_CONST(bool, n->Op()->GetAttr("padding_weights")); + } + } + return res; +} + static bool IsParamOfFC(Node* n, const std::string& param_name) { if (IsInputOfFC(n) && n->inputs.empty() && (n->Name() == n->outputs[0]->Op()->Input(param_name)[0])) { @@ -255,7 +266,7 @@ void BuildRepeatedFCReluPattern(PDPattern* pattern, fc_ops[i] = pattern->NewNode( [=](Node* x) { - if (!IsFCWithAct(x, "relu")) { + if (!IsFCWithAct(x, "relu") || IsFCWithPaddingWeights(x)) { return false; } auto* fc_out_var = x->outputs[0]; diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc index 3a79452e230ef4..0a6b5e44452fe1 100644 --- a/paddle/fluid/framework/lod_tensor.cc +++ b/paddle/fluid/framework/lod_tensor.cc @@ -268,6 +268,21 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor, TensorToStream(os, static_cast(tensor), dev_ctx); } +void SerializeToStream(std::ostream &os, const LoDTensor &tensor) { + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + const platform::DeviceContext *dev_ctx; + auto place = tensor.place(); + dev_ctx = pool.Get(place); + SerializeToStream(os, tensor, *dev_ctx); +} + +void DeserializeFromStream(std::ifstream &os, LoDTensor *tensor) { + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + const platform::DeviceContext *dev_ctx; + dev_ctx = pool.Get(platform::CPUPlace()); + DeserializeFromStream(os, tensor, *dev_ctx); +} + void DeserializeFromStream(std::istream &is, LoDTensor *tensor, const platform::DeviceContext &dev_ctx, const size_t &seek, diff --git a/paddle/fluid/framework/lod_tensor.h b/paddle/fluid/framework/lod_tensor.h index b8911154e6bf7b..6b357aba1c5f9a 100644 --- a/paddle/fluid/framework/lod_tensor.h +++ b/paddle/fluid/framework/lod_tensor.h @@ -14,16 +14,11 @@ limitations under the License. */ #pragma once +#include #include #include #include #include -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -#include -#include -#endif - -#include #include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/mixed_vector.h" @@ -260,5 +255,9 @@ LoD ConvertToLengthBasedLoD(const LoD& offset_lod); LoD ConvertToOffsetBasedLoD(const LoD& length_lod); +void SerializeToStream(std::ostream& os, const LoDTensor& tensor); + +void DeserializeFromStream(std::ifstream& os, LoDTensor* tensor); + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc index f107321958ba7b..7d55d8c41e3e92 100644 --- a/paddle/fluid/framework/naive_executor.cc +++ b/paddle/fluid/framework/naive_executor.cc @@ -128,7 +128,7 @@ NaiveExecutor::~NaiveExecutor() { #ifdef PADDLE_WITH_MKLDNN // Clear mkl-dnn cache, // this is needed to have mkl-dnn unit tests working - ClearMKLDNNCache(place_); + ClearMKLDNNCache(place_, this); #endif } diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index 7af5c54ceed74f..519bf8c633a013 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -447,6 +447,11 @@ void OpDesc::SetOutput(const std::string ¶m_name, this->outputs_[param_name] = args; } +void OpDesc::RemoveOutput(const std::string &name) { + outputs_.erase(name); + need_update_ = true; +} + bool OpDesc::HasProtoAttr(const std::string &name) const { auto &op_info = OpInfoMap::Instance(); if (op_info.Has(desc_.type())) { diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h index 95c33bca6c7f1d..1bc1a308e453bb 100644 --- a/paddle/fluid/framework/op_desc.h +++ b/paddle/fluid/framework/op_desc.h @@ -65,6 +65,7 @@ class OpDesc { void SetOutput(const std::string ¶m_name, const std::vector &args); + void RemoveOutput(const std::string &name); bool HasAttr(const std::string &name) const { return attrs_.find(name) != attrs_.end(); diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index 4c529329761227..593d4d839fa910 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -25,7 +25,8 @@ limitations under the License. */ #include #define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h -#include "glog/logging.h" // For VLOG() +#include "gflags/gflags.h" +#include "glog/logging.h" // For VLOG() #include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/details/op_registry.h" #include "paddle/fluid/framework/grad_op_desc_maker.h" @@ -67,6 +68,8 @@ class Version; } // namespace framework } // namespace paddle +DECLARE_bool(check_kernel_launch); + namespace paddle { namespace framework { @@ -134,6 +137,19 @@ class OpRegistry { static std::unique_ptr CreateOp(const OpDesc& op_desc); }; +template +inline void CheckKernelLaunch(const char* op_type) {} + +#ifdef PADDLE_WITH_CUDA +template <> +inline void CheckKernelLaunch<::paddle::platform::CUDAPlace>( + const char* op_type) { + if (FLAGS_check_kernel_launch) { + PADDLE_ENFORCE_CUDA_LAUNCH_SUCCESS(op_type); + } +} +#endif + template struct OpKernelRegistrarFunctor; @@ -162,8 +178,9 @@ struct OpKernelRegistrarFunctor { RegisterKernelClass( op_type, library_type, customized_type_value, - [](const framework::ExecutionContext& ctx) { + [op_type](const framework::ExecutionContext& ctx) { KERNEL_TYPE().Compute(ctx); + CheckKernelLaunch(op_type); }); constexpr auto size = std::tuple_size>::value; OpKernelRegistrarFunctor @@ -223,8 +240,13 @@ struct OpKernelRegistrarFunctorEx(op_type, library_type, - customized_type_value, Functor()); + RegisterKernelClass( + op_type, library_type, customized_type_value, + + [op_type](const framework::ExecutionContext& ctx) { + Functor()(ctx); + CheckKernelLaunch(op_type); + }); constexpr auto size = std::tuple_size>::value; @@ -343,6 +365,12 @@ struct OpKernelRegistrarFunctorEx &places, InitP2P(places); ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_), member_->places_.size()); - member_->use_device_ = exec_strategy.use_device_; - member_->build_strategy_ = build_strategy; - member_->use_all_reduce_ = member_->build_strategy_.reduce_ == - BuildStrategy::ReduceStrategy::kAllReduce; - member_->nranks_ = build_strategy.num_trainers_ * places.size(); - if (!member_->use_all_reduce_ && member_->nranks_ == 1) { - LOG(INFO) << "If you set build_strategy.reduce with 'Reduce'," - "the number of places should be greater than 1."; - member_->build_strategy_.reduce_ = - BuildStrategy::ReduceStrategy::kAllReduce; - member_->use_all_reduce_ = true; - } -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && defined(_WIN32) - if (member_->IsUseCUDA(member_->use_device_)) { - PADDLE_ENFORCE_EQ( - places.size(), 1, - platform::errors::Unavailable("Windows can support Single GPU only.")); - } -#endif - -#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \ - (!defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL)) - if (member_->IsUseCUDA(member_->use_device_)) { - PADDLE_ENFORCE_EQ( - places.size(), 1, - platform::errors::PermissionDenied( - "Your machine has multiple cards, " - "but the WITH_NCCL option is not turned on during compilation, " - "and you cannot use multi-card training or prediction. " - "Please recompile and turn on the WITH_NCCL option.")); - } -#endif - - std::string device_name; - if (member_->use_device_ == p::kCPU) { - device_name = "CPU"; - } else if (member_->use_device_ == p::kCUDA) { - device_name = "CUDA"; - } else { - device_name = "XPU"; - } - - VLOG(1) << string::Sprintf( - "The Program will be executed on %s using ParallelExecutor, %lu " - "cards are used, so %lu programs are executed in parallel.", - device_name, places.size(), places.size()); - - // Step 1. Bcast the bcast_vars to devs. - // Create local scopes - if (local_scopes.empty()) { - member_->own_local_scope_ = true; - member_->local_scopes_.emplace_back(member_->global_scope_); - for (size_t i = 1; i < member_->places_.size(); ++i) { - member_->local_scopes_.emplace_back(&scope->NewScope()); - } - } else { - member_->own_local_scope_ = false; - PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size(), - platform::errors::PreconditionNotMet( - "member_->places_.size() = %d is not equal to " - "local_scopes.size() = %d", - member_->places_.size(), local_scopes.size())); - for (size_t i = 0; i < member_->places_.size(); ++i) { - member_->local_scopes_.emplace_back(&local_scopes[i]->NewScope()); - } - } - - std::vector graphs; - if (member_->build_strategy_.async_mode_) { - PADDLE_ENFORCE_EQ(member_->IsUseCUDA(member_->use_device_), false, - platform::errors::Unavailable( - "gpu mode does not support async_mode_ now!")); - graphs.push_back(graph); - for (size_t i = 1; i < places.size(); ++i) { - auto *tmp_graph = new ir::Graph(graph->OriginProgram()); - async_graphs_.emplace_back(tmp_graph); - graphs.push_back(tmp_graph); - } - } - - // FIXME(Yancey1989): parallel graph mode get better performance - // in GPU allreduce distributed training. Need an elegant way to - // choice the execution strategy. - member_->build_strategy_.enable_parallel_graph_ = - EnableParallelGraphExecution(*graph, exec_strategy, - member_->build_strategy_); - if (member_->build_strategy_.enable_parallel_graph_) { - LOG(INFO) << "The Executor would execute the graph by ParallelGraph " - "Execution which can get better performance," - << "you can force it off by env FLAGS_enable_parallel_graph=0"; - } + // Initialize necessary info of member_ with strategy. + InitExecutorPrivateMemberInfo(exec_strategy, build_strategy, places.size(), + *graph); - if (member_->IsUseCUDA(member_->use_device_) && member_->nranks_ > 1) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - member_->InitOrGetNCCLCommunicator(scope, &member_->build_strategy_); - - // Initialize device context's nccl comm, will be used by normal - // Operators like sync_batch_norm, and collective ops. - // NOTE: more than one ParallelExecutor with same place, the nccl comm will - // be rewrite and there will be some problem. - // NOTE: NCCL group-calls and non-group-calls can not use the same - // NCCL communicator, so for ParallelGraph and Multi-Process mode, re-use - // same communicators. - auto *nccl_ctxs = - member_->nccl_ctxs_->GetSyncBatchNormCtx(scope, member_->places_); - auto &pool = platform::DeviceContextPool::Instance(); - for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) { - auto *dev_ctx = static_cast( - pool.Get(member_->places_[dev_id])); - auto &nccl_ctx = nccl_ctxs->at(member_->places_[dev_id]); - dev_ctx->set_nccl_comm(nccl_ctx.comm()); - } -#else - PADDLE_THROW( - platform::errors::PreconditionNotMet("Not compiled with CUDA.")); -#endif - } - if (member_->use_device_ == p::kXPU && member_->nranks_ > 1) { -#if defined(PADDLE_WITH_XPU_BKCL) - member_->InitOrGetBKCLCommunicator(scope, member_->build_strategy_); + // Step 1. Create local scopes and Clone graph into multi device + CreateLocalScopes(scope, local_scopes, /*create_new*/ true); + std::vector graphs = CloneGraphToMultiDevices(graph); + PrepareNCCLCommunicator(scope); - auto *bkcl_ctxs = - member_->bkcl_ctxs_->GetSyncBatchNormCtx(scope, member_->places_); - auto &pool = platform::DeviceContextPool::Instance(); - for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) { - auto *dev_ctx = static_cast( - pool.Get(member_->places_[dev_id])); - auto &bkcl_ctx = bkcl_ctxs->at(member_->places_[dev_id]); - dev_ctx->set_bkcl_context(bkcl_ctx.comm()); - } -#else - PADDLE_THROW( - platform::errors::PreconditionNotMet("Not compiled with XPU.")); -#endif - } // broadcast parameters from the 0th device to others: auto need_broadcast = [&]() -> bool { if (member_->build_strategy_.num_trainers_ > 1) { @@ -778,257 +651,75 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, } return false; }; - // Bcast Parameters to all GPUs if (need_broadcast()) { BCastParamsToDevices(bcast_vars, member_->build_strategy_.trainer_id_); } - // Startup Program has been run. All local scopes has correct parameters. - // Step 2. Convert main_program to SSA form and dependency graph. Also, insert // ncclOp - std::vector async_graphs(places.size()); -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - if (member_->build_strategy_.async_mode_) { - VLOG(3) << "use local async mode"; - graph = member_->build_strategy_.Apply( - graph, {member_->places_[0]}, loss_var_name, - {member_->local_scopes_[0]}, 1, member_->use_device_, - member_->nccl_ctxs_); - for (size_t i = 1; i < member_->places_.size(); ++i) { - graphs[i] = member_->build_strategy_.Apply( - graphs[i], {member_->places_[i]}, loss_var_name, - {member_->local_scopes_[i]}, 1, member_->use_device_, - member_->nccl_ctxs_); - async_graphs[i] = graphs[i]; - } - } else { - graph = member_->build_strategy_.Apply( - graph, member_->places_, loss_var_name, member_->local_scopes_, - member_->nranks_, member_->use_device_, member_->nccl_ctxs_); - } -#elif defined(PADDLE_WITH_XPU_BKCL) - if (member_->build_strategy_.async_mode_) { - VLOG(3) << "use local async mode"; - graph = member_->build_strategy_.Apply( - graph, {member_->places_[0]}, loss_var_name, - {member_->local_scopes_[0]}, 1, member_->use_device_, - member_->bkcl_ctxs_); - for (size_t i = 1; i < member_->places_.size(); ++i) { - graphs[i] = member_->build_strategy_.Apply( - graphs[i], {member_->places_[i]}, loss_var_name, - {member_->local_scopes_[i]}, 1, member_->use_device_, - member_->bkcl_ctxs_); - async_graphs[i] = graphs[i]; - } - } else { - graph = member_->build_strategy_.Apply( - graph, member_->places_, loss_var_name, member_->local_scopes_, - member_->nranks_, member_->use_device_, member_->bkcl_ctxs_); - } -#else - if (member_->build_strategy_.async_mode_) { - VLOG(3) << "use local async mode"; - graph = member_->build_strategy_.Apply( - graph, {member_->places_[0]}, loss_var_name, - {member_->local_scopes_[0]}, 1, member_->use_device_); - for (size_t i = 1; i < member_->places_.size(); ++i) { - graphs[i] = member_->build_strategy_.Apply( - graphs[i], {member_->places_[i]}, loss_var_name, - {member_->local_scopes_[i]}, 1, member_->use_device_); - async_graphs[i] = graphs[i]; - } - } else { - graph = member_->build_strategy_.Apply( - graph, member_->places_, loss_var_name, member_->local_scopes_, - member_->nranks_, member_->use_device_); - } -#endif - + std::vector async_graphs = + CompileGraphWithBuildStrategy(graph, &graphs, loss_var_name); graph = member_->ApplyMemoryOptimizePass(graph); - async_graphs[0] = graph; // Step 3. Create vars in each scope. Passes may also create new vars. // skip control vars and empty vars std::vector var_infos; - for (auto &node : graph->Nodes()) { - if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { - var_infos.emplace_back(); - var_infos.back().name_ = node->Var()->Name(); - var_infos.back().type_ = node->Var()->GetType(); - var_infos.back().persistable_ = node->Var()->Persistable(); - - member_->is_persistable_.emplace(node->Var()->Name(), - node->Var()->Persistable()); - } - } - - if (graph->Has(details::kFusedVars)) { - auto &fused_vars = graph->Get(details::kFusedVars); - for (auto &fused_var : fused_vars) { - var_infos.emplace_back(); - var_infos.back() = fused_var.second; + CreateVariableInfos(&var_infos, graph); + std::unordered_map scope_map = + CreateLocalExecScopes(member_->local_scopes_, /*create_new*/ true); - member_->is_persistable_.emplace(fused_var.first, - fused_var.second.persistable_); - } - } + // Step 4. Create SSAGraph executor + std::vector final_graphs = + CreateSSAGraphExecutor(exec_strategy, &async_graphs, graph); - std::unordered_map scope_map; - for (auto *scope : member_->local_scopes_) { - auto &local_exec_scope = scope->NewScope(); - member_->local_exec_scopes_.emplace_back(&local_exec_scope); - scope_map.emplace(scope, &local_exec_scope); + VLOG(3) << "use ScopeBufferedSSAGraphExecutor"; + if (!member_->build_strategy_.async_mode_) { + member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( + exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, + std::move(var_infos), member_->places_, std::move(member_->executor_))); } - PADDLE_ENFORCE_EQ( - member_->local_scopes_.size(), member_->local_exec_scopes_.size(), - platform::errors::PreconditionNotMet( - "member_->local_scopes_.size() = %d is not equal to " - "member_->local_exec_scopes_.size() = %d", - member_->local_scopes_.size(), member_->local_exec_scopes_.size())); + ResetOpHandleScopeMapOfGraphs(final_graphs, scope_map); + SetReaderOpDeviceInfoOfGraphs(final_graphs); +} - std::vector final_graphs; +void ParallelExecutor::BCastParamsToDevices( + const std::vector &vars, int trainer_id) const { + VLOG(3) << "BCastParamsToDevices"; + // the initializing bcast, all vars would be bcast from device(0). + for (auto &var : vars) { + framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var); + if (main_var == nullptr || !main_var->IsType()) { + continue; + } - if (member_->build_strategy_.async_mode_) { - VLOG(3) << "use AsyncSSAGraphExecutor"; - member_->executor_.reset(new details::AsyncSSAGraphExecutor( - exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, - member_->places_, async_graphs)); - final_graphs = async_graphs; - } else if (member_->build_strategy_.enable_parallel_graph_) { - VLOG(3) << "use ParallelSSAGraphExecutor"; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - // TODO(Yancey1989): Remove passing in the main_program when - // allreduce_seq_pass doesn't need it as the attr. - bool is_inference = details::IsDataParallelInferenceGraph(*graph); - bool has_drop_last_read_op = details::HasDropLastReadOp(*graph); + auto &main_tensor = main_var->Get(); + if (!main_tensor.IsInitialized()) { + VLOG(3) << "one in var not inited, return!"; + continue; + } + auto &dims = main_tensor.dims(); + if (paddle::platform::is_gpu_place(main_tensor.place())) { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + std::vector buffers; + buffers.reserve(member_->places_.size()); + size_t numel = main_tensor.numel(); + ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type()); + for (size_t i = 0; i < member_->places_.size(); ++i) { + auto place = member_->places_[i]; + void *buffer; - auto *pg_exe = new details::ParallelSSAGraphExecutor( - exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, - member_->places_, graph); - final_graphs = pg_exe->Graphs(); - member_->executor_.reset(pg_exe); - - if (is_inference && member_->places_.size() > 1) { - member_->inference_executor_ = pg_exe; - if (!has_drop_last_read_op) { - VLOG(5) << "Enable partial feed support in inference phase"; - pg_exe->EnablePartialFeedSupport(); - } - } -#else - PADDLE_THROW(platform::errors::PreconditionNotMet( - "Paddle should be compiled with CUDA for ParallelGraph Execution.")); -#endif - } else { - bool has_drop_last_read_op = details::HasDropLastReadOp(*graph); - auto possible_inference_graphs = - details::TrySeparateToMultipleSingleDeviceGraphs(graph); - if (!possible_inference_graphs.empty()) { - VLOG(5) << "Use ParallelSSAGraphExecutor in inference phase"; - auto *pg_exe = new details::ParallelSSAGraphExecutor( - exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, - member_->places_, std::move(possible_inference_graphs)); - if (!has_drop_last_read_op) { - VLOG(5) << "Enable partial feed support in inference phase"; - pg_exe->EnablePartialFeedSupport(); - } - final_graphs = pg_exe->Graphs(); - member_->executor_.reset(pg_exe); - member_->inference_executor_ = pg_exe; - } else { - LOG_IF(WARNING, details::HasKeepLastReadOp(*graph)) - << "drop_last=False for DataLoader is not supported in training " - "network. It is automatically turned to drop_last=True."; - if (exec_strategy.type_ == ExecutionStrategy::kDefault) { - VLOG(3) << "use ThreadedSSAGraphExecutor"; - member_->executor_.reset(new details::ThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, - member_->places_, graph)); - } else { - if (member_->use_device_ == p::kXPU) { -#if defined(PADDLE_WITH_XPU) - VLOG(3) << "use BindThreadedSSAGraphExecutor"; - member_->executor_.reset(new details::BindThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, - member_->local_exec_scopes_, member_->places_, graph)); -#else - PADDLE_THROW(platform::errors::PermissionDenied( - "Paddle can't use XPU device since it's not compiled with XPU," - "Please recompile or reinstall Paddle with XPU support.")); -#endif - } else { - VLOG(3) << "use FastThreadedSSAGraphExecutor"; - member_->executor_.reset(new details::FastThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, - member_->local_exec_scopes_, member_->places_, graph)); - } - } - final_graphs.emplace_back(graph); - } - } - - VLOG(3) << "use ScopeBufferedSSAGraphExecutor"; - if (!member_->build_strategy_.async_mode_) { - member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, - std::move(var_infos), member_->places_, std::move(member_->executor_))); - } - - for (auto *g : final_graphs) { - auto ops = ir::FilterByNodeWrapper(*g); - for (auto *op : ops) { - op->SetLocalExecScopes(scope_map); - } - } - - if (final_graphs.size() == 1) { - ir::SetReaderOpDeviceInfo(final_graphs[0], member_->places_.size()); - } else { - for (size_t i = 0; i < final_graphs.size(); ++i) { - ir::SetReaderOpDeviceInfo(final_graphs[i], member_->places_.size(), i); - } - } -} - -void ParallelExecutor::BCastParamsToDevices( - const std::vector &vars, int trainer_id) const { - VLOG(3) << "BCastParamsToDevices"; - // the initializing bcast, all vars would be bcast from device(0). - for (auto &var : vars) { - framework::Variable *main_var = member_->local_scopes_[0]->FindVar(var); - if (main_var == nullptr || !main_var->IsType()) { - continue; - } - - auto &main_tensor = main_var->Get(); - if (!main_tensor.IsInitialized()) { - VLOG(3) << "one in var not inited, return!"; - continue; - } - auto &dims = main_tensor.dims(); - if (paddle::platform::is_gpu_place(main_tensor.place())) { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - std::vector buffers; - buffers.reserve(member_->places_.size()); - size_t numel = main_tensor.numel(); - ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type()); - for (size_t i = 0; i < member_->places_.size(); ++i) { - auto place = member_->places_[i]; - void *buffer; - - if (i == 0 && trainer_id == 0) { - buffer = const_cast(main_tensor.data()); - } else { - auto local_scope = member_->local_scopes_[i]; - auto *t = local_scope->Var(var)->GetMutable(); - t->Resize(dims); - buffer = t->mutable_data(place, main_tensor.type()); - } - buffers.push_back(buffer); - } + if (i == 0 && trainer_id == 0) { + buffer = const_cast(main_tensor.data()); + } else { + auto local_scope = member_->local_scopes_[i]; + auto *t = local_scope->Var(var)->GetMutable(); + t->Resize(dims); + buffer = t->mutable_data(place, main_tensor.type()); + } + buffers.push_back(buffer); + } PADDLE_ENFORCE_EQ(member_->places_.size(), buffers.size(), platform::errors::PreconditionNotMet( @@ -1367,6 +1058,399 @@ bool ParallelExecutor::EnableParallelGraphExecution( return enable_parallel_graph; } +void ParallelExecutor::InitExecutorPrivateMemberInfo( + const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy, + size_t device_count, const ir::Graph &graph) { + member_->use_device_ = exec_strategy.use_device_; + member_->build_strategy_ = build_strategy; + member_->use_all_reduce_ = member_->build_strategy_.reduce_ == + BuildStrategy::ReduceStrategy::kAllReduce; + member_->nranks_ = build_strategy.num_trainers_ * device_count; + if (!member_->use_all_reduce_ && member_->nranks_ == 1) { + LOG(INFO) << "If you set build_strategy.reduce with 'Reduce'," + "the number of places should be greater than 1."; + member_->build_strategy_.reduce_ = + BuildStrategy::ReduceStrategy::kAllReduce; + member_->use_all_reduce_ = true; + } +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && defined(_WIN32) + if (member_->IsUseCUDA(member_->use_device_)) { + PADDLE_ENFORCE_EQ( + device_count, 1, + platform::errors::Unavailable("Windows can support Single GPU only.")); + } +#endif + +#if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && \ + (!defined(PADDLE_WITH_NCCL) && !defined(PADDLE_WITH_RCCL)) + if (member_->IsUseCUDA(member_->use_device_)) { + PADDLE_ENFORCE_EQ( + device_count, 1, + platform::errors::PermissionDenied( + "Your machine has multiple cards, " + "but the WITH_NCCL option is not turned on during compilation, " + "and you cannot use multi-card training or prediction. " + "Please recompile and turn on the WITH_NCCL option.")); + } +#endif + + std::string device_name; + if (member_->use_device_ == p::kCPU) { + device_name = "CPU"; + } else if (member_->use_device_ == p::kCUDA) { + device_name = "CUDA"; + } else { + device_name = "XPU"; + } + + VLOG(1) << string::Sprintf( + "The Program will be executed on %s using ParallelExecutor, %lu " + "cards are used, so %lu programs are executed in parallel.", + device_name, device_count, device_count); + + // FIXME(Yancey1989): parallel graph mode get better performance + // in GPU allreduce distributed training. Need an elegant way to + // choice the execution strategy. + member_->build_strategy_.enable_parallel_graph_ = + EnableParallelGraphExecution(graph, exec_strategy, + member_->build_strategy_); + if (member_->build_strategy_.enable_parallel_graph_) { + LOG(INFO) << "The Executor would execute the graph by ParallelGraph " + "Execution which can get better performance," + << "you can force it off by env FLAGS_enable_parallel_graph=0"; + } +} + +void ParallelExecutor::CreateLocalScopes( + Scope *global_scope, const std::vector &local_scopes, + bool create_new) { + if (local_scopes.empty()) { + member_->own_local_scope_ = true; + member_->local_scopes_.emplace_back(global_scope); + for (size_t i = 1; i < member_->places_.size(); ++i) { + member_->local_scopes_.emplace_back(&global_scope->NewScope()); + } + } else { + member_->own_local_scope_ = false; + PADDLE_ENFORCE_EQ(member_->places_.size(), local_scopes.size(), + platform::errors::PreconditionNotMet( + "member_->places_.size() = %d is not equal to " + "local_scopes.size() = %d", + member_->places_.size(), local_scopes.size())); + for (size_t i = 0; i < member_->places_.size(); ++i) { + if (create_new) { + member_->local_scopes_.emplace_back(&local_scopes[i]->NewScope()); + } else { + // Use local scopes directly + member_->local_scopes_.emplace_back(local_scopes[i]); + } + } + } +} + +std::unordered_map ParallelExecutor::CreateLocalExecScopes( + const std::vector &local_scopes, bool create_new) { + std::unordered_map scope_map; + + for (auto *scope : local_scopes) { + Scope *local_exec_scope = scope; + if (create_new) { + local_exec_scope = &scope->NewScope(); + } + member_->local_exec_scopes_.emplace_back(local_exec_scope); + scope_map.emplace(scope, local_exec_scope); + } + + PADDLE_ENFORCE_EQ( + member_->local_scopes_.size(), member_->local_exec_scopes_.size(), + platform::errors::PreconditionNotMet( + "member_->local_scopes_.size() = %d is not equal to " + "member_->local_exec_scopes_.size() = %d", + member_->local_scopes_.size(), member_->local_exec_scopes_.size())); + + return scope_map; +} + +std::vector ParallelExecutor::CloneGraphToMultiDevices( + ir::Graph *graph) { + std::vector graphs; + if (member_->build_strategy_.async_mode_) { + PADDLE_ENFORCE_EQ(member_->IsUseCUDA(member_->use_device_), false, + platform::errors::Unavailable( + "gpu mode does not support async_mode_ now!")); + graphs.push_back(graph); + for (size_t i = 1; i < member_->places_.size(); ++i) { + auto *tmp_graph = new ir::Graph(graph->OriginProgram()); + async_graphs_.emplace_back(tmp_graph); + graphs.push_back(tmp_graph); + } + } + + return graphs; +} + +void ParallelExecutor::PrepareNCCLCommunicator(Scope *global_scope) { + if (member_->IsUseCUDA(member_->use_device_) && member_->nranks_ > 1) { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + member_->InitOrGetNCCLCommunicator(global_scope, &member_->build_strategy_); + + // Initialize device context's nccl comm, will be used by normal + // Operators like sync_batch_norm, and collective ops. + // NOTE: more than one ParallelExecutor with same place, the nccl comm will + // be rewrite and there will be some problem. + // NOTE: NCCL group-calls and non-group-calls can not use the same + // NCCL communicator, so for ParallelGraph and Multi-Process mode, re-use + // same communicators. + auto *nccl_ctxs = member_->nccl_ctxs_->GetSyncBatchNormCtx( + global_scope, member_->places_); + auto &pool = platform::DeviceContextPool::Instance(); + for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) { + auto *dev_ctx = static_cast( + pool.Get(member_->places_[dev_id])); + auto &nccl_ctx = nccl_ctxs->at(member_->places_[dev_id]); + dev_ctx->set_nccl_comm(nccl_ctx.comm()); + } +#else + PADDLE_THROW( + platform::errors::PreconditionNotMet("Not compiled with CUDA.")); +#endif + } + if (member_->use_device_ == p::kXPU && member_->nranks_ > 1) { +#if defined(PADDLE_WITH_XPU_BKCL) + member_->InitOrGetBKCLCommunicator(global_scope, member_->build_strategy_); + + auto *bkcl_ctxs = member_->bkcl_ctxs_->GetSyncBatchNormCtx( + global_scope, member_->places_); + auto &pool = platform::DeviceContextPool::Instance(); + for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) { + auto *dev_ctx = static_cast( + pool.Get(member_->places_[dev_id])); + auto &bkcl_ctx = bkcl_ctxs->at(member_->places_[dev_id]); + dev_ctx->set_bkcl_context(bkcl_ctx.comm()); + } +#else + PADDLE_THROW( + platform::errors::PreconditionNotMet("Not compiled with XPU.")); +#endif + } +} + +std::vector ParallelExecutor::CompileGraphWithBuildStrategy( + ir::Graph *graph, std::vector *device_graphs, + const std::string &loss_var_name) { + auto device_count = member_->places_.size(); + std::vector async_graphs(device_count); + + auto &graphs = *device_graphs; +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + if (member_->build_strategy_.async_mode_) { + PADDLE_ENFORCE_EQ(graphs.size(), device_count, + platform::errors::PreconditionNotMet( + "graphs.size() shoule be %d, but received %d", + device_count, graphs.size())); + VLOG(3) << "use local async mode"; + graph = member_->build_strategy_.Apply( + graph, {member_->places_[0]}, loss_var_name, + {member_->local_scopes_[0]}, 1, member_->use_device_, + member_->nccl_ctxs_); + for (size_t i = 1; i < device_count; ++i) { + graphs[i] = member_->build_strategy_.Apply( + graphs[i], {member_->places_[i]}, loss_var_name, + {member_->local_scopes_[i]}, 1, member_->use_device_, + member_->nccl_ctxs_); + async_graphs[i] = graphs[i]; + } + } else { + graph = member_->build_strategy_.Apply( + graph, member_->places_, loss_var_name, member_->local_scopes_, + member_->nranks_, member_->use_device_, member_->nccl_ctxs_); + } +#elif defined(PADDLE_WITH_XPU_BKCL) + if (member_->build_strategy_.async_mode_) { + PADDLE_ENFORCE_EQ(graphs.size(), device_count, + platform::errors::PreconditionNotMet( + "graphs.size() shoule be %d, but received %d", + device_count, graphs.size())); + VLOG(3) << "use local async mode"; + graph = member_->build_strategy_.Apply( + graph, {member_->places_[0]}, loss_var_name, + {member_->local_scopes_[0]}, 1, member_->use_device_, + member_->bkcl_ctxs_); + for (size_t i = 1; i < device_count; ++i) { + graphs[i] = member_->build_strategy_.Apply( + graphs[i], {member_->places_[i]}, loss_var_name, + {member_->local_scopes_[i]}, 1, member_->use_device_, + member_->bkcl_ctxs_); + async_graphs[i] = graphs[i]; + } + } else { + graph = member_->build_strategy_.Apply( + graph, member_->places_, loss_var_name, member_->local_scopes_, + member_->nranks_, member_->use_device_, member_->bkcl_ctxs_); + } +#else + if (member_->build_strategy_.async_mode_) { + VLOG(3) << "use local async mode"; + graph = member_->build_strategy_.Apply( + graph, {member_->places_[0]}, loss_var_name, + {member_->local_scopes_[0]}, 1, member_->use_device_); + for (size_t i = 1; i < device_count; ++i) { + graphs[i] = member_->build_strategy_.Apply( + graphs[i], {member_->places_[i]}, loss_var_name, + {member_->local_scopes_[i]}, 1, member_->use_device_); + async_graphs[i] = graphs[i]; + } + } else { + graph = member_->build_strategy_.Apply( + graph, member_->places_, loss_var_name, member_->local_scopes_, + member_->nranks_, member_->use_device_); + } +#endif + + return async_graphs; +} + +void ParallelExecutor::CreateVariableInfos( + std::vector *var_infos, ir::Graph *graph) { + PADDLE_ENFORCE_EQ( + var_infos->size(), 0, + platform::errors::PreconditionNotMet( + "var_infos->size() shoule be 0, but received %d", var_infos->size())); + PADDLE_ENFORCE_EQ( + member_->is_persistable_.size(), 0, + platform::errors::PreconditionNotMet( + "member_->is_persistable_.size() shoule be 0, but received %d", + member_->is_persistable_.size())); + for (auto &node : graph->Nodes()) { + if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { + var_infos->emplace_back(); + var_infos->back().name_ = node->Var()->Name(); + var_infos->back().type_ = node->Var()->GetType(); + var_infos->back().persistable_ = node->Var()->Persistable(); + + member_->is_persistable_.emplace(node->Var()->Name(), + node->Var()->Persistable()); + } + } + + if (graph->Has(details::kFusedVars)) { + auto &fused_vars = graph->Get(details::kFusedVars); + for (auto &fused_var : fused_vars) { + var_infos->emplace_back(); + var_infos->back() = fused_var.second; + + member_->is_persistable_.emplace(fused_var.first, + fused_var.second.persistable_); + } + } +} + +std::vector ParallelExecutor::CreateSSAGraphExecutor( + const ExecutionStrategy &exec_strategy, + std::vector *async_graphs, ir::Graph *graph) { + std::vector final_graphs; + + if (member_->build_strategy_.async_mode_) { + VLOG(3) << "use AsyncSSAGraphExecutor"; + member_->executor_.reset(new details::AsyncSSAGraphExecutor( + exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, + member_->places_, *async_graphs)); + final_graphs = *async_graphs; + } else if (member_->build_strategy_.enable_parallel_graph_) { + VLOG(3) << "use ParallelSSAGraphExecutor"; +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + // TODO(Yancey1989): Remove passing in the main_program when + // allreduce_seq_pass doesn't need it as the attr. + bool is_inference = details::IsDataParallelInferenceGraph(*graph); + bool has_drop_last_read_op = details::HasDropLastReadOp(*graph); + + auto *pg_exe = new details::ParallelSSAGraphExecutor( + exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, + member_->places_, graph); + final_graphs = pg_exe->Graphs(); + member_->executor_.reset(pg_exe); + + if (is_inference && member_->places_.size() > 1) { + member_->inference_executor_ = pg_exe; + if (!has_drop_last_read_op) { + VLOG(5) << "Enable partial feed support in inference phase"; + pg_exe->EnablePartialFeedSupport(); + } + } +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "Paddle should be compiled with CUDA for ParallelGraph Execution.")); +#endif + } else { + bool has_drop_last_read_op = details::HasDropLastReadOp(*graph); + auto possible_inference_graphs = + details::TrySeparateToMultipleSingleDeviceGraphs(graph); + if (!possible_inference_graphs.empty()) { + VLOG(5) << "Use ParallelSSAGraphExecutor in inference phase"; + auto *pg_exe = new details::ParallelSSAGraphExecutor( + exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, + member_->places_, std::move(possible_inference_graphs)); + if (!has_drop_last_read_op) { + VLOG(5) << "Enable partial feed support in inference phase"; + pg_exe->EnablePartialFeedSupport(); + } + final_graphs = pg_exe->Graphs(); + member_->executor_.reset(pg_exe); + member_->inference_executor_ = pg_exe; + } else { + LOG_IF(WARNING, details::HasKeepLastReadOp(*graph)) + << "drop_last=False for DataLoader is not supported in training " + "network. It is automatically turned to drop_last=True."; + if (exec_strategy.type_ == ExecutionStrategy::kDefault) { + VLOG(3) << "use ThreadedSSAGraphExecutor"; + member_->executor_.reset(new details::ThreadedSSAGraphExecutor( + exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, + member_->places_, graph)); + } else { + VLOG(3) << "use FastThreadedSSAGraphExecutor"; + member_->executor_.reset(new details::FastThreadedSSAGraphExecutor( + exec_strategy, member_->local_scopes_, member_->local_exec_scopes_, + member_->places_, graph)); + } + final_graphs.emplace_back(graph); + } + } + return final_graphs; +} + +void ParallelExecutor::ResetOpHandleScopeMapOfGraphs( + const std::vector &final_graphs, + const std::unordered_map &scope_map) { + PADDLE_ENFORCE_GE( + final_graphs.size(), 1, + platform::errors::PreconditionNotMet( + "final_graphs shoule contain at least one graph, but received %d", + final_graphs.size())); + + PADDLE_ENFORCE_GT(scope_map.size(), 0, + platform::errors::PreconditionNotMet( + "scope_map shoule contain at least one " + "element, but received %d", + scope_map.size())); + for (auto *g : final_graphs) { + auto ops = ir::FilterByNodeWrapper(*g); + for (auto *op : ops) { + op->SetLocalExecScopes(scope_map); + } + } +} + +void ParallelExecutor::SetReaderOpDeviceInfoOfGraphs( + const std::vector &final_graphs) { + if (final_graphs.size() == 1) { + ir::SetReaderOpDeviceInfo(final_graphs[0], member_->places_.size()); + } else { + for (size_t i = 0; i < final_graphs.size(); ++i) { + ir::SetReaderOpDeviceInfo(final_graphs[i], member_->places_.size(), i); + } + } +} + const ir::Graph &ParallelExecutor::Graph() const { return member_->executor_->Graph(); } diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 47de7dc48f4f2c..d4d0b534b55f05 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -24,6 +24,7 @@ limitations under the License. */ #include "paddle/fluid/framework/details/build_strategy.h" #include "paddle/fluid/framework/details/execution_strategy.h" #include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/op_info.h" @@ -41,6 +42,7 @@ namespace framework { class ParallelExecutorPrivate; +using details::VariableInfo; using details::BuildStrategy; using details::ExecutionStrategy; namespace p = paddle::platform; @@ -93,6 +95,40 @@ class ParallelExecutor { const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy) const; + void InitExecutorPrivateMemberInfo(const ExecutionStrategy &exec_strategy, + const BuildStrategy &build_strategy, + size_t device_count, + const ir::Graph &graph); + + void CreateLocalScopes(Scope *global_scope, + const std::vector &local_scopes, + bool create_new); + + std::unordered_map CreateLocalExecScopes( + const std::vector &local_scopes, bool create_new); + + std::vector CloneGraphToMultiDevices(ir::Graph *graph); + + void PrepareNCCLCommunicator(Scope *global_scope); + + std::vector CompileGraphWithBuildStrategy( + ir::Graph *graph, std::vector *graphs, + const std::string &loss_var_name); + + void CreateVariableInfos(std::vector *var_infos, + ir::Graph *graph); + + std::vector CreateSSAGraphExecutor( + const ExecutionStrategy &exec_strategy, + std::vector *async_graphs, ir::Graph *graph); + + void ResetOpHandleScopeMapOfGraphs( + const std::vector &final_graphs, + const std::unordered_map &scope_map); + + void SetReaderOpDeviceInfoOfGraphs( + const std::vector &final_graphs); + ParallelExecutorPrivate *member_; std::vector> async_graphs_; }; diff --git a/paddle/fluid/framework/pipeline_trainer.cc b/paddle/fluid/framework/pipeline_trainer.cc index 5968df548dfb0f..75c42fa3e52736 100644 --- a/paddle/fluid/framework/pipeline_trainer.cc +++ b/paddle/fluid/framework/pipeline_trainer.cc @@ -12,7 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(PADDLE_WITH_ASCEND_CL) #include "paddle/fluid/framework/data_feed_factory.h" #include "paddle/fluid/framework/device_worker_factory.h" #include "paddle/fluid/framework/trainer.h" @@ -34,7 +35,11 @@ void PipelineTrainer::Initialize(const TrainerDesc& trainer_desc, ParseDumpConfig(trainer_desc); const auto& section_config = section_params.section_config(); int place_id = section_config.place_id(); +#if (defined PADDLE_WITH_NCCL) place_ = platform::CUDAPlace(place_id); +#elif (defined PADDLE_WITH_ASCEND_CL) // NOLINT + place_ = platform::NPUPlace(place_id); +#endif worker_ = DeviceWorkerFactory::CreateDeviceWorker( trainer_desc.device_worker_name()); auto this_worker = diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc index e740771e5ca9fc..00ff50abadd185 100644 --- a/paddle/fluid/framework/section_worker.cc +++ b/paddle/fluid/framework/section_worker.cc @@ -9,7 +9,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(PADDLE_WITH_ASCEND_CL) #include #include "paddle/fluid/framework/device_worker.h" #include "paddle/fluid/framework/executor_gc_helper.h" diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc index 4c30c40ad58375..7e48d0dc5f9620 100644 --- a/paddle/fluid/framework/selected_rows.cc +++ b/paddle/fluid/framework/selected_rows.cc @@ -113,6 +113,21 @@ void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows, TensorToStream(os, selected_rows.value(), dev_ctx); } +void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + const platform::DeviceContext* dev_ctx; + auto place = selected_rows.place(); + dev_ctx = pool.Get(place); + SerializeToStream(os, selected_rows, *dev_ctx); +} + +void DeserializeFromStream(std::ifstream& os, SelectedRows* selected_rows) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + const platform::DeviceContext* dev_ctx; + dev_ctx = pool.Get(platform::CPUPlace()); + DeserializeFromStream(os, selected_rows, *dev_ctx); +} + void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows, const platform::DeviceContext& dev_ctx) { { diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h index 48353b43f56cac..e53e3d973c5246 100644 --- a/paddle/fluid/framework/selected_rows.h +++ b/paddle/fluid/framework/selected_rows.h @@ -173,5 +173,9 @@ void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows, void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows, const platform::DeviceContext& dev_ctx); +void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows); + +void DeserializeFromStream(std::ifstream& os, SelectedRows* selected_rows); + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index d6882b25d22588..78fd1af09e2945 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -822,6 +822,29 @@ void TensorToStream(std::ostream& os, const Tensor& tensor, #else PADDLE_THROW(platform::errors::Unimplemented( "XPUPlace is not supported when not compiled with XPU")); +#endif + } else if (platform::is_npu_place(tensor.place())) { +#ifdef PADDLE_WITH_ASCEND_CL + constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB + std::unique_ptr buf(new char[kBufSize]); + auto& npu_dev_ctx = + static_cast(dev_ctx); + platform::CPUPlace cpu; + uintptr_t data = reinterpret_cast(data_ptr); + while (size != 0) { + size_t size_to_write = std::min(kBufSize, static_cast(size)); + memory::Copy(cpu, buf.get(), + BOOST_GET_CONST(platform::NPUPlace, tensor.place()), + reinterpret_cast(data), size_to_write, + npu_dev_ctx.stream()); + npu_dev_ctx.Wait(); + os.write(buf.get(), size_to_write); + data += size_to_write; + size -= size_to_write; + } +#else + PADDLE_THROW(platform::errors::Unimplemented( + "NPUPlace is not supported when not compiled with NPU")); #endif } else { os.write(static_cast(data_ptr), @@ -877,9 +900,10 @@ void TensorFromStream(std::istream& is, Tensor* tensor, auto ctx = platform::CPUDeviceContext(); size_t size = tensor->numel() * framework::SizeOfType(desc.data_type()); if (platform::is_gpu_place(dev_ctx.GetPlace()) || - platform::is_xpu_place(dev_ctx.GetPlace())) { + platform::is_xpu_place(dev_ctx.GetPlace()) || + platform::is_npu_place(dev_ctx.GetPlace())) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_XPU) + defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_ASCEND_CL) Tensor cpu_tensor; cpu_tensor.Resize(framework::make_ddim(shape)); framework::VisitDataType( @@ -888,13 +912,19 @@ void TensorFromStream(std::istream& is, Tensor* tensor, is.read(static_cast(buf), size); auto dst_place = dev_ctx.GetPlace(); framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor); + if (platform::is_npu_place(dev_ctx.GetPlace())) { + dev_ctx.Wait(); + } #else if (platform::is_gpu_place(dev_ctx.GetPlace())) { PADDLE_THROW(platform::errors::Unimplemented( "CUDAPlace is not supported when not compiled with CUDA")); - } else { + } else if (platform::is_xpu_place(dev_ctx.GetPlace())) { PADDLE_THROW(platform::errors::Unimplemented( "XPUPlace is not supported when not compiled with XPU")); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "NPUPlace is not supported when not compiled with NPU")); } #endif } else { @@ -935,9 +965,10 @@ void TensorFromStream(std::istream& is, Tensor* tensor, auto ctx = platform::CPUDeviceContext(); size_t size = tensor->numel() * framework::SizeOfType(desc.data_type()); if (platform::is_gpu_place(dev_ctx.GetPlace()) || - platform::is_xpu_place(dev_ctx.GetPlace())) { + platform::is_xpu_place(dev_ctx.GetPlace()) || + platform::is_npu_place(dev_ctx.GetPlace())) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \ - defined(PADDLE_WITH_XPU) + defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_ASCEND_CL) Tensor cpu_tensor; cpu_tensor.Resize(framework::make_ddim(dims)); framework::VisitDataType( @@ -946,13 +977,19 @@ void TensorFromStream(std::istream& is, Tensor* tensor, is.read(static_cast(buf), size); auto dst_place = dev_ctx.GetPlace(); framework::TensorCopy(cpu_tensor, dst_place, dev_ctx, tensor); + if (platform::is_npu_place(dev_ctx.GetPlace())) { + dev_ctx.Wait(); + } #else if (platform::is_gpu_place(dev_ctx.GetPlace())) { PADDLE_THROW(platform::errors::Unimplemented( "CUDAPlace is not supported when not compiled with CUDA")); - } else { + } else if (platform::is_xpu_place(dev_ctx.GetPlace())) { PADDLE_THROW(platform::errors::Unimplemented( "XPUPlace is not supported when not compiled with XPU")); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "NPUPlace is not supported when not compiled with NPU")); } #endif } else { diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index 85af9e50087024..22c8e1c1665f12 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -159,11 +159,15 @@ void TensorFromVector(const std::vector& src, } #endif #ifdef PADDLE_WITH_ASCEND_CL + // NOTE(zhiqiu): Becareful that aclrtMemcpyAsync is different from + // cudaMemcpyAsync. + // cudaMemcpyAsync is actually "sync" between cpu <-> gpu. + // aclrtMemcpyAsync is really "async" between cpu <-> npu. + // Since vector is on cpu, I think this function should be a "sync" operation, + // so pass nullptr as stream to memory::Copy(). else if (platform::is_npu_place(dst_place)) { // NOLINT - memory::Copy( - BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, src_place, - src_ptr, size, - reinterpret_cast(ctx).stream()); + memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, + src_place, src_ptr, size, nullptr); } #endif } @@ -202,10 +206,8 @@ inline void TensorFromVector(const std::vector& src, #endif #ifdef PADDLE_WITH_ASCEND_CL else if (platform::is_npu_place(dst_place)) { // NOLINT - memory::Copy( - BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, src_place, - src_ptr, size, - reinterpret_cast(ctx).stream()); + memory::Copy(BOOST_GET_CONST(platform::NPUPlace, dst_place), dst_ptr, + src_place, src_ptr, size, nullptr); } #endif delete[] array; @@ -265,10 +267,9 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx, #endif #ifdef PADDLE_WITH_ASCEND_CL else if (platform::is_npu_place(src.place())) { // NOLINT - memory::Copy( - dst_place, dst_ptr, BOOST_GET_CONST(platform::NPUPlace, src.place()), - src_ptr, size, - reinterpret_cast(ctx).stream()); + memory::Copy(dst_place, dst_ptr, + BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr, + size, nullptr); } #endif } @@ -301,10 +302,9 @@ inline void TensorToVector(const Tensor& src, #endif #ifdef PADDLE_WITH_ASCEND_CL else if (platform::is_npu_place(src.place())) { // NOLINT - memory::Copy( - dst_place, dst_ptr, BOOST_GET_CONST(platform::NPUPlace, src.place()), - src_ptr, size, - reinterpret_cast(ctx).stream()); + memory::Copy(dst_place, dst_ptr, + BOOST_GET_CONST(platform::NPUPlace, src.place()), src_ptr, + size, nullptr); } #endif for (unsigned int i = 0; i < src.numel(); i++) { diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h index 7efb89ad7d9d9c..3ac36bd2e4a248 100644 --- a/paddle/fluid/framework/trainer.h +++ b/paddle/fluid/framework/trainer.h @@ -26,7 +26,6 @@ limitations under the License. */ #include "paddle/fluid/framework/data_feed.h" #include "paddle/fluid/framework/data_set.h" #include "paddle/fluid/framework/device_worker.h" -#include "paddle/fluid/framework/fleet/heter_context.h" #include "paddle/fluid/framework/fleet/heter_wrapper.h" #include "paddle/fluid/framework/heter_service.h" #include "paddle/fluid/framework/lod_tensor.h" @@ -332,7 +331,8 @@ class PSGPUTrainer : public TrainerBase { }; #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(PADDLE_WITH_ASCEND_CL) class PipelineTrainer : public TrainerBase { public: PipelineTrainer() {} diff --git a/paddle/fluid/framework/trainer_factory.cc b/paddle/fluid/framework/trainer_factory.cc index 6b9dbece8974c2..15073b6f78c5b3 100644 --- a/paddle/fluid/framework/trainer_factory.cc +++ b/paddle/fluid/framework/trainer_factory.cc @@ -76,7 +76,8 @@ REGISTER_TRAINER_CLASS(HeterBoxTrainer); (defined PADDLE_WITH_PSLIB) REGISTER_TRAINER_CLASS(PSGPUTrainer); #endif -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(PADDLE_WITH_ASCEND_CL) REGISTER_TRAINER_CLASS(PipelineTrainer); #endif } // namespace framework diff --git a/paddle/fluid/framework/unused_var_check.cc b/paddle/fluid/framework/unused_var_check.cc index d2adbdd34512b3..0f8465ab8948e4 100644 --- a/paddle/fluid/framework/unused_var_check.cc +++ b/paddle/fluid/framework/unused_var_check.cc @@ -53,27 +53,28 @@ static const std::unordered_set &GetOpWithUnusedVarAllowSet() { // Use pointer here for safe static deinitialization static auto *allow_set = new std::unordered_set({ // called once - "batch_norm", // 0 - "batch_norm_grad", // 0 - "sync_batch_norm", // 0 - "sync_batch_norm_grad", // 0 - "inplace_abn", // 0 - "inplace_abn_grad", // 0 - "dgc_momentum", // 0 - "fake_quantize_range_abs_max", // 0 - "rmsprop", // 0 - "sequence_conv_grad", // 0 - "roi_perspective_transform_grad", // 0 - "fill_zeros_like", // 1 - "fill_any_like", // 1 - "nce_grad", // 1 - "precision_recall", // 1 - "fusion_seqpool_cvm_concat", // 2 - "fused_batch_norm_act", // 2 - "fused_batch_norm_act_grad", // 2 - "data_norm", // 0 - "data_norm_grad", // 0 - "update_loss_scaling", // 0 + "batch_norm", // 0 + "batch_norm_grad", // 0 + "sync_batch_norm", // 0 + "sync_batch_norm_grad", // 0 + "inplace_abn", // 0 + "inplace_abn_grad", // 0 + "dgc_momentum", // 0 + "fake_quantize_range_abs_max", // 0 + "rmsprop", // 0 + "sequence_conv_grad", // 0 + "roi_perspective_transform_grad", // 0 + "fill_zeros_like", // 1 + "fill_any_like", // 1 + "nce_grad", // 1 + "precision_recall", // 1 + "fusion_seqpool_cvm_concat", // 2 + "fused_batch_norm_act", // 2 + "fused_batch_norm_act_grad", // 2 + "data_norm", // 0 + "data_norm_grad", // 0 + "update_loss_scaling", // 0 + "fused_embedding_eltwise_layernorm", // 0 }); return *allow_set; } diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h index fc754cbaf177c9..473df85aa0421e 100644 --- a/paddle/fluid/framework/var_type_traits.h +++ b/paddle/fluid/framework/var_type_traits.h @@ -36,6 +36,11 @@ #endif #endif +#ifdef PADDLE_WITH_ASCEND_CL +#include +#include +#endif + #if defined(PADDLE_WITH_XPU_BKCL) #include "xpu/bkcl.h" #endif @@ -50,6 +55,10 @@ class Communicator; class NCCLCommunicator; #endif #endif +#ifdef PADDLE_WITH_ASCEND_CL +class Communicator; +class HCCLCommunicator; +#endif #if defined(PADDLE_WITH_XPU_BKCL) class BKCLCommunicator; @@ -162,6 +171,9 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl< #endif operators::CudnnRNNCache, #endif +#if defined(PADDLE_WITH_ASCEND_CL) + HcclRootInfo, +#endif #if defined(PADDLE_WITH_XPU_BKCL) BKCLUniqueId, platform::BKCLCommunicator, #endif diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt index a24c0ac09c7587..6bee3d44b2edd7 100644 --- a/paddle/fluid/imperative/CMakeLists.txt +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -4,7 +4,7 @@ cc_library(prepared_operator SRCS prepared_operator.cc DEPS proto_desc operator cc_library(layer SRCS layer.cc DEPS prepared_operator math_function imperative_flag variable_helper op_registry) add_subdirectory(jit) cc_library(amp SRCS amp_auto_cast.cc DEPS layer ) -cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp) +cc_library(tracer SRCS tracer.cc DEPS layer engine program_desc_tracer amp denormal) cc_library(basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator) cc_library(engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator) cc_library(imperative_profiler SRCS profiler.cc) diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc index a56458b21398b3..fd2bb6e5c99522 100644 --- a/paddle/fluid/imperative/amp_auto_cast.cc +++ b/paddle/fluid/imperative/amp_auto_cast.cc @@ -26,7 +26,24 @@ class VarBase; AmpOperators::AmpOperators() : allow_ops_(new std::unordered_set()), - block_ops_(new std::unordered_set()) {} + block_ops_(new std::unordered_set()), + unsupported_fp16_ops_(new std::unordered_set()) { + auto& all_kernels = framework::OperatorWithKernel::AllOpKernels(); + auto fp16_dtype = framework::proto::VarType::FP16; + for (auto it = all_kernels.begin(); it != all_kernels.end(); it++) { + bool supported = false; + for (auto& kernel_type : it->second) { + if (platform::is_gpu_place(kernel_type.first.place_) && + kernel_type.first.data_type_ == fp16_dtype) { + supported = true; + } + } + if (!supported) { + unsupported_fp16_ops_->insert(it->first); + } + } +} + AmpOperators::~AmpOperators() {} AmpOperators& AmpOperators::Instance() { @@ -44,16 +61,26 @@ AmpOperators::GetMutableBlockOps() { return block_ops_; } +std::shared_ptr> +AmpOperators::GetMutableUnsupportedFp16Ops() { + return unsupported_fp16_ops_; +} + std::ostream& operator<<(std::ostream& os, AmpOperators& ops) { os << "allow ops: "; auto allow_ops = ops.GetMutableAllowOps(); std::copy((*allow_ops).begin(), (*allow_ops).end(), std::ostream_iterator(os, " ")); - os << "; "; + os << "\n"; os << "block ops: "; auto block_ops = ops.GetMutableBlockOps(); std::copy((*block_ops).begin(), (*block_ops).end(), std::ostream_iterator(os, " ")); + os << "\n"; + os << "unsupported fp16 ops: "; + auto unsupported_fp16_ops = ops.GetMutableUnsupportedFp16Ops(); + std::copy((*unsupported_fp16_ops).begin(), (*unsupported_fp16_ops).end(), + std::ostream_iterator(os, " ")); return os; } @@ -156,6 +183,12 @@ NameVarBaseMap AutoCastInputs(const std::string& op_type, return new_ins; } else { auto dst_type = GetPromoteType(ins); + // NOTE(zhiqiu): if the op has op fp16 kernel, fall back to fp32. + if (dst_type == framework::proto::VarType::FP16 && + AmpOperators::Instance().GetMutableUnsupportedFp16Ops()->count( + op_type)) { + dst_type = framework::proto::VarType::FP32; + } for (auto& pair : new_ins) { // NOTE(zhiqiu): batch_norm and layer_norm support only input x is fp16. if ((op_type == "batch_norm" || op_type == "layer_norm") && diff --git a/paddle/fluid/imperative/amp_auto_cast.h b/paddle/fluid/imperative/amp_auto_cast.h index 619c6b0baf896f..fa76c19688a693 100644 --- a/paddle/fluid/imperative/amp_auto_cast.h +++ b/paddle/fluid/imperative/amp_auto_cast.h @@ -40,6 +40,9 @@ class AmpOperators { std::shared_ptr> GetMutableBlockOps(); + std::shared_ptr> + GetMutableUnsupportedFp16Ops(); + private: AmpOperators(); // forbid calling default constructor @@ -50,6 +53,9 @@ class AmpOperators { // The set of ops that support fp16 calculation and are considered numerically // dangerous and whose effects may also be observed in downstream ops. std::shared_ptr> block_ops_; + + // The set of ops that has no fp16 CUDA kennel. + std::shared_ptr> unsupported_fp16_ops_; }; std::ostream& operator<<(std::ostream& os, AmpOperators& ops); diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc index d5350744e4c553..7bcc3d6c608c94 100644 --- a/paddle/fluid/imperative/basic_engine.cc +++ b/paddle/fluid/imperative/basic_engine.cc @@ -408,7 +408,8 @@ void BasicEngine::Execute() { VLOG(10) << "create temporary var of " << var->Name() << " for sum gradient within this graph!"; } else if (!inplace_grad_name_map.empty() && - inplace_grad_name_map.count(pair.first)) { + inplace_grad_name_map.count(pair.first) && + bwd_ins.count(inplace_grad_name_map.at(pair.first))) { // When calculate Inplace grad op, create a new output var. // If a tmp var has been created, there is no need to create it // again. @@ -470,12 +471,20 @@ void BasicEngine::Execute() { { VLOG(3) << "Start to execute grad op " << cur_op.Type(); - if (tmp_ins_ptr == nullptr) { - OpBase::Run(cur_op.InnerOp(), bwd_ins, tmp_outs, cur_op.Attrs(), - cur_op.place()); - } else { - OpBase::Run(cur_op.InnerOp(), *tmp_ins_ptr, tmp_outs, cur_op.Attrs(), - cur_op.place()); + try { + if (tmp_ins_ptr == nullptr) { + OpBase::Run(cur_op.InnerOp(), bwd_ins, tmp_outs, cur_op.Attrs(), + cur_op.place()); + } else { + OpBase::Run(cur_op.InnerOp(), *tmp_ins_ptr, tmp_outs, + cur_op.Attrs(), cur_op.place()); + } + } catch (platform::EnforceNotMet& exception) { + Clear(); + throw std::move(exception); + } catch (std::exception& ex) { + Clear(); + PADDLE_THROW(platform::errors::External("%s", ex.what())); } } diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 70359dc3fd25bf..a4af3117d3e32e 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -187,6 +187,7 @@ size_t VarBase::GradOpNum() const { } void VarBase::ClearGradient() { + VLOG(4) << "ClearGradient " << Name(); if (grad_var_) { if (grad_var_->Var().IsType()) { auto* grad_t = diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc index b91fc460781c79..9f036742f0f5dd 100644 --- a/paddle/fluid/imperative/nccl_context.cc +++ b/paddle/fluid/imperative/nccl_context.cc @@ -35,7 +35,7 @@ namespace imperative { void NCCLParallelContext::BcastNCCLId( std::vector &nccl_ids, // NOLINT - int root) { + int root, int server_fd) { if (strategy_.local_rank_ == root) { std::vector other_trainers; for (auto &ep : strategy_.trainer_endpoints_) { @@ -45,11 +45,14 @@ void NCCLParallelContext::BcastNCCLId( } platform::SendBroadCastCommID(other_trainers, &nccl_ids); } else { - platform::RecvBroadCastCommID(strategy_.current_endpoint_, &nccl_ids); + platform::RecvBroadCastCommID(server_fd, strategy_.current_endpoint_, + &nccl_ids); } } void NCCLParallelContext::Init() { + int server_fd = -1; + std::vector nccl_ids; nccl_ids.resize(strategy_.nrings_); @@ -58,8 +61,13 @@ void NCCLParallelContext::Init() { for (size_t i = 0; i < nccl_ids.size(); ++i) { platform::dynload::ncclGetUniqueId(&nccl_ids[i]); } + } else { + // FIXME(wangxi): gloo will use rank0 endpoint, so not create socket server + // on rank0. + server_fd = platform::SocketServer::GetInstance(strategy_.current_endpoint_) + .socket(); } - BcastNCCLId(nccl_ids, 0); + BcastNCCLId(nccl_ids, 0, server_fd); int gpu_id = BOOST_GET_CONST(platform::CUDAPlace, place_).device; for (int ring_id = 0; ring_id < strategy_.nrings_; ring_id++) { @@ -80,14 +88,20 @@ void NCCLParallelContext::Init() { } void NCCLParallelContext::InitWithRingID(int ring_id) { + int server_fd = -1; std::vector nccl_ids; nccl_ids.resize(1); if (strategy_.local_rank_ == 0) { // generate the unique ncclid on the root worker platform::dynload::ncclGetUniqueId(&nccl_ids[0]); + } else { + // FIXME(wangxi): gloo will use rank0 endpoint, so not create socket server + // on rank0. + server_fd = platform::SocketServer::GetInstance(strategy_.current_endpoint_) + .socket(); } - BcastNCCLId(nccl_ids, 0); + BcastNCCLId(nccl_ids, 0, server_fd); int gpu_id = BOOST_GET_CONST(platform::CUDAPlace, place_).device; VLOG(0) << "init nccl context nranks: " << strategy_.nranks_ diff --git a/paddle/fluid/imperative/nccl_context.h b/paddle/fluid/imperative/nccl_context.h index bcaeb811b108c5..1eee393aa714bb 100644 --- a/paddle/fluid/imperative/nccl_context.h +++ b/paddle/fluid/imperative/nccl_context.h @@ -49,7 +49,8 @@ class NCCLParallelContext : public ParallelContext { ~NCCLParallelContext() override = default; - void BcastNCCLId(std::vector& nccl_ids, int root); // NOLINT + void BcastNCCLId(std::vector& nccl_ids, int root, // NOLINT + int server_fd); void Init() override; diff --git a/paddle/fluid/imperative/py_layer_fwd.h b/paddle/fluid/imperative/py_layer_fwd.h index bd132f2576fec1..de5f9d75e9173a 100644 --- a/paddle/fluid/imperative/py_layer_fwd.h +++ b/paddle/fluid/imperative/py_layer_fwd.h @@ -63,15 +63,16 @@ std::shared_ptr CreateGradOpNode( } } -py::object PyLayerApply(const platform::Place& place, const py::object& cls, +py::object PyLayerApply(const platform::Place& place, const py::handle& cls, const py::args args, const py::kwargs kwargs) { + py::gil_scoped_acquire guard; auto bk_function = cls.attr("_backward_function"); auto context = bk_function(); auto forward = cls.attr("forward"); auto result_forward = forward(context, *args, **kwargs); std::shared_ptr py_layer_ctx = - std::make_shared(context.release().ptr()); + std::make_shared(context.ptr()); // make inputs to varbase std::vector> input_vars; // process args,`input_vars` only collect `imperative::VarBase` @@ -115,12 +116,12 @@ py::object PyLayerApply(const platform::Place& place, const py::object& cls, tuple_result[i].cast>(); output_vars.push_back(temp_out); } catch (py::cast_error&) { - PADDLE_THROW(platform::errors::Unimplemented( - "The output of `PyLayer.forward` should be `Tensor`.")); + // Only collect Tensor type in 'kwargs' and pass them to backward. + // Ignore other types of input temporarily. } } else { - PADDLE_THROW(platform::errors::Unimplemented( - "The output of `PyLayer.forward` can not be `None`.")); + // Only collect Tensor type in 'kwargs' and pass them to backward. + // Ignore other types of input temporarily. } } } else { @@ -130,14 +131,18 @@ py::object PyLayerApply(const platform::Place& place, const py::object& cls, result_forward.cast>(); output_vars.push_back(temp_out); } catch (py::cast_error&) { - PADDLE_THROW(platform::errors::Unimplemented( - "The output of `PyLayer.forward` should be `Tensor`.")); + // Only collect Tensor type in 'kwargs' and pass them to backward. + // Ignore other types of input temporarily. } } else { - PADDLE_THROW(platform::errors::Unimplemented( - "The output of `PyLayer.forward` can not be `None`.")); + // Only collect Tensor type in 'kwargs' and pass them to backward. + // Ignore other types of input temporarily. } } + if (output_vars.size() == 0) { + PADDLE_THROW(platform::errors::InvalidArgument( + "At least one output of `PyLayer.forward` is a `Tensor`.")); + } NameVarBaseMap outs = {{"Out", output_vars}}; diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc index a92704ce447dc1..e3dd0a2aa75b41 100644 --- a/paddle/fluid/imperative/reducer.cc +++ b/paddle/fluid/imperative/reducer.cc @@ -443,10 +443,6 @@ void Reducer::PrepareDeps(const std::unordered_set &init_nodes) { auto *cur_node = q.front(); q.pop(); - for (auto &cur_op : *cur_node) { - cur_op.EnforceHasInOut(); - } - const auto &grad_pending_nodes = cur_node->GradPendingNodes(); for (auto &grad_pending_node : grad_pending_nodes) { PADDLE_ENFORCE_NOT_NULL( @@ -523,7 +519,6 @@ void Reducer::PrepareForBackward( q.pop(); for (const auto &cur_op : *cur_node) { - cur_op.EnforceHasInOut(); auto &bwd_outs = cur_op.GetOutsMap(); for (const auto &pair : bwd_outs) { if (!pair.second.IsGrad()) { @@ -762,10 +757,11 @@ void Reducer::MarkGroupReady(size_t group_index) { // TODO(liuyuhui): Add try catch to deal with exception later, // otherwise the main thread will continue to run when an exception is // thrown in comm_pool_. - comm_pool_->enqueue([&] { + auto next_group = next_group_; + comm_pool_->enqueue([this, run_order, next_group, &group] { auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place_).device; platform::SetXPUDeviceId(dev_id); - FusedAllReduceSchedule(run_order, group, next_group_); + FusedAllReduceSchedule(run_order, group, next_group); { std::lock_guard lock(mutex_); comm_op_count_ -= 1; // lock diff --git a/paddle/fluid/imperative/tests/nccl_context_test.cc b/paddle/fluid/imperative/tests/nccl_context_test.cc index 4967df5341d355..2d8a08217b0b83 100644 --- a/paddle/fluid/imperative/tests/nccl_context_test.cc +++ b/paddle/fluid/imperative/tests/nccl_context_test.cc @@ -15,6 +15,7 @@ #include // NOLINT #include "paddle/fluid/imperative/nccl_context.h" +#include "paddle/fluid/platform/gen_comm_id_helper.h" #include "gtest/gtest.h" @@ -36,9 +37,13 @@ imperative::ParallelStrategy GetStrategy(int local_rank) { #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) void BcastNCCLId(int local_rank, std::vector* nccl_ids) { auto strategy = GetStrategy(local_rank); + int server_fd = platform::CreateListenSocket(strategy.current_endpoint_); + platform::CUDAPlace gpu(local_rank); imperative::NCCLParallelContext ctx(strategy, gpu); - ctx.BcastNCCLId(*nccl_ids, 0); + ctx.BcastNCCLId(*nccl_ids, 0, server_fd); + + platform::CloseSocket(server_fd); } TEST(BcastNCCLId, Run) { diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 777cb10e0754c3..41ad70e5a5741b 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -19,6 +19,7 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/imperative/amp_auto_cast.h" #include "paddle/fluid/imperative/op_base.h" +#include "paddle/fluid/platform/denormal.h" #include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/string/string_helper.h" @@ -83,7 +84,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists( if (gcs_.count(place) == 0) { std::unique_ptr gc; if (platform::is_gpu_place(place)) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) gc.reset(new framework::DefaultStreamGarbageCollector( BOOST_GET_CONST(platform::CUDAPlace, place), 0)); @@ -94,7 +95,7 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists( "Please recompile or reinstall Paddle with GPU support.")); #endif } else if (platform::is_cuda_pinned_place(place)) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) gc.reset(new framework::CUDAPinnedGarbageCollector( BOOST_GET_CONST(platform::CUDAPinnedPlace, place), 0)); @@ -134,6 +135,7 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins, const platform::Place& place, bool trace_backward, const std::map& inplace_map) { platform::RecordEvent op_type_record_event(type); + platform::ScopedFlushDenormal flush; VLOG(1) << "Trace Op: " << type; if (FLAGS_use_mkldnn) { // if both lists are empty all ops are enabled (default for diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 93fd85f13cbf08..c002c7a10cb7b3 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -33,7 +33,7 @@ if (WITH_LITE) add_subdirectory(lite) endif() -# fluid_modules exclude API-interface of inference/api and inference/capi +# fluid_modules exclude API-interface of inference/api and inference/capi_exp get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) # Adapt to custom op mechanism: Include the header files related to the data type @@ -61,7 +61,7 @@ if(NOT APPLE) endif() # C inference API -add_subdirectory(capi) +add_subdirectory(capi_exp) if(WITH_TESTING AND WITH_INFERENCE_API_TEST) add_subdirectory(tests/api) diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index bd27b1f5f34475..255c6ca75dfd74 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -213,6 +213,11 @@ struct Argument { DECL_ARGUMENT_FIELD(tensorrt_use_calib_mode, TensorRtUseCalibMode, bool); DECL_ARGUMENT_FIELD(tensorrt_use_oss, TensorRtUseOSS, bool); + DECL_ARGUMENT_FIELD(use_dlnne, UseDlnne, bool); + DECL_ARGUMENT_FIELD(dlnne_min_subgraph_size, DlnneMinSubgraphSize, int); + DECL_ARGUMENT_FIELD(dlnne_max_batch_size, DlnneMaxBatchSize, int); + DECL_ARGUMENT_FIELD(dlnne_workspace_size, DlnneWorkspaceSize, int); + DECL_ARGUMENT_FIELD(lite_passes_filter, LitePassesFilter, std::vector); DECL_ARGUMENT_FIELD(lite_ops_filter, LiteOpsFilter, std::vector); @@ -222,6 +227,11 @@ struct Argument { DECL_ARGUMENT_FIELD(use_xpu, UseXpu, bool); DECL_ARGUMENT_FIELD(xpu_l3_workspace_size, XpuL3WorkspaceSize, int); + DECL_ARGUMENT_FIELD(xpu_locked, XpuLocked, bool); + DECL_ARGUMENT_FIELD(xpu_autotune, XpuAutotune, bool); + DECL_ARGUMENT_FIELD(xpu_autotune_file, XpuAutotuneFile, std::string); + DECL_ARGUMENT_FIELD(xpu_precision, XpuPrecision, std::string); + DECL_ARGUMENT_FIELD(xpu_adaptive_seqlen, XpuAdaptiveSeqlen, bool); // Memory optimized related. DECL_ARGUMENT_FIELD(enable_memory_optim, EnableMemoryOptim, bool); diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index a4e263e2f464c4..4bb08dc96b1cf5 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -106,8 +106,8 @@ void IRPassManager::CreatePasses(Argument *argument, bool use_static_engine = argument->tensorrt_use_static_engine(); bool model_from_memory = argument->model_from_memory(); std::string optim_cache_dir = argument->optim_cache_dir(); - bool int8_valid = - !(model_from_memory && optim_cache_dir.empty() && enable_int8); + bool int8_valid = !(model_from_memory && optim_cache_dir.empty() && + enable_int8 && use_calib_mode); PADDLE_ENFORCE_EQ( int8_valid, true, platform::errors::PreconditionNotMet( @@ -166,6 +166,11 @@ void IRPassManager::CreatePasses(Argument *argument, // run fp16. pass->Set("disable_trt_plugin_fp16", new bool(argument->disable_trt_plugin_fp16())); + } else if (pass_name == "dlnne_subgraph_pass") { + pass->Set("min_subgraph_size", + new int(argument->dlnne_min_subgraph_size())); + pass->Set("program", + new framework::ProgramDesc *(&argument->main_program())); } if (pass_name == "lite_subgraph_pass") { bool enable_int8 = @@ -183,6 +188,12 @@ void IRPassManager::CreatePasses(Argument *argument, new int(argument->xpu_l3_workspace_size())); pass->Set("cpu_math_library_num_threads", new int(argument->cpu_math_library_num_threads())); + pass->Set("locked", new bool(argument->xpu_locked())); + pass->Set("autotune", new bool(argument->xpu_autotune())); + pass->Set("autotune_file", + new std::string(argument->xpu_autotune_file())); + pass->Set("precision", new std::string(argument->xpu_precision())); + pass->Set("adaptive_seqlen", new bool(argument->xpu_adaptive_seqlen())); } disable_logs_ = argument->disable_logs(); if (pass_name == "fc_fuse_pass") { diff --git a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt index e35178428cc7ba..330f7a99847344 100644 --- a/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt +++ b/paddle/fluid/inference/analysis/ir_passes/CMakeLists.txt @@ -20,3 +20,15 @@ if (WITH_LITE) set(INFER_IR_PASSES ${INFER_IR_PASSES} lite_subgraph_pass CACHE INTERNAL "") cc_test(lite_subgraph_pass_tester SRCS lite_subgraph_pass_tester.cc DEPS lite_subgraph_pass gtest glog) endif() + +MESSAGE("WITH_DLNNE:${WITH_DLNNE}") +if(WITH_DLNNE) + cc_library(dlnne_subgraph_pass SRCS dlnne_subgraph_pass.cc DEPS ${analysis_deps} subgraph_util) + set(analysis_deps ${analysis_deps} + subgraph_util dlnne_subgraph_pass + CACHE INTERNAL "") + + set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h.tmp) + file(APPEND ${pass_file} "USE_PASS(dlnne_subgraph_pass);\n") + set(INFER_IR_PASSES ${INFER_IR_PASSES} dlnne_subgraph_pass CACHE INTERNAL "") +endif() diff --git a/paddle/fluid/inference/analysis/ir_passes/dlnne_reg_py.h b/paddle/fluid/inference/analysis/ir_passes/dlnne_reg_py.h new file mode 100644 index 00000000000000..ae977c1403a879 --- /dev/null +++ b/paddle/fluid/inference/analysis/ir_passes/dlnne_reg_py.h @@ -0,0 +1,21 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +namespace paddle { +namespace inference { + +int RegisterPyFunc(const std::string& name, void* pfn); +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc new file mode 100644 index 00000000000000..8f789139af9bfc --- /dev/null +++ b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.cc @@ -0,0 +1,351 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include +#include +#include + +#include +#include + +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" +#include "paddle/fluid/framework/ir/subgraph_detector.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/inference/analysis/helper.h" +#include "paddle/fluid/inference/analysis/ir_passes/dlnne_reg_py.h" +#include "paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h" +#include "paddle/fluid/string/pretty_log.h" + +namespace paddle { +namespace inference { + +int (*PyConvertGraph)(const char *graph_name); + +int RegisterPyFunc(const std::string &name, void *pfn) { + if (name.compare("convert_graph") == 0) { + PyConvertGraph = reinterpret_cast(pfn); + } + + return 0; +} +int ConvertGraph(std::string graph_name) { + LOG(INFO) << "starting doing convert_graph"; + + PyConvertGraph(graph_name.c_str()); + + return 0; +} + +namespace analysis { + +using framework::ir::Node; + +void analysis::DlnneSubgraphPass::ApplyImpl(framework::ir::Graph *graph) const { + static std::unordered_set teller_set{ + "mul", "matmul", "conv2d", "pool2d", "relu", "softmax", "sigmoid", + "hard_swish", "depthwise_conv2d", "batch_norm", "concat", "tanh", "pad", + "elementwise_add", "elementwise_mul", "dropout", "prelu", + "conv2d_transpose", "leaky_relu", + // "fc", + "shuffle_channel", "swish", "split", + // "instance_norm", + "gelu", + // "layer_norm", + // "scale", + // "stack", + "relu6", "reshape2", "transpose2", "concat", "slice", + }; + + framework::ir::FusePassBase::Init("dlnne_subgraph_pass", graph); + + auto teller = [&](const framework::ir::Node *node) { + if (!node->IsOp() || !node->Op()) return false; + return teller_set.find(node->Op()->Type()) != teller_set.end(); + }; + + framework::ir::SubGraphFuser fuser( + graph, teller, Get("min_subgraph_size") /*min subgraph size*/, + "dlnne_engine"); + fuser(); + + std::vector graph_param_names = + ExtractParameters(graph->Nodes()); + // those parameter already exist in dlnne, and should not have another copy in + // fluid. + std::vector repetitive_params; + + for (auto *node : graph->Nodes()) { + if (node->IsOp() && !framework::ir::Agent(node).subgraph()->empty()) { + CreateDlnneOp(node, graph, graph_param_names, &repetitive_params); + + std::unordered_set nodes2remove( + framework::ir::Agent(node).subgraph()->begin(), + framework::ir::Agent(node).subgraph()->end()); + framework::ir::GraphSafeRemoveNodes(graph, nodes2remove); + } + } + + std::unordered_set nodes2remove; + for (auto *node : graph->Nodes()) { + if (node->IsOp() && framework::ir::Agent(node).deleted()) { + nodes2remove.insert(node); + } + } + framework::ir::GraphSafeRemoveNodes(graph, nodes2remove); +} + +std::string GenerateEngineKey(const std::set &engine_inputs, + const std::set &engine_outputs, + const std::string &predictor_id) { + std::string engine_hash_key = ""; + for (auto name : engine_inputs) { + engine_hash_key += name; + } + for (auto name : engine_outputs) { + engine_hash_key += name; + } + engine_hash_key += predictor_id; + auto engine_key = std::to_string(std::hash()(engine_hash_key)); + return engine_key; +} +std::string replace_name(std::string name, const char *raw, + const char *new_char) { + std::string r_name = name; + int pos = r_name.find(raw); + while (pos >= 0) { + r_name = r_name.replace(pos, 1, new_char); + pos = r_name.find(raw); + } + return r_name; +} + +void DlnneSubgraphPass::CreateDlnneOp( + framework::ir::Node *node, framework::ir::Graph *graph, + const std::vector &graph_params, + std::vector *repetitive_params) const { + auto *op_desc = node->Op(); + auto &subgraph = *framework::ir::Agent(node).subgraph(); + PADDLE_ENFORCE_EQ(subgraph.empty(), false, + platform::errors::PreconditionNotMet( + "The subgraph should not be empty.")); + + // A fake block desc. + framework::proto::BlockDesc block_proto; + framework::BlockDesc block_desc(nullptr, &block_proto); + block_desc.Proto()->set_parent_idx(-1); + block_desc.Proto()->set_idx(0); + LOG(INFO) << "--- detect a sub-graph with " << subgraph.size() << " nodes"; + // for debug + framework::ProgramDesc tmp_dump_program_desc; + auto *tmp_dump_main_block = tmp_dump_program_desc.MutableBlock(0); + + std::unordered_map name_var_desc; + std::set name_var_input_nodes; + std::set name_var_output_nodes; + std::set name_ops; + + for (auto *node : subgraph) { + auto *op = block_desc.AppendOp(); + *op->Proto() = *node->Op()->Proto(); + + // debug + { + name_ops.insert(node->Name()); + auto *tmp_dump_new_block_op = tmp_dump_main_block->AppendOp(); + + framework::OpDesc op_desc; + op_desc.CopyFrom(*node->Op()); + + for (auto argument_name : op_desc.InputArgumentNames()) { + if (std::count(graph_params.begin(), graph_params.end(), + argument_name) > 0) { + op_desc.Rename(argument_name, replace_name(argument_name, "/", ".")); + } + } + for (auto argument_name : op_desc.OutputArgumentNames()) { + if (std::count(graph_params.begin(), graph_params.end(), + argument_name) > 0) { + op_desc.Rename(argument_name, replace_name(argument_name, "/", ".")); + } + } + *tmp_dump_new_block_op->Proto() = *op_desc.Proto(); + + for (auto *x : node->inputs) { + if (x->IsVar()) { + name_var_desc[x->Name()] = x->Var(); + } + if (std::count(graph_params.begin(), graph_params.end(), x->Name()) == + 0) + name_var_input_nodes.insert(x->Name()); + } + + for (auto *x : node->outputs) { + if (x->IsVar()) { + name_var_desc[x->Name()] = x->Var(); + } + if (std::count(graph_params.begin(), graph_params.end(), x->Name()) == + 0) + name_var_output_nodes.insert(x->Name()); + } + } + } + std::set valid_input_names; + std::set valid_output_names; + for (auto name : name_var_output_nodes) { + if (name_var_input_nodes.find(name) == name_var_input_nodes.end()) { + valid_output_names.insert(name); + } + } + + for (auto name : name_var_input_nodes) { + if (name_var_output_nodes.find(name) == name_var_output_nodes.end()) { + valid_input_names.insert(name); + } + } + + // Then, we will use the input_names_with_id and output_names_with_id to + // generate the engine key. + // So, We use set instead of unordered_set here to ensure that the engine key + // is unique. + std::set input_names; + std::set input_names_with_id; + std::vector params; + // if we delete fluid copy of params shared by more than 1 ops, there will be + // problem, so we filter them out. + + // The node->inputs contains input tensors and parameters. + for (auto *x : node->inputs) { + input_names.insert(x->Name()); + input_names_with_id.insert(x->Name() + std::to_string(x->id())); + if (std::count(graph_params.begin(), graph_params.end(), x->Name()) > 0) { + params.push_back(x->Name()); + } + } + + std::set output_names; + std::set output_names_with_id; + std::vector origin_output_dims; + for (auto *x : node->outputs) { + origin_output_dims.push_back(x->Var()->GetShape().size()); + output_names.insert(x->Name()); + output_names_with_id.insert(x->Name() + std::to_string(x->id())); + } + + std::unordered_map output_name_map; + std::unordered_map graph_var_map; + + for (framework::ir::Node *node : graph->Nodes()) { + if (node->IsVar() && node->Var()) { + graph_var_map[node->Name()] = node; + } + } + + // Set attrs + op_desc->SetType("dlnne_engine"); + op_desc->SetInput("Xs", std::vector(valid_input_names.begin(), + valid_input_names.end())); + + op_desc->SetOutput("Ys", std::vector(valid_output_names.begin(), + valid_output_names.end())); + + op_desc->SetAttr("parameters", params); + auto engine_key = GenerateEngineKey(input_names_with_id, output_names_with_id, + std::to_string(0)); + op_desc->SetAttr("engine_key", engine_key); + auto *scope = param_scope(); + + { + std::set input_names; + + for (auto name : name_var_input_nodes) { + if (name_var_output_nodes.find(name) == name_var_output_nodes.end()) { + input_names.insert(name); + } + } + + // add feed to subgraph: + int input_idx = 0; + for (auto input_name : input_names) { + auto *feed0 = tmp_dump_main_block->AppendOp(); + feed0->SetType("feed"); + feed0->SetInput("X", {"feed"}); + feed0->SetOutput("Out", {input_name}); + feed0->SetAttr("col", input_idx); + input_idx++; + } + // add fetch to subgraph: + int output_idx = 0; + for (auto output_name : valid_output_names) { + auto *fetch0 = tmp_dump_main_block->AppendOp(); + fetch0->SetType("fetch"); + fetch0->SetInput("X", {output_name}); + fetch0->SetOutput("Out", {"out"}); + fetch0->SetAttr("col", output_idx); + output_idx++; + } + + mkdir("./dump", 0777); + std::string dir_name = "./dump/" + engine_key; + mkdir(dir_name.c_str(), 0777); + ofstream m_stream; + m_stream.open(dir_name + "/__model__", ios::out); + + VLOG(4) << "name_var_desc size:" << name_var_desc.size(); + + for (auto &kv : name_var_desc) { + auto *new_add_var = tmp_dump_main_block->Proto()->add_vars(); + *new_add_var = *kv.second->Proto(); + auto *variable_tmp = scope->FindVar(kv.first); + if (variable_tmp != nullptr) { + *new_add_var->mutable_name() = replace_name(kv.first, "/", "."); + new_add_var->set_persistable(true); + } else { + new_add_var->set_persistable(false); + } + } + + for (auto param_name : params) { + auto *var = scope->FindVar(param_name); + if (var != nullptr) { + auto *var_t = var->GetMutable(); + ofstream p_stream; + p_stream.open(dir_name + "/" + replace_name(param_name, "/", "."), + ios::out); + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(var_t->place()); + framework::SerializeToStream(p_stream, *var_t, dev_ctx); + p_stream.close(); + } + } + + std::string model; + + tmp_dump_program_desc.Proto()->SerializeToString(&model); + m_stream << model; + m_stream.close(); + + op_desc->SetBlockAttr("sub_block", tmp_dump_main_block); + op_desc->SetAttr("subgraph", model); + op_desc->Flush(); + + ConvertGraph(engine_key); + } +} + +} // namespace analysis +} // namespace inference +} // namespace paddle + +REGISTER_PASS(dlnne_subgraph_pass, + paddle::inference::analysis::DlnneSubgraphPass); diff --git a/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h new file mode 100644 index 00000000000000..5a1d2506fdb09b --- /dev/null +++ b/paddle/fluid/inference/analysis/ir_passes/dlnne_subgraph_pass.h @@ -0,0 +1,55 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/inference/analysis/ir_passes/subgraph_util.h" +#include "paddle/fluid/inference/api/paddle_analysis_config.h" + +namespace paddle { +namespace framework { +namespace ir { +class Graph; +class Node; +} // namespace ir +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { + +int ConvertGraph(std::string graph_name); + +namespace analysis { + +class DlnneSubgraphPass : public framework::ir::FusePassBase { + public: + void ApplyImpl(framework::ir::Graph *graph) const override; + + private: + void CleanIntermediateOutputs(framework::ir::Node *node); + void CreateDlnneOp(framework::ir::Node *x, framework::ir::Graph *graph, + const std::vector &graph_params, + std::vector *repetitive_params) const; +}; + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc index c697914904b3e9..b8cac8992f4eed 100644 --- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc @@ -245,6 +245,11 @@ void LiteSubgraphPass::SetUpEngine( bool use_xpu = Get("use_xpu"); int xpu_l3_workspace_size = Get("xpu_l3_workspace_size"); int cpu_math_library_num_threads = Get("cpu_math_library_num_threads"); + bool locked = Get("locked"); + bool autotune = Get("autotune"); + std::string autotune_file = Get("autotune_file"); + std::string precision = Get("precision"); + bool adaptive_seqlen = Get("adaptive_seqlen"); lite_api::TargetType target_type; if (use_gpu) { @@ -282,6 +287,11 @@ void LiteSubgraphPass::SetUpEngine( }; config.cpu_math_library_num_threads = cpu_math_library_num_threads; config.xpu_l3_workspace_size = xpu_l3_workspace_size; + config.locked = locked; + config.autotune = autotune; + config.autotune_file = autotune_file; + config.precision = precision; + config.adaptive_seqlen = adaptive_seqlen; if (dump_model) { lite::StrToBinaryFile("./model.bin", config.model); lite::StrToBinaryFile("./param.bin", config.param); diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 60de4234b41a8b..f57f07883dcd70 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -20,6 +20,7 @@ #include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" #include "paddle/fluid/inference/tensorrt/engine.h" +#include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/tensorrt/op_teller.h" namespace paddle { @@ -321,11 +322,20 @@ void TensorRtSubgraphPass::CreateTensorRTOp( opt_input_shape = {}; } - if (min_input_shape.size() > 0 && TRT_VERSION > 6000) { + auto to_major_version = [&](int full_version) -> float { + return (full_version / 100) / 10.0; + }; + const float compile_time_trt_version = to_major_version(TRT_VERSION); + const float run_time_trt_version = + to_major_version(tensorrt::GetInferLibVersion()); + if (compile_time_trt_version != run_time_trt_version) { LOG_FIRST_N(WARNING, 1) - << "The Paddle lib links the " << TRT_VERSION << " version TensorRT, " - << "make sure the runtime TensorRT you are using is no less than this " - "version, otherwise, there might be Segfault!"; + << "The Paddle Inference library is compiled with " + << compile_time_trt_version << " version TensorRT, " + << "but the runtime TensorRT you are using is " << run_time_trt_version + << " version. " + "This might cause serious compatibility issues. We strongly " + "recommend using the same TRT version at runtime."; } // Setting the disable_trt_plugin_fp16 to true means that TRT plugin will not diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index 03f86cc7ba6de6..82c95ba2c95712 100755 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -32,10 +32,10 @@ cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc) if(WITH_CRYPTO) cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope reset_tensor_array - analysis_config zero_copy_tensor trainer_desc_proto paddle_crypto) + analysis_config zero_copy_tensor trainer_desc_proto paddle_crypto custom_operator) else() cc_library(paddle_inference_api SRCS api.cc api_impl.cc helper.cc DEPS lod_tensor scope reset_tensor_array - analysis_config zero_copy_tensor trainer_desc_proto) + analysis_config zero_copy_tensor trainer_desc_proto custom_operator) endif() if(WIN32) diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 0622fb27d9e38c..853c1ac1da8742 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -26,6 +26,7 @@ namespace paddle { struct MkldnnQuantizerConfig; extern const std::vector kTRTSubgraphPasses; +extern const std::vector kDlnneSubgraphPasses; extern const std::vector kLiteSubgraphPasses; PassStrategy *AnalysisConfig::pass_builder() const { @@ -95,9 +96,17 @@ void AnalysisConfig::DisableFCPadding() { Update(); } -void AnalysisConfig::EnableXpu(int l3_workspace_size) { +void AnalysisConfig::EnableXpu(int l3_workspace_size, bool locked, + bool autotune, const std::string &autotune_file, + const std::string &precision, + bool adaptive_seqlen) { use_xpu_ = true; xpu_l3_workspace_size_ = l3_workspace_size; + xpu_locked_ = locked; + xpu_autotune_ = autotune; + xpu_autotune_file_ = autotune_file; + xpu_precision_ = precision; + xpu_adaptive_seqlen_ = adaptive_seqlen; Update(); } @@ -134,6 +143,9 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(trt_use_static_engine_); CP_MEMBER(trt_use_calib_mode_); CP_MEMBER(trt_use_oss_); + // Dlnne related + CP_MEMBER(use_dlnne_); + CP_MEMBER(dlnne_min_subgraph_size_); // MKLDNN related. CP_MEMBER(use_mkldnn_); CP_MEMBER(mkldnn_enabled_op_types_); @@ -157,6 +169,11 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { CP_MEMBER(use_xpu_); CP_MEMBER(xpu_l3_workspace_size_); + CP_MEMBER(xpu_locked_); + CP_MEMBER(xpu_autotune_); + CP_MEMBER(xpu_autotune_file_); + CP_MEMBER(xpu_precision_); + CP_MEMBER(xpu_adaptive_seqlen_); // profile related. CP_MEMBER(with_profile_); @@ -211,6 +228,21 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { pass_builder_->DeletePass(ps); } } + if (use_dlnne_) { + auto all_passes = kDlnneSubgraphPasses; + auto other_passes = other.pass_builder()->AllPasses(); + // We should sort them, because the user may call the SwitchIrDebug + // interface, which will change the pass. + std::sort(all_passes.begin(), all_passes.end()); + std::sort(other_passes.begin(), other_passes.end()); + std::vector deleted_passes; + std::set_difference(all_passes.begin(), all_passes.end(), + other_passes.begin(), other_passes.end(), + std::inserter(deleted_passes, deleted_passes.begin())); + for (auto ps : deleted_passes) { + pass_builder_->DeletePass(ps); + } + } } void AnalysisConfig::EnableCUDNN() { @@ -309,6 +341,12 @@ void AnalysisConfig::EnableTensorRtEngine( #endif } +void AnalysisConfig::EnableDlnne(int min_subgraph_size) { + use_dlnne_ = true; + dlnne_min_subgraph_size_ = min_subgraph_size; + Update(); +} + void AnalysisConfig::SetTRTDynamicShapeInfo( std::map> min_input_shape, std::map> max_input_shape, @@ -383,6 +421,14 @@ void AnalysisConfig::Update() { pass_builder()->AppendPass(pass); } } + LOG(INFO) << "use_dlnne_:" << use_dlnne_ << std::endl; + if (use_dlnne_) { + pass_builder()->ClearPasses(); + for (const auto &pass : kDlnneSubgraphPasses) { + pass_builder()->AppendPass(pass); + } + } + if (use_gpu() && use_cudnn_) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) if (!enable_ir_optim_) { @@ -479,6 +525,9 @@ std::string AnalysisConfig::SerializeInfoCache() { ss << tensorrt_max_batchsize_; ss << tensorrt_min_subgraph_size_; + ss << use_dlnne_; + ss << dlnne_min_subgraph_size_; + for (auto &op : trt_disabled_ops_) ss << op.c_str(); ss << ";"; @@ -512,6 +561,11 @@ std::string AnalysisConfig::SerializeInfoCache() { ss << use_lite_; ss << use_xpu_; ss << xpu_l3_workspace_size_; + ss << xpu_locked_; + ss << xpu_autotune_; + ss << xpu_autotune_file_; + ss << xpu_precision_; + ss << xpu_adaptive_seqlen_; ss << thread_local_stream_; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 4b6c746d57525a..89c8c7902bac9f 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -191,22 +191,8 @@ bool AnalysisPredictor::PrepareScope( status_is_cloned_ = true; } else { paddle::framework::InitDevices(); - scope_.reset(new paddle::framework::Scope(), [](framework::Scope *scope) { - delete scope; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - for (int dev_id = 0; dev_id < paddle::platform::GetCUDADeviceCount(); - ++dev_id) { - memory::Release(platform::CUDAPlace(dev_id)); - } -#endif -#ifdef PADDLE_WITH_XPU - for (int dev_id = 0; dev_id < paddle::platform::GetXPUDeviceCount(); - ++dev_id) { - memory::Release(platform::XPUPlace(dev_id)); - } -#endif - memory::Release(platform::CPUPlace()); - }); + // TODO(wilber): we need to release memory occupied by weights. + scope_.reset(new paddle::framework::Scope()); status_is_cloned_ = false; } sub_scope_ = &scope_->NewScope(); @@ -537,6 +523,12 @@ void AnalysisPredictor::PrepareArgument() { argument_.SetCloseTrtPluginFp16(config_.disable_trt_plugin_fp16_); } + if (config_.dlnne_enabled()) { + LOG(INFO) << "Dlnne subgraph is enabled"; + argument_.SetUseDlnne(true); + argument_.SetDlnneMinSubgraphSize(config_.dlnne_min_subgraph_size_); + } + if (config_.lite_engine_enabled()) { argument_.SetCpuMathLibraryNumThreads( config_.cpu_math_library_num_threads()); @@ -546,6 +538,11 @@ void AnalysisPredictor::PrepareArgument() { argument_.SetLiteZeroCopy(config_.lite_zero_copy_); argument_.SetUseXpu(config_.use_xpu_); argument_.SetXpuL3WorkspaceSize(config_.xpu_l3_workspace_size_); + argument_.SetXpuLocked(config_.xpu_locked_); + argument_.SetXpuAutotune(config_.xpu_autotune_); + argument_.SetXpuAutotuneFile(config_.xpu_autotune_file_); + argument_.SetXpuPrecision(config_.xpu_precision_); + argument_.SetXpuAdaptiveSeqlen(config_.xpu_adaptive_seqlen_); LOG(INFO) << "Lite subgraph engine is enabled"; } @@ -617,7 +614,7 @@ std::unique_ptr CreatePaddlePredictor< // This function can only be executed once per process. static std::once_flag custom_operators_registered; std::call_once(custom_operators_registered, - []() { paddle::RegisterAllCustomOperator(); }); + []() { inference::RegisterAllCustomOperator(); }); if (config.use_gpu()) { static std::once_flag gflags_initialized; diff --git a/paddle/fluid/inference/api/helper.cc b/paddle/fluid/inference/api/helper.cc index 9cc491e10d691a..d78560239de50e 100644 --- a/paddle/fluid/inference/api/helper.cc +++ b/paddle/fluid/inference/api/helper.cc @@ -13,6 +13,9 @@ // limitations under the License. #include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/extension/include/ext_op_meta_info.h" +#include "paddle/fluid/framework/custom_operator.h" +#include "paddle/fluid/framework/operator.h" namespace paddle { namespace inference { @@ -40,5 +43,20 @@ std::string to_string>>( return ss.str(); } +void RegisterAllCustomOperator() { + auto &op_meta_info_map = OpMetaInfoMap::Instance(); + const auto &meta_info_map = op_meta_info_map.GetMap(); + for (auto &pair : meta_info_map) { + const auto &all_op_kernels{framework::OperatorWithKernel::AllOpKernels()}; + if (all_op_kernels.find(pair.first) == all_op_kernels.end()) { + framework::RegisterOperatorWithMetaInfo(pair.second); + } else { + LOG(INFO) << "The operator `" << pair.first + << "` has been registered. " + "Therefore, we will not repeat the registration here."; + } + } +} + } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index 14b968f5834da8..c6d25137594b76 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -398,5 +398,7 @@ static bool IsFileExists(const std::string &path) { return exists; } +void RegisterAllCustomOperator(); + } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc index 793fc53d90b768..f6cdbb00b50453 100644 --- a/paddle/fluid/inference/api/mkldnn_quantizer.cc +++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc @@ -411,7 +411,8 @@ void AnalysisPredictor::MkldnnQuantizer::ClearDeviceContext() const { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::MKLDNNDeviceContext* dev_ctx = (platform::MKLDNNDeviceContext*)pool.Get(predictor_.place_); - dev_ctx->ResetBlobMap(); + dev_ctx->ResetBlobMap( + paddle::platform::MKLDNNDeviceContext::tls().get_curr_exec()); } void AnalysisPredictor::MkldnnQuantizer::PrepareArgument() const { diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index e492b32cb6cbef..2bbd4bb837a22f 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -177,7 +177,10 @@ struct PD_INFER_DECL AnalysisConfig { /// void DisableGpu(); - void EnableXpu(int l3_workspace_size = 0xfffc00); + void EnableXpu(int l3_workspace_size = 0xfffc00, bool locked = false, + bool autotune = true, const std::string& autotune_file = "", + const std::string& precision = "int16", + bool adaptive_seqlen = false); /// /// \brief A boolean state telling whether the GPU is turned on. /// @@ -360,6 +363,9 @@ struct PD_INFER_DECL AnalysisConfig { /// bool tensorrt_dla_enabled() { return trt_use_dla_; } + void EnableDlnne(int min_subgraph_size = 3); + bool dlnne_enabled() const { return use_dlnne_; } + /// /// \brief Turn on the usage of Lite sub-graph engine. /// @@ -627,6 +633,10 @@ struct PD_INFER_DECL AnalysisConfig { std::vector trt_disabled_ops_{}; bool disable_trt_plugin_fp16_{false}; + // dlnne related. + bool use_dlnne_{false}; + int dlnne_min_subgraph_size_{3}; + // memory reuse related. bool enable_memory_optim_{false}; @@ -661,6 +671,11 @@ struct PD_INFER_DECL AnalysisConfig { bool thread_local_stream_{false}; bool use_xpu_{false}; int xpu_l3_workspace_size_; + bool xpu_locked_; + bool xpu_autotune_; + std::string xpu_autotune_file_; + std::string xpu_precision_; + bool xpu_adaptive_seqlen_; // mkldnn related. int mkldnn_cache_capacity_{0}; diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 1d77ddaf73ef70..b2e3de63691c55 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -110,6 +110,16 @@ const std::vector kTRTSubgraphPasses({ "transpose_flatten_concat_fuse_pass", }); +const std::vector kDlnneSubgraphPasses({ + "is_test_pass", // + "delete_dropout_op_pass" // + "simplify_with_basic_ops_pass", // + "conv_bn_fuse_pass", // + "depthwise_conv_bn_fuse_pass", // + "shuffle_channel_detect_pass", // + "dlnne_subgraph_pass", // +}); + const std::vector kLiteSubgraphPasses({ #ifdef PADDLE_WITH_LITE "lite_subgraph_pass", diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index a725ebab35eada..d7556b50031b7d 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -242,6 +242,9 @@ class PD_INFER_DECL XpuPassStrategy final : public PassStrategy { /// \brief List of tensorRT subgraph passes. PD_INFER_DECL extern const std::vector kTRTSubgraphPasses; +/// \brief List of dlnne subgraph passes. +PD_INFER_DECL extern const std::vector kDlnneSubgraphPasses; + /// \brief List of lite subgraph passes. PD_INFER_DECL extern const std::vector kLiteSubgraphPasses; diff --git a/paddle/fluid/inference/capi/pd_config.cc b/paddle/fluid/inference/capi/pd_config.cc index 231639667244d8..9bb52ba5780251 100644 --- a/paddle/fluid/inference/capi/pd_config.cc +++ b/paddle/fluid/inference/capi/pd_config.cc @@ -260,6 +260,22 @@ bool PD_TensorrtEngineEnabled(const PD_AnalysisConfig* config) { return config->config.tensorrt_engine_enabled(); } +void PD_EnableDlnne(PD_AnalysisConfig* config, int min_subgraph_size) { + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); + config->config.EnableDlnne(min_subgraph_size); +} + +bool PD_DlnneEnabled(const PD_AnalysisConfig* config) { + PADDLE_ENFORCE_NOT_NULL( + config, + paddle::platform::errors::InvalidArgument( + "The pointer of analysis configuration shouldn't be nullptr")); + return config->config.dlnne_enabled(); +} + void PD_SwitchIrDebug(PD_AnalysisConfig* config, bool x) { PADDLE_ENFORCE_NOT_NULL( config, diff --git a/paddle/fluid/inference/capi_exp/CMakeLists.txt b/paddle/fluid/inference/capi_exp/CMakeLists.txt new file mode 100644 index 00000000000000..521d24329d4641 --- /dev/null +++ b/paddle/fluid/inference/capi_exp/CMakeLists.txt @@ -0,0 +1,29 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +set(C_API_SRCS pd_config.cc pd_predictor.cc pd_tensor.cc pd_utils.cc) + +cc_library(paddle_inference_c SRCS ${C_API_SRCS} DEPS paddle_inference) + +if(NOT ON_INFER) + return() +endif() + +# Create inference capi shared library +cc_library(paddle_inference_c_shared SHARED SRCS ${C_API_SRCS} DEPS paddle_inference) +set_target_properties(paddle_inference_c_shared PROPERTIES OUTPUT_NAME paddle_inference_c) +if(WIN32) + target_link_libraries(paddle_inference_c_shared shlwapi.lib) +endif() diff --git a/paddle/fluid/inference/capi_exp/lod_demo.cc b/paddle/fluid/inference/capi_exp/lod_demo.cc new file mode 100644 index 00000000000000..2b049e992e71dd --- /dev/null +++ b/paddle/fluid/inference/capi_exp/lod_demo.cc @@ -0,0 +1,102 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/// +/// \file lod_demo.cc +/// +/// \brief a demo for user to learn how to inference by c api. +/// it rectify from +/// paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc. +/// +/// \author paddle-infer@baidu.com +/// \date 2021-04-21 +/// \since 2.1 +/// + +#include +#include +#include +#include +#include +#include "paddle/fluid/inference/capi_exp/pd_inference_api.h" + +int main(int argc, char *argv[]) { + auto model_dir = FLAGS_infer_model; + PD_Config *config = PD_ConfigCreate(); + PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(), + (model_dir + "/param").c_str()); + PD_ConfigDisableGpu(config); + + PD_Predictor *predictor = PD_PredictorCreate(config); + size_t input_num = PD_PredictorGetInputNum(predictor); + size_t output_num = PD_PredictorGetOutputNum(predictor); + + PD_OneDimArrayCstr *input_names = PD_PredictorGetInputNames(predictor); + LOG(INFO) << "Predictor start run!"; + PD_Tensor *inputs[2]; + inputs[0] = PD_PredictorGetInputHandle(predictor, input_names->data[0]); + inputs[1] = PD_PredictorGetInputHandle(predictor, input_names->data[1]); + LOG(INFO) << "Predictor start run!"; + // inputs[0]: word, use lod memory in stack + int32_t shape_0[2] = {11, 1}; + int64_t data_0[11 * 1] = {12673, 9763, 905, 284, 45, 7474, 20, 17, 1, 4, 9}; + size_t lod_layer_0[2] = {0, 11}; + PD_OneDimArraySize layer_0; + layer_0.size = 2; + layer_0.data = lod_layer_0; + PD_OneDimArraySize *layer_0_ptr = &layer_0; + PD_TwoDimArraySize lod_0; + lod_0.size = 1; + lod_0.data = &layer_0_ptr; + PD_TensorReshape(inputs[0], 2, shape_0); + PD_TensorCopyFromCpuInt64(inputs[0], data_0); + PD_TensorSetLod(inputs[0], &lod_0); + + // inputs[1]: mention, use lod memory in heap + int32_t shape_1[2] = {11, 1}; + int64_t data_1[11 * 1] = {27, 0, 0, 33, 34, 33, 0, 0, 0, 1, 2}; + PD_TwoDimArraySize *lod_1_ptr = new PD_TwoDimArraySize(); + lod_1_ptr->size = 1; + lod_1_ptr->data = new PD_OneDimArraySize *[1]; + lod_1_ptr->data[0] = new PD_OneDimArraySize(); + lod_1_ptr->data[0]->size = 2; + lod_1_ptr->data[0]->data = new size_t[2]; + lod_1_ptr->data[0]->data[0] = 0; + lod_1_ptr->data[0]->data[1] = 11; + + PD_TensorReshape(inputs[1], 2, shape_1); + PD_TensorCopyFromCpuInt64(inputs[1], data_1); + PD_TensorSetLod(inputs[1], lod_1_ptr); + // retrieve the lod memory + delete[] lod_1_ptr->data[0]->data; + delete lod_1_ptr->data[0]; + delete[] lod_1_ptr->data; + delete lod_1_ptr; + lod_1_ptr = nullptr; + + PD_PredictorRun(predictor); + PD_OneDimArrayCstr *output_names = PD_PredictorGetOutputNames(predictor); + PD_Tensor *output = + PD_PredictorGetOutputHandle(predictor, output_names->data[0]); + PD_TwoDimArraySize *output_lod = PD_TensorGetLod(output); + + PD_TwoDimArraySizeDestroy(output_lod); + PD_TensorDestroy(output); + PD_OneDimArrayCstrDestroy(output_names); + + PD_TensorDestroy(inputs[0]); + PD_TensorDestroy(inputs[1]); + PD_OneDimArrayCstrDestroy(input_names); + PD_PredictorDestroy(predictor); +} diff --git a/paddle/fluid/inference/capi_exp/pd_common.h b/paddle/fluid/inference/capi_exp/pd_common.h new file mode 100644 index 00000000000000..4b70ed7fbad297 --- /dev/null +++ b/paddle/fluid/inference/capi_exp/pd_common.h @@ -0,0 +1,75 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#if defined(_WIN32) +#ifdef PADDLE_DLL_INFERENCE +#define PADDLE_CAPI_EXPORT __declspec(dllexport) +#else +#define PADDLE_CAPI_EXPORT __declspec(dllimport) +#endif // PADDLE_DLL_INFERENCE +#else +#define PADDLE_CAPI_EXPORT __attribute__((visibility("default"))) +#endif // _WIN32 + +/// +/// __pd_give means that a new object is returned. The user should make sure +/// that the returned pointer is used exactly once as a value for an __pd_take +/// argument. In between, it can be used as a value for as many __pd_keep +/// arguments as the user likes. +/// +#ifndef __pd_give +#define __pd_give +#endif +/// +/// __pd_take means that the object the argument points to is taken over by the +/// function and may no longer be used by the user as an argument to any other +/// function. The pointer value must be one returned by a function returning an +/// __pd_give pointer. +/// +#ifndef __pd_take +#define __pd_take +#endif +/// +/// __pd_keep means that the function will only use the object temporarily. The +/// object which the argument points to is not taken over by the function. After +/// the function has finished, the user can still use it as an argument to other +/// functions. +/// +#ifndef __pd_keep +#define __pd_keep +#endif + +typedef int8_t PD_Bool; +#define TRUE 1 +#define FALSE 0 + +#define PD_ENUM(type) \ + typedef int32_t type; \ + enum + +PD_ENUM(PD_PrecisionType){PD_PRECISION_FLOAT32 = 0, PD_PRECISION_INT8, + PD_PRECISION_HALF}; + +PD_ENUM(PD_PlaceType){PD_PLACE_UNK = -1, PD_PLACE_CPU, PD_PLACE_GPU, + PD_PLACE_XPU}; + +PD_ENUM(PD_DataType){ + PD_DATA_UNK = -1, PD_DATA_FLOAT32, PD_DATA_INT32, + PD_DATA_INT64, PD_DATA_UINT8, +}; diff --git a/paddle/fluid/inference/capi_exp/pd_config.cc b/paddle/fluid/inference/capi_exp/pd_config.cc new file mode 100644 index 00000000000000..c45454e86bdaac --- /dev/null +++ b/paddle/fluid/inference/capi_exp/pd_config.cc @@ -0,0 +1,382 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/capi_exp/pd_config.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/platform/enforce.h" + +#define CHECK_NULL_POINTER_PARM(param) \ + PADDLE_ENFORCE_NOT_NULL( \ + param, paddle::platform::errors::InvalidArgument( \ + "The pointer of " #param " shouldn't be nullptr")) + +#define CHECK_AND_CONVERT_PD_CONFIG \ + PADDLE_ENFORCE_NOT_NULL( \ + pd_config, paddle::platform::errors::InvalidArgument( \ + "The pointer of paddle config shouldn't be nullptr")); \ + Config* config = reinterpret_cast(pd_config) + +using paddle_infer::Config; + +static Config::Precision ConvertToCxxPrecisionType(PD_PrecisionType precision) { + switch (precision) { + case PD_PRECISION_FLOAT32: + return Config::Precision::kFloat32; + case PD_PRECISION_INT8: + return Config::Precision::kInt8; + case PD_PRECISION_HALF: + return Config::Precision::kHalf; + default: + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "Unsupport paddle precision type %d.", precision)); + return Config::Precision::kFloat32; + } +} + +extern "C" { +__pd_give PD_Config* PD_ConfigCreate() { + return reinterpret_cast(new Config()); +} + +void PD_ConfigDestroy(__pd_take PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + delete reinterpret_cast(config); +} + +void PD_ConfigSetModel(__pd_keep PD_Config* pd_config, + const char* prog_file_path, + const char* params_file_path) { + CHECK_AND_CONVERT_PD_CONFIG; + CHECK_NULL_POINTER_PARM(prog_file_path); + CHECK_NULL_POINTER_PARM(params_file_path); + config->SetModel(prog_file_path, params_file_path); +} +void PD_ConfigSetProgFile(__pd_keep PD_Config* pd_config, + const char* prog_file_path) { + CHECK_AND_CONVERT_PD_CONFIG; + CHECK_NULL_POINTER_PARM(prog_file_path); + config->SetProgFile(prog_file_path); +} +void PD_ConfigSetParamsFile(__pd_keep PD_Config* pd_config, + const char* params_file_path) { + CHECK_AND_CONVERT_PD_CONFIG; + CHECK_NULL_POINTER_PARM(params_file_path); + config->SetParamsFile(params_file_path); +} +void PD_ConfigSetOptimCacheDir(__pd_keep PD_Config* pd_config, + const char* opt_cache_dir) { + CHECK_AND_CONVERT_PD_CONFIG; + CHECK_NULL_POINTER_PARM(opt_cache_dir); + config->SetOptimCacheDir(opt_cache_dir); +} + +void PD_ConfigSetModelDir(__pd_keep PD_Config* pd_config, + const char* model_dir) { + CHECK_AND_CONVERT_PD_CONFIG; + CHECK_NULL_POINTER_PARM(model_dir); + config->SetModel(model_dir); +} +const char* PD_ConfigGetModelDir(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->model_dir().c_str(); +} +const char* PD_ConfigGetProgFile(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->prog_file().c_str(); +} +const char* PD_ConfigGetParamsFile(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->params_file().c_str(); +} + +void PD_ConfigDisableFCPadding(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->DisableFCPadding(); +} +PD_Bool PD_ConfigUseFcPadding(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->use_fc_padding(); +} + +void PD_ConfigEnableUseGpu(__pd_keep PD_Config* pd_config, + uint64_t memory_pool_init_size_mb, + int32_t device_id) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableUseGpu(memory_pool_init_size_mb, device_id); +} +void PD_ConfigDisableGpu(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->DisableGpu(); +} +PD_Bool PD_ConfigUseGpu(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->use_gpu(); +} + +void PD_ConfigEnableXpu(__pd_keep PD_Config* pd_config, + int32_t l3_workspace_size) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableXpu(l3_workspace_size); +} +PD_Bool PD_ConfigUseXpu(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->use_xpu(); +} + +int32_t PD_ConfigGpuDeviceId(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->gpu_device_id(); +} +int32_t PD_ConfigXpuDeviceId(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->xpu_device_id(); +} +int32_t PD_ConfigMemoryPoolInitSizeMb(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->memory_pool_init_size_mb(); +} +float PD_ConfigFractionOfGpuMemoryForPool(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->fraction_of_gpu_memory_for_pool(); +} +void PD_ConfigEnableCudnn(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableCUDNN(); +} +PD_Bool PD_ConfigCudnnEnabled(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->cudnn_enabled(); +} + +void PD_ConfigSwitchIrOptim(__pd_keep PD_Config* pd_config, PD_Bool x) { + CHECK_AND_CONVERT_PD_CONFIG; + config->SwitchIrOptim(x); +} +PD_Bool PD_ConfigIrOptim(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->ir_optim(); +} + +void PD_ConfigEnableTensorRtEngine(__pd_keep PD_Config* pd_config, + int32_t workspace_size, + int32_t max_batch_size, + int32_t min_subgraph_size, + PD_PrecisionType precision, + PD_Bool use_static, PD_Bool use_calib_mode) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableTensorRtEngine( + workspace_size, max_batch_size, min_subgraph_size, + ConvertToCxxPrecisionType(precision), use_static, use_calib_mode); +} +PD_Bool PD_ConfigTensorRtEngineEnabled(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->tensorrt_engine_enabled(); +} + +void PD_ConfigSetTrtDynamicShapeInfo(__pd_keep PD_Config* pd_config, + size_t tensor_num, + const char** tensor_name, + size_t* shapes_num, int32_t** min_shape, + int32_t** max_shape, int32_t** optim_shape, + PD_Bool disable_trt_plugin_fp16) { + CHECK_AND_CONVERT_PD_CONFIG; + std::map> min_input_shapes; + std::map> max_input_shapes; + std::map> optim_input_shapes; + for (size_t tensor_index = 0; tensor_index < tensor_num; ++tensor_index) { + std::string name(tensor_name[tensor_index]); + std::vector min_input_shape, max_input_shape, optim_input_shape; + for (size_t shape_index = 0; shape_index < shapes_num[tensor_index]; + ++shape_index) { + min_input_shape.emplace_back(min_shape[tensor_index][shape_index]); + max_input_shape.emplace_back(max_shape[tensor_index][shape_index]); + optim_input_shape.emplace_back(optim_shape[tensor_index][shape_index]); + } + min_input_shapes[name] = std::move(min_input_shape); + max_input_shapes[name] = std::move(max_input_shape); + optim_input_shapes[name] = std::move(optim_input_shape); + } + config->SetTRTDynamicShapeInfo(min_input_shapes, max_input_shapes, + optim_input_shapes, disable_trt_plugin_fp16); +} + +void PD_ConfigDisableTensorRtOPs(__pd_keep PD_Config* pd_config, size_t ops_num, + const char** ops_name) { + CHECK_AND_CONVERT_PD_CONFIG; + std::vector ops_list; + for (size_t index = 0; index < ops_num; ++index) { + ops_list.emplace_back(ops_name[index]); + } + config->Exp_DisableTensorRtOPs(ops_list); +} + +void PD_ConfigEnableTensorRtOSS(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableTensorRtOSS(); +} +PD_Bool PD_ConfigTensorRtOssEnabled(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->tensorrt_oss_enabled(); +} + +void PD_ConfigEnableTensorRtDla(__pd_keep PD_Config* pd_config, + int32_t dla_core) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableTensorRtDLA(dla_core); +} +PD_Bool PD_ConfigTensorRtDlaEnabled(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->tensorrt_dla_enabled(); +} + +void PD_ConfigEnableLiteEngine(__pd_keep PD_Config* pd_config, + PD_PrecisionType precision, PD_Bool zero_copy, + size_t passes_filter_num, + const char** passes_filter, + size_t ops_filter_num, const char** ops_filter) { + CHECK_AND_CONVERT_PD_CONFIG; + std::vector passes_filters, ops_filters; + for (size_t index = 0; index < passes_filter_num; ++index) { + passes_filters.emplace_back(passes_filter[index]); + } + for (size_t index = 0; index < ops_filter_num; ++index) { + ops_filters.emplace_back(ops_filter[index]); + } + config->EnableLiteEngine(ConvertToCxxPrecisionType(precision), zero_copy, + passes_filters, ops_filters); +} +PD_Bool PD_ConfigLiteEngineEnabled(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->lite_engine_enabled(); +} + +void PD_ConfigSwitchIrDebug(__pd_keep PD_Config* pd_config, PD_Bool x) { + CHECK_AND_CONVERT_PD_CONFIG; + config->SwitchIrDebug(x); +} +void PD_ConfigEnableMKLDNN(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableMKLDNN(); +} +void PD_ConfigSetMkldnnCacheCapacity(__pd_keep PD_Config* pd_config, + int32_t capacity) { + CHECK_AND_CONVERT_PD_CONFIG; + config->SetMkldnnCacheCapacity(capacity); +} +PD_Bool PD_ConfigMkldnnEnabled(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->mkldnn_enabled(); +} +void PD_ConfigSetCpuMathLibraryNumThreads( + __pd_keep PD_Config* pd_config, int32_t cpu_math_library_num_threads) { + CHECK_AND_CONVERT_PD_CONFIG; + config->SetCpuMathLibraryNumThreads(cpu_math_library_num_threads); +} +int32_t PD_ConfigGetCpuMathLibraryNumThreads(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->cpu_math_library_num_threads(); +} + +void PD_ConfigSetMkldnnOp(__pd_keep PD_Config* pd_config, size_t ops_num, + const char** op_list) { + CHECK_AND_CONVERT_PD_CONFIG; + std::unordered_set op_names; + for (size_t index = 0; index < ops_num; ++index) { + op_names.emplace(op_list[index]); + } + config->SetMKLDNNOp(std::move(op_names)); +} +void PD_ConfigEnableMkldnnQuantizer(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableMkldnnQuantizer(); +} +void PD_ConfigEnableMkldnnBfloat16(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableMkldnnBfloat16(); +} +PD_Bool PD_ConfigMkldnnBfloat16Enabled(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->mkldnn_bfloat16_enabled(); +} +void PD_ConfigSetBfloat16Op(__pd_keep PD_Config* pd_config, size_t ops_num, + const char** op_list) { + CHECK_AND_CONVERT_PD_CONFIG; + std::unordered_set op_names; + for (size_t index = 0; index < ops_num; ++index) { + op_names.emplace(op_list[index]); + } + config->SetBfloat16Op(std::move(op_names)); +} +PD_Bool PD_ConfigThreadLocalStreamEnabled(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->thread_local_stream_enabled(); +} +PD_Bool PD_ConfigMkldnnQuantizerEnabled(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->mkldnn_quantizer_enabled(); +} +void PD_ConfigSetModelBuffer(__pd_keep PD_Config* pd_config, + const char* prog_buffer, size_t prog_buffer_size, + const char* params_buffer, + size_t params_buffer_size) { + CHECK_AND_CONVERT_PD_CONFIG; + config->SetModelBuffer(prog_buffer, prog_buffer_size, params_buffer, + params_buffer_size); +} +PD_Bool PD_ConfigModelFromMemory(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->model_from_memory(); +} +void PD_ConfigEnableMemoryOptim(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableMemoryOptim(); +} +PD_Bool PD_ConfigMemoryOptimEnabled(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->enable_memory_optim(); +} +void PD_ConfigEnableProfile(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableProfile(); +} +PD_Bool PD_ConfigProfileEnabled(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->profile_enabled(); +} +void PD_ConfigDisableGlogInfo(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->DisableGlogInfo(); +} +PD_Bool PD_ConfigGlogInfoDisabled(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->glog_info_disabled(); +} +void PD_ConfigSetInvalid(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->SetInValid(); +} +PD_Bool PD_ConfigIsValid(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + return config->is_valid(); +} +void PD_ConfigEnableGpuMultiStream(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->EnableGpuMultiStream(); +} +void PD_ConfigPartiallyRelease(__pd_keep PD_Config* pd_config) { + CHECK_AND_CONVERT_PD_CONFIG; + config->PartiallyRelease(); +} + +} // extern "C" diff --git a/paddle/fluid/inference/capi_exp/pd_config.h b/paddle/fluid/inference/capi_exp/pd_config.h new file mode 100644 index 00000000000000..e44983e24484ea --- /dev/null +++ b/paddle/fluid/inference/capi_exp/pd_config.h @@ -0,0 +1,571 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/// +/// \file pd_config.h +/// +/// \brief interface for paddle config +/// +/// \author paddle-infer@baidu.com +/// \date 2021-04-21 +/// \since 2.1 +/// + +#pragma once + +#include "pd_common.h" // NOLINT + +typedef struct PD_Config PD_Config; + +#ifdef __cplusplus +extern "C" { +#endif + +/// +/// \brief Create a paddle config +/// +/// \return new config. +/// +PADDLE_CAPI_EXPORT extern __pd_give PD_Config* PD_ConfigCreate(); +/// +/// \brief Destroy the paddle config +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigDestroy(__pd_take PD_Config* pd_config); +/// +/// \brief Set the combined model with two specific pathes for program and +/// parameters. +/// +/// \param[in] pd_onfig config +/// \param[in] prog_file_path model file path of the combined model. +/// \param[in] params_file_path params file path of the combined model. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSetModel(__pd_keep PD_Config* pd_config, + const char* prog_file_path, + const char* params_file_path); +/// +/// \brief Set the model file path of a combined model. +/// +/// \param[in] pd_onfig config +/// \param[in] prog_file_path model file path. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSetProgFile( + __pd_keep PD_Config* pd_config, const char* prog_file_path); +/// +/// \brief Set the params file path of a combined model. +/// +/// \param[in] pd_onfig config +/// \param[in] params_file_path params file path. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSetParamsFile( + __pd_keep PD_Config* pd_config, const char* params_file_path); +/// +/// \brief Set the path of optimization cache directory. +/// \param[in] pd_onfig config +/// \param[in] opt_cache_dir the path of optimization cache directory. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSetOptimCacheDir( + __pd_keep PD_Config* pd_config, const char* opt_cache_dir); +/// +/// \brief Set the no-combined model dir path. +/// \param[in] pd_onfig config +/// \param[in] model_dir model dir path. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSetModelDir( + __pd_keep PD_Config* pd_config, const char* model_dir); +/// +/// \brief Get the model directory path. +/// +/// \param[in] pd_onfig config +/// \return The model directory path. +/// +PADDLE_CAPI_EXPORT extern const char* PD_ConfigGetModelDir( + __pd_keep PD_Config* pd_config); +/// +/// \brief Get the program file path. +/// +/// \param[in] pd_onfig config +/// \return The program file path. +/// +PADDLE_CAPI_EXPORT extern const char* PD_ConfigGetProgFile( + __pd_keep PD_Config* pd_config); +/// +/// \brief Get the params file path. +/// +/// \param[in] pd_onfig config +/// \return The params file path. +/// +PADDLE_CAPI_EXPORT extern const char* PD_ConfigGetParamsFile( + __pd_keep PD_Config* pd_config); +/// +/// \brief Turn off FC Padding. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigDisableFCPadding( + __pd_keep PD_Config* pd_config); +/// +/// \brief A boolean state telling whether fc padding is used. +/// +/// \param[in] pd_onfig config +/// \return Whether fc padding is used. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseFcPadding( + __pd_keep PD_Config* pd_config); +/// +/// \brief Turn on GPU. +/// +/// \param[in] pd_onfig config +/// \param[in] memory_pool_init_size_mb initial size of the GPU memory pool in +/// MB. +/// \param[in] device_id device_id the GPU card to use. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableUseGpu( + __pd_keep PD_Config* pd_config, uint64_t memory_pool_init_size_mb, + int32_t device_id); +/// +/// \brief Turn off GPU. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigDisableGpu( + __pd_keep PD_Config* pd_config); +/// +/// \brief A boolean state telling whether the GPU is turned on. +/// +/// \brief Turn off GPU. +/// \return Whether the GPU is turned on. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseGpu( + __pd_keep PD_Config* pd_config); +/// +/// \brief Turn on XPU. +/// +/// \param[in] pd_onfig config +/// \param[in] l3_workspace_size l3 workspace size. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableXpu( + __pd_keep PD_Config* pd_config, int32_t l3_workspace_size); +/// +/// \brief A boolean state telling whether the XPU is turned on. +/// +/// \param[in] pd_onfig config +/// \return Whether the XPU is turned on. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigUseXpu( + __pd_keep PD_Config* pd_config); +/// +/// \brief Get the GPU device id. +/// +/// \param[in] pd_onfig config +/// \return The GPU device id. +/// +PADDLE_CAPI_EXPORT extern int32_t PD_ConfigGpuDeviceId( + __pd_keep PD_Config* pd_config); +/// +/// \brief Get the XPU device id. +/// +/// \param[in] pd_onfig config +/// \return The XPU device id. +/// +PADDLE_CAPI_EXPORT extern int32_t PD_ConfigXpuDeviceId( + __pd_keep PD_Config* pd_config); +/// +/// \brief Get the initial size in MB of the GPU memory pool. +/// +/// \param[in] pd_onfig config +/// \return The initial size in MB of the GPU memory pool. +/// +PADDLE_CAPI_EXPORT extern int32_t PD_ConfigMemoryPoolInitSizeMb( + __pd_keep PD_Config* pd_config); +/// +/// \brief Get the proportion of the initial memory pool size compared to the +/// device. +/// +/// \param[in] pd_onfig config +/// \return The proportion of the initial memory pool size. +/// +PADDLE_CAPI_EXPORT extern float PD_ConfigFractionOfGpuMemoryForPool( + __pd_keep PD_Config* pd_config); +/// +/// \brief Turn on CUDNN. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableCudnn( + __pd_keep PD_Config* pd_config); +/// +/// \brief A boolean state telling whether to use CUDNN. +/// +/// \param[in] pd_onfig config +/// \return Whether to use CUDNN. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigCudnnEnabled( + __pd_keep PD_Config* pd_config); +/// +/// \brief Control whether to perform IR graph optimization. +/// If turned off, the AnalysisConfig will act just like a NativeConfig. +/// +/// \param[in] pd_onfig config +/// \param[in] x Whether the ir graph optimization is actived. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSwitchIrOptim( + __pd_keep PD_Config* pd_config, PD_Bool x); +/// +/// \brief A boolean state telling whether the ir graph optimization is +/// actived. +/// +/// \param[in] pd_onfig config +/// \return Whether to use ir graph optimization. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigIrOptim( + __pd_keep PD_Config* pd_config); +/// +/// \brief Turn on the TensorRT engine. +/// The TensorRT engine will accelerate some subgraphes in the original Fluid +/// computation graph. In some models such as resnet50, GoogleNet and so on, +/// it gains significant performance acceleration. +/// +/// \param[in] pd_onfig config +/// \param[in] workspace_size The memory size(in byte) used for TensorRT +/// workspace. +/// \param[in] max_batch_size The maximum batch size of this prediction task, +/// better set as small as possible for less performance loss. +/// \param[in] min_subgrpah_size The minimum TensorRT subgraph size needed, if a +/// subgraph is smaller than this, it will not be transferred to TensorRT +/// engine. +/// \param[in] precision The precision used in TensorRT. +/// \param[in] use_static Serialize optimization information to disk for +/// reusing. +/// \param[in] use_calib_mode Use TRT int8 calibration(post training +/// quantization). +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableTensorRtEngine( + __pd_keep PD_Config* pd_config, int32_t workspace_size, + int32_t max_batch_size, int32_t min_subgraph_size, + PD_PrecisionType precision, PD_Bool use_static, PD_Bool use_calib_mode); +/// +/// \brief A boolean state telling whether the TensorRT engine is used. +/// +/// \param[in] pd_onfig config +/// \return Whether the TensorRT engine is used. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigTensorRtEngineEnabled( + __pd_keep PD_Config* pd_config); +/// +/// \brief Set min, max, opt shape for TensorRT Dynamic shape mode. +/// +/// \param[in] pd_onfig config +/// \param[in] tensor_num The number of the subgraph input. +/// \param[in] tensor_name The name of every subgraph input. +/// \param[in] shapes_num The shape size of every subgraph input. +/// \param[in] min_shape The min input shape of every subgraph input. +/// \param[in] max_shape The max input shape of every subgraph input. +/// \param[in] optim_shape The opt input shape of every subgraph input. +/// \param[in] disable_trt_plugin_fp16 Setting this parameter to true means that +/// TRT plugin will not run fp16. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSetTrtDynamicShapeInfo( + __pd_keep PD_Config* pd_config, size_t tensor_num, const char** tensor_name, + size_t* shapes_num, int32_t** min_shape, int32_t** max_shape, + int32_t** optim_shape, PD_Bool disable_trt_plugin_fp16); +/// +/// \brief Prevent ops running in Paddle-TRT +/// NOTE: just experimental, not an official stable API, easy to be broken. +/// +/// \param[in] pd_onfig config +/// \param[in] ops_num ops number +/// \param[in] ops_name ops name +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigDisableTensorRtOPs( + __pd_keep PD_Config* pd_config, size_t ops_num, const char** ops_name); +/// +/// \brief Replace some TensorRT plugins to TensorRT OSS( +/// https://github.com/NVIDIA/TensorRT), with which some models's inference +/// may be more high-performance. Libnvinfer_plugin.so greater than +/// V7.2.1 is needed. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableTensorRtOSS( + __pd_keep PD_Config* pd_config); +/// +/// \brief A boolean state telling whether to use the TensorRT OSS. +/// +/// \param[in] pd_onfig config +/// \return Whether to use the TensorRT OSS. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigTensorRtOssEnabled( + __pd_keep PD_Config* pd_config); +/// +/// \brief Enable TensorRT DLA +/// +/// \param[in] pd_onfig config +/// \param[in] dla_core ID of DLACore, which should be 0, 1, +/// ..., IBuilder.getNbDLACores() - 1 +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableTensorRtDla( + __pd_keep PD_Config* pd_config, int32_t dla_core); +/// +/// \brief A boolean state telling whether to use the TensorRT DLA. +/// +/// \param[in] pd_onfig config +/// \return Whether to use the TensorRT DLA. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigTensorRtDlaEnabled( + __pd_keep PD_Config* pd_config); +/// +/// \brief Turn on the usage of Lite sub-graph engine. +/// +/// \param[in] pd_onfig config +/// \param[in] precision Precion used in Lite sub-graph engine. +/// \param[in] zero_copy whether use zero copy. +/// \param[in] passes_filter_num The number of passes used in Lite sub-graph +/// engine. +/// \param[in] passes_filter The name of passes used in Lite sub-graph engine. +/// \param[in] ops_filter_num The number of operators not supported by Lite. +/// \param[in] ops_filter The name of operators not supported by Lite. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableLiteEngine( + __pd_keep PD_Config* pd_config, PD_PrecisionType precision, + PD_Bool zero_copy, size_t passes_filter_num, const char** passes_filter, + size_t ops_filter_num, const char** ops_filter); +/// +/// \brief A boolean state indicating whether the Lite sub-graph engine is +/// used. +/// +/// \param[in] pd_onfig config +/// \return Whether the Lite sub-graph engine is used. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigLiteEngineEnabled( + __pd_keep PD_Config* pd_config); +/// +/// \brief Control whether to debug IR graph analysis phase. +/// This will generate DOT files for visualizing the computation graph after +/// each analysis pass applied. +/// +/// \param[in] pd_onfig config +/// \param[in] x whether to debug IR graph analysis phase. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSwitchIrDebug( + __pd_keep PD_Config* pd_config, PD_Bool x); +/// +/// \brief Turn on MKLDNN. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableMKLDNN( + __pd_keep PD_Config* pd_config); +/// +/// \brief Set the cache capacity of different input shapes for MKLDNN. +/// Default value 0 means not caching any shape. +/// Please see MKL-DNN Data Caching Design Document: +/// https://github.com/PaddlePaddle/FluidDoc/blob/develop/doc/fluid/design/mkldnn/caching/caching.md +/// +/// \param[in] pd_onfig config +/// \param[in] capacity The cache capacity. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSetMkldnnCacheCapacity( + __pd_keep PD_Config* pd_config, int32_t capacity); +/// +/// \brief A boolean state telling whether to use the MKLDNN. +/// +/// \param[in] pd_onfig config +/// \return Whether to use the MKLDNN. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigMkldnnEnabled( + __pd_keep PD_Config* pd_config); +/// +/// \brief Set the number of cpu math library threads. +/// +/// \param[in] pd_onfig config +/// \param cpu_math_library_num_threads The number of cpu math library +/// threads. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSetCpuMathLibraryNumThreads( + __pd_keep PD_Config* pd_config, int32_t cpu_math_library_num_threads); +/// +/// \brief An int state telling how many threads are used in the CPU math +/// library. +/// +/// \param[in] pd_onfig config +/// \return The number of threads used in the CPU math library. +/// +PADDLE_CAPI_EXPORT extern int32_t PD_ConfigGetCpuMathLibraryNumThreads( + __pd_keep PD_Config* pd_config); +/// +/// \brief Specify the operator type list to use MKLDNN acceleration. +/// +/// \param[in] pd_onfig config +/// \param[in] ops_num The number of operator type list. +/// \param[in] op_list The name of operator type list. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSetMkldnnOp( + __pd_keep PD_Config* pd_config, size_t ops_num, const char** op_list); +/// +/// \brief Turn on MKLDNN quantization. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableMkldnnQuantizer( + __pd_keep PD_Config* pd_config); +/// +/// \brief A boolean state telling whether the MKLDNN quantization is enabled. +/// +/// \param[in] pd_onfig config +/// \return Whether the MKLDNN quantization is enabled. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigMkldnnQuantizerEnabled( + __pd_keep PD_Config* pd_config); +/// +/// \brief Turn on MKLDNN bfloat16. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableMkldnnBfloat16( + __pd_keep PD_Config* pd_config); +/// +/// \brief A boolean state telling whether to use the MKLDNN Bfloat16. +/// +/// \param[in] pd_onfig config +/// \return Whether to use the MKLDNN Bfloat16. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigMkldnnBfloat16Enabled( + __pd_keep PD_Config* pd_config); +/// \brief Specify the operator type list to use Bfloat16 acceleration. +/// +/// \param[in] pd_onfig config +/// \param[in] ops_num The number of operator type list. +/// \param[in] op_list The name of operator type list. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSetBfloat16Op( + __pd_keep PD_Config* pd_config, size_t ops_num, const char** op_list); +/// +/// \brief Enable the GPU multi-computing stream feature. +/// NOTE: The current behavior of this interface is to bind the computation +/// stream to the thread, and this behavior may be changed in the future. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableGpuMultiStream( + __pd_keep PD_Config* pd_config); +/// +/// \brief A boolean state telling whether the thread local CUDA stream is +/// enabled. +/// +/// \param[in] pd_onfig config +/// \return Whether the thread local CUDA stream is enabled. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigThreadLocalStreamEnabled( + __pd_keep PD_Config* pd_config); +/// +/// \brief Specify the memory buffer of program and parameter. +/// Used when model and params are loaded directly from memory. +/// +/// \param[in] pd_onfig config +/// \param[in] prog_buffer The memory buffer of program. +/// \param[in] prog_buffer_size The size of the model data. +/// \param[in] params_buffer The memory buffer of the combined parameters file. +/// \param[in] params_buffer_size The size of the combined parameters data. +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSetModelBuffer( + __pd_keep PD_Config* pd_config, const char* prog_buffer, + size_t prog_buffer_size, const char* params_buffer, + size_t params_buffer_size); +/// +/// \brief A boolean state telling whether the model is set from the CPU +/// memory. +/// +/// \param[in] pd_onfig config +/// \return Whether model and params are loaded directly from memory. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigModelFromMemory( + __pd_keep PD_Config* pd_config); +/// +/// \brief Turn on memory optimize +/// NOTE still in development. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableMemoryOptim( + __pd_keep PD_Config* pd_config); +/// +/// \brief A boolean state telling whether the memory optimization is +/// activated. +/// +/// \param[in] pd_onfig config +/// \return Whether the memory optimization is activated. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigMemoryOptimEnabled( + __pd_keep PD_Config* pd_config); +/// +/// \brief Turn on profiling report. +/// If not turned on, no profiling report will be generated. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigEnableProfile( + __pd_keep PD_Config* pd_config); +/// +/// \brief A boolean state telling whether the profiler is activated. +/// +/// \param[in] pd_onfig config +/// \return bool Whether the profiler is activated. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigProfileEnabled( + __pd_keep PD_Config* pd_config); +/// +/// \brief Mute all logs in Paddle inference. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigDisableGlogInfo( + __pd_keep PD_Config* pd_config); +/// +/// \brief A boolean state telling whether logs in Paddle inference are muted. +/// +/// \param[in] pd_onfig config +/// \return Whether logs in Paddle inference are muted. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigGlogInfoDisabled( + __pd_keep PD_Config* pd_config); +/// +/// \brief Set the Config to be invalid. +/// This is to ensure that an Config can only be used in one +/// Predictor. +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigSetInvalid( + __pd_keep PD_Config* pd_config); +/// +/// \brief A boolean state telling whether the Config is valid. +/// +/// \param[in] pd_onfig config +/// \return Whether the Config is valid. +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_ConfigIsValid( + __pd_keep PD_Config* pd_config); +/// +/// \brief Partially release the memory +/// +/// \param[in] pd_onfig config +/// +PADDLE_CAPI_EXPORT extern void PD_ConfigPartiallyRelease( + __pd_keep PD_Config* pd_config); + +#ifdef __cplusplus +} // extern "C" +#endif diff --git a/paddle/fluid/inference/capi_exp/pd_inference_api.h b/paddle/fluid/inference/capi_exp/pd_inference_api.h new file mode 100644 index 00000000000000..5f21dca1a7bf6a --- /dev/null +++ b/paddle/fluid/inference/capi_exp/pd_inference_api.h @@ -0,0 +1,22 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "pd_common.h" // NOLINT +#include "pd_config.h" // NOLINT +#include "pd_predictor.h" // NOLINT +#include "pd_tensor.h" // NOLINT +#include "pd_types.h" // NOLINT +#include "pd_utils.h" // NOLINT diff --git a/paddle/fluid/inference/capi_exp/pd_predictor.cc b/paddle/fluid/inference/capi_exp/pd_predictor.cc new file mode 100644 index 00000000000000..f5287a5152957f --- /dev/null +++ b/paddle/fluid/inference/capi_exp/pd_predictor.cc @@ -0,0 +1,109 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/capi_exp/pd_predictor.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/capi_exp/pd_types.h" +#include "paddle/fluid/inference/capi_exp/pd_utils.h" +#include "paddle/fluid/inference/capi_exp/types_internal.h" +#include "paddle/fluid/inference/capi_exp/utils_internal.h" +#include "paddle/fluid/platform/enforce.h" + +#define CHECK_AND_CONVERT_PD_PREDICTOR \ + PADDLE_ENFORCE_NOT_NULL( \ + pd_predictor, \ + paddle::platform::errors::InvalidArgument( \ + "The pointer of paddle predictor shouldn't be nullptr")); \ + auto& predictor = pd_predictor->predictor + +extern "C" { +__pd_give PD_Predictor* PD_PredictorCreate(__pd_take PD_Config* pd_config) { + PADDLE_ENFORCE_NOT_NULL( + pd_config, paddle::platform::errors::InvalidArgument( + "The pointer of paddle predictor shouldn't be nullptr")); + PD_Predictor* pd_predictor = new PD_Predictor(); + paddle_infer::Config* config = + reinterpret_cast(pd_config); + pd_predictor->predictor = paddle_infer::CreatePredictor(*config); + delete config; + return pd_predictor; +} + +__pd_give PD_Predictor* PD_PredictorClone( + __pd_keep PD_Predictor* pd_predictor) { + CHECK_AND_CONVERT_PD_PREDICTOR; + PD_Predictor* new_predictor = new PD_Predictor(); + new_predictor->predictor = predictor->Clone(); + return new_predictor; +} + +__pd_give PD_OneDimArrayCstr* PD_PredictorGetInputNames( + __pd_keep PD_Predictor* pd_predictor) { + CHECK_AND_CONVERT_PD_PREDICTOR; + std::vector names = predictor->GetInputNames(); + return paddle_infer::CvtVecToOneDimArrayCstr(names); +} + +__pd_give PD_OneDimArrayCstr* PD_PredictorGetOutputNames( + __pd_keep PD_Predictor* pd_predictor) { + CHECK_AND_CONVERT_PD_PREDICTOR; + std::vector names = predictor->GetOutputNames(); + return paddle_infer::CvtVecToOneDimArrayCstr(names); +} + +size_t PD_PredictorGetInputNum(__pd_keep PD_Predictor* pd_predictor) { + CHECK_AND_CONVERT_PD_PREDICTOR; + return predictor->GetInputNames().size(); +} + +size_t PD_PredictorGetOutputNum(__pd_keep PD_Predictor* pd_predictor) { + CHECK_AND_CONVERT_PD_PREDICTOR; + return predictor->GetOutputNames().size(); +} +__pd_give PD_Tensor* PD_PredictorGetInputHandle( + __pd_keep PD_Predictor* pd_predictor, const char* name) { + CHECK_AND_CONVERT_PD_PREDICTOR; + PD_Tensor* pd_tensor = new PD_Tensor(); + pd_tensor->tensor = predictor->GetInputHandle(name); + return pd_tensor; +} + +__pd_give PD_Tensor* PD_PredictorGetOutputHandle( + __pd_keep PD_Predictor* pd_predictor, const char* name) { + CHECK_AND_CONVERT_PD_PREDICTOR; + PD_Tensor* pd_tensor = new PD_Tensor(); + pd_tensor->tensor = predictor->GetOutputHandle(name); + return pd_tensor; +} + +PD_Bool PD_PredictorRun(__pd_keep PD_Predictor* pd_predictor) { + CHECK_AND_CONVERT_PD_PREDICTOR; + return predictor->Run(); +} + +void PD_PredictorClearIntermediateTensor(__pd_keep PD_Predictor* pd_predictor) { + CHECK_AND_CONVERT_PD_PREDICTOR; + predictor->ClearIntermediateTensor(); +} + +uint64_t PD_PredictorTryShrinkMemory(__pd_keep PD_Predictor* pd_predictor) { + CHECK_AND_CONVERT_PD_PREDICTOR; + return predictor->TryShrinkMemory(); +} + +void PD_PredictorDestroy(__pd_take PD_Predictor* pd_predictor) { + delete pd_predictor; +} + +} // extern "C" diff --git a/paddle/fluid/inference/capi_exp/pd_predictor.h b/paddle/fluid/inference/capi_exp/pd_predictor.h new file mode 100644 index 00000000000000..d4542d0b6d394d --- /dev/null +++ b/paddle/fluid/inference/capi_exp/pd_predictor.h @@ -0,0 +1,148 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/// +/// \file pd_predictor.h +/// +/// \brief interface for paddle predictor +/// +/// \author paddle-infer@baidu.com +/// \date 2021-04-21 +/// \since 2.1 +/// + +#pragma once + +#include "pd_common.h" // NOLINT + +typedef struct PD_Predictor PD_Predictor; +typedef struct PD_Config PD_Config; +typedef struct PD_Tensor PD_Tensor; +typedef struct PD_OneDimArrayCstr PD_OneDimArrayCstr; + +#ifdef __cplusplus +extern "C" { +#endif + +/// +/// \brief Create a new Predictor +/// +/// \param[in] Config config +/// \return new predicor. +/// +PADDLE_CAPI_EXPORT extern __pd_give PD_Predictor* PD_PredictorCreate( + __pd_take PD_Config* pd_config); +/// +/// \brief Clone a new Predictor +/// +/// \param[in] pd_predictor predictor +/// \return new predictor. +/// +PADDLE_CAPI_EXPORT extern __pd_give PD_Predictor* PD_PredictorClone( + __pd_keep PD_Predictor* pd_predictor); +/// +/// \brief Get the input names +/// +/// \param[in] pd_predictor predictor +/// \return input names +/// +PADDLE_CAPI_EXPORT extern __pd_give PD_OneDimArrayCstr* +PD_PredictorGetInputNames(__pd_keep PD_Predictor* pd_predictor); +/// +/// \brief Get the output names +/// +/// \param[in] pd_predictor predictor +/// \return output names +/// +PADDLE_CAPI_EXPORT extern __pd_give PD_OneDimArrayCstr* +PD_PredictorGetOutputNames(__pd_keep PD_Predictor* pd_predictor); + +/// +/// \brief Get the input number +/// +/// \param[in] pd_predictor predictor +/// \return input number +/// +PADDLE_CAPI_EXPORT extern size_t PD_PredictorGetInputNum( + __pd_keep PD_Predictor* pd_predictor); + +/// +/// \brief Get the output number +/// +/// \param[in] pd_predictor predictor +/// \return output number +/// +PADDLE_CAPI_EXPORT extern size_t PD_PredictorGetOutputNum( + __pd_keep PD_Predictor* pd_predictor); + +/// +/// \brief Get the Input Tensor object +/// +/// \param[in] pd_predictor predictor +/// \param[in] name input name +/// \return input tensor +/// +PADDLE_CAPI_EXPORT extern __pd_give PD_Tensor* PD_PredictorGetInputHandle( + __pd_keep PD_Predictor* pd_predictor, const char* name); + +/// +/// \brief Get the Output Tensor object +/// +/// \param[in] pd_predictor predictor +/// \param[in] name output name +/// \return output tensor +/// +PADDLE_CAPI_EXPORT extern __pd_give PD_Tensor* PD_PredictorGetOutputHandle( + __pd_keep PD_Predictor* pd_predictor, const char* name); + +/// +/// \brief Run the prediction engine +/// +/// \param[in] pd_predictor predictor +/// \return Whether the function executed successfully +/// +PADDLE_CAPI_EXPORT extern PD_Bool PD_PredictorRun( + __pd_keep PD_Predictor* pd_predictor); + +/// \brief Clear the intermediate tensors of the predictor +/// +/// \param[in] pd_predictor predictor +/// +PADDLE_CAPI_EXPORT extern void PD_PredictorClearIntermediateTensor( + __pd_keep PD_Predictor* pd_predictor); + +/// +/// \brief Release all tmp tensor to compress the size of the memory pool. +/// The memory pool is considered to be composed of a list of chunks, if +/// the chunk is not occupied, it can be released. +/// +/// \param[in] pd_predictor predictor +/// \return Number of bytes released. It may be smaller than the actual +/// released memory, because part of the memory is not managed by the +/// MemoryPool. +/// +PADDLE_CAPI_EXPORT extern uint64_t PD_PredictorTryShrinkMemory( + __pd_keep PD_Predictor* pd_predictor); + +/// +/// \brief Destroy a predictor object +/// +/// \param[in] pd_predictor predictor +/// +PADDLE_CAPI_EXPORT extern void PD_PredictorDestroy( + __pd_take PD_Predictor* pd_predictor); + +#ifdef __cplusplus +} // extern "C" +#endif diff --git a/paddle/fluid/inference/capi_exp/pd_tensor.cc b/paddle/fluid/inference/capi_exp/pd_tensor.cc new file mode 100644 index 00000000000000..9c661dea6f2bb2 --- /dev/null +++ b/paddle/fluid/inference/capi_exp/pd_tensor.cc @@ -0,0 +1,117 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/capi_exp/pd_tensor.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/capi_exp/pd_types.h" +#include "paddle/fluid/inference/capi_exp/pd_utils.h" +#include "paddle/fluid/inference/capi_exp/types_internal.h" +#include "paddle/fluid/inference/capi_exp/utils_internal.h" +#include "paddle/fluid/platform/enforce.h" + +#define CHECK_AND_CONVERT_PD_TENSOR \ + PADDLE_ENFORCE_NOT_NULL( \ + pd_tensor, paddle::platform::errors::InvalidArgument( \ + "The pointer of paddle tensor shouldn't be nullptr")); \ + auto& tensor = pd_tensor->tensor + +extern "C" { + +void PD_TensorDestroy(__pd_take PD_Tensor* pd_tensor) { delete pd_tensor; } +void PD_TensorReshape(__pd_keep PD_Tensor* pd_tensor, size_t shape_size, + int32_t* shape) { + CHECK_AND_CONVERT_PD_TENSOR; + std::vector shapes(shape_size); + for (size_t index = 0; index < shape_size; ++index) { + shapes[index] = shape[index]; + } + tensor->Reshape(shapes); +} + +#define REPEAT_ALL_DATA_TYPE(func) \ + func(float, Float) func(int64_t, Int64) func(int32_t, Int32) \ + func(uint8_t, Uint8) func(int8_t, Int8) + +#define PD_TENSOR_MUTABLE_DATA_IMPL(type, Type) \ + type* PD_TensorMutableData##Type(__pd_keep PD_Tensor* pd_tensor, \ + PD_PlaceType place) { \ + CHECK_AND_CONVERT_PD_TENSOR; \ + return tensor->mutable_data(paddle_infer::CvtToCxxPlaceType(place)); \ + } +REPEAT_ALL_DATA_TYPE(PD_TENSOR_MUTABLE_DATA_IMPL) +#undef PD_TENSOR_MUTABLE_DATA_IMPL + +#define PD_TENSOR_DATA_IMPL(type, Type) \ + type* PD_TensorData##Type(__pd_keep PD_Tensor* pd_tensor, \ + PD_PlaceType* place, int32_t* size) { \ + CHECK_AND_CONVERT_PD_TENSOR; \ + PADDLE_ENFORCE_NOT_NULL(place, \ + paddle::platform::errors::InvalidArgument( \ + "The pointer of place shouldn't be nullptr")); \ + PADDLE_ENFORCE_NOT_NULL(size, \ + paddle::platform::errors::InvalidArgument( \ + "The pointer of size shouldn't be nullptr")); \ + paddle_infer::PlaceType cxx_palce_type; \ + int cxx_size; \ + type* data = tensor->data(&cxx_palce_type, &cxx_size); \ + *place = paddle_infer::CvtFromCxxPlaceType(cxx_palce_type); \ + *size = static_cast(cxx_size); \ + return data; \ + } +REPEAT_ALL_DATA_TYPE(PD_TENSOR_DATA_IMPL) +#undef PD_TENSOR_DATA_IMPL + +#define PD_TENSOR_COPY_FROM_CPU_IMPL(type, Type) \ + void PD_TensorCopyFromCpu##Type(__pd_keep PD_Tensor* pd_tensor, \ + const type* data) { \ + CHECK_AND_CONVERT_PD_TENSOR; \ + tensor->CopyFromCpu(data); \ + } +REPEAT_ALL_DATA_TYPE(PD_TENSOR_COPY_FROM_CPU_IMPL) +#undef PD_TENSOR_COPY_FROM_CPU_IMPL + +#define PD_TENSOR_COPY_TO_CPU_IMPL(type, Type) \ + void PD_TensorCopyToCpu##Type(__pd_keep PD_Tensor* pd_tensor, type* data) { \ + CHECK_AND_CONVERT_PD_TENSOR; \ + tensor->CopyToCpu(data); \ + } +REPEAT_ALL_DATA_TYPE(PD_TENSOR_COPY_TO_CPU_IMPL) +#undef PD_TENSOR_COPY_TO_CPU_IMPL + +#undef REPEAT_ALL_DATA_TYPE + +__pd_give PD_OneDimArrayInt32* PD_TensorGetShape( + __pd_keep PD_Tensor* pd_tensor) { + CHECK_AND_CONVERT_PD_TENSOR; + return paddle_infer::CvtVecToOneDimArrayInt32(tensor->shape()); +} +void PD_TensorSetLod(__pd_keep PD_Tensor* pd_tensor, + __pd_keep PD_TwoDimArraySize* lod) { + CHECK_AND_CONVERT_PD_TENSOR; + tensor->SetLoD(paddle_infer::CvtTwoDimArrayToVecSize(lod)); +} +__pd_give PD_TwoDimArraySize* PD_TensorGetLod(__pd_keep PD_Tensor* pd_tensor) { + CHECK_AND_CONVERT_PD_TENSOR; + return paddle_infer::CvtVecToTwoDimArraySize(tensor->lod()); +} +const char* PD_TensorGetName(__pd_keep PD_Tensor* pd_tensor) { + CHECK_AND_CONVERT_PD_TENSOR; + return tensor->name().c_str(); +} +PD_DataType PD_TensorGetDataType(__pd_keep PD_Tensor* pd_tensor) { + CHECK_AND_CONVERT_PD_TENSOR; + return paddle_infer::CvtFromCxxDatatype(tensor->type()); +} + +} // extern "C" diff --git a/paddle/fluid/inference/capi_exp/pd_tensor.h b/paddle/fluid/inference/capi_exp/pd_tensor.h new file mode 100644 index 00000000000000..29ea4b5d62e43c --- /dev/null +++ b/paddle/fluid/inference/capi_exp/pd_tensor.h @@ -0,0 +1,287 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/// +/// \file pd_tensor.h +/// +/// \brief interface for paddle tensor +/// +/// \author paddle-infer@baidu.com +/// \date 2021-04-21 +/// \since 2.1 +/// + +#pragma once + +#include "pd_common.h" // NOLINT + +typedef struct PD_Tensor PD_Tensor; +typedef struct PD_OneDimArrayInt32 PD_OneDimArrayInt32; +typedef struct PD_TwoDimArraySize PD_TwoDimArraySize; + +#ifdef __cplusplus +extern "C" { +#endif + +/// +/// \brief Destroy the paddle tensor +/// +/// \param[in] pd_tensor tensor +/// +PADDLE_CAPI_EXPORT extern void PD_TensorDestroy(__pd_take PD_Tensor* pd_tensor); + +/// +/// \brief Reset the shape of the tensor. +/// Generally it's only used for the input tensor. +/// Reshape must be called before calling PD_TensorMutableData*() or +/// PD_TensorCopyFromCpu*() +/// +/// \param[in] pd_tensor tensor. +/// \param[in] shape_size The size of shape. +/// \param[in] shape The shape to set. +/// +PADDLE_CAPI_EXPORT extern void PD_TensorReshape(__pd_keep PD_Tensor* pd_tensor, + size_t shape_size, + int32_t* shape); + +/// +/// \brief Get the memory pointer in CPU or GPU with 'float' data type. +/// Please Reshape the tensor first before call this. +/// It's usually used to get input data pointer. +/// +/// \param[in] pd_tensor tensor. +/// \param[in] place The place of the tensor. +/// \return Memory pointer of pd_tensor +/// +PADDLE_CAPI_EXPORT extern float* PD_TensorMutableDataFloat( + __pd_keep PD_Tensor* pd_tensor, PD_PlaceType place); +/// +/// \brief Get the memory pointer in CPU or GPU with 'int64_t' data type. +/// Please Reshape the tensor first before call this. +/// It's usually used to get input data pointer. +/// +/// \param[in] pd_tensor tensor. +/// \param[in] place The place of the tensor. +/// \return Memory pointer of pd_tensor +/// +PADDLE_CAPI_EXPORT extern int64_t* PD_TensorMutableDataInt64( + __pd_keep PD_Tensor* pd_tensor, PD_PlaceType place); +/// +/// \brief Get the memory pointer in CPU or GPU with 'int32_t' data type. +/// Please Reshape the tensor first before call this. +/// It's usually used to get input data pointer. +/// +/// \param[in] pd_tensor tensor. +/// \param[in] place The place of the tensor. +/// \return Memory pointer of pd_tensor +/// +PADDLE_CAPI_EXPORT extern int32_t* PD_TensorMutableDataInt32( + __pd_keep PD_Tensor* pd_tensor, PD_PlaceType place); +/// +/// \brief Get the memory pointer in CPU or GPU with 'uint8_t' data type. +/// Please Reshape the tensor first before call this. +/// It's usually used to get input data pointer. +/// +/// \param[in] pd_tensor tensor. +/// \param[in] place The place of the tensor. +/// \return Memory pointer of pd_tensor +/// +PADDLE_CAPI_EXPORT extern uint8_t* PD_TensorMutableDataUint8( + __pd_keep PD_Tensor* pd_tensor, PD_PlaceType place); +/// +/// \brief Get the memory pointer in CPU or GPU with 'int8_t' data type. +/// Please Reshape the tensor first before call this. +/// It's usually used to get input data pointer. +/// +/// \param[in] pd_tensor tensor. +/// \param[in] place The place of the tensor. +/// \return Memory pointer of pd_tensor +/// +PADDLE_CAPI_EXPORT extern int8_t* PD_TensorMutableDataInt8( + __pd_keep PD_Tensor* pd_tensor, PD_PlaceType place); +/// +/// \brief Get the memory pointer directly. +/// It's usually used to get the output data pointer. +/// +/// \param[in] pd_tensor tensor. +/// \param[out] place To get the device type of the tensor. +/// \param[out] size To get the data size of the tensor. +/// \return The tensor data buffer pointer. +/// +PADDLE_CAPI_EXPORT extern float* PD_TensorDataFloat( + __pd_keep PD_Tensor* pd_tensor, PD_PlaceType* place, int32_t* size); +/// +/// \brief Get the memory pointer directly. +/// It's usually used to get the output data pointer. +/// +/// \param[in] pd_tensor tensor. +/// \param[out] place To get the device type of the tensor. +/// \param[out] size To get the data size of the tensor. +/// \return The tensor data buffer pointer. +/// +PADDLE_CAPI_EXPORT extern int64_t* PD_TensorDataInt64( + __pd_keep PD_Tensor* pd_tensor, PD_PlaceType* place, int32_t* size); +/// +/// \brief Get the memory pointer directly. +/// It's usually used to get the output data pointer. +/// +/// \param[in] pd_tensor tensor. +/// \param[out] place To get the device type of the tensor. +/// \param[out] size To get the data size of the tensor. +/// \return The tensor data buffer pointer. +/// +PADDLE_CAPI_EXPORT extern int32_t* PD_TensorDataInt32( + __pd_keep PD_Tensor* pd_tensor, PD_PlaceType* place, int32_t* size); +/// +/// \brief Get the memory pointer directly. +/// It's usually used to get the output data pointer. +/// +/// \param[in] pd_tensor tensor. +/// \param[out] place To get the device type of the tensor. +/// \param[out] size To get the data size of the tensor. +/// \return The tensor data buffer pointer. +/// +PADDLE_CAPI_EXPORT extern uint8_t* PD_TensorDataUint8( + __pd_keep PD_Tensor* pd_tensor, PD_PlaceType* place, int32_t* size); +/// +/// \brief Get the memory pointer directly. +/// It's usually used to get the output data pointer. +/// +/// \param[in] pd_tensor tensor. +/// \param[out] place To get the device type of the tensor. +/// \param[out] size To get the data size of the tensor. +/// \return The tensor data buffer pointer. +/// +PADDLE_CAPI_EXPORT extern int8_t* PD_TensorDataInt8( + __pd_keep PD_Tensor* pd_tensor, PD_PlaceType* place, int32_t* size); +/// +/// \brief Copy the host memory to tensor data. +/// It's usually used to set the input tensor data. +/// \param[in] pd_tensor tensor. +/// \param[in] data The pointer of the data, from which the tensor will copy. +/// +PADDLE_CAPI_EXPORT extern void PD_TensorCopyFromCpuFloat( + __pd_keep PD_Tensor* pd_tensor, const float* data); +/// +/// \brief Copy the host memory to tensor data. +/// It's usually used to set the input tensor data. +/// \param[in] pd_tensor tensor. +/// \param[in] data The pointer of the data, from which the tensor will copy. +/// +PADDLE_CAPI_EXPORT extern void PD_TensorCopyFromCpuInt64( + __pd_keep PD_Tensor* pd_tensor, const int64_t* data); +/// +/// \brief Copy the host memory to tensor data. +/// It's usually used to set the input tensor data. +/// \param[in] pd_tensor tensor. +/// \param[in] data The pointer of the data, from which the tensor will copy. +/// +PADDLE_CAPI_EXPORT extern void PD_TensorCopyFromCpuInt32( + __pd_keep PD_Tensor* pd_tensor, const int32_t* data); +/// +/// \brief Copy the host memory to tensor data. +/// It's usually used to set the input tensor data. +/// \param[in] pd_tensor tensor. +/// \param[in] data The pointer of the data, from which the tensor will copy. +/// +PADDLE_CAPI_EXPORT extern void PD_TensorCopyFromCpuUint8( + __pd_keep PD_Tensor* pd_tensor, const uint8_t* data); +/// +/// \brief Copy the host memory to tensor data. +/// It's usually used to set the input tensor data. +/// \param[in] pd_tensor tensor. +/// \param[in] data The pointer of the data, from which the tensor will copy. +/// +PADDLE_CAPI_EXPORT extern void PD_TensorCopyFromCpuInt8( + __pd_keep PD_Tensor* pd_tensor, const int8_t* data); +/// +/// \brief Copy the tensor data to the host memory. +/// It's usually used to get the output tensor data. +/// \param[in] pd_tensor tensor. +/// \param[out] data The tensor will copy the data to the address. +/// +PADDLE_CAPI_EXPORT extern void PD_TensorCopyToCpuFloat( + __pd_keep PD_Tensor* pd_tensor, float* data); +/// +/// \brief Copy the tensor data to the host memory. +/// It's usually used to get the output tensor data. +/// \param[in] pd_tensor tensor. +/// \param[out] data The tensor will copy the data to the address. +/// +PADDLE_CAPI_EXPORT extern void PD_TensorCopyToCpuInt64( + __pd_keep PD_Tensor* pd_tensor, int64_t* data); +/// +/// \brief Copy the tensor data to the host memory. +/// It's usually used to get the output tensor data. +/// \param[in] pd_tensor tensor. +/// \param[out] data The tensor will copy the data to the address. +/// +PADDLE_CAPI_EXPORT extern void PD_TensorCopyToCpuInt32( + __pd_keep PD_Tensor* pd_tensor, int32_t* data); +/// +/// \brief Copy the tensor data to the host memory. +/// It's usually used to get the output tensor data. +/// \param[in] pd_tensor tensor. +/// \param[out] data The tensor will copy the data to the address. +/// +PADDLE_CAPI_EXPORT extern void PD_TensorCopyToCpuUint8( + __pd_keep PD_Tensor* pd_tensor, uint8_t* data); +/// +/// \brief Copy the tensor data to the host memory. +/// It's usually used to get the output tensor data. +/// \param[in] pd_tensor tensor. +/// \param[out] data The tensor will copy the data to the address. +/// +PADDLE_CAPI_EXPORT extern void PD_TensorCopyToCpuInt8( + __pd_keep PD_Tensor* pd_tensor, int8_t* data); +/// +/// \brief Get the tensor shape +/// \param[in] pd_tensor tensor. +/// \return The tensor shape. +/// +PADDLE_CAPI_EXPORT extern __pd_give PD_OneDimArrayInt32* PD_TensorGetShape( + __pd_keep PD_Tensor* pd_tensor); + +/// +/// \brief Set the tensor lod information +/// \param[in] pd_tensor tensor. +/// \param[in] lod lod information. +/// +PADDLE_CAPI_EXPORT extern void PD_TensorSetLod( + __pd_keep PD_Tensor* pd_tensor, __pd_keep PD_TwoDimArraySize* lod); +/// +/// \brief Get the tensor lod information +/// \param[in] pd_tensor tensor. +/// \return the lod information. +/// +PADDLE_CAPI_EXPORT extern __pd_give PD_TwoDimArraySize* PD_TensorGetLod( + __pd_keep PD_Tensor* pd_tensor); +/// +/// \brief Get the tensor name +/// \param[in] pd_tensor tensor. +/// \return the tensor name. +/// +PADDLE_CAPI_EXPORT extern const char* PD_TensorGetName( + __pd_keep PD_Tensor* pd_tensor); +/// +/// \brief Get the tensor data type +/// \param[in] pd_tensor tensor. +/// \return the tensor data type. +/// +PADDLE_CAPI_EXPORT extern PD_DataType PD_TensorGetDataType( + __pd_keep PD_Tensor* pd_tensor); + +#ifdef __cplusplus +} // extern "C" +#endif diff --git a/paddle/fluid/inference/capi_exp/pd_types.h b/paddle/fluid/inference/capi_exp/pd_types.h new file mode 100644 index 00000000000000..a5da2913a9b207 --- /dev/null +++ b/paddle/fluid/inference/capi_exp/pd_types.h @@ -0,0 +1,40 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "pd_common.h" // NOLINT + +typedef struct PD_OneDimArrayInt32 { + size_t size; + int32_t* data; +} PD_OneDimArrayInt32; // std::vector + +typedef struct PD_OneDimArraySize { + size_t size; + size_t* data; +} PD_OneDimArraySize; // std::vector + +typedef struct PD_OneDimArrayCstr { + size_t size; + char** data; +} PD_OneDimArrayCstr; // std::vector + +typedef struct PD_TwoDimArraySize { + size_t size; + PD_OneDimArraySize** data; +} PD_TwoDimArraySize; // std::vector> diff --git a/paddle/fluid/inference/capi_exp/pd_utils.cc b/paddle/fluid/inference/capi_exp/pd_utils.cc new file mode 100644 index 00000000000000..2e762619f5567c --- /dev/null +++ b/paddle/fluid/inference/capi_exp/pd_utils.cc @@ -0,0 +1,221 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/capi_exp/pd_utils.h" +#include "paddle/fluid/inference/capi_exp/utils_internal.h" +#include "paddle/fluid/platform/enforce.h" + +#define DESTROY_ONE_DIM_ARRAY(type) \ + void PD_OneDimArray##type##Destroy(__pd_take PD_OneDimArray##type* array) { \ + if (array != NULL) { \ + delete[] array->data; \ + delete array; \ + } \ + } +#define CONVERT_VEC_TO_ONE_DIM_ARRAY(type, Type, vec_type) \ + __pd_give PD_OneDimArray##Type* CvtVecToOneDimArray##Type( \ + const std::vector& vec) { \ + PD_OneDimArray##Type* array = new PD_OneDimArray##Type; \ + array->size = vec.size(); \ + array->data = vec.empty() ? NULL : new type[vec.size()]; \ + for (size_t index = 0; index < vec.size(); ++index) { \ + array->data[index] = vec[index]; \ + } \ + return array; \ + } +#define CONVERT_ONE_DIM_ARRAY_TO_VEC(type, Type, vec_type) \ + std::vector CvtOneDimArrayToVec##Type( \ + __pd_keep const PD_OneDimArray##Type* array) { \ + std::vector vec; \ + if (array != NULL) { \ + vec.resize(array->size); \ + for (size_t index = 0; index < array->size; ++index) { \ + vec[index] = array->data[index]; \ + } \ + } \ + return vec; \ + } + +#define ONE_DIM_ARRAY_UTILS_FUNC_IMPL(type, Type, vec_type) \ + extern "C" { \ + DESTROY_ONE_DIM_ARRAY(Type); \ + } \ + namespace paddle_infer { \ + CONVERT_VEC_TO_ONE_DIM_ARRAY(type, Type, vec_type) \ + CONVERT_ONE_DIM_ARRAY_TO_VEC(type, Type, vec_type) \ + } + +ONE_DIM_ARRAY_UTILS_FUNC_IMPL(int32_t, Int32, int) +ONE_DIM_ARRAY_UTILS_FUNC_IMPL(size_t, Size, size_t) + +#undef ONE_DIM_ARRAY_UTILS_FUNC_IMPL +#undef CONVERT_ONE_DIM_ARRAY_TO_VEC +#undef CONVERT_VEC_TO_ONE_DIM_ARRAY +#undef DESTROY_ONE_DIM_ARRAY + +void PD_OneDimArrayCstrDestroy(__pd_take PD_OneDimArrayCstr* array) { + if (array != NULL) { + if (array->size != 0) { + for (size_t index = 0; index < array->size; ++index) { + delete[] array->data[index]; + } + } + delete[] array->data; + delete array; + } +} +namespace paddle_infer { + +__pd_give PD_OneDimArrayCstr* CvtVecToOneDimArrayCstr( + const std::vector& vec) { + PD_OneDimArrayCstr* array = new PD_OneDimArrayCstr; + array->size = vec.size(); + array->data = vec.empty() ? NULL : new char*[vec.size()]; + for (size_t index = 0u; index < vec.size(); ++index) { + array->data[index] = new char[vec[index].size() + 1]; + memcpy(array->data[index], vec[index].c_str(), vec[index].size() + 1); + } + return array; +} + +std::vector CvtOneDimArrayToVecCstr( + __pd_keep const PD_OneDimArrayCstr* array) { + std::vector vec; + for (size_t index = 0; index < array->size; ++index) { + vec.emplace_back(array->data[index]); + } + return vec; +} + +} // namespace paddle_infer + +#define DESTROY_TWO_DIM_ARRAY(type) \ + void PD_TwoDimArray##type##Destroy(__pd_take PD_TwoDimArray##type* array) { \ + if (array != NULL) { \ + if (array->size != 0) { \ + for (size_t index = 0; index < array->size; ++index) { \ + PD_OneDimArray##type##Destroy(array->data[index]); \ + } \ + } \ + delete[] array->data; \ + delete array; \ + } \ + } +#define CONVERT_VEC_TO_TWO_DIM_ARRAY(type, Type, vec_type) \ + __pd_give PD_TwoDimArray##Type* CvtVecToTwoDimArray##Type( \ + const std::vector>& vec) { \ + PD_TwoDimArray##Type* array = new PD_TwoDimArray##Type; \ + array->size = vec.size(); \ + array->data = vec.empty() ? NULL : new PD_OneDimArray##Type*[vec.size()]; \ + for (size_t index = 0; index < vec.size(); ++index) { \ + array->data[index] = CvtVecToOneDimArray##Type(vec[index]); \ + } \ + return array; \ + } +#define CONVERT_TWO_DIM_ARRAY_TO_VEC(type, Type, vec_type) \ + std::vector> CvtTwoDimArrayToVec##Type( \ + __pd_keep const PD_TwoDimArray##Type* array) { \ + std::vector> vec; \ + if (array != NULL && array->size != 0) { \ + vec.resize(array->size); \ + for (size_t index = 0; index < array->size; ++index) { \ + vec[index] = CvtOneDimArrayToVec##Type((array->data)[index]); \ + } \ + } \ + return vec; \ + } +#define TWO_DIM_ARRAY_UTILS_FUNC_IMPL(type, Type, vec_type) \ + extern "C" { \ + DESTROY_TWO_DIM_ARRAY(Type); \ + } \ + namespace paddle_infer { \ + CONVERT_VEC_TO_TWO_DIM_ARRAY(type, Type, vec_type) \ + CONVERT_TWO_DIM_ARRAY_TO_VEC(type, Type, vec_type) \ + } + +TWO_DIM_ARRAY_UTILS_FUNC_IMPL(size_t, Size, size_t) + +#undef TWO_DIM_ARRAY_UTILS_FUNC_IMPL +#undef CONVERT_TWO_DIM_ARRAY_TO_VEC +#undef CONVERT_VEC_TO_TWO_DIM_ARRAY +#undef DESTROY_TWO_DIM_ARRAY + +namespace paddle_infer { + +PlaceType CvtToCxxPlaceType(PD_PlaceType place_type) { + switch (place_type) { + case PD_PLACE_UNK: + return PlaceType::kUNK; + case PD_PLACE_CPU: + return PlaceType::kCPU; + case PD_PLACE_GPU: + return PlaceType::kGPU; + case PD_PLACE_XPU: + return PlaceType::kXPU; + default: + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "Unsupport paddle place type %d.", place_type)); + return PlaceType::kUNK; + } +} + +PD_PlaceType CvtFromCxxPlaceType(PlaceType place_type) { + switch (place_type) { + case PlaceType::kCPU: + return PD_PLACE_CPU; + case PlaceType::kGPU: + return PD_PLACE_GPU; + case PlaceType::kXPU: + return PD_PLACE_XPU; + default: + return PD_PLACE_UNK; + } +} + +DataType CvtToCxxDatatype(PD_DataType data_type) { + switch (data_type) { + case PD_DATA_FLOAT32: + return DataType::FLOAT32; + case PD_DATA_INT64: + return DataType::INT64; + case PD_DATA_INT32: + return DataType::INT32; + case PD_DATA_UINT8: + return DataType::UINT8; + default: + PADDLE_THROW(paddle::platform::errors::InvalidArgument( + "Unsupport paddle data type %d.", data_type)); + return DataType::FLOAT32; + } +} + +PD_DataType CvtFromCxxDatatype(DataType data_type) { + switch (data_type) { + case DataType::FLOAT32: + return PD_DATA_FLOAT32; + case DataType::INT64: + return PD_DATA_INT64; + case DataType::INT32: + return PD_DATA_INT32; + case DataType::UINT8: + return PD_DATA_UINT8; + default: + return PD_DATA_UNK; + } +} + +} // namespace paddle_infer diff --git a/paddle/fluid/inference/capi_exp/pd_utils.h b/paddle/fluid/inference/capi_exp/pd_utils.h new file mode 100644 index 00000000000000..68e519d4bb5e95 --- /dev/null +++ b/paddle/fluid/inference/capi_exp/pd_utils.h @@ -0,0 +1,70 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/// +/// \file pd_utils.h +/// +/// \brief Some utility function to destroy paddle struct. +/// +/// \author paddle-infer@baidu.com +/// \date 2021-04-21 +/// \since 2.1 +/// + +#pragma once + +#include +#include + +#include "pd_types.h" // NOLINT + +#ifdef __cplusplus +extern "C" { +#endif + +/// +/// \brief Destroy the PD_OneDimArrayInt32 object pointed to by the pointer. +/// +/// \param[in] array pointer to the PD_OneDimArrayInt32 object. +/// +PADDLE_CAPI_EXPORT extern void PD_OneDimArrayInt32Destroy( + __pd_take PD_OneDimArrayInt32* array); + +/// +/// \brief Destroy the PD_OneDimArrayCstr object pointed to by the pointer. +/// +/// \param[in] array pointer to the PD_OneDimArrayCstr object. +/// +PADDLE_CAPI_EXPORT extern void PD_OneDimArrayCstrDestroy( + __pd_take PD_OneDimArrayCstr* array); + +/// +/// \brief Destroy the PD_OneDimArraySize object pointed to by the pointer. +/// +/// \param[in] array pointer to the PD_OneDimArraySize object. +/// +PADDLE_CAPI_EXPORT extern void PD_OneDimArraySizeDestroy( + __pd_take PD_OneDimArraySize* array); + +/// +/// \brief Destroy the PD_TwoDimArraySize object pointed to by the pointer. +/// +/// \param[in] array pointer to the PD_TwoDimArraySize object. +/// +PADDLE_CAPI_EXPORT extern void PD_TwoDimArraySizeDestroy( + __pd_take PD_TwoDimArraySize* array); + +#ifdef __cplusplus +} // extern "C" +#endif diff --git a/paddle/fluid/inference/capi_exp/types_internal.h b/paddle/fluid/inference/capi_exp/types_internal.h new file mode 100644 index 00000000000000..8a61b9a884c3bf --- /dev/null +++ b/paddle/fluid/inference/capi_exp/types_internal.h @@ -0,0 +1,29 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/capi_exp/pd_common.h" + +typedef struct PD_Tensor { + std::unique_ptr tensor; +} PD_Tensor; + +typedef struct PD_Predictor { + std::shared_ptr predictor; +} PD_Predictor; diff --git a/paddle/fluid/inference/capi_exp/utils_internal.h b/paddle/fluid/inference/capi_exp/utils_internal.h new file mode 100644 index 00000000000000..fbae512ecd8557 --- /dev/null +++ b/paddle/fluid/inference/capi_exp/utils_internal.h @@ -0,0 +1,153 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/// +/// \file utils_internal.h +/// +/// \brief Some utility function used to convert object between C Struct and C++ +/// Class. +/// +/// \author paddle-infer@baidu.com +/// \date 2021-04-21 +/// \since 2.1 +/// + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/inference/capi_exp/pd_types.h" + +namespace paddle_infer { + +/// +/// \brief Convert the 'std::vector' object to a 'PD_OneDimArrayInt32' +/// object. +/// +/// \param[in] vec source object. +/// \return target object. +/// +__pd_give PD_OneDimArrayInt32* CvtVecToOneDimArrayInt32( + const std::vector& vec); + +/// +/// \brief Convert the 'PD_OneDimArrayInt32' object to a 'std::vector' +/// object. +/// +/// \param[in] array source object. +/// \return target object. +/// +std::vector CvtOneDimArrayToVecInt32( + __pd_keep const PD_OneDimArrayInt32* array); + +/// +/// \brief Convert the 'std::vector' object to a 'PD_OneDimArraySize' +/// object. +/// +/// \param[in] vec source object. +/// \return target object. +/// +__pd_give PD_OneDimArraySize* CvtVecToOneDimArraySize( + const std::vector& vec); + +/// +/// \brief Convert the 'PD_OneDimArraySize' object to a 'std::vector' +/// object. +/// +/// \param[in] array source object. +/// \return target object. +/// +std::vector CvtOneDimArrayToVecSize( + __pd_keep const PD_OneDimArraySize* array); + +/// +/// \brief Convert the 'std::vector' object to a +/// 'PD_OneDimArrayCstr' object. +/// +/// \param[in] vec source object. +/// \return target object. +/// +__pd_give PD_OneDimArrayCstr* CvtVecToOneDimArrayCstr( + const std::vector& vec); + +/// +/// \brief Convert the 'PD_OneDimArrayCstr' object to a +/// 'std::vector' object. +/// +/// \param[in] array source object. +/// \return target object. +/// +std::vector CvtOneDimArrayToVecCstr( + __pd_keep const PD_OneDimArrayCstr* array); + +/// +/// \brief Convert the 'std::vector>' object to a +/// 'PD_TwoDimArraySize' object. +/// +/// \param[in] vec source object. +/// \return target object. +/// +__pd_give PD_TwoDimArraySize* CvtVecToTwoDimArraySize( + const std::vector>& vec); + +/// +/// \brief Convert the 'PD_TwoDimArraySize' object to a +/// 'std::vector>' object. +/// +/// \param[in] array source object. +/// \return target object. +/// +std::vector> CvtTwoDimArrayToVecSize( + __pd_keep const PD_TwoDimArraySize* array); + +/// +/// \brief Convert the 'PD_PlaceType' object to a 'paddle_infer::PlaceType' +/// object. +/// +/// \param[in] place_type source object. +/// \return target object. +/// +PlaceType CvtToCxxPlaceType(PD_PlaceType place_type); + +/// +/// \brief Convert the 'paddle_infer::PlaceType' object to a 'PD_PlaceType' +/// object. +/// +/// \param[in] place_type source object. +/// \return target object. +/// +PD_PlaceType CvtFromCxxPlaceType(PlaceType place_type); + +/// +/// \brief Convert the 'PD_DataType' object to a 'paddle_infer::DataType' +/// object. +/// +/// \param[in] place_type source object. +/// \return target object. +/// +DataType CvtToCxxDatatype(PD_DataType data_type); + +/// +/// \brief Convert the 'paddle_infer::DataType' object to a 'PD_DataType' +/// object. +/// +/// \param[in] place_type source object. +/// \return target object. +/// +PD_DataType CvtFromCxxDatatype(DataType data_type); + +} // namespace paddle_infer diff --git a/paddle/fluid/inference/lite/engine.cc b/paddle/fluid/inference/lite/engine.cc index 59a786e46c98bf..908e1ab990bb73 100644 --- a/paddle/fluid/inference/lite/engine.cc +++ b/paddle/fluid/inference/lite/engine.cc @@ -59,8 +59,14 @@ paddle::lite_api::PaddlePredictor* EngineManager::Create( #endif #ifdef LITE_SUBGRAPH_WITH_XPU + // Deprecated in Paddle-Lite release/v2.8 lite_cxx_config.set_xpu_workspace_l3_size_per_thread( cfg.xpu_l3_workspace_size); + lite_cxx_config.set_xpu_l3_cache_method(cfg.xpu_l3_workspace_size, + cfg.locked); + lite_cxx_config.set_xpu_conv_autotune(cfg.autotune, cfg.autotune_file); + lite_cxx_config.set_xpu_multi_encoder_method(cfg.precision, + cfg.adaptive_seqlen); #endif // create predictor diff --git a/paddle/fluid/inference/lite/engine.h b/paddle/fluid/inference/lite/engine.h index 5ba487cc24d7d5..a64ef1eda828bf 100644 --- a/paddle/fluid/inference/lite/engine.h +++ b/paddle/fluid/inference/lite/engine.h @@ -42,6 +42,11 @@ struct EngineConfig { // for xpu size_t xpu_l3_workspace_size; + bool locked = false; + bool autotune = true; + std::string autotune_file = ""; + std::string precision = "int16"; + bool adaptive_seqlen = false; // for x86 or arm int cpu_math_library_num_threads{1}; diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc index a6484a13557052..7ea41839cb939f 100644 --- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc @@ -38,38 +38,6 @@ class BatchNormOpConverter : public OpConverter { VLOG(3) << "convert a fluid batch norm op to tensorrt batch_norm"; framework::OpDesc op_desc(op, nullptr); - PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1, - platform::errors::InvalidArgument( - "Invalid input X's size of batch_norm TRT converter. " - "Expected 1, received %d.", - op_desc.Input("X").size())); - PADDLE_ENFORCE_EQ(op_desc.Input("Bias").size(), 1, - platform::errors::InvalidArgument( - "Invalid input Bias's size of batch_norm TRT " - "converter. Expected 1, received %d.", - op_desc.Input("Bias").size())); // Bias is a weight - PADDLE_ENFORCE_EQ(op_desc.Input("Mean").size(), 1, - platform::errors::InvalidArgument( - "Invalid input Mean's size of batch_norm TRT " - "converter. Expected 1, received %d.", - op_desc.Input("Mean").size())); // Mean is a weight - PADDLE_ENFORCE_EQ(op_desc.Input("Scale").size(), 1, - platform::errors::InvalidArgument( - "Invalid input Scale's size of batch_norm TRT " - "converter. Expected 1, received %d.", - op_desc.Input("Scale").size())); // Scale is a weight - PADDLE_ENFORCE_EQ( - op_desc.Input("Variance").size(), 1, - platform::errors::InvalidArgument( - "Invalid input Variance's size of batch_norm TRT converter. " - "Expected 1, received %d.", - op_desc.Input("Variance").size())); // Variance is a weight - PADDLE_ENFORCE_EQ(op_desc.Output("Y").size(), 1, - platform::errors::InvalidArgument( - "Invalid output Y's size of batch_norm TRT " - "converter. Expected 1, received %d.", - op_desc.Output("Y").size())); - auto* X = engine_->GetITensor(op_desc.Input("X").front()); // Declare weights auto* Bias_v = scope.FindVar(op_desc.Input("Bias").front()); diff --git a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc index 5515cd35daedc7..61199724bcfe30 100644 --- a/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/conv2d_op.cc @@ -36,18 +36,6 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op, VLOG(3) << "convert a fluid " << name << " op to tensorrt layer without bias"; framework::OpDesc op_desc(op, nullptr); - PADDLE_ENFORCE_EQ(op_desc.Input("Input").size(), 1UL, - platform::errors::InvalidArgument( - "TRT Conv2d expect 1 input, but got %d input.", - op_desc.Input("Input").size())); - PADDLE_ENFORCE_EQ(op_desc.Input("Filter").size(), 1UL, - platform::errors::InvalidArgument( - "TRT Conv2d expect 1 filter, but got %d filter.", - op_desc.Input("Filter").size())); - PADDLE_ENFORCE_EQ(op_desc.Output("Output").size(), 1UL, - platform::errors::InvalidArgument( - "TRT Conv2d expect 1 output, but got %d output.", - op_desc.Output("Output").size())); auto* X = engine->GetITensor(op_desc.Input("Input").front()); std::string filter_var_name = op_desc.Input("Filter").front(); @@ -61,13 +49,6 @@ void ConvertConv2d(TensorRTEngine* engine, const framework::proto::OpDesc& op, if (enable_int8) { #if IS_TRT_VERSION_GE(5000) - if (op_desc.Type() != "conv2d_transpose") { - PADDLE_ENFORCE_EQ( - op_desc.HasAttr("Input_scale"), true, - platform::errors::InvalidArgument("Input scale not found. TRT int8" - " requires conv/deconv to have " - "input quantization scales.")); - } float in_scale = BOOST_GET_CONST(float, op_desc.GetAttr("Input_scale")) * 127; auto weight_scale = @@ -179,19 +160,11 @@ class Deconv2dOpConverter : public OpConverter { nvinfer1::DimsHW& ksize, TensorRTEngine::Weight& weight, TensorRTEngine::Weight& bias) -> nvinfer1::IDeconvolutionLayer* { auto* layer = - TRT_ENGINE_ADD_LAYER(engine_, Deconvolution, *inputs, n_input, + TRT_ENGINE_ADD_LAYER(engine_, Deconvolution, *inputs, n_output, ksize, weight.get(), bias.get()); return layer; }, [](nvinfer1::IDeconvolutionLayer* layer, nvinfer1::DimsHW& dilations) { - // In trt Deconv, dilation should be 1, ohter values are not - // supported. - bool condition = (dilations.d[0] == 1 && dilations.d[1] == 1); - PADDLE_ENFORCE_EQ(condition, true, - platform::errors::InvalidArgument( - "In Deconv, Dilations must be (1, 1) for " - "tensorRT, but given (%d, %d)", - dilations.d[0], dilations.d[1])); }, "conv2d_transpose"); } diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc index 74057addecd1f9..19d79510547ecc 100644 --- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc @@ -25,6 +25,10 @@ static bool CheckDims(const nvinfer1::Dims& dims_x, return false; } for (int i = 0; i < dims_x.nbDims; i++) { + // conservative judgment + if (dims_x.d[i] == -1 || dims_y.d[i] == -1) { + return false; + } if (dims_x.d[i] != dims_y.d[i]) { return false; } @@ -43,25 +47,6 @@ class ElementwiseWeightOpConverter : public OpConverter { framework::OpDesc op_desc(op, nullptr); VLOG(3) << "Convert a fluid elementwise op to TensorRT IScaleLayer"; - PADDLE_ENFORCE_EQ( - op_desc.Input("X").size(), 1, - platform::errors::InvalidArgument( - "The input op's Input(\"X\").size() " - "should equal to 1, but received Input(\"X\").size() = %u.", - op_desc.Input("X").size())); - PADDLE_ENFORCE_EQ( - op_desc.Input("Y").size(), 1, - platform::errors::InvalidArgument( - "The input op's Input(\"Y\").size() " - "should equal to 1, but received Input(\"Y\").size() = %u.", - op_desc.Input("Y").size())); // Y is a weight - PADDLE_ENFORCE_EQ( - op_desc.Output("Out").size(), 1, - platform::errors::InvalidArgument( - "The input op's Output(\"Out\").size() " - "should equal to 1, but reveceid Output(\"Out\").size() = %u.", - op_desc.Output("Out").size())); - auto* X = engine_->GetITensor(op_desc.Input("X").front()); auto* Y_v = scope.FindVar(op_desc.Input("Y").front()); PADDLE_ENFORCE_NOT_NULL( @@ -193,25 +178,6 @@ class ElementwiseTensorOpConverter : public OpConverter { framework::OpDesc op_desc(op, nullptr); nvinfer1::ILayer* layer = nullptr; - PADDLE_ENFORCE_EQ( - op_desc.Input("X").size(), 1, - platform::errors::InvalidArgument( - "The input op's Input(\"X\").size() " - "should equal to 1, but received Input(\"X\").size() = %u.", - op_desc.Input("X").size())); - PADDLE_ENFORCE_EQ( - op_desc.Input("Y").size(), 1, - platform::errors::InvalidArgument( - "The input op's Input(\"Y\").size() " - "should equal to 1, but received Input(\"Y\").size() = %u.", - op_desc.Input("Y").size())); // Y is a weight - PADDLE_ENFORCE_EQ( - op_desc.Output("Out").size(), 1, - platform::errors::InvalidArgument( - "The input op's Output(\"Out\").size() " - "should equal to 1, but received Output(\"Out\").size() = %u.", - op_desc.Output("Out").size())); - auto* X = engine_->GetITensor(op_desc.Input("X").front()); auto* Y = engine_->GetITensor(op_desc.Input("Y").front()); std::vector itensors; diff --git a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc index f13f1724541239..66a682db07b911 100644 --- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc +++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc @@ -34,13 +34,17 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { VLOG(4) << "convert fluid EmbEltwiseLayerNorm op to tensorrt layer"; framework::OpDesc op_desc(op, nullptr); - auto id_names = op_desc.Input("Ids"); - auto emb_names = op_desc.Input("Embs"); + auto word_id_name = op_desc.Input("WordId").front(); + auto pos_id_name = op_desc.Input("PosId").front(); + auto sent_id_name = op_desc.Input("SentId").front(); + auto word_emb_name = op_desc.Input("WordEmbedding").front(); + auto pos_emb_name = op_desc.Input("PosEmbedding").front(); + auto sent_emb_name = op_desc.Input("SentEmbedding").front(); + std::vector id_names = {word_id_name, pos_id_name, + sent_id_name}; + std::vector emb_names = {word_emb_name, pos_emb_name, + sent_emb_name}; - PADDLE_ENFORCE_EQ(id_names.size(), emb_names.size(), - platform::errors::InvalidArgument( - "The id and emb size of fused EmbEltwiseLayerNormOp " - "should be same ")); int input_num = id_names.size(); // Declare inputs @@ -91,99 +95,96 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter { nvinfer1::ILayer* layer = nullptr; bool enable_int8 = op_desc.HasAttr("enable_int8"); - if (engine_->with_dynamic_shape()) { - if (engine_->use_oss()) { - int output_fp16 = static_cast((engine_->WithFp16() == 1) ? 1 : 0); - if (enable_int8) { - output_fp16 = 1; - } - PADDLE_ENFORCE_EQ( - output_fp16, 1, - platform::errors::InvalidArgument( - "Only Precision::KHalf(fp16) is supported when infering " - "ernie(bert) model with config.EnableTensorRtOSS(). " - "But Precision::KFloat32 is setted.")); - const std::vector fields{ - {"bert_embeddings_layernorm_beta", bias, - nvinfer1::PluginFieldType::kFLOAT32, - static_cast(bias_size)}, - {"bert_embeddings_layernorm_gamma", scale, - nvinfer1::PluginFieldType::kFLOAT32, - static_cast(scale_size)}, - {"bert_embeddings_word_embeddings", input_embs[0], - nvinfer1::PluginFieldType::kFLOAT32, - static_cast(emb_sizes[0])}, - {"bert_embeddings_token_type_embeddings", input_embs[2], - nvinfer1::PluginFieldType::kFLOAT32, - static_cast(emb_sizes[2])}, - {"bert_embeddings_position_embeddings", input_embs[1], - nvinfer1::PluginFieldType::kFLOAT32, - static_cast(emb_sizes[1])}, - {"output_fp16", &output_fp16, nvinfer1::PluginFieldType::kINT32, 1}, - }; - - // remember to free - nvinfer1::PluginFieldCollection* plugin_ptr = - static_cast( - malloc(sizeof(*plugin_ptr) + - fields.size() * sizeof(nvinfer1::PluginField))); - plugin_ptr->nbFields = static_cast(fields.size()); - plugin_ptr->fields = fields.data(); - - std::vector plugin_inputs; - plugin_inputs.emplace_back(engine_->GetITensor( - engine_->network()->getInput(0)->getName())); // word_embedding, - // eval_placeholder_0 - plugin_inputs.emplace_back(engine_->GetITensor( - engine_->network()->getInput(1)->getName())); // sent_embedding, - // eval_placeholder_1 - plugin_inputs.emplace_back(engine_->GetITensor( - engine_->network()->getInput(2)->getName())); // cu_seqlens, - // eval_placeholder_2 - auto max_seqlen_tensor = - engine_->GetITensor(engine_->network()->getInput(3)->getName()); - auto* shuffle_layer = TRT_ENGINE_ADD_LAYER( - engine_, Shuffle, - *const_cast(max_seqlen_tensor)); - nvinfer1::Dims shape_dim; - shape_dim.nbDims = 1; - shape_dim.d[0] = -1; - shuffle_layer->setReshapeDimensions(shape_dim); - plugin_inputs.emplace_back( - shuffle_layer->getOutput(0)); // max_seqlen, eval_placeholder_3 - - auto creator = GetPluginRegistry()->getPluginCreator( - "CustomEmbLayerNormPluginDynamic", "2"); - - auto plugin_obj = creator->createPlugin( - "CustomEmbLayerNormPluginDynamic", plugin_ptr); - auto plugin_layer = engine_->network()->addPluginV2( - plugin_inputs.data(), plugin_inputs.size(), *plugin_obj); - layer = plugin_layer; - free(plugin_ptr); - auto output_name = op_desc.Output("Out")[0]; - RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm", - {output_name, std::string("qkv_plugin_mask")}, - test_mode); - } else { - bool with_fp16 = - engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); - float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon")); - plugin::DynamicPluginTensorRT* plugin = nullptr; - plugin = new plugin::EmbEltwiseLayernormPluginDynamic( - input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden, - eps, with_fp16); - layer = engine_->AddDynamicPlugin(input_ids.data(), input_num, plugin); - auto output_name = op_desc.Output("Out")[0]; - RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm", {output_name}, - test_mode); + if (engine_->use_oss()) { + int output_fp16 = static_cast((engine_->WithFp16() == 1) ? 1 : 0); + if (enable_int8) { + output_fp16 = 1; } + PADDLE_ENFORCE_EQ( + input_num, 3, + platform::errors::InvalidArgument( + "When using oss and var-len, embedding_eltwise_layernorm op" + "should have 3 inputs only, but got %d.", + input_num)); + PADDLE_ENFORCE_EQ( + output_fp16, 1, + platform::errors::InvalidArgument( + "Only Precision::KHalf(fp16) is supported when infering " + "ernie(bert) model with config.EnableTensorRtOSS(). " + "But Precision::KFloat32 is setted.")); + const std::vector fields{ + {"bert_embeddings_layernorm_beta", bias, + nvinfer1::PluginFieldType::kFLOAT32, + static_cast(bias_size)}, + {"bert_embeddings_layernorm_gamma", scale, + nvinfer1::PluginFieldType::kFLOAT32, + static_cast(scale_size)}, + {"bert_embeddings_word_embeddings", input_embs[0], + nvinfer1::PluginFieldType::kFLOAT32, + static_cast(emb_sizes[0])}, + {"bert_embeddings_token_type_embeddings", input_embs[2], + nvinfer1::PluginFieldType::kFLOAT32, + static_cast(emb_sizes[2])}, + {"bert_embeddings_position_embeddings", input_embs[1], + nvinfer1::PluginFieldType::kFLOAT32, + static_cast(emb_sizes[1])}, + {"output_fp16", &output_fp16, nvinfer1::PluginFieldType::kINT32, 1}, + }; + + // remember to free + nvinfer1::PluginFieldCollection* plugin_ptr = + static_cast( + malloc(sizeof(*plugin_ptr) + + fields.size() * sizeof(nvinfer1::PluginField))); + plugin_ptr->nbFields = static_cast(fields.size()); + plugin_ptr->fields = fields.data(); + + std::vector plugin_inputs; + plugin_inputs.emplace_back( + engine_->GetITensor(word_id_name)); // word_embedding, + // eval_placeholder_0 + plugin_inputs.emplace_back( + engine_->GetITensor(sent_id_name)); // sent_embedding, + // eval_placeholder_1 + plugin_inputs.emplace_back( + engine_->GetITensor(pos_id_name)); // cu_seqlens, + // eval_placeholder_2 + auto max_seqlen_tensor = + engine_->GetITensor(engine_->network()->getInput(3)->getName()); + auto* shuffle_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *max_seqlen_tensor); + nvinfer1::Dims shape_dim; + shape_dim.nbDims = 1; + shape_dim.d[0] = -1; + shuffle_layer->setReshapeDimensions(shape_dim); + plugin_inputs.emplace_back( + shuffle_layer->getOutput(0)); // max_seqlen, eval_placeholder_3 + + auto creator = GetPluginRegistry()->getPluginCreator( + "CustomEmbLayerNormPluginDynamic", "2"); + + auto plugin_obj = + creator->createPlugin("CustomEmbLayerNormPluginDynamic", plugin_ptr); + auto plugin_layer = engine_->network()->addPluginV2( + plugin_inputs.data(), plugin_inputs.size(), *plugin_obj); + layer = plugin_layer; + free(plugin_ptr); + auto output_name = op_desc.Output("Out")[0]; + RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm", + {output_name, std::string("qkv_plugin_mask")}, + test_mode); } else { - PADDLE_THROW(platform::errors::Fatal( - "You are running the Ernie(Bert) model in static" - "shape mode, which is not supported for the time being.\n" - "You can use the config.SetTRTDynamicShapeInfo(...) interface" - " to set the shape information to run the dynamic shape mode.")); + bool with_fp16 = + engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); + float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon")); + plugin::DynamicPluginTensorRT* plugin = nullptr; + plugin = new plugin::EmbEltwiseLayernormPluginDynamic( + input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden, + eps, with_fp16); + layer = engine_->AddDynamicPlugin(input_ids.data(), input_num, plugin); + auto output_name = op_desc.Output("Out")[0]; + RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm", {output_name}, + test_mode); } #else diff --git a/paddle/fluid/inference/tensorrt/convert/fc_op.cc b/paddle/fluid/inference/tensorrt/convert/fc_op.cc index 194d76c737c7f9..6167e68df2b673 100644 --- a/paddle/fluid/inference/tensorrt/convert/fc_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/fc_op.cc @@ -160,66 +160,67 @@ class FcOpConverter : public OpConverter { if (engine_->with_dynamic_shape()) { // not NCHW layout, but NLP layout with added 'x 1 x 1' auto x_dim = X->getDimensions(); - if (x_dim.nbDims == 3 || x_dim.nbDims == 2) { - auto output_name = op_desc.Output("Out").front(); - // add shuffle before fc - nvinfer1::Dims reshape_before_fc_dim; - reshape_before_fc_dim.nbDims = x_dim.nbDims + 2; - for (int i = 0; i < x_dim.nbDims; i++) { - reshape_before_fc_dim.d[i] = 0; - } - reshape_before_fc_dim.d[x_dim.nbDims] = 1; - reshape_before_fc_dim.d[x_dim.nbDims + 1] = 1; - auto* reshape_before_fc_layer = - TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X); - reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim); - reshape_before_fc_layer->setName( - ("shuffle_before_fc(Output: " + output_name + ")").c_str()); + if (engine_->use_oss() && engine_->with_ernie() && x_dim.nbDims == 4 && + x_dim.d[2] == 1 && x_dim.d[3] == 1 && x_num_col_dims == 2) { + // fc which is just after self attention + regist_fc(X, n_output, weight, bias); + return; + } + PADDLE_ENFORCE_LE( + x_dim.nbDims - x_num_col_dims, 3, + platform::errors::InvalidArgument( + "Params and input dims mismatch. Paddle-TRT FC " + "converter expects x_dim.nbDims - x_num_col_dims <= 3, but " + "x_dim.nbDims = %d, x_num_col_dims = %d.", + x_dim.nbDims, x_num_col_dims)); + auto output_name = op_desc.Output("Out").front(); + // add shuffle before fc + nvinfer1::Dims reshape_before_fc_dim; + // padding shape "x 1 x 1" + int padding_length = 3 - (x_dim.nbDims - x_num_col_dims); + reshape_before_fc_dim.nbDims = x_dim.nbDims + padding_length; + int cur_dim_index = reshape_before_fc_dim.nbDims - 1; + while (padding_length-- > 0) { + reshape_before_fc_dim.d[cur_dim_index--] = 1; + } + while (cur_dim_index >= 0) { + reshape_before_fc_dim.d[cur_dim_index--] = 0; + } - // add fc layer - auto* fc_layer = TRT_ENGINE_ADD_LAYER( - engine_, FullyConnected, *reshape_before_fc_layer->getOutput(0), - n_output, weight.get(), bias.get()); - fc_layer->setName(("fc_layer(Output: " + output_name + ")").c_str()); + auto* reshape_before_fc_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *X); + reshape_before_fc_layer->setReshapeDimensions(reshape_before_fc_dim); + reshape_before_fc_layer->setName( + ("shuffle_before_fc(Output: " + output_name + ")").c_str()); - // add shuffle after fc - nvinfer1::Dims reshape_after_fc_dim; - if (x_dim.nbDims == 3) { - if (x_num_col_dims == 2) { - reshape_after_fc_dim.nbDims = 3; - reshape_after_fc_dim.d[0] = 0; - reshape_after_fc_dim.d[1] = 0; - reshape_after_fc_dim.d[2] = 0; - } else { - reshape_after_fc_dim.nbDims = 2; - reshape_after_fc_dim.d[0] = 0; - auto dim = fc_layer->getOutput(0)->getDimensions(); - reshape_after_fc_dim.d[1] = dim.d[1] * dim.d[2]; - } - // x_dim.nbDims == 2 - } else { - reshape_after_fc_dim.nbDims = 2; - reshape_after_fc_dim.d[0] = 0; - reshape_after_fc_dim.d[1] = 0; - } - auto* reshape_after_fc_layer = - TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *fc_layer->getOutput(0)); - reshape_after_fc_layer->setReshapeDimensions(reshape_after_fc_dim); + // add fc layer + auto* fc_layer = TRT_ENGINE_ADD_LAYER( + engine_, FullyConnected, *reshape_before_fc_layer->getOutput(0), + n_output, weight.get(), bias.get()); + fc_layer->setName(("fc_layer(Output: " + output_name + ")").c_str()); - if (activation_type == "relu") { - reshape_after_fc_layer->setName( - ("shuffle_after_fc(Output: " + output_name + ")").c_str()); - nvinfer1::IActivationLayer* relu_layer = TRT_ENGINE_ADD_LAYER( - engine_, Activation, *(reshape_after_fc_layer->getOutput(0)), - nvinfer1::ActivationType::kRELU); - RreplenishLayerAndOutput(relu_layer, "relu_after_fc_shuffle", - {output_name}, test_mode); - } else { - RreplenishLayerAndOutput(reshape_after_fc_layer, "shuffle_after_fc", - {output_name}, test_mode); - } + // add shuffle after fc + nvinfer1::Dims reshape_after_fc_dim; + reshape_after_fc_dim.nbDims = x_num_col_dims + 1; + for (int i = 0; i < reshape_after_fc_dim.nbDims; i++) { + reshape_after_fc_dim.d[i] = 0; + } + + auto* reshape_after_fc_layer = + TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *fc_layer->getOutput(0)); + reshape_after_fc_layer->setReshapeDimensions(reshape_after_fc_dim); + + if (activation_type == "relu") { + reshape_after_fc_layer->setName( + ("shuffle_after_fc(Output: " + output_name + ")").c_str()); + nvinfer1::IActivationLayer* relu_layer = TRT_ENGINE_ADD_LAYER( + engine_, Activation, *(reshape_after_fc_layer->getOutput(0)), + nvinfer1::ActivationType::kRELU); + RreplenishLayerAndOutput(relu_layer, "relu_after_fc_shuffle", + {output_name}, test_mode); } else { - regist_fc(X, n_output, weight, bias); + RreplenishLayerAndOutput(reshape_after_fc_layer, "shuffle_after_fc", + {output_name}, test_mode); } return; } diff --git a/paddle/fluid/inference/tensorrt/convert/gelu_op.cc b/paddle/fluid/inference/tensorrt/convert/gelu_op.cc index ca5b6a8b52e797..0436499cd40756 100644 --- a/paddle/fluid/inference/tensorrt/convert/gelu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/gelu_op.cc @@ -47,15 +47,7 @@ class GeluOpConverter : public OpConverter { framework::OpDesc op_desc(op, nullptr); // Declare inputs int input_num = op_desc.Input("X").size(); - PADDLE_ENFORCE_EQ(input_num, 1, - platform::errors::InvalidArgument( - "gelu op has only 1 input, but got %d", input_num)); auto* input = engine_->GetITensor(op_desc.Input("X")[0]); - // Get output - size_t output_num = op_desc.Output("Out").size(); - PADDLE_ENFORCE_EQ(output_num, 1, - platform::errors::InvalidArgument( - "gelu op has only 1 output, but got %d", output_num)); nvinfer1::ILayer* layer = nullptr; if (engine_->with_dynamic_shape()) { diff --git a/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc b/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc index 9dc40ceec48094..7ef79e547d09ab 100644 --- a/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/hard_swish_op.cc @@ -41,17 +41,7 @@ class HardSwishOpConverter : public OpConverter { framework::OpDesc op_desc(op, nullptr); // Declare inputs int input_num = op_desc.Input("X").size(); - PADDLE_ENFORCE_EQ( - input_num, 1, - platform::errors::InvalidArgument( - "HardSwish op has only 1 input, but got %d", input_num)); auto* input = engine_->GetITensor(op_desc.Input("X")[0]); - // Get output - size_t output_num = op_desc.Output("Out").size(); - PADDLE_ENFORCE_EQ( - output_num, 1, - platform::errors::InvalidArgument( - "HardSwish op has only 1 output, but got %d", output_num)); const float threshold = op_desc.HasAttr("threshold") diff --git a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc index c1f266bacfec57..0b97b5d87a3d50 100644 --- a/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/layer_norm_op.cc @@ -25,25 +25,6 @@ class LayerNormOpConverter : public OpConverter { const framework::Scope& scope, bool test_mode) override { VLOG(4) << "convert a fluid layer_norm op to tensorrt layer_norm plugin"; framework::OpDesc op_desc(op, nullptr); - PADDLE_ENFORCE_EQ( - op_desc.Input("X").size(), 1, - platform::errors::InvalidArgument( - "input of layer_norm op converter should be 1, got %d", - op_desc.Input("X").size())); - PADDLE_ENFORCE_EQ(op_desc.Input("Bias").size(), 1, - platform::errors::InvalidArgument( - "Bias of layer_norm op converter should be 1, got %d", - op_desc.Input("Bias").size())); // Bias is a weight - PADDLE_ENFORCE_EQ( - op_desc.Input("Scale").size(), 1, - platform::errors::InvalidArgument( - "Scale of layer_norm op converter should be 1, got %d", - op_desc.Input("Scale").size())); // Scale is a weight - PADDLE_ENFORCE_EQ( - op_desc.Output("Y").size(), 1, - platform::errors::InvalidArgument( - "output of layer_norm op converter should be 1, got %d", - op_desc.Input("Y").size())); auto* X = engine_->GetITensor(op_desc.Input("X").front()); auto* Bias_v = scope.FindVar(op_desc.Input("Bias").front()); diff --git a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc index c2ffb3f3197c15..d6277b5208d5a1 100644 --- a/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/leaky_relu_op.cc @@ -36,21 +36,7 @@ class LeakyReluOpConverter : public OpConverter { VLOG(4) << "convert fluid leaky_relu op to tensorrt layer"; framework::OpDesc op_desc(op, nullptr); - // Declare inputs - size_t input_num = op_desc.Input("X").size(); - PADDLE_ENFORCE_EQ(input_num, 1UL, - platform::errors::InvalidArgument( - "Invalid number of TRT leaky_relu op converter " - "inputs. Expected 1, but received %d", - input_num)); auto* input = engine_->GetITensor(op_desc.Input("X")[0]); - // Get output - size_t output_num = op_desc.Output("Out").size(); - PADDLE_ENFORCE_EQ(output_num, 1UL, - platform::errors::InvalidArgument( - "Invalid number of TRT leaky_relu op converter " - "outputs. Expected 1, but received %d", - output_num)); // Get attrs float alpha = BOOST_GET_CONST(float, op_desc.GetAttr("alpha")); nvinfer1::ILayer* output_layer = nullptr; diff --git a/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc b/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc index e91a2ee13f4c2d..3940cc5dce1b00 100644 --- a/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/nearest_interp_op.cc @@ -65,13 +65,6 @@ class NearestInterpolateOpConverter : public OpConverter { scale_w = scale; } else { // axis are different in static/dynamic mode - PADDLE_ENFORCE_GT( - out_h, 0, platform::errors::InvalidArgument( - "out_h must be greater than 0 if scale is not set.")); - PADDLE_ENFORCE_GT( - out_w, 0, platform::errors::InvalidArgument( - "out_w must be greater than 0 if scale is not set.")); - bool with_dynamic = engine_->with_dynamic_shape(); int h_axis = (data_layout == framework::DataLayout::kNCHW) + with_dynamic; diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h index 8de16df0a2f610..f72ae2c3ec2d7e 100644 --- a/paddle/fluid/inference/tensorrt/convert/op_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -109,6 +109,12 @@ class OpConverter { it, platform::errors::Unimplemented("no OpConverter for optype [%s]", op_desc.Type())); } + if (op_desc.Type() == "depthwise_conv2d_transpose") { + it = Registry::Global().Lookup("conv2d_transpose"); + PADDLE_ENFORCE_NOT_NULL( + it, platform::errors::Unimplemented("no OpConverter for optype [%s]", + op_desc.Type())); + } if (op_desc.Type() == "transpose2") { it = Registry::Global().Lookup("transpose"); PADDLE_ENFORCE_NOT_NULL( diff --git a/paddle/fluid/inference/tensorrt/convert/pad_op.cc b/paddle/fluid/inference/tensorrt/convert/pad_op.cc index 6bf50e4742dd28..d6711bbbd2cb52 100644 --- a/paddle/fluid/inference/tensorrt/convert/pad_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pad_op.cc @@ -43,8 +43,6 @@ class PadOpConverter : public OpConverter { const std::vector paddings = BOOST_GET_CONST(std::vector, op_desc.GetAttr("paddings")); - const float pad_value = - BOOST_GET_CONST(float, op_desc.GetAttr("pad_value")); nvinfer1::Dims input_shape = input->getDimensions(); int nbDims = input_shape.nbDims; @@ -62,9 +60,6 @@ class PadOpConverter : public OpConverter { "(nbDims + 1) * 2 == pad_size. But " "received nbDims:%d, pad_size:%d.", nbDims, pad_size)); - PADDLE_ENFORCE_EQ(pad_value, 0.0, - platform::errors::InvalidArgument( - "The pad layer of TRT only support zero.")); nvinfer1::DimsHW pre_pad(paddings[pad_size - 4], paddings[pad_size - 2]); nvinfer1::DimsHW post_pad(paddings[pad_size - 3], paddings[pad_size - 1]); diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc index c10072602d7c51..90d6392fd6404e 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -66,15 +66,6 @@ class Pool2dOpConverter : public OpConverter { VLOG(4) << "convert a fluid pool2d op to tensorrt pool2d layer without bias"; framework::OpDesc op_desc(op, nullptr); - PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1UL, - platform::errors::InvalidArgument( - "TRT Pool2d expect 1 input, but got %d input.", - op_desc.Input("X").size())); - PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1UL, - platform::errors::InvalidArgument( - "TRT Pool2d expect 1 Output, but got %d output.", - op_desc.Output("Out").size())); - auto *input1 = engine_->GetITensor(op_desc.Input("X")[0]); nvinfer1::Dims input_shape = input1->getDimensions(); int input_dims = input_shape.nbDims; @@ -110,10 +101,6 @@ class Pool2dOpConverter : public OpConverter { nv_pool_type = nvinfer1::PoolingType::kAVERAGE; reduce_operation = nvinfer1::ReduceOperation::kAVG; plugin_pool_type = plugin::PoolPlugin::PoolType::avg; - } else { - PADDLE_THROW(platform::errors::Fatal( - "Wrong pool op type, the trt do not support the %s pool type.", - pool_type)); } nvinfer1::DimsHW nv_ksize(ksize[0], ksize[1]); diff --git a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc index 74d77d8be44937..a8a36e1238168a 100644 --- a/paddle/fluid/inference/tensorrt/convert/prelu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/prelu_op.cc @@ -31,19 +31,7 @@ class PReluOpConverter : public OpConverter { framework::OpDesc op_desc(op, nullptr); // Declare inputs size_t input_num = op_desc.Input("X").size(); - PADDLE_ENFORCE_EQ(input_num, 1UL, - platform::errors::InvalidArgument( - "Invalid input X's size of prelu TRT converter. " - "Expected 1, received %d.", - input_num)); auto* input = engine_->GetITensor(op_desc.Input("X")[0]); - // Get output - size_t output_num = op_desc.Output("Out").size(); - PADDLE_ENFORCE_EQ(output_num, 1UL, - platform::errors::InvalidArgument( - "Invalid output Out's size of prelu TRT converter. " - "Expected 1, received %d.", - output_num)); // Get attrs std::string mode = BOOST_GET_CONST(std::string, op_desc.GetAttr("mode")); // diff --git a/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc b/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc index 1329608aecd205..654fe7e0133796 100644 --- a/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/roi_align_op.cc @@ -62,12 +62,6 @@ class RoiAlignOpConverter : public OpConverter { std::vector inputs{input_tensor, rois_tensor}; nvinfer1::ILayer* layer = nullptr; - PADDLE_ENFORCE_EQ( - engine_->with_dynamic_shape(), true, - platform::errors::InvalidArgument( - "TRT roi align plugin only accept the dynamic shape, because that " - "the roi_align will change the batch size.")); - auto* roi_align_plugin = new plugin::RoiAlignPluginDynamic( data_type_, pooled_height, pooled_width, spatial_scale, sampling_ratio); auto roi_align_layer = engine_->network()->addPluginV2( diff --git a/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc b/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc index bf1f82076a66ce..0fdc262f7e740b 100644 --- a/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/shuffle_channel_op.cc @@ -50,12 +50,6 @@ class ShuffleChannelOpConverter : public OpConverter { int w = input_dims.d[2]; int group = BOOST_GET_CONST(int, op_desc.GetAttr("group")); - if (engine_->with_dynamic_shape()) { - PADDLE_THROW(platform::errors::Fatal( - "You are running the TRT Dynamic Shape mode, " - "the shuffle_channel op does not support dynamic shape yet")); - } - auto* layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input); nvinfer1::Dims4 reshape_dim(group, c / group, h, w); layer->setReshapeDimensions(reshape_dim); diff --git a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc index b44bdcef7123c2..e621ac0514109d 100644 --- a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc +++ b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc @@ -52,57 +52,57 @@ class SkipLayerNormOpConverter : public OpConverter { bool enable_int8 = op_desc.HasAttr("enable_int8"); nvinfer1::ILayer* layer = nullptr; - if (engine_->with_dynamic_shape()) { - if (engine_->use_oss()) { - auto creator = GetPluginRegistry()->getPluginCreator( - "CustomSkipLayerNormPluginDynamic", "2"); - assert(creator != nullptr); - int type = static_cast((engine_->WithFp16() == 1) - ? nvinfer1::DataType::kHALF - : nvinfer1::DataType::kFLOAT); - int ld = input1->getDimensions().d[2]; // hidden dimension - assert(ld > 0); - - if (enable_int8) { - type = static_cast(nvinfer1::DataType::kHALF); - } - - const std::vector fields{ - {"type_id", &type, nvinfer1::PluginFieldType::kINT32, 1}, - {"ld", &ld, nvinfer1::PluginFieldType::kINT32, 1}, - {"beta", bias, nvinfer1::PluginFieldType::kFLOAT32, bias_size}, - {"gamma", scale, nvinfer1::PluginFieldType::kFLOAT32, scale_size}, - }; - nvinfer1::PluginFieldCollection* pluginPtr = - static_cast( - malloc(sizeof(*pluginPtr) + - fields.size() * - sizeof(nvinfer1::PluginField))); // remember to free - pluginPtr->nbFields = static_cast(fields.size()); - pluginPtr->fields = fields.data(); - - auto pluginObj = creator->createPlugin( - "CustomSkipLayerNormPluginDynamic", pluginPtr); - auto plugin_layer = engine_->network()->addPluginV2( - inputs.data(), inputs.size(), *pluginObj); - - assert(plugin_layer != nullptr); - layer = plugin_layer; - } else { - float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon")); - bool with_fp16 = - engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); - plugin::SkipLayerNormPluginDynamic* plugin = - new plugin::SkipLayerNormPluginDynamic(bias, scale, bias_size, - scale_size, eps, with_fp16); - layer = engine_->AddDynamicPlugin(inputs.data(), 2, plugin); + + if (engine_->use_oss()) { + auto creator = GetPluginRegistry()->getPluginCreator( + "CustomSkipLayerNormPluginDynamic", "2"); + PADDLE_ENFORCE_NE( + creator, nullptr, + platform::errors::InvalidArgument( + "fail to get creator of CustomSkipLayerNormPluginDynamic")); + int type = static_cast((engine_->WithFp16() == 1) + ? nvinfer1::DataType::kHALF + : nvinfer1::DataType::kFLOAT); + int ld = input1->getDimensions().d[2]; // hidden dimension + PADDLE_ENFORCE_GT(ld, 0, platform::errors::InvalidArgument( + "in CustomSkipLayerNormPluginDynamic hidden " + "dimension should > 0")); + if (enable_int8) { + type = static_cast(nvinfer1::DataType::kHALF); } + + const std::vector fields{ + {"type_id", &type, nvinfer1::PluginFieldType::kINT32, 1}, + {"ld", &ld, nvinfer1::PluginFieldType::kINT32, 1}, + {"beta", bias, nvinfer1::PluginFieldType::kFLOAT32, bias_size}, + {"gamma", scale, nvinfer1::PluginFieldType::kFLOAT32, scale_size}, + }; + nvinfer1::PluginFieldCollection* pluginPtr = + static_cast( + malloc(sizeof(*pluginPtr) + + fields.size() * + sizeof(nvinfer1::PluginField))); // remember to free + pluginPtr->nbFields = static_cast(fields.size()); + pluginPtr->fields = fields.data(); + + auto pluginObj = + creator->createPlugin("CustomSkipLayerNormPluginDynamic", pluginPtr); + auto plugin_layer = engine_->network()->addPluginV2( + inputs.data(), inputs.size(), *pluginObj); + + PADDLE_ENFORCE_NE( + plugin_layer, nullptr, + platform::errors::InvalidArgument( + "fail to add CustomSkipLayerNormPluginDynamic layer")); + layer = plugin_layer; } else { - PADDLE_THROW(platform::errors::Fatal( - "You are running the Ernie(Bert) model in static" - "shape mode, which is not supported for the time being.\n" - "You can use the config.SetTRTDynamicShapeInfo(...) interface" - " to set the shape information to run the dynamic shape mode.")); + float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon")); + bool with_fp16 = + engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); + plugin::SkipLayerNormPluginDynamic* plugin = + new plugin::SkipLayerNormPluginDynamic(bias, scale, bias_size, + scale_size, eps, with_fp16); + layer = engine_->AddDynamicPlugin(inputs.data(), 2, plugin); } auto output_name = op_desc.Output("Out")[0]; diff --git a/paddle/fluid/inference/tensorrt/convert/slice_op.cc b/paddle/fluid/inference/tensorrt/convert/slice_op.cc index aee39b7cf0c14c..2ab024dff327fd 100644 --- a/paddle/fluid/inference/tensorrt/convert/slice_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/slice_op.cc @@ -44,15 +44,6 @@ class SliceOpConverter : public OpConverter { std::vector ends = BOOST_GET_CONST(std::vector, op_desc.GetAttr("ends")); - PADDLE_ENFORCE_EQ( - starts.size(), axes.size(), - platform::errors::InvalidArgument( - "The size of starts must be equal to the size of axes.")); - PADDLE_ENFORCE_EQ( - ends.size(), axes.size(), - platform::errors::InvalidArgument( - "The size of ends must be equal to the size of axes.")); - auto input_dims = input->getDimensions(); if (!engine_->with_dynamic_shape()) { // notice that input shape is [CHW] without batch axis when input has @@ -62,10 +53,6 @@ class SliceOpConverter : public OpConverter { } input_dims.d[0] = 1; // fake batchsize, not useful here for (size_t i = 0; i < axes.size(); i++) { - // split on batch is not supported in TensorRT - PADDLE_ENFORCE_NE(axes[i], 0, platform::errors::InvalidArgument( - "Invalid slice axis. Slice on batch " - "axis is not supported in TensorRT")); if (starts[i] < 0) { starts[i] = std::max(starts[i] + input_dims.d[axes[i]], 0); } diff --git a/paddle/fluid/inference/tensorrt/convert/split_op.cc b/paddle/fluid/inference/tensorrt/convert/split_op.cc index 75b317e7bfd90e..47a6dd783a70cf 100644 --- a/paddle/fluid/inference/tensorrt/convert/split_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc @@ -33,17 +33,7 @@ class SplitOpConverter : public OpConverter { size_t output_num = op_desc.Output("Out").size(); // Get Attrs - PADDLE_ENFORCE_EQ(input_num, 1UL, - platform::errors::InvalidArgument( - "Invalid input X's size of split TRT converter. " - "Expected 1, received %d.", - input_num)); int axis = BOOST_GET_CONST(int, op_desc.GetAttr("axis")); - // split on batch is not supported in TensorRT - PADDLE_ENFORCE_NE( - axis, 0, - platform::errors::InvalidArgument( - "Invalid split axis. Split on batch is not supported in TensorRT")); std::vector output_lengths = BOOST_GET_CONST(std::vector, op_desc.GetAttr("sections")); diff --git a/paddle/fluid/inference/tensorrt/convert/stack_op.cc b/paddle/fluid/inference/tensorrt/convert/stack_op.cc index a0292b21124633..6105e10799e552 100644 --- a/paddle/fluid/inference/tensorrt/convert/stack_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/stack_op.cc @@ -58,26 +58,19 @@ class StackOpConverter : public OpConverter { } nvinfer1::ILayer* layer = nullptr; - if (engine_->with_dynamic_shape()) { #if IS_TRT_VERSION_GE(6000) - bool with_fp16 = - engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); - plugin::StackPluginDynamic* plugin = - new plugin::StackPluginDynamic(axis, input_num, with_fp16); - layer = engine_->AddDynamicPlugin(inputs, input_num, plugin); - assert(layer != nullptr); + bool with_fp16 = engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); + plugin::StackPluginDynamic* plugin = + new plugin::StackPluginDynamic(axis, input_num, with_fp16); + layer = engine_->AddDynamicPlugin(inputs, input_num, plugin); + PADDLE_ENFORCE_NOT_NULL( + layer, platform::errors::InvalidArgument( + "trt stack layer in converter could not be created.")); #else - PADDLE_THROW(platform::errors::Fatal( - "You are running the TRT Dynamic Shape mode, need to confirm that " - "your TRT version is no less than 6.0")); + PADDLE_THROW(platform::errors::Fatal( + "You are running the TRT Dynamic Shape mode, need to confirm that " + "your TRT version is no less than 6.0")); #endif - } else { - PADDLE_THROW(platform::errors::Fatal( - "You are running the Ernie(Bert) model in static" - "shape mode, which is not supported for the time being.\n" - "You can use the config.SetTRTDynamicShapeInfo(...) interface" - " to set the shape information to run the dynamic shape mode.")); - } auto output_name = op_desc.Output("Y").front(); RreplenishLayerAndOutput(layer, "stack", {output_name}, test_mode); free(inputs); diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h index 971f99e6919722..6158fd130bad8d 100644 --- a/paddle/fluid/inference/tensorrt/helper.h +++ b/paddle/fluid/inference/tensorrt/helper.h @@ -60,6 +60,9 @@ static nvinfer1::IRuntime* createInferRuntime(nvinfer1::ILogger* logger) { static nvinfer1::IPluginRegistry* GetPluginRegistry() { return static_cast(dy::getPluginRegistry()); } +static int GetInferLibVersion() { + return static_cast(dy::getInferLibVersion()); +} #endif // A logger for create TensorRT infer builder. @@ -67,9 +70,12 @@ class NaiveLogger : public nvinfer1::ILogger { public: void log(nvinfer1::ILogger::Severity severity, const char* msg) override { switch (severity) { - case Severity::kINFO: + case Severity::kVERBOSE: VLOG(3) << msg; break; + case Severity::kINFO: + VLOG(2) << msg; + break; case Severity::kWARNING: LOG(WARNING) << msg; break; diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 53225b79780773..48c7b7fdd0d79d 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -42,15 +42,13 @@ struct SimpleOpTypeSetTeller : public Teller { teller_set.insert("multihead_matmul"); teller_set.insert("skip_layernorm"); teller_set.insert("slice"); -#endif -#if IS_TRT_VERSION_GE(7130) - teller_set.insert("group_norm"); + int8_teller_set.insert("fused_embedding_eltwise_layernorm"); int8_teller_set.insert("multihead_matmul"); int8_teller_set.insert("skip_layernorm"); - int8_teller_set.insert("fused_embedding_eltwise_layernorm"); - int8_teller_set.insert("matmul"); - int8_teller_set.insert("stack"); int8_teller_set.insert("slice"); +#endif +#if IS_TRT_VERSION_GE(7130) + teller_set.insert("group_norm"); #endif } @@ -67,6 +65,8 @@ struct SimpleOpTypeSetTeller : public Teller { // use this set for no calib int8. std::unordered_set int8_teller_set{"mul", "conv2d", + "matmul", + "stack", "conv2d_fusion", "pool2d", "relu", @@ -102,6 +102,7 @@ struct SimpleOpTypeSetTeller : public Teller { "dropout", "prelu", "conv2d_transpose", + "depthwise_conv2d_transpose", "leaky_relu", "fc", "shuffle_channel", @@ -137,13 +138,95 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, return false; for (auto& teller : tellers_) { - if (op_type == "pool2d" || op_type == "conv2d" || - op_type == "depthwise_conv2d" || op_type == "conv2d_transpose") { + if (op_type == "depthwise_conv2d") { std::vector paddings = BOOST_GET_CONST(std::vector, desc.GetAttr("paddings")); if (paddings.size() > 2) return false; } + + if (op_type == "pool2d") { + std::vector paddings = + BOOST_GET_CONST(std::vector, desc.GetAttr("paddings")); + if (paddings.size() > 2) return false; + if (desc.Input("X").size() != 1) { + VLOG(3) << "TRT Pool2d expect 1 input, but got " + << desc.Input("X").size(); + return false; + } + if (desc.Output("Out").size() != 1) { + VLOG(3) << "TRT Pool2d has only 1 output, but got " + << desc.Output("Out").size(); + return false; + } + if (!desc.HasAttr("pooling_type")) { + return false; + } else { + std::string pool_type = + BOOST_GET_CONST(std::string, desc.GetAttr("pooling_type")); + if (pool_type != "max" && pool_type != "avg") { + VLOG(3) << "Wrong pool op type, the trt do not support the " + << pool_type << " pool type."; + return false; + } + } + } + + if (op_type == "conv2d" || op_type == "conv2d_transpose" || + op_type == "conv2d_fusion" || op_type == "depthwise_conv2d" || + op_type == "depthwise_conv2d_transpose") { + std::vector paddings = + BOOST_GET_CONST(std::vector, desc.GetAttr("paddings")); + + // conv2d and conv2d_transpose need padding check + if (paddings.size() > 2 && op_type != "conv2d_fusion") return false; + + if (desc.Input("Input").size() != 1) { + VLOG(3) << "TRT Conv2d expect 1 input, but got " + << desc.Input("Input").size() << " input."; + return false; + } + + if (desc.Input("Filter").size() != 1) { + VLOG(3) << "TRT Conv2d expect 1 filter, but got " + << desc.Input("Filter").size() << " filter."; + return false; + } + + if (desc.HasAttr("enable_int8")) { + if (op_type == "conv2d" || op_type == "conv2d_fusion") { + if (!desc.HasAttr("Input_scale")) { + VLOG(3) << "Input scale not found. TRT int8" + " requires conv/deconv to have " + "input quantization scales."; + return false; + } + } + } + + if (op_type == "conv2d_transpose" || + op_type == "depthwise_conv2d_transpose") { + if (!desc.HasAttr("dilations")) { + return false; + } else { + const std::vector dilations = + BOOST_GET_CONST(std::vector, desc.GetAttr("dilations")); + if (dilations[0] != 1 || dilations[1] != 1) { + VLOG(3) << "In conv2d_transpose, Dilations must be (1, 1) for " + "tensorRT, but given (" + << dilations[0] << ", " << dilations[1] << ")"; + return false; + } + } + } + + if (desc.Output("Output").size() != 1) { + VLOG(3) << "TRT Conv2d expect 1 output, but got " + << desc.Output("Output").size() << " output."; + return false; + } + } + if (op_type == "matmul") { auto* block = desc.Block(); for (auto& param_name : desc.Inputs()) { @@ -151,7 +234,7 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, auto* var_desc = block->FindVar(var_name); const auto shape = var_desc->GetShape(); if (shape.size() < 3) { - VLOG(1) + VLOG(3) << "matmul op dims < 3 not supported in tensorrt, but got dims " << shape.size() << ", so jump it."; return false; @@ -189,7 +272,18 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, if (axis.size() >= nvinfer1::Dims::MAX_DIMS) return false; } } - if (op_type == "flatten2" || op_type == "flatten") { + if (op_type == "flatten2") { + // flatten doesn't support dynamic shape currently + if (!desc.HasAttr("axis")) { + return false; + } else { + if (with_dynamic_shape) return false; + int axis = BOOST_GET_CONST(int, desc.GetAttr("axis")); + if (axis != 1) return false; + } + } + + if (op_type == "flatten") { // flatten doesn't support dynamic shape currently if (!desc.HasAttr("axis")) { return false; @@ -229,7 +323,7 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, auto* var_desc = block->FindVar(var_name); const auto shape = var_desc->GetShape(); if (shape.size() != 3) { - VLOG(1) << "multiclass_nms op dims != 3 not supported in tensorrt, " + VLOG(3) << "multiclass_nms op dims != 3 not supported in tensorrt, " "but got dims " << shape.size() << ", so jump it."; return false; @@ -252,18 +346,6 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, if (registry == nullptr) return false; } - if (op_type == "fc" || op_type == "mul") { - const int x_num_col_dims = - desc.HasAttr("x_num_col_dims") - ? BOOST_GET_CONST(int, desc.GetAttr("x_num_col_dims")) - : (desc.HasAttr("in_num_col_dims") - ? BOOST_GET_CONST(int, desc.GetAttr("in_num_col_dims")) - : 1); - if (x_num_col_dims != 1 && x_num_col_dims != 2) { - return false; - } - } - if (op_type == "nearest_interp") { std::vector attrs{"data_layout", "interp_method", "align_corners", "scale", @@ -279,6 +361,25 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, auto interp_method = BOOST_GET_CONST(std::string, desc.GetAttr("interp_method")); if (interp_method != "nearest") return false; + + if (!desc.HasAttr("scale") || !desc.HasAttr("out_h") || + !desc.HasAttr("out_w")) { + return false; + } else { + auto scale = BOOST_GET_CONST(float, desc.GetAttr("scale")); + auto out_h = BOOST_GET_CONST(int, desc.GetAttr("out_h")); + auto out_w = BOOST_GET_CONST(int, desc.GetAttr("out_w")); + if (!(scale > 0.f && (out_h <= 0 && out_w <= 0))) { + if (out_h <= 0) { + VLOG(3) << "out_h must be greater than 0 if scale is not set."; + return false; + } + if (out_w <= 0) { + VLOG(3) << "out_w must be greater than 0 if scale is not set."; + return false; + } + } + } } if (op_type == "roi_align") { @@ -303,6 +404,235 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, if (spatial_scale <= 0.f) return false; } + if (op_type == "hard_swish") { + if (desc.Input("X").size() != 1) { + VLOG(3) << "HardSwish op has only 1 input, but got " + << desc.Input("X").size(); + return false; + } + + if (desc.Output("Out").size() != 1) { + VLOG(3) << "HardSwish op has only 1 output, but got " + << desc.Output("Out").size(); + return false; + } + } + + if (op_type == "batch_norm") { + const std::vector bn_inputs = {"X", "Bias", "Mean", "Scale", + "Variance"}; + for (unsigned int i = 0; i < bn_inputs.size(); i++) { + if (desc.Input(bn_inputs[i]).size() != 1) { + VLOG(3) << "Invalid " << bn_inputs[i] + << "'s size of batch_norm TRT " + "converter. Expected 1, received " + << desc.Input(bn_inputs[i]).size() << "."; + return false; + } + } + + if (desc.Output("Y").size() != 1) { + VLOG(3) << "Invalid output Y's size of batch_norm TRT " + "converter. Expected 1, received " + << desc.Output("Y").size() << "."; + return false; + } + } + + if (op_type == "split") { + if (desc.Input("X").size() != 1) { + VLOG(3) << "Invalid input X's size of split TRT converter. " + "Expected 1, received " + << desc.Input("X").size() << "."; + return false; + } + if (!desc.HasAttr("axis")) { + return false; + } else { + int axis = BOOST_GET_CONST(int, desc.GetAttr("axis")); + if (axis == 0) { + VLOG(3) << "Invalid split axis. Split on batch is not supported in " + "TensorRT"; + return false; + } + } + } + + if (op_type == "slice") { + if (!desc.HasAttr("axes") || !desc.HasAttr("starts") || + !desc.HasAttr("ends")) { + return false; + } else { + std::vector axes = + BOOST_GET_CONST(std::vector, desc.GetAttr("axes")); + std::vector starts = + BOOST_GET_CONST(std::vector, desc.GetAttr("starts")); + std::vector ends = + BOOST_GET_CONST(std::vector, desc.GetAttr("ends")); + if (axes.size() != starts.size() || axes.size() != ends.size()) { + return false; + } + if (!with_dynamic_shape) { + for (size_t i = 0; i < axes.size(); i++) { + if (axes[i] == 0) { + VLOG(3) << "Invalid slice axis. Slice on batch axis is not " + "supported in TensorRT"; + return false; + } + } + } + } + } + + if (op_type == "elementwise_add" || op_type == "elementwise_mul") { + if (desc.Input("X").size() != 1) { + VLOG(3) << "The input op's Input(\"X\").size() " + "should equal to 1, but received Input(\"X\").size() = " + << desc.Input("X").size() << "."; + return false; + } + if (desc.Input("Y").size() != 1) { + VLOG(3) << "The input op's Input(\"Y\").size() " + "should equal to 1, but received Input(\"Y\").size() = " + << desc.Input("Y").size() << "."; + return false; + } + if (desc.Output("Out").size() != 1) { + VLOG(3) << "The input op's Output(\"Out\").size() " + "should equal to 1, but reveceid Output(\"Out\").size() = " + << desc.Output("Out").size() << "."; + return false; + } + } + + if (op_type == "stack") { + if (!with_dynamic_shape) { + VLOG(3) + << "static shape mode is not supported for TRT stack.\n" + "You can use the config.SetTRTDynamicShapeInfo(...) interface" + " to set the shape information to run the dynamic shape " + "mode."; + return false; + } + } + + if (op_type == "fused_embedding_eltwise_layernorm") { + if (!with_dynamic_shape) { + VLOG(3) << "fused_embedding_eltwise_layernorm should run on dynamic " + "shape mode."; + return false; + } + if (desc.Input("Ids").size() != desc.Input("Embs").size()) { + VLOG(3) << "The id and emb size of fused EmbEltwiseLayerNormOp " + "should be same "; + return false; + } + } + + if (op_type == "gelu") { + if (desc.Input("X").size() != 1) { + VLOG(3) << "gelu op has only 1 input, but got " + << desc.Input("X").size(); + return false; + } + if (desc.Output("Out").size() != 1) { + VLOG(3) << "gelu op has only 1 output, but got " + << desc.Output("Out").size(); + return false; + } + } + + if (op_type == "layer_norm") { + if (desc.Input("X").size() != 1) { + VLOG(3) << "input of layer_norm op converter should be 1, got " + << desc.Input("X").size(); + return false; + } + if (desc.Input("Bias").size() != 1) { + VLOG(3) << "Bias of layer_norm op converter should be 1, got " + << desc.Input("Bias").size(); + return false; + } + if (desc.Input("Scale").size() != 1) { + VLOG(3) << "Scale of layer_norm op converter should be 1, got " + << desc.Input("Scale").size(); + return false; + } + if (desc.Output("Y").size() != 1) { + VLOG(3) << "output of layer_norm op converter should be 1, got " + << desc.Output("Y").size(); + return false; + } + } + + if (op_type == "leaky_relu") { + if (desc.Input("X").size() != 1) { + VLOG(3) << "Invalid number of TRT leaky_relu op converter " + "inputs. Expected 1, but received " + << desc.Input("X").size(); + return false; + } + if (desc.Output("Out").size() != 1) { + VLOG(3) << "output of leaky_relu op converter should be 1, got " + << desc.Output("Out").size(); + return false; + } + } + + if (op_type == "pad") { + const float pad_value = BOOST_GET_CONST(float, desc.GetAttr("pad_value")); + if (pad_value != 0.0f) { + VLOG(3) << "The pad layer of TRT only support zero."; + return false; + } + } + + if (op_type == "prelu") { + if (desc.Input("X").size() != 1) { + VLOG(3) << "Invalid input X's size of prelu TRT converter. " + "Expected 1, received " + << desc.Input("X").size() << "."; + return false; + } + if (desc.Output("Out").size() != 1) { + VLOG(3) << "Invalid output Out's size of prelu TRT converter. " + "Expected 1, received " + << desc.Output("Out").size() << "."; + return false; + } + } + + if (op_type == "roi_align") { + if (!with_dynamic_shape) { + VLOG(3) << "TRT roi align plugin only accept the dynamic shape, " + "because that " + "the roi_align will change the batch size."; + return false; + } + } + + if (op_type == "shuffle_channel") { + if (with_dynamic_shape) { + VLOG(3) << "You are running the TRT Dynamic Shape mode, " + "the shuffle_channel op does not support dynamic shape yet"; + return false; + } + } + + if (op_type == "skip_layernorm") { + if (!with_dynamic_shape) { + VLOG(3) << "the skip_layernorm does not support static shape yet"; + return false; + } + } + + if (op_type == "multihead_matmul") { + if (!with_dynamic_shape) { + VLOG(3) << "the multihead_matmul does not support static shape yet"; + return false; + } + } + if ((*teller)(op_type, desc, use_no_calib_int8)) return true; } return false; diff --git a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu index a5fc9e73c5f27f..214e1a81e7dc04 100644 --- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu @@ -225,6 +225,14 @@ nvinfer1::DataType QkvToContextPluginDynamic::getOutputDataType( return input_types[0]; } +template +__global__ void apply_scale(T *data, T scale, int n) { +#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) + int tid = blockIdx.x * blockDim.x + threadIdx.x; + data[tid] = data[tid] * scale; +#endif +} + int QkvToContextPluginDynamic::enqueue( const nvinfer1::PluginTensorDesc *input_desc, const nvinfer1::PluginTensorDesc *output_desc, const void *const *inputs, @@ -291,10 +299,17 @@ int QkvToContextPluginDynamic::enqueue( platform::DeviceContextPool::Instance().Get( platform::CUDAPlace(device_id))); + int n_q = seq_len * head_number_ * head_size_; + constexpr int threads = 128; + int blocks = (n_q + threads - 1) / threads; + + apply_scale<<>>(tptr, static_cast(scale_), + n_q); + const platform::CUDADeviceContext &dev_ctx = *device_ctx; operators::math::MultiHeadGPUComputeFunctor multihead_compute_func; multihead_compute_func(dev_ctx, batch, seq_len, head_number_, head_size_, - qkptr, input1_data, tptr, half(scale_), half(0.0)); + qkptr, input1_data, tptr, half(1.), half(0.0)); int grid = batch * head_number_ * seq_len; int block = head_size_; diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 75628adbe8a859..f74cd671d6dca0 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -522,10 +522,10 @@ if(WITH_GPU AND TENSORRT_FOUND) inference_analysis_test(trt_instance_norm_test SRCS trt_instance_norm_converter_test.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${TEST_INSTANCE_NORM_MODEL}/) - inference_analysis_test(test_analyzer_capi_gpu SRCS analyzer_capi_gpu_tester.cc + inference_analysis_test(test_analyzer_capi_exp_gpu SRCS analyzer_capi_exp_gpu_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) - inference_analysis_test(test_analyzer_capi_xpu SRCS analyzer_capi_xpu_tester.cc + inference_analysis_test(test_analyzer_capi_exp_xpu SRCS analyzer_capi_exp_xpu_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) @@ -604,14 +604,23 @@ inference_analysis_test(lite_resnet50_test SRCS lite_resnet50_test.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${RESNET50_MODEL_DIR}) -inference_analysis_test(test_analyzer_capi SRCS analyzer_capi_tester.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c - ARGS --infer_model=${RESNET50_MODEL_DIR}/model) +inference_analysis_test(test_analyzer_capi_exp SRCS analyzer_capi_exp_tester.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c + ARGS --infer_model=${RESNET50_MODEL_DIR}/model) + +inference_analysis_test(test_analyzer_capi_exp_pd_config SRCS analyzer_capi_exp_pd_config_tester.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c + ARGS --infer_model=${MOBILENET_INSTALL_DIR}/model) + +inference_analysis_test(test_analyzer_capi_exp_pd_tensor SRCS analyzer_capi_exp_pd_tensor_tester.cc + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c + ARGS --infer_model=${MOBILENET_INSTALL_DIR}/model) -inference_analysis_test(test_analyzer_capi_pd_tensor SRCS analyzer_capi_pd_tensor_tester.cc +if (NOT APPLE AND NOT WIN32) + inference_analysis_test(test_analyzer_capi_exp_pd_threads SRCS analyzer_capi_exp_pd_threads_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c ARGS --infer_model=${MOBILENET_INSTALL_DIR}/model) - +endif() inference_analysis_test(test_analyzer_zerocopytensor_tensor SRCS analyzer_zerocopy_tensor_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${OCR_INSTALL_DIR}/model) @@ -621,17 +630,17 @@ inference_analysis_test(test_analyzer_paddletensor_tensor SRCS analyzer_paddle_t ARGS --infer_model=${OCR_INSTALL_DIR}/model --infer_data=${OCR_INSTALL_DIR}/data.txt --refer_result=${OCR_INSTALL_DIR}/result.txt) if(WITH_MKLDNN) - inference_analysis_test(test_analyzer_capi_int SRCS analyzer_capi_int_tester.cc + inference_analysis_test(test_analyzer_capi_exp_int SRCS analyzer_capi_exp_int_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c ARGS --infer_model=${INT8_DATA_DIR}/resnet50/model) - endif() +endif() -inference_analysis_test(test_analyzer_capi_ner SRCS analyzer_capi_ner_tester.cc +inference_analysis_test(test_analyzer_capi_exp_ner SRCS analyzer_capi_exp_ner_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c ARGS --infer_model=${CHINESE_NER_INSTALL_DIR}/model) if(WITH_GPU) - inference_analysis_test(paddle_infer_api_test SRCS paddle_infer_api_test.cc + inference_analysis_test(paddle_infer_api_test SRCS paddle_infer_api_test.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${RESNET50_MODEL_DIR}) endif() diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_gpu_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_gpu_tester.cc new file mode 100644 index 00000000000000..de9e2afd705f93 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_gpu_tester.cc @@ -0,0 +1,160 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include +#include "paddle/fluid/inference/capi_exp/pd_inference_api.h" +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +TEST(PD_Config, gpu_interface) { + std::string model_dir = FLAGS_infer_model + "/mobilenet"; + std::string prog_file = model_dir + "/__model__"; + std::string param_file = model_dir + "/__params__"; + std::string opt_cache_dir = FLAGS_infer_model + "/OptimCacheDir"; + const char* ops_name = "conv_2d"; + + PD_Config* config = PD_ConfigCreate(); + PD_ConfigSetModel(config, prog_file.c_str(), param_file.c_str()); + PD_ConfigSetOptimCacheDir(config, opt_cache_dir.c_str()); + + PD_ConfigEnableUseGpu(config, 100, 0); + bool use_gpu = PD_ConfigUseGpu(config); + EXPECT_TRUE(use_gpu); + int init_size = PD_ConfigMemoryPoolInitSizeMb(config); + EXPECT_EQ(init_size, 100); + int gpu_device_id = PD_ConfigGpuDeviceId(config); + EXPECT_EQ(gpu_device_id, 0); + float frac = PD_ConfigFractionOfGpuMemoryForPool(config); + LOG(INFO) << frac; + PD_ConfigEnableCudnn(config); + bool cudnn = PD_ConfigCudnnEnabled(config); + EXPECT_TRUE(cudnn); + + PD_ConfigEnableTensorRtEngine(config, 1 << 20, 1, 3, PD_PRECISION_INT8, FALSE, + TRUE); + bool trt_enable = PD_ConfigTensorRtEngineEnabled(config); + EXPECT_TRUE(trt_enable); + + const char* tensor_name = "image"; + size_t shapes_num[1] = {4}; + int32_t min_shape[4] = {1, 3, 36, 36}; + int32_t max_shape[4] = {1, 3, 224, 224}; + int32_t opt_shape[4] = {1, 3, 224, 224}; + int32_t* min_shape_ptr = min_shape; + int32_t* max_shape_ptr = max_shape; + int32_t* opt_shape_ptr = opt_shape; + PD_ConfigSetTrtDynamicShapeInfo(config, 1, &tensor_name, shapes_num, + &min_shape_ptr, &max_shape_ptr, + &opt_shape_ptr, FALSE); + PD_ConfigDisableTensorRtOPs(config, 1, &ops_name); + PD_ConfigEnableTensorRtOSS(config); + bool oss_enabled = PD_ConfigTensorRtOssEnabled(config); + EXPECT_TRUE(oss_enabled); + + PD_ConfigEnableTensorRtDla(config, 4); + bool dla_enabled = PD_ConfigTensorRtDlaEnabled(config); + EXPECT_TRUE(dla_enabled); + + PD_ConfigEnableGpuMultiStream(config); + bool thread_local_thread = PD_ConfigThreadLocalStreamEnabled(config); + EXPECT_TRUE(thread_local_thread); + + PD_ConfigDisableGpu(config); + PD_ConfigDestroy(config); +} + +TEST(PD_Config, use_gpu) { + std::string model_dir = FLAGS_infer_model + "/mobilenet"; + PD_Config* config = PD_ConfigCreate(); + + PD_ConfigDisableGpu(config); + PD_ConfigSetCpuMathLibraryNumThreads(config, 10); + int num_thread = PD_ConfigGetCpuMathLibraryNumThreads(config); + EXPECT_EQ(num_thread, 10); + + PD_ConfigSwitchIrDebug(config, TRUE); + PD_ConfigSetModelDir(config, model_dir.c_str()); + PD_ConfigSetOptimCacheDir(config, + (FLAGS_infer_model + "/OptimCacheDir").c_str()); + const char* model_dir_ = PD_ConfigGetModelDir(config); + LOG(INFO) << model_dir_; + + PD_ConfigEnableUseGpu(config, 100, 0); + bool use_gpu = PD_ConfigUseGpu(config); + EXPECT_TRUE(use_gpu); + int device_id = PD_ConfigGpuDeviceId(config); + EXPECT_EQ(device_id, 0); + int init_size = PD_ConfigMemoryPoolInitSizeMb(config); + EXPECT_EQ(init_size, 100); + + float frac = PD_ConfigFractionOfGpuMemoryForPool(config); + LOG(INFO) << frac; + + PD_ConfigEnableCudnn(config); + bool cudnn = PD_ConfigCudnnEnabled(config); + EXPECT_TRUE(cudnn); + + PD_ConfigSwitchIrOptim(config, TRUE); + bool ir_optim = PD_ConfigIrOptim(config); + EXPECT_TRUE(ir_optim); + + PD_ConfigEnableTensorRtEngine(config, 1 << 20, 1, 3, PD_PRECISION_FLOAT32, + FALSE, FALSE); + bool trt_enable = PD_ConfigTensorRtEngineEnabled(config); + EXPECT_TRUE(trt_enable); + PD_ConfigEnableMemoryOptim(config); + bool memory_optim_enable = PD_ConfigMemoryOptimEnabled(config); + EXPECT_TRUE(memory_optim_enable); + PD_ConfigEnableProfile(config); + bool profiler_enable = PD_ConfigProfileEnabled(config); + EXPECT_TRUE(profiler_enable); + PD_ConfigSetInvalid(config); + bool is_valid = PD_ConfigIsValid(config); + EXPECT_FALSE(is_valid); + PD_ConfigDestroy(config); +} + +TEST(PD_Config, trt_int8) { + std::string model_dir = FLAGS_infer_model + "/mobilenet"; + PD_Config* config = PD_ConfigCreate(); + PD_ConfigEnableUseGpu(config, 100, 0); + PD_ConfigEnableTensorRtEngine(config, 1 << 20, 1, 3, PD_PRECISION_INT8, FALSE, + TRUE); + bool trt_enable = PD_ConfigTensorRtEngineEnabled(config); + EXPECT_TRUE(trt_enable); + PD_ConfigDestroy(config); +} + +TEST(PD_Config, trt_fp16) { + std::string model_dir = FLAGS_infer_model + "/mobilenet"; + PD_Config* config = PD_ConfigCreate(); + PD_ConfigEnableUseGpu(config, 100, 0); + PD_ConfigEnableTensorRtEngine(config, 1 << 20, 1, 3, PD_PRECISION_HALF, FALSE, + FALSE); + bool trt_enable = PD_ConfigTensorRtEngineEnabled(config); + EXPECT_TRUE(trt_enable); + PD_Predictor* predictor = PD_PredictorCreate(config); + PD_PredictorDestroy(predictor); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_int_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_int_tester.cc new file mode 100644 index 00000000000000..d3a15cb285772d --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_int_tester.cc @@ -0,0 +1,89 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include +#include "paddle/fluid/inference/capi_exp/pd_inference_api.h" +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +void predictor_run() { + std::string model_dir = FLAGS_infer_model; + PD_Config* config = PD_ConfigCreate(); + PD_ConfigDisableGpu(config); + PD_ConfigSetCpuMathLibraryNumThreads(config, 10); + PD_ConfigSwitchIrDebug(config, TRUE); + PD_ConfigSetModelDir(config, model_dir.c_str()); + PD_Predictor* predictor = PD_PredictorCreate(config); + PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor); + LOG(INFO) << "The inputs' size is: " << input_names->size; + EXPECT_EQ(input_names->size, 2u); + + int32_t shape_0[4] = {1, 3, 224, 224}; + float data_0[1 * 3 * 224 * 224] = {0}; + PD_Tensor* input_0 = PD_PredictorGetInputHandle(predictor, "image"); + PD_TensorReshape(input_0, 4, shape_0); + PD_TensorCopyFromCpuFloat(input_0, data_0); + int32_t shape_1[2] = {1, 1}; + int64_t data_1[1] = {0}; + PD_Tensor* input_1 = PD_PredictorGetInputHandle(predictor, "label"); + PD_TensorReshape(input_1, 2, shape_1); + PD_TensorCopyFromCpuInt64(input_1, data_1); + + LOG(INFO) << "Run Inference in CAPI encapsulation. "; + EXPECT_TRUE(PD_PredictorRun(predictor)); + + PD_OneDimArrayCstr* output_names = PD_PredictorGetOutputNames(predictor); + LOG(INFO) << "output size is: " << output_names->size; + for (size_t index = 0; index < output_names->size; ++index) { + LOG(INFO) << "output[" << index + << "]'s name is: " << output_names->data[index]; + PD_Tensor* output = + PD_PredictorGetOutputHandle(predictor, output_names->data[index]); + PD_OneDimArrayInt32* shape = PD_TensorGetShape(output); + LOG(INFO) << "output[" << index << "]'s shape_size is: " << shape->size; + int32_t out_size = 1; + for (size_t i = 0; i < shape->size; ++i) { + LOG(INFO) << "output[" << index << "]'s shape is: " << shape->data[i]; + out_size = out_size * shape->data[i]; + } + float* out_data = new float[out_size]; + PD_TensorCopyToCpuFloat(output, out_data); + LOG(INFO) << "output[" << index << "]'s DATA is: " << out_data[0]; + delete[] out_data; + PD_OneDimArrayInt32Destroy(shape); + PD_TensorDestroy(output); + } + PD_PredictorClearIntermediateTensor(predictor); + PD_PredictorTryShrinkMemory(predictor); + PD_OneDimArrayCstrDestroy(output_names); + PD_TensorDestroy(input_1); + PD_TensorDestroy(input_0); + PD_OneDimArrayCstrDestroy(input_names); + PD_PredictorDestroy(predictor); +} + +#ifdef PADDLE_WITH_MKLDNN +TEST(PD_PredictorRun, predictor_run) { predictor_run(); } +#endif + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc new file mode 100644 index 00000000000000..4369cd78dfa374 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_ner_tester.cc @@ -0,0 +1,105 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include "paddle/fluid/inference/capi_exp/pd_inference_api.h" +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +TEST(PD_PredictorRun, predictor_run) { + auto model_dir = FLAGS_infer_model; + PD_Config *config = PD_ConfigCreate(); + PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(), + (model_dir + "/param").c_str()); + PD_ConfigDisableGpu(config); + + PD_Predictor *predictor = PD_PredictorCreate(config); + size_t input_num = PD_PredictorGetInputNum(predictor); + LOG(INFO) << "Input num: " << input_num; + size_t output_num = PD_PredictorGetOutputNum(predictor); + LOG(INFO) << "Output num: " << output_num; + + PD_OneDimArrayCstr *input_names = PD_PredictorGetInputNames(predictor); + EXPECT_EQ(input_names->size, 2u); + LOG(INFO) << "Predictor start run!"; + PD_Tensor *inputs[2]; + inputs[0] = PD_PredictorGetInputHandle(predictor, input_names->data[0]); + inputs[1] = PD_PredictorGetInputHandle(predictor, input_names->data[1]); + LOG(INFO) << "Predictor start run!"; + // inputs[0]: word, use lod memory in stack + int32_t shape_0[2] = {11, 1}; + int64_t data_0[11 * 1] = {12673, 9763, 905, 284, 45, 7474, 20, 17, 1, 4, 9}; + size_t lod_layer_0[2] = {0, 11}; + PD_OneDimArraySize layer_0; + layer_0.size = 2; + layer_0.data = lod_layer_0; + PD_OneDimArraySize *layer_0_ptr = &layer_0; + PD_TwoDimArraySize lod_0; + lod_0.size = 1; + lod_0.data = &layer_0_ptr; + PD_TensorReshape(inputs[0], 2, shape_0); + PD_TensorCopyFromCpuInt64(inputs[0], data_0); + PD_TensorSetLod(inputs[0], &lod_0); + + // inputs[1]: mention, use lod memory in heap + int32_t shape_1[2] = {11, 1}; + int64_t data_1[11 * 1] = {27, 0, 0, 33, 34, 33, 0, 0, 0, 1, 2}; + PD_TwoDimArraySize *lod_1_ptr = new PD_TwoDimArraySize(); + lod_1_ptr->size = 1; + lod_1_ptr->data = new PD_OneDimArraySize *[1]; + lod_1_ptr->data[0] = new PD_OneDimArraySize(); + lod_1_ptr->data[0]->size = 2; + lod_1_ptr->data[0]->data = new size_t[2]; + lod_1_ptr->data[0]->data[0] = 0; + lod_1_ptr->data[0]->data[1] = 11; + + PD_TensorReshape(inputs[1], 2, shape_1); + PD_TensorCopyFromCpuInt64(inputs[1], data_1); + PD_TensorSetLod(inputs[1], lod_1_ptr); + // retrieve the lod memory + delete[] lod_1_ptr->data[0]->data; + delete lod_1_ptr->data[0]; + delete[] lod_1_ptr->data; + delete lod_1_ptr; + lod_1_ptr = nullptr; + + LOG(INFO) << "Predictor start run!"; + bool success = PD_PredictorRun(predictor); + EXPECT_TRUE(success); + LOG(INFO) << "Predictor run success!"; + PD_OneDimArrayCstr *output_names = PD_PredictorGetOutputNames(predictor); + PD_Tensor *output = + PD_PredictorGetOutputHandle(predictor, output_names->data[0]); + PD_TwoDimArraySize *output_lod = PD_TensorGetLod(output); + + PD_TwoDimArraySizeDestroy(output_lod); + PD_TensorDestroy(output); + PD_OneDimArrayCstrDestroy(output_names); + + PD_TensorDestroy(inputs[0]); + PD_TensorDestroy(inputs[1]); + PD_OneDimArrayCstrDestroy(input_names); + PD_PredictorDestroy(predictor); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc new file mode 100644 index 00000000000000..18107704ae420f --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_config_tester.cc @@ -0,0 +1,108 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include +#include "paddle/fluid/inference/capi_exp/pd_inference_api.h" +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +TEST(PD_Config, interface) { + std::string model_dir = FLAGS_infer_model + "/mobilenet"; + std::string prog_file = model_dir + "/__model__"; + std::string param_file = model_dir + "/__params__"; + std::string opt_cache_dir = FLAGS_infer_model + "/OptimCacheDir"; + + PD_Config* config = PD_ConfigCreate(); + PD_ConfigSetModelDir(config, model_dir.c_str()); + std::string model_dir_ = PD_ConfigGetModelDir(config); + EXPECT_EQ(model_dir, model_dir_); + + PD_ConfigSetModel(config, prog_file.c_str(), param_file.c_str()); + PD_ConfigSetProgFile(config, prog_file.c_str()); + PD_ConfigSetParamsFile(config, param_file.c_str()); + PD_ConfigSetOptimCacheDir(config, opt_cache_dir.c_str()); + std::string prog_file_ = PD_ConfigGetProgFile(config); + std::string param_file_ = PD_ConfigGetParamsFile(config); + EXPECT_EQ(prog_file, prog_file_); + EXPECT_EQ(param_file, param_file_); + + PD_ConfigDisableFCPadding(config); + bool fc_padding = PD_ConfigUseFcPadding(config); + EXPECT_FALSE(fc_padding); + + PD_ConfigDisableGpu(config); + PD_ConfigSwitchIrOptim(config, TRUE); + bool ir_optim = PD_ConfigIrOptim(config); + EXPECT_TRUE(ir_optim); + +#ifndef PADDLE_WITH_LITE + PD_ConfigEnableLiteEngine(config, PD_PRECISION_FLOAT32, TRUE, 0, nullptr, 0, + nullptr); + bool lite_enabled = PD_ConfigLiteEngineEnabled(config); + EXPECT_TRUE(lite_enabled); +#endif + + PD_ConfigSwitchIrDebug(config, TRUE); +#ifdef PADDLE_WITH_MKLDNN + const char* ops_name = "conv_2d"; + PD_ConfigEnableMKLDNN(config); + PD_ConfigSetMkldnnOp(config, 1, &ops_name); + PD_ConfigSetMkldnnCacheCapacity(config, 100); + bool mkldnn_enabled = PD_ConfigMkldnnEnabled(config); + EXPECT_TRUE(mkldnn_enabled); + + PD_ConfigSetCpuMathLibraryNumThreads(config, 10); + int32_t cpu_threads = PD_ConfigGetCpuMathLibraryNumThreads(config); + EXPECT_EQ(cpu_threads, 10); + + PD_ConfigEnableMkldnnQuantizer(config); + bool mkldnn_qt_enabled = PD_ConfigMkldnnQuantizerEnabled(config); + EXPECT_TRUE(mkldnn_qt_enabled); + + PD_ConfigEnableMkldnnBfloat16(config); + PD_ConfigSetBfloat16Op(config, 1, &ops_name); + bool mkldnn_bf16_enabled = PD_ConfigMkldnnBfloat16Enabled(config); + EXPECT_TRUE(mkldnn_bf16_enabled); +#endif + + PD_ConfigEnableMemoryOptim(config); + bool memory_enabled = PD_ConfigMemoryOptimEnabled(config); + EXPECT_TRUE(memory_enabled); + + PD_ConfigEnableProfile(config); + bool profile_enabled = PD_ConfigProfileEnabled(config); + EXPECT_TRUE(profile_enabled); + + PD_ConfigDisableGlogInfo(config); + bool glog_diabled = PD_ConfigGlogInfoDisabled(config); + EXPECT_TRUE(glog_diabled); + + PD_ConfigSetInvalid(config); + bool is_valid = PD_ConfigIsValid(config); + EXPECT_FALSE(is_valid); + + PD_ConfigPartiallyRelease(config); + PD_ConfigDestroy(config); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_tensor_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_tensor_tester.cc new file mode 100644 index 00000000000000..f4017fc5a7f340 --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_tensor_tester.cc @@ -0,0 +1,196 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/inference/capi_exp/pd_inference_api.h" +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +void PD_run() { + auto model_dir = FLAGS_infer_model; + PD_Config* config = PD_ConfigCreate(); + PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(), + (model_dir + "/__params__").c_str()); + PD_Predictor* predictor = PD_PredictorCreate(config); + PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor); + PD_Tensor* tensor = + PD_PredictorGetInputHandle(predictor, input_names->data[0]); + + int32_t shapes[4] = {1, 3, 300, 300}; + std::vector input(1 * 3 * 300 * 300, 0); + int32_t size; + PD_PlaceType place; + PD_TensorReshape(tensor, 4, shapes); + PD_TensorCopyFromCpuFloat(tensor, input.data()); + PD_TensorDataFloat(tensor, &place, &size); + PD_TensorMutableDataFloat(tensor, place); + + PD_TwoDimArraySize lod; + lod.size = 0; + lod.data = NULL; + PD_TensorSetLod(tensor, &lod); + + PD_PredictorRun(predictor); + + std::vector out_data; + PD_OneDimArrayCstr* output_names = PD_PredictorGetOutputNames(predictor); + PD_Tensor* output_tensor = + PD_PredictorGetOutputHandle(predictor, output_names->data[0]); + PD_OneDimArrayInt32* output_shape = PD_TensorGetShape(output_tensor); + int32_t out_num = std::accumulate(output_shape->data, + output_shape->data + output_shape->size, 1, + std::multiplies()); + out_data.resize(out_num); + PD_TensorCopyToCpuFloat(output_tensor, out_data.data()); + LOG(INFO) << "Output tensor name is: " << PD_TensorGetName(output_tensor); + PD_DataType data_type = PD_TensorGetDataType(output_tensor); + EXPECT_EQ(data_type, PD_DATA_FLOAT32); + + PD_TwoDimArraySize* out_lod = PD_TensorGetLod(output_tensor); + + PD_TwoDimArraySizeDestroy(out_lod); + PD_OneDimArrayInt32Destroy(output_shape); + PD_TensorDestroy(output_tensor); + PD_OneDimArrayCstrDestroy(output_names); + PD_TensorDestroy(tensor); + PD_OneDimArrayCstrDestroy(input_names); + PD_PredictorDestroy(predictor); +} +TEST(PD_Tensor, PD_run) { PD_run(); } + +TEST(PD_Tensor, int32) { + auto model_dir = FLAGS_infer_model; + PD_Config* config = PD_ConfigCreate(); + PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(), + (model_dir + "/__params__").c_str()); + PD_Predictor* predictor = PD_PredictorCreate(config); + PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor); + PD_Tensor* tensor = + PD_PredictorGetInputHandle(predictor, input_names->data[0]); + int32_t shapes[4] = {1, 3, 300, 300}; + std::vector input(1 * 3 * 300 * 300, 0); + int32_t size; + PD_PlaceType place; + PD_TensorReshape(tensor, 4, shapes); + PD_TensorCopyFromCpuInt32(tensor, input.data()); + int32_t* data_ptr = PD_TensorDataInt32(tensor, &place, &size); + EXPECT_EQ(place, PD_PLACE_CPU); + EXPECT_EQ(size, 1 * 3 * 300 * 300); + int32_t* mutable_data_ptr = PD_TensorMutableDataInt32(tensor, place); + EXPECT_EQ(data_ptr, mutable_data_ptr); + + PD_DataType data_type = PD_TensorGetDataType(tensor); + EXPECT_EQ(data_type, PD_DATA_INT32); + PD_TensorCopyToCpuInt32(tensor, input.data()); + + PD_TensorDestroy(tensor); + PD_OneDimArrayCstrDestroy(input_names); + PD_PredictorDestroy(predictor); +} + +TEST(PD_Tensor, int64) { + auto model_dir = FLAGS_infer_model; + PD_Config* config = PD_ConfigCreate(); + PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(), + (model_dir + "/__params__").c_str()); + PD_Predictor* predictor = PD_PredictorCreate(config); + PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor); + PD_Tensor* tensor = + PD_PredictorGetInputHandle(predictor, input_names->data[0]); + int32_t shapes[4] = {1, 3, 300, 300}; + std::vector input(1 * 3 * 300 * 300, 0); + int32_t size; + PD_PlaceType place; + PD_TensorReshape(tensor, 4, shapes); + PD_TensorCopyFromCpuInt64(tensor, input.data()); + int64_t* data_ptr = PD_TensorDataInt64(tensor, &place, &size); + EXPECT_EQ(place, PD_PLACE_CPU); + EXPECT_EQ(size, 1 * 3 * 300 * 300); + int64_t* mutable_data_ptr = PD_TensorMutableDataInt64(tensor, place); + EXPECT_EQ(data_ptr, mutable_data_ptr); + + PD_DataType data_type = PD_TensorGetDataType(tensor); + EXPECT_EQ(data_type, PD_DATA_INT64); + PD_TensorCopyToCpuInt64(tensor, input.data()); + + PD_TensorDestroy(tensor); + PD_OneDimArrayCstrDestroy(input_names); + PD_PredictorDestroy(predictor); +} + +TEST(PD_Tensor, uint8) { + auto model_dir = FLAGS_infer_model; + PD_Config* config = PD_ConfigCreate(); + PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(), + (model_dir + "/__params__").c_str()); + PD_Predictor* predictor = PD_PredictorCreate(config); + PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(predictor); + PD_Tensor* tensor = + PD_PredictorGetInputHandle(predictor, input_names->data[0]); + int32_t shapes[4] = {1, 3, 300, 300}; + uint8_t input[1 * 3 * 300 * 300] = {0}; + int32_t size; + PD_PlaceType place; + PD_TensorReshape(tensor, 4, shapes); + PD_TensorCopyFromCpuUint8(tensor, input); + uint8_t* data_ptr = PD_TensorDataUint8(tensor, &place, &size); + EXPECT_EQ(place, PD_PLACE_CPU); + EXPECT_EQ(size, 1 * 3 * 300 * 300); + uint8_t* mutable_data_ptr = PD_TensorMutableDataUint8(tensor, place); + EXPECT_EQ(data_ptr, mutable_data_ptr); + + PD_DataType data_type = PD_TensorGetDataType(tensor); + EXPECT_EQ(data_type, PD_DATA_UINT8); + PD_TensorCopyToCpuUint8(tensor, input); + + PD_TensorDestroy(tensor); + PD_OneDimArrayCstrDestroy(input_names); + PD_PredictorDestroy(predictor); +} + +std::string read_file(std::string filename) { + std::ifstream file(filename); + return std::string((std::istreambuf_iterator(file)), + std::istreambuf_iterator()); +} + +TEST(PD_Tensor, from_buffer) { + PD_Config* config = PD_ConfigCreate(); + std::string prog_file = FLAGS_infer_model + "/__model__"; + std::string params_file = FLAGS_infer_model + "/__params__"; + + std::string prog_str = read_file(prog_file); + std::string params_str = read_file(params_file); + + PD_ConfigSetModelBuffer(config, prog_str.c_str(), prog_str.size(), + params_str.c_str(), params_str.size()); + + bool model_from_memory = PD_ConfigModelFromMemory(config); + EXPECT_TRUE(model_from_memory); + PD_ConfigDestroy(config); +} + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_threads_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_threads_tester.cc new file mode 100644 index 00000000000000..8951c446b1f83a --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_pd_threads_tester.cc @@ -0,0 +1,119 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/inference/capi_exp/pd_inference_api.h" +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +typedef struct RunParameter { + PD_Predictor* predictor; + int32_t* shapes; + size_t shape_size; + float* input_data; + int32_t out_size; + float* out_data; + int32_t thread_index; +} RunParameter; + +void* run(void* thread_param) { + struct RunParameter* param = (struct RunParameter*)thread_param; + LOG(INFO) << "Thread " << param->thread_index << " start run!"; + PD_OneDimArrayCstr* input_names = PD_PredictorGetInputNames(param->predictor); + PD_Tensor* tensor = + PD_PredictorGetInputHandle(param->predictor, input_names->data[0]); + PD_TensorReshape(tensor, param->shape_size, param->shapes); + PD_TensorCopyFromCpuFloat(tensor, param->input_data); + PD_PredictorRun(param->predictor); + PD_OneDimArrayCstr* output_names = + PD_PredictorGetOutputNames(param->predictor); + PD_Tensor* output_tensor = + PD_PredictorGetOutputHandle(param->predictor, output_names->data[0]); + PD_OneDimArrayInt32* output_shape = PD_TensorGetShape(output_tensor); + param->out_size = 1; + for (size_t index = 0; index < output_shape->size; ++index) { + param->out_size = param->out_size * output_shape->data[index]; + } + PD_OneDimArrayInt32Destroy(output_shape); + param->out_data = + reinterpret_cast(malloc(param->out_size * sizeof(float))); + PD_TensorCopyToCpuFloat(output_tensor, param->out_data); + PD_TensorDestroy(output_tensor); + PD_OneDimArrayCstrDestroy(output_names); + PD_TensorDestroy(tensor); + PD_OneDimArrayCstrDestroy(input_names); + LOG(INFO) << "Thread " << param->thread_index << " end run!"; + return NULL; +} +void threads_run(int thread_num) { + auto model_dir = FLAGS_infer_model; + PD_Config* config = PD_ConfigCreate(); + PD_ConfigSetModel(config, (model_dir + "/__model__").c_str(), + (model_dir + "/__params__").c_str()); + PD_Predictor* predictor = PD_PredictorCreate(config); + + pthread_t* threads = + reinterpret_cast(malloc(thread_num * sizeof(pthread_t))); + RunParameter* params = reinterpret_cast( + malloc(thread_num * sizeof(RunParameter))); + int32_t shapes[4] = {1, 3, 300, 300}; + float* input = + reinterpret_cast(malloc(1 * 3 * 300 * 300 * sizeof(float))); + memset(input, 0, 1 * 3 * 300 * 300 * sizeof(float)); + for (int i = 0; i < thread_num; ++i) { + params[i].predictor = PD_PredictorClone(predictor); + params[i].shapes = shapes; + params[i].shape_size = 4; + params[i].input_data = input; + params[i].out_size = 0; + params[i].out_data = NULL; + params[i].thread_index = i; + pthread_create(&(threads[i]), NULL, run, (params + i)); + } + for (int i = 0; i < thread_num; ++i) { + pthread_join(threads[i], NULL); + } + ASSERT_GT(params[0].out_size, 0); + + for (int i = 1; i < thread_num; ++i) { + ASSERT_EQ(params[i].out_size, params[0].out_size); + for (int j = 0; j < params[i].out_size; ++j) { + ASSERT_EQ(params[i].out_data[j], params[0].out_data[j]); + } + } + for (int i = 0; i < thread_num; ++i) { + PD_PredictorDestroy(params[i].predictor); + free(params[i].out_data); + } + free(input); + free(params); + free(threads); + PD_PredictorDestroy(predictor); +} + +TEST(PD_Predictor, PD_multi_threads_run) { threads_run(10); } + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_tester.cc new file mode 100644 index 00000000000000..11de1a5a6fab4f --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_tester.cc @@ -0,0 +1,83 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include + +#include "paddle/fluid/inference/capi_exp/pd_inference_api.h" +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +void predictor_run() { + std::string model_dir = FLAGS_infer_model; + std::string prog_file = model_dir + "/model"; + std::string params_file = model_dir + "/params"; + PD_Config *config = PD_ConfigCreate(); + PD_ConfigDisableGpu(config); + PD_ConfigSetCpuMathLibraryNumThreads(config, 10); + PD_ConfigSwitchIrDebug(config, TRUE); + PD_ConfigSetModel(config, prog_file.c_str(), params_file.c_str()); + + PD_Predictor *predictor = PD_PredictorCreate(config); + PD_Tensor *tensor = PD_PredictorGetInputHandle(predictor, "data"); + + const int batch_size = 1; + const int channels = 3; + const int height = 318; + const int width = 318; + float *input = new float[batch_size * channels * height * width](); + + int32_t shape[4] = {batch_size, channels, height, width}; + PD_TensorReshape(tensor, 4, shape); + PD_TensorCopyFromCpuFloat(tensor, input); + EXPECT_TRUE(PD_PredictorRun(predictor)); + + delete[] input; + PD_TensorDestroy(tensor); + PD_PredictorDestroy(predictor); +} + +TEST(PD_PredictorRun, predictor_run) { predictor_run(); } + +#ifdef PADDLE_WITH_MKLDNN +TEST(PD_Config, profile_mkldnn) { + std::string model_dir = FLAGS_infer_model; + std::string prog_file = model_dir + "/model"; + std::string params_file = model_dir + "/params"; + PD_Config *config = PD_ConfigCreate(); + PD_ConfigDisableGpu(config); + PD_ConfigSetCpuMathLibraryNumThreads(config, 10); + PD_ConfigSwitchIrDebug(config, TRUE); + PD_ConfigEnableMKLDNN(config); + bool mkldnn_enable = PD_ConfigMkldnnEnabled(config); + EXPECT_TRUE(mkldnn_enable); + PD_ConfigEnableMkldnnQuantizer(config); + bool quantizer_enable = PD_ConfigMkldnnQuantizerEnabled(config); + EXPECT_TRUE(quantizer_enable); + PD_ConfigEnableMkldnnBfloat16(config); + PD_ConfigSetMkldnnCacheCapacity(config, 0); + PD_ConfigSetModel(config, prog_file.c_str(), params_file.c_str()); + PD_ConfigDestroy(config); +} +#endif + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_exp_xpu_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_exp_xpu_tester.cc new file mode 100644 index 00000000000000..f4fd04e85840de --- /dev/null +++ b/paddle/fluid/inference/tests/api/analyzer_capi_exp_xpu_tester.cc @@ -0,0 +1,60 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include +#include "paddle/fluid/inference/capi_exp/pd_inference_api.h" +#include "paddle/fluid/inference/tests/api/tester_helper.h" + +namespace paddle { +namespace inference { +namespace analysis { + +#ifdef PADDLE_WITH_XPU +TEST(PD_Config, use_xpu) { + std::string model_dir = FLAGS_infer_model + "/mobilenet"; + PD_Config *config = PD_Config(); + PD_ConfigSwitchIrDebug(config, TRUE); + PD_ConfigSetModelDir(config, model_dir.c_str()); + PD_ConfigSetOptimCacheDir(config, + (FLAGS_infer_model + "/OptimCacheDir").c_str()); + const char *model_dir_ = PD_ConfigGetModelDir(config); + LOG(INFO) << model_dir_; + PD_ConfigEnableXpu(config, 0xfffc00); + bool use_xpu = PD_ConfigUseXpu(config); + EXPECT_TRUE(use_xpu); + int32_t device_id = PD_ConfigXpuDeviceId(config); + EXPECT_EQ(devive_id, 0); + PD_ConfigSwitchIrOptim(config, TRUE); + bool ir_optim = PD_IrOptim(config); + EXPECT_TRUE(ir_optim); + PD_ConfigEnableMemoryOptim(config); + bool memory_optim_enable = PD_ConfigMemoryOptimEnabled(config); + EXPECT_TRUE(memory_optim_enable); + PD_ConfigEnableProfile(config); + bool profiler_enable = PD_ConfigProfileEnabled(config); + EXPECT_TRUE(profiler_enable); + PD_SetInValid(config); + bool is_valid = PD_ConfigIsValid(config); + EXPECT_FALSE(is_valid); + PD_ConfigDestroy(config); +} +#endif + +} // namespace analysis +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/cuda_allocator.cc b/paddle/fluid/memory/allocation/cuda_allocator.cc index c1b12f5c0ecbb6..b1a45afa99d9a5 100644 --- a/paddle/fluid/memory/allocation/cuda_allocator.cc +++ b/paddle/fluid/memory/allocation/cuda_allocator.cc @@ -54,6 +54,7 @@ Allocation* CUDAAllocator::AllocateImpl(size_t size) { size_t avail, total, actual_avail, actual_total; bool is_limited = platform::RecordedCudaMemGetInfo( &avail, &total, &actual_avail, &actual_total, place_.device); + size_t allocated = total - avail; std::string err_msg; if (is_limited) { @@ -68,13 +69,14 @@ Allocation* CUDAAllocator::AllocateImpl(size_t size) { PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( "\n\nOut of memory error on GPU %d. " - "Cannot allocate %s memory on GPU %d, " + "Cannot allocate %s memory on GPU %d, %s memory has been allocated and " "available memory is only %s.\n\n" "Please check whether there is any other process using GPU %d.\n" "1. If yes, please stop them, or start PaddlePaddle on another GPU.\n" "2. If no, please decrease the batch size of your model. %s\n\n", place_.device, string::HumanReadableSize(size), place_.device, - string::HumanReadableSize(avail), place_.device, err_msg)); + string::HumanReadableSize(allocated), string::HumanReadableSize(avail), + place_.device, err_msg)); } } // namespace allocation diff --git a/paddle/fluid/memory/detail/system_allocator.cc b/paddle/fluid/memory/detail/system_allocator.cc index c733ba5c68c9bd..0d7065d8bfba0e 100644 --- a/paddle/fluid/memory/detail/system_allocator.cc +++ b/paddle/fluid/memory/detail/system_allocator.cc @@ -125,6 +125,7 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) { size_t avail, total, actual_avail, actual_total; bool is_limited = platform::RecordedCudaMemGetInfo( &avail, &total, &actual_avail, &actual_total, gpu_id_); + size_t allocated = total - avail; std::string err_msg; if (is_limited) { @@ -139,7 +140,7 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) { PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted( "\n\nOut of memory error on GPU %d. " - "Cannot allocate %s memory on GPU %d, " + "Cannot allocate %s memory on GPU %d, %s memory has been allocated and " "available memory is only %s.\n\n" "Please check whether there is any other process using GPU %d.\n" "1. If yes, please stop them, or start PaddlePaddle on another GPU.\n" @@ -150,8 +151,8 @@ void* GPUAllocator::Alloc(size_t* index, size_t size) { " The command is " "`export FLAGS_fraction_of_gpu_memory_to_use=xxx`.%s\n\n", gpu_id_, string::HumanReadableSize(size), gpu_id_, - string::HumanReadableSize(avail), gpu_id_, - FLAGS_fraction_of_gpu_memory_to_use, err_msg)); + string::HumanReadableSize(allocated), string::HumanReadableSize(avail), + gpu_id_, FLAGS_fraction_of_gpu_memory_to_use, err_msg)); } } diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc index 1eb0535831bb19..730d49e8acd930 100644 --- a/paddle/fluid/memory/memcpy.cc +++ b/paddle/fluid/memory/memcpy.cc @@ -207,12 +207,6 @@ void Copy(platform::NPUPlace dst_place, platform::SetNPUDeviceId(dst_place.device); - // NOTE(ascendrc): NPU memcpy async from host to device is a "real" async, - // which is different from CUDA. In Paddle, when async is called, "sync" - // is run actually, which means Paddle doesn't fully supported async. - // TODO(ascendrc): Support NPU memcpy async for better performance. - stream = nullptr; - VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place << " by thream(" << stream << ")"; @@ -220,6 +214,12 @@ void Copy(platform::NPUPlace dst_place, platform::RecordEvent record_event("NpuMemcpyAsync:CPU->NPU"); platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE, stream); } else { + // On NPU, async operation after sync operation is ok, while sync operation + // after async is not ok, since the async operation may not done. + // So, its needed to do wait before sync operation. + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + static_cast(pool.Get(dst_place))->Wait(); + platform::RecordEvent record_event("NpuMemcpySync:CPU->NPU"); platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_HOST_TO_DEVICE); } @@ -235,12 +235,6 @@ void Copy(platform::CPUPlace dst_place, platform::SetNPUDeviceId(src_place.device); - // NOTE(ascendrc): NPU memcpy async from device to host is a "real" async, - // which is different from CUDA. In Paddle, when async is called, "sync" - // is run actually, which means Paddle doesn't fully supported async. - // TODO(ascendrc): Support NPU memcpy async for better performance. - stream = nullptr; - VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to " << dst_place << " by thream(" << stream << ")"; @@ -248,6 +242,9 @@ void Copy(platform::CPUPlace dst_place, platform::RecordEvent record_event("NpuMemcpyAsync:NPU->CPU"); platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST, stream); } else { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + static_cast(pool.Get(src_place))->Wait(); + platform::RecordEvent record_event("GpuMemcpySync:NPU->CPU"); platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_HOST); } @@ -270,6 +267,10 @@ void Copy(platform::NPUPlace dst_place, platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE, stream); } else { + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + static_cast(pool.Get(dst_place))->Wait(); + platform::RecordEvent record_event("NpuMemcpySync(same_npu):NPU->NPU"); platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE); } @@ -284,6 +285,10 @@ void Copy(platform::NPUPlace dst_place, platform::NPUMemcpyAsync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE, stream); } else { + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + static_cast(pool.Get(dst_place))->Wait(); + platform::RecordEvent record_event("NpuMemcpyPeerSync:NPU->NPU"); platform::NPUMemcpySync(dst, src, num, ACL_MEMCPY_DEVICE_TO_DEVICE); } diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index cecc70cc6dda8e..6e11c64afc4bd8 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -42,6 +42,10 @@ if (WITH_GPU AND TENSORRT_FOUND) add_subdirectory(tensorrt) endif() +if (WITH_DLNNE) + add_subdirectory(dlnne) +endif() + if (WITH_LITE) add_subdirectory(lite) endif() @@ -167,7 +171,7 @@ endif() if (WITH_ASCEND_CL) cc_test(range_op_npu_test SRCS range_op_npu_test.cc DEPS op_registry range_op scope device_context enforce executor) - cc_test(lookup_table_v2_op_npu_test SRCS lookup_table_v2_op_npu_test.cc DEPS op_registry lookup_table_v2_op scope device_context enforce executor compare_op) + cc_test(expand_op_npu_test SRCS expand_op_npu_test.cc DEPS op_registry expand_op eigen_cc_function scope device_context enforce executor compare_op) endif() set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library") @@ -195,3 +199,7 @@ endif() if(WITH_ASCEND_CL) cc_test(gelu_op_npu_test SRCS gelu_op_npu_test.cc DEPS op_registry gelu_op scope device_context enforce executor) endif() + +if (WITH_GPU OR WITH_ASCEND_CL) +cc_test(copy_cross_scope_test SRCS copy_cross_scope_test.cc DEPS op_registry copy_cross_scope_op scope device_context enforce executor) +endif() diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 1cac9ed9f1dd08..055909ba6f486f 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -162,6 +162,12 @@ Sigmoid Activation Operator )DOC"; +UNUSED constexpr char SiluDoc[] = R"DOC( +Silu Activation Operator + +$$out = x * \\frac{1}{1 + e^{-x}}$$ +)DOC"; + UNUSED constexpr char LogSigmoidDoc[] = R"DOC( Logsigmoid Activation Operator @@ -697,6 +703,7 @@ It is recommended to use the defaults for this activation. }; REGISTER_ACTIVATION_OP_MAKER(Sigmoid, SigmoidDoc); +REGISTER_ACTIVATION_OP_MAKER(Silu, SiluDoc); REGISTER_ACTIVATION_OP_MAKER(LogSigmoid, LogSigmoidDoc); REGISTER_ACTIVATION_OP_MAKER(Exp, ExpDoc); REGISTER_ACTIVATION_OP_MAKER(Relu, ReluDoc); diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu index 781a97c1ffcc17..836c5fa06f6dfe 100644 --- a/paddle/fluid/operators/activation_op.cu +++ b/paddle/fluid/operators/activation_op.cu @@ -10,382 +10,719 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/activation_op.h" +#include "paddle/fluid/operators/amp/fp16_type_traits.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" #include "paddle/fluid/operators/math/math_cuda_utils.h" #include "paddle/fluid/platform/cuda_device_function.h" -#include "paddle/fluid/platform/float16.h" namespace paddle { namespace operators { -using Tensor = framework::Tensor; -using float16 = paddle::platform::float16; +template +struct CudaReluFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + + // relu(x) = max(x, 0) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + return args[0] > zero ? args[0] : zero; + } +}; template -struct CudaVecType { - using type = T; - static constexpr int vecsize = 1; +struct CudaReluGradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + + // dx = dout * (out > 0) + // Inputs: args[0], the input dout + // args[1], the input out + __device__ __forceinline__ T operator()(const T* args) const { + return args[1] > zero ? args[0] : zero; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } }; -template <> -struct CudaVecType { - using type = __half2; - static constexpr int vecsize = 2; +template +struct CudaLeakyReluFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float alpha; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + // leakyrelu(x) = x > 0 ? x : alpha * x + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + return args[0] > zero ? args[0] : static_cast(alpha) * args[0]; + } }; -template <> -struct CudaVecType { - using type = float4; - static constexpr int vecsize = 4; +template +struct CudaLeakyReluGradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float alpha; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + // dx = dout * (x > 0 ? 1 : alpha) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + return args[1] > zero ? args[0] : static_cast(alpha) * args[0]; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } }; template -class BaseGPUFunctor { - public: - using ELEMENT_TYPE = T; +struct CudaSigmoidFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + + // sigmoid(x) = 1 / (1 + exp(-x)) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(one / (one + exp(-x))); + } +}; - using AttrPair = std::vector>; +template +struct CudaSigmoidGradFunctor : public BaseActivationFunctor { + T one = static_cast(1.0f); + + // dx = dout * out * (1 - out) + // Inputs: args[0], the input dout + // args[1], the input out + __device__ __forceinline__ T operator()(const T* args) const { + return args[0] * args[1] * (one - args[1]); + } - AttrPair GetAttrs() { return AttrPair(); } + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } }; -/* ========================================================================== */ +template +struct CudaSiluFunctor : public BaseActivationFunctor { + // MPType means Compute Type + using MPType = typename details::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + + // silu(x) = x / (1 + exp(-x)) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(x / (one + exp(-x))); + } +}; -/* =========================== relu forward ============================ */ template -class ReluGPUFunctor : public BaseGPUFunctor { - private: - T zero_; +struct CudaSiluGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + + // dx = dout * (1 + exp(-x) + x * exp(-x) / (1 + exp(-x))^2) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType dout = static_cast(args[0]); + MPType x = static_cast(args[1]); + MPType temp = one / (one + exp(-x)); + return static_cast(dout * (temp * (one + x * (one - temp)))); + } - public: - ReluGPUFunctor() { zero_ = static_cast(0.0f); } - - // for relu forward when T is double - __device__ __forceinline__ typename CudaVecType::type Compute( - const typename CudaVecType::type in) { - // relu forward : out = max(x, 0) - return in > zero_ ? in : zero_; - } - - // when num % vecsize != 0 this func will be used - __device__ __forceinline__ T ComputeRemainder(const T in) { - // relu forward : out = max(x, 0) - return in > zero_ ? in : zero_; - } -}; - -template <> -__device__ __forceinline__ CudaVecType::type -ReluGPUFunctor::Compute(const CudaVecType::type in) { - // relu forward : out = max(in, 0) - return make_float4((in.x > zero_) * (in.x), (in.y > zero_) * (in.y), - (in.z > zero_) * (in.z), (in.w > zero_) * (in.w)); -} - -template <> -__device__ __forceinline__ CudaVecType::type -ReluGPUFunctor::Compute(const CudaVecType::type in) { -// relu forward : out = max(in, 0) -#ifdef __HIPCC__ || CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) - const half2 kzero = __float2half2_rn(0.0f); - return __hmul2(__hgt2(in, kzero), in); -#else - const float2 xx = __half22float2(in); - return __floats2half2_rn((xx.x > 0.0f) * static_cast(xx.x), - (xx.y > 0.0f) * static_cast(xx.y)); -#endif -} -/* ========================================================================== */ + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; -/* =========================== relu backward ============================ - */ +template +struct CudaLogSigmoidFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + MPType zero = static_cast(0.0f); + + // logsigmoid(x) = log(1 / (1 + exp(-x))) + // For numerical stability, + // logsigmoid(x) = + // - (max(-x, 0) + log(exp(-max(-x, 0)) + exp(-x - max(-x, 0)))) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + MPType temp = x > zero ? zero : -x; + return static_cast(-temp - log(exp(-temp) + exp(-x - temp))); + } +}; template -class ReluGradGPUFunctor : public BaseGPUFunctor { - private: - T zero_; +struct CudaLogSigmoidGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + MPType zero = static_cast(0.0f); + + // dx = dout * exp(-x) / (1 + exp(-x)) + // For numerical stability: + // dx = dout * exp(-x - max(-x, 0)) / (exp(-max(-x, 0)) + exp(-x - max(-x, + // 0))) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType dout = static_cast(args[0]); + MPType x = static_cast(args[1]); + MPType temp1 = x > zero ? zero : -x; + MPType temp2 = exp(-x - temp1); + return static_cast(dout * (temp2 / (exp(-temp1) + temp2))); + } - public: - ReluGradGPUFunctor() { zero_ = static_cast(0.0f); } + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaAtanFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // atan(x) = atan(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(atan(x)); + } +}; + +template +struct CudaAtanGradFunctor : public BaseActivationFunctor { + T one = static_cast(1.0f); + + // dx = dout / (1 + x^2) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + return args[0] / (one + args[1] * args[1]); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaSoftShrinkFunctor : public BaseActivationFunctor { + float lambda; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"lambda", &lambda}}; + } + + // softshrink(x) = x - lambda, if x > lambda; + // x + lambda, if x < -lambda; + // 0, otherwise. + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + T x = args[0]; + T l = static_cast(lambda); + T temp1 = static_cast(x > l); + T temp2 = static_cast(x < -l); + return temp1 * (x - l) + temp2 * (x + l); + } +}; + +template +struct CudaSoftShrinkGradFunctor : public BaseActivationFunctor { + T zero = static_cast(0.0f); + float lambda; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"lambda", &lambda}}; + } + + // dx = dout, if x > lambda or x < -lambda else 0 + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + T x = args[1]; + T l = static_cast(lambda); + return (x >= -l && x <= l) ? zero : args[0]; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaCeilFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // ceil(x) = ceil(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(ceil(x)); + } +}; + +template +struct CudaFloorFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // floor(x) = floor(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(floor(x)); + } +}; + +template +struct CudaRoundFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // round(x) = round(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(round(x)); + } +}; + +// grad functor for ceil, floor and round +template +struct CudaZeroGradFunctor : public BaseActivationFunctor { + __device__ __forceinline__ T operator()(const T* args) const { + return static_cast(0.0f); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kNoDeps; } +}; + +template +struct CudaCosFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // cos(x) = cos(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(cos(x)); + } +}; + +template +struct CudaCosGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // dx = dout * (-sin(x)) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType dout = static_cast(args[0]); + MPType x = static_cast(args[1]); + return static_cast(-dout * sin(x)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaSinFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // sin(x) = sin(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(sin(x)); + } +}; + +template +struct CudaSinGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // dx = dout * cos(x) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType dout = static_cast(args[0]); + MPType x = static_cast(args[1]); + return static_cast(dout * cos(x)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaTanFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // tan(x) = tan(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(tan(x)); + } +}; + +template +struct CudaTanGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // dx = dout / cos(x)^2 + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType dout = static_cast(args[0]); + MPType x = static_cast(args[1]); + return static_cast(dout / (cos(x) * cos(x))); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaAsinFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // asin(x) = asin(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(asin(x)); + } +}; + +template +struct CudaAsinGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + + // dx = dout / sqrt(1 - x^2) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType dout = static_cast(args[0]); + MPType x = static_cast(args[1]); + return static_cast(dout / sqrt(one - x * x)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaAcosFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // acos(x) = acos(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(acos(x)); + } +}; + +template +struct CudaAcosGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + MPType one = static_cast(1.0f); + + // dx = -dout / sqrt(1 - x^2) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType dout = static_cast(args[0]); + MPType x = static_cast(args[1]); + return static_cast(-dout / sqrt(one - x * x)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; - // for relu backward when T is double - __device__ __forceinline__ typename CudaVecType::type Compute( - const typename CudaVecType::type out, - const typename CudaVecType::type dout) { - return out > zero_ ? dout : zero_; +template +struct CudaCoshFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // cosh(x) = cosh(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(cosh(x)); + } +}; + +template +struct CudaCoshGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // dx = dout * sinh(x) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType dout = static_cast(args[0]); + MPType x = static_cast(args[1]); + return static_cast(dout * sinh(x)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaSinhFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // sinh(x) = sinh(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(sinh(x)); + } +}; + +template +struct CudaSinhGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // dx = dout * cosh(x) + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType dout = static_cast(args[0]); + MPType x = static_cast(args[1]); + return static_cast(dout * cosh(x)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaTanhFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // tanh(x) = tanh(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(tanh(x)); } +}; - // when num % vecsize != 0 this func will be used - __device__ __forceinline__ T ComputeRemainder(const T out, const T dout) { - // relu backward : dx = out > 0 ? dout : 0 - return out > zero_ ? dout : zero_; +template +struct CudaTanhGradFunctor : public BaseActivationFunctor { + T one = static_cast(1.0f); + + // dx = dout * (1 - out^2) + // Inputs: args[0], the input dout + // args[1], the input out + __device__ __forceinline__ T operator()(const T* args) const { + T dout = static_cast(args[0]); + T out = static_cast(args[1]); + return dout * (one - out * out); } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } }; -template <> -__device__ __forceinline__ CudaVecType::type -ReluGradGPUFunctor::Compute(const CudaVecType::type out, - const CudaVecType::type dout) { - // relu backward : dx = out > 0 ? dout : 0; - return make_float4((out.x > zero_) * (dout.x), (out.y > zero_) * (dout.y), - (out.z > zero_) * (dout.z), (out.w > zero_) * (dout.w)); -} - -template <> -__device__ __forceinline__ CudaVecType::type -ReluGradGPUFunctor::Compute(const CudaVecType::type out, - const CudaVecType::type dout) { -// relu backward : dx = out > 0 ? dout : 0; -#ifdef __HIPCC__ || CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) - const half2 kzero = __float2half2_rn(0.0f); - return __hmul2(__hgt2(out, kzero), dout); -#else - const float2 xx = __half22float2(out); - const float2 yy = __half22float2(dout); - return __floats2half2_rn((xx.x > 0.0f) * static_cast(yy.x), - (xx.y > 0.0f) * static_cast(yy.y)); -#endif -} +template +struct CudaReciprocalFunctor : public BaseActivationFunctor { + T one = static_cast(1.0f); + + // reciprocal(x) = 1 / x + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + return one / args[0]; + } +}; -/* ========================================================================== */ -/* ======================== leaky relu forward ======================== - */ template -class LeakyReluGPUFunctor : public BaseGPUFunctor { - private: - T zero_; - float alpha_; +struct CudaReciprocalGradFunctor : public BaseActivationFunctor { + // dx = -dout * out^2 + // Inputs: args[0], the input dout + // args[1], the input out + __device__ __forceinline__ T operator()(const T* args) const { + return -args[0] * args[1] * args[1]; + } - public: - LeakyReluGPUFunctor() { zero_ = static_cast(0.0f); } + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha_}}; - } - // leakyrelu forward : out = x > 0 ? x : x * alpha - __device__ __forceinline__ typename CudaVecType::type Compute( - const typename CudaVecType::type in) { - return in > zero_ ? in : static_cast(alpha_) * in; - } - - __device__ __forceinline__ T ComputeRemainder(const T in) { - // leakyrelu forward : out = x > 0 ? x : x * alpha - return in > zero_ ? in : static_cast(alpha_) * in; - } -}; - -template <> -__device__ __forceinline__ CudaVecType::type -LeakyReluGPUFunctor::Compute(const CudaVecType::type in) { - // leakyrelu forward : out = x > 0 ? x : x * alpha - return make_float4((in.x > zero_) ? (in.x) : (in.x) * alpha_, - (in.y > zero_) ? (in.y) : (in.y) * alpha_, - (in.z > zero_) ? (in.z) : (in.z) * alpha_, - (in.w > zero_) ? (in.w) : (in.w) * alpha_); -} - -template <> -__device__ __forceinline__ CudaVecType::type -LeakyReluGPUFunctor::Compute(const CudaVecType::type in) { - // leakyrelu forward : out = x > 0 ? x : x * alpha - const float2 xx = __half22float2(in); - return __floats2half2_rn((xx.x > 0.0f) ? xx.x : xx.x * alpha_, - (xx.y > 0.0f) ? xx.y : xx.y * alpha_); -} -/* ========================================================================== */ +template +struct CudaExpFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // exp(x) = exp(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(exp(x)); + } +}; -/* =========================== leaky relu backward ======================= - */ template -class LeakyReluGradGPUFunctor : public BaseGPUFunctor { - private: - T zero_; - float alpha_; +struct CudaExpGradFunctor : public BaseActivationFunctor { + // dx = dout * out + // Inputs: args[0], the input dout + // args[1], the input out + __device__ __forceinline__ T operator()(const T* args) const { + return args[0] * args[1]; + } - public: - LeakyReluGradGPUFunctor() { zero_ = static_cast(0.0f); } + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; - typename BaseActivationFunctor::AttrPair GetAttrs() { - return {{"alpha", &alpha_}}; +template +struct CudaLogFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // log(x) = log(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(log(x)); + } +}; + +template +struct CudaLogGradFunctor : public BaseActivationFunctor { + // dx = dout / x + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + return args[0] / args[1]; } - // for leaky relu backward when T is double - __device__ __forceinline__ typename CudaVecType::type Compute( - const typename CudaVecType::type in, - const typename CudaVecType::type dout) { - // leakyrelu backward : dx = x > 0 ? dout : alpha * dout - return in > zero_ ? dout : static_cast(alpha_) * dout; + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + +template +struct CudaSquareFunctor : public BaseActivationFunctor { + // square(x) = x * x + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + return args[0] * args[0]; } +}; - // when num % vecsize != 0 this func will be used - __device__ __forceinline__ T ComputeRemainder(const T in, const T dout) { - // leakyrelu backward : dx = x > 0 ? dout : alpha * dout - return in > zero_ ? dout : static_cast(alpha_) * dout; +template +struct CudaSquareGradFunctor : public BaseActivationFunctor { + T two = static_cast(2.0f); + + // dx = dout * 2 * x + // Inputs: args[0], the input dout + // args[1], the input x + __device__ __forceinline__ T operator()(const T* args) const { + return args[0] * two * args[1]; } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } }; -template <> -__device__ __forceinline__ CudaVecType::type -LeakyReluGradGPUFunctor::Compute(const CudaVecType::type in, - const CudaVecType::type dout) { - // leakyrelu backward : dx = x > 0 ? dout : alpha * dout - return make_float4((in.x > zero_) ? (dout.x) : alpha_ * (dout.x), - (in.y > zero_) ? (dout.y) : alpha_ * (dout.y), - (in.z > zero_) ? (dout.z) : alpha_ * (dout.z), - (in.w > zero_) ? (dout.w) : alpha_ * (dout.w)); -} - -template <> -__device__ __forceinline__ CudaVecType::type LeakyReluGradGPUFunctor< - float16>::Compute(const CudaVecType::type in, - const CudaVecType::type dout) { - // leakyrelu backward : dx = x > 0 ? dout : alpha * dout - const float2 xx = __half22float2(in); - const float2 yy = __half22float2(dout); - return __floats2half2_rn((xx.x > 0.0f) ? yy.x : alpha_ * yy.x, - (xx.y > 0.0f) ? yy.y : alpha_ * yy.y); -} +template +struct CudaSqrtFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // sqrt(x) = sqrt(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(sqrt(x)); + } +}; -/* ========================================================================== */ +template +struct CudaSqrtGradFunctor : public BaseActivationFunctor { + T one_half = static_cast(0.5f); + + // dx = dout * 0.5 / out + // Inputs: args[0], the input dout + // args[1], the input out + __device__ __forceinline__ T operator()(const T* args) const { + return one_half * args[0] / args[1]; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; -template -__global__ void ActivationGradKernelVec(const T* forward_data, const T* dout, - T* dx, int num, Functor functor) { - using VecType = typename CudaVecType::type; - constexpr int vecsize = CudaVecType::vecsize; - int idx = threadIdx.x + blockIdx.x * blockDim.x; - int stride = blockDim.x * gridDim.x; - int loop = num / vecsize; - int tail = num % vecsize; - const VecType* in_forward = reinterpret_cast(forward_data); - const VecType* in_dout = reinterpret_cast(dout); - VecType* out = reinterpret_cast(dx); - VecType forward_vec, dout_vec; - T in_data, dout_data; - for (int i = idx; i < loop; i += stride) { -#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 - forward_vec = __ldg(in_forward + i); - dout_vec = __ldg(in_dout + i); -#else - forward_vec = in_forward[i]; - dout_vec = in_dout[i]; -#endif - out[i] = functor.Compute(forward_vec, dout_vec); - } - - while (idx == loop && tail) { - in_data = forward_data[num - tail]; - dout_data = dout[num - tail]; - dx[num - tail] = functor.ComputeRemainder(in_data, dout_data); - --tail; - } -} - -template -__global__ void ActivationkernelVec(const T* src, T* dst, int num, - Functor functor) { - constexpr int vecsize = CudaVecType::vecsize; - using VecType = typename CudaVecType::type; - int idx = threadIdx.x + blockIdx.x * blockDim.x; - int stride = blockDim.x * gridDim.x; - int loop = num / vecsize; - int tail = num % vecsize; - const VecType* in = reinterpret_cast(src); - VecType* out = reinterpret_cast(dst); - VecType x_vec; - for (int i = idx; i < loop; i += stride) { -#ifdef __HIPCC__ || __CUDA_ARCH__ >= 350 - x_vec = __ldg(in + i); -#else - x_vec = in[i]; -#endif - out[i] = functor.Compute(x_vec); - } - - while (idx == loop && tail) { - dst[num - tail] = functor.ComputeRemainder(src[num - tail]); - --tail; - } -} +template +struct CudaRsqrtFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + + // rsqrt(x) = rsqrt(x) + // Inputs: args[0], the input x + __device__ __forceinline__ T operator()(const T* args) const { + MPType x = static_cast(args[0]); + return static_cast(rsqrt(x)); + } +}; + +template +struct CudaRsqrtGradFunctor : public BaseActivationFunctor { + T minus_one_half = static_cast(-0.5f); + + // dx = dout * -0.5 / out^3 + // Inputs: args[0], the input dout + // args[1], the input out + __device__ __forceinline__ T operator()(const T* args) const { + T out = args[1]; + return minus_one_half * args[0] * out * out * out; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; template -class ActivationGPUKernel +class ActivationCudaKernel : public framework::OpKernel { public: using T = typename Functor::ELEMENT_TYPE; - void Compute(const framework::ExecutionContext& context) const override { - const framework::Tensor* in_x = nullptr; + void Compute(const framework::ExecutionContext& ctx) const override { + const framework::Tensor* x = nullptr; framework::Tensor* out = nullptr; - ExtractActivationTensor(context, &in_x, &out); - auto& dev_ctx = context.template device_context(); - - int num = in_x->numel(); - const T* input_data = in_x->data(); - T* output_data = out->mutable_data(dev_ctx.GetPlace(), - static_cast(num * sizeof(T))); - - int block = 512; -#ifdef __HIPCC__ - block = 256; -#endif - Functor functor; + ExtractActivationTensor(ctx, &x, &out); + out->mutable_data(ctx.GetPlace()); + auto& dev_ctx = ctx.template device_context(); + std::vector ins = {x}; + std::vector outs = {out}; + auto functor = Functor(); auto attrs = functor.GetAttrs(); for (auto& attr : attrs) { - *attr.second = context.Attr(attr.first); + *attr.second = ctx.Attr(attr.first); } - constexpr int vecsize = CudaVecType::vecsize; - int grid = max((num / vecsize + block - 1) / block, 1); - auto stream = context.cuda_device_context().stream(); - ActivationkernelVec<<>>( - input_data, output_data, num, functor); + LaunchElementwiseCudaKernel(dev_ctx, ins, &outs, + functor); } }; template -class ActivationGradGPUKernel +class ActivationGradCudaKernel : public framework::OpKernel { public: using T = typename Functor::ELEMENT_TYPE; - void Compute(const framework::ExecutionContext& context) const override { + void Compute(const framework::ExecutionContext& ctx) const override { const framework::Tensor *x, *out, *d_out; framework::Tensor* d_x = nullptr; x = out = d_out = nullptr; - ExtractActivationGradTensor(context, &x, &out, &d_out, + ExtractActivationGradTensor(ctx, &x, &out, &d_out, &d_x); - int numel = d_out->numel(); - auto& dev_ctx = context.template device_context(); - auto* dx_data = d_x->mutable_data( - dev_ctx.GetPlace(), static_cast(numel * sizeof(T))); - auto* dout_data = d_out->data(); + d_x->mutable_data(ctx.GetPlace()); + auto& dev_ctx = ctx.template device_context(); + auto functor = Functor(); + auto attrs = functor.GetAttrs(); + for (auto& attr : attrs) { + *attr.second = ctx.Attr(attr.first); + } + + std::vector ins = {d_out}; + std::vector outs = {d_x}; - auto* forward_data = dout_data; if (static_cast(Functor::FwdDeps()) == static_cast(kDepOut)) { // Only need forward output Out - forward_data = out->data(); + ins.push_back(out); + LaunchElementwiseCudaKernel(dev_ctx, ins, + &outs, functor); } else if (static_cast(Functor::FwdDeps()) == static_cast(kDepX)) { // Only need forward input X - forward_data = x->data(); + ins.push_back(x); + LaunchElementwiseCudaKernel(dev_ctx, ins, + &outs, functor); + } else { + LaunchElementwiseCudaKernel(dev_ctx, ins, + &outs, functor); } - - int block = 512; -#ifdef __HIPCC__ - block = 256; -#endif - - Functor functor; - auto attrs = functor.GetAttrs(); - for (auto& attr : attrs) { - *attr.second = context.Attr(attr.first); - } - constexpr int vecsize = CudaVecType::vecsize; - int grid = max((numel / vecsize + block - 1) / block, 1); - auto stream = context.cuda_device_context().stream(); - ActivationGradKernelVec<<>>( - forward_data, dout_data, dx_data, numel, functor); } }; @@ -395,12 +732,13 @@ class ActivationGradGPUKernel namespace ops = paddle::operators; namespace plat = paddle::platform; -#define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, op_name, functor, \ - grad_functor) \ +#define REGISTER_ACTIVATION_GPU_KERNEL(act_type, op_name, functor, \ + grad_functor) \ REGISTER_OP_CUDA_KERNEL( \ - act_type, \ - ops::ActivationKernel>, \ - ops::ActivationKernel>, \ + act_type, ops::ActivationKernel>, \ + ops::ActivationKernel>, \ ops::ActivationKernel>); \ REGISTER_OP_CUDA_KERNEL( \ @@ -410,28 +748,28 @@ namespace plat = paddle::platform; ops::grad_functor>, \ ops::ActivationGradKernel>); -FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_CUDA_KERNEL); -#define REGISTER_ACTIVATION_GPU_KERNEL(act_type, op_name, functor, \ - grad_functor) \ +#define REGISTER_ACTIVATION_CUDA_KERNEL(act_type, op_name, functor, \ + grad_functor) \ REGISTER_OP_CUDA_KERNEL( \ - act_type, ops::ActivationGPUKernel>, \ - ops::ActivationGPUKernel>, \ - ops::ActivationGPUKernel>); \ + act_type, ops::ActivationCudaKernel>, \ + ops::ActivationCudaKernel>, \ + ops::ActivationCudaKernel>); \ REGISTER_OP_CUDA_KERNEL( \ - act_type##_grad, ops::ActivationGradGPUKernel>, \ - ops::ActivationGradGPUKernel>, \ - ops::ActivationGradGPUKernel>); + act_type##_grad, \ + ops::ActivationGradCudaKernel>, \ + ops::ActivationGradCudaKernel>, \ + ops::ActivationGradCudaKernel>); /* ======================== leaky relu register ============================ */ -REGISTER_ACTIVATION_GPU_KERNEL(leaky_relu, LeakyRelu, LeakyReluGPUFunctor, - LeakyReluGradGPUFunctor); +REGISTER_ACTIVATION_CUDA_KERNEL(leaky_relu, LeakyRelu, CudaLeakyReluFunctor, + CudaLeakyReluGradFunctor); REGISTER_OP_CUDA_KERNEL( leaky_relu_grad_grad, @@ -444,7 +782,7 @@ REGISTER_OP_CUDA_KERNEL( /* ========================================================================== */ /* ======================== elu register ============================ */ -REGISTER_ACTIVATION_CUDA_KERNEL(elu, ELU, ELUFunctor, ELUGradFunctor); +REGISTER_ACTIVATION_GPU_KERNEL(elu, ELU, ELUFunctor, ELUGradFunctor); REGISTER_OP_CUDA_KERNEL( elu_grad_grad, ops::ELUDoubleGradKernel>, - ops::ActivationKernel>, - ops::ActivationKernel>, - ops::ActivationKernel>, - ops::ActivationKernel>); + square, ops::ActivationCudaKernel>, + ops::ActivationCudaKernel>, + ops::ActivationCudaKernel>, + ops::ActivationCudaKernel>, + ops::ActivationCudaKernel>); REGISTER_OP_CUDA_KERNEL( - square_grad, ops::ActivationGradKernel>, - ops::ActivationGradKernel>, - ops::ActivationGradKernel>, - ops::ActivationGradKernel>, - ops::ActivationGradKernel>); + square_grad, + ops::ActivationGradCudaKernel>, + ops::ActivationGradCudaKernel>, + ops::ActivationGradCudaKernel>, + ops::ActivationGradCudaKernel>, + ops::ActivationGradCudaKernel>); REGISTER_OP_CUDA_KERNEL( square_grad_grad, @@ -564,27 +910,29 @@ REGISTER_OP_CUDA_KERNEL( /* ========================== exp register ============================ */ REGISTER_OP_CUDA_KERNEL( - exp, ops::ActivationKernel>, - ops::ActivationKernel>, + exp, ops::ActivationCudaKernel>, + ops::ActivationCudaKernel>, ops::ActivationKernel>, ops::ActivationKernel>, - ops::ActivationKernel>); + ops::ActivationCudaKernel>); REGISTER_OP_CUDA_KERNEL( - exp_grad, ops::ActivationGradKernel>, - ops::ActivationGradKernel>, - ops::ActivationGradKernel>, - ops::ActivationGradKernel>, - ops::ActivationGradKernel>); + exp_grad, ops::ActivationGradCudaKernel>, + ops::ActivationGradCudaKernel>, + ops::ActivationGradCudaKernel>, + ops::ActivationGradCudaKernel>, + ops::ActivationGradCudaKernel>); /* ========================================================================== */ /* ========================== Log register ==================================*/ -REGISTER_ACTIVATION_CUDA_KERNEL(log, Log, LogFunctor, LogGradFunctor); +REGISTER_ACTIVATION_CUDA_KERNEL(log, Log, CudaLogFunctor, CudaLogGradFunctor); REGISTER_OP_CUDA_KERNEL( log_grad_grad, ops::LogDoubleGradKernel>); /* ========================================================================== */ + +REGISTER_ACTIVATION_CUDA_KERNEL(sigmoid, Sigmoid, CudaSigmoidFunctor, + CudaSigmoidGradFunctor); +REGISTER_ACTIVATION_CUDA_KERNEL(silu, Silu, CudaSiluFunctor, + CudaSiluGradFunctor); +REGISTER_ACTIVATION_CUDA_KERNEL(logsigmoid, LogSigmoid, CudaLogSigmoidFunctor, + CudaLogSigmoidGradFunctor); +REGISTER_ACTIVATION_CUDA_KERNEL(atan, Atan, CudaAtanFunctor, + CudaAtanGradFunctor); +REGISTER_ACTIVATION_CUDA_KERNEL(softshrink, SoftShrink, CudaSoftShrinkFunctor, + CudaSoftShrinkGradFunctor); +REGISTER_ACTIVATION_CUDA_KERNEL(ceil, Ceil, CudaCeilFunctor, + CudaZeroGradFunctor); +REGISTER_ACTIVATION_CUDA_KERNEL(floor, Floor, CudaFloorFunctor, + CudaZeroGradFunctor); +REGISTER_ACTIVATION_CUDA_KERNEL(cos, Cos, CudaCosFunctor, CudaCosGradFunctor); +REGISTER_ACTIVATION_CUDA_KERNEL(tan, Tan, CudaTanFunctor, CudaTanGradFunctor); +REGISTER_ACTIVATION_CUDA_KERNEL(acos, Acos, CudaAcosFunctor, + CudaAcosGradFunctor); +REGISTER_ACTIVATION_CUDA_KERNEL(sin, Sin, CudaSinFunctor, CudaSinGradFunctor); +REGISTER_ACTIVATION_CUDA_KERNEL(asin, Asin, CudaAsinFunctor, + CudaAsinGradFunctor); +REGISTER_ACTIVATION_CUDA_KERNEL(sinh, Sinh, CudaSinhFunctor, + CudaSinhGradFunctor); +REGISTER_ACTIVATION_CUDA_KERNEL(cosh, Cosh, CudaCoshFunctor, + CudaCoshGradFunctor); +REGISTER_ACTIVATION_CUDA_KERNEL(round, Round, CudaRoundFunctor, + CudaZeroGradFunctor); +REGISTER_ACTIVATION_CUDA_KERNEL(reciprocal, Reciprocal, CudaReciprocalFunctor, + CudaReciprocalGradFunctor); +REGISTER_ACTIVATION_GPU_KERNEL(log1p, Log1p, Log1pFunctor, Log1pGradFunctor); +REGISTER_ACTIVATION_GPU_KERNEL(log2, Log2, Log2Functor, Log2GradFunctor); +REGISTER_ACTIVATION_GPU_KERNEL(log10, Log10, Log10Functor, Log10GradFunctor); +REGISTER_ACTIVATION_GPU_KERNEL(brelu, BRelu, BReluFunctor, BReluGradFunctor); +REGISTER_ACTIVATION_GPU_KERNEL(soft_relu, SoftRelu, SoftReluFunctor, + SoftReluGradFunctor); +REGISTER_ACTIVATION_GPU_KERNEL(stanh, STanh, STanhFunctor, STanhGradFunctor); +REGISTER_ACTIVATION_GPU_KERNEL(softplus, Softplus, SoftplusFunctor, + SoftplusGradFunctor); +REGISTER_ACTIVATION_GPU_KERNEL(softsign, Softsign, SoftsignFunctor, + SoftsignGradFunctor); +REGISTER_ACTIVATION_GPU_KERNEL(relu6, Relu6, Relu6Functor, Relu6GradFunctor); +REGISTER_ACTIVATION_GPU_KERNEL(tanh_shrink, TanhShrink, TanhShrinkFunctor, + TanhShrinkGradFunctor); +REGISTER_ACTIVATION_GPU_KERNEL(hard_shrink, HardShrink, HardShrinkFunctor, + HardShrinkGradFunctor); +REGISTER_ACTIVATION_GPU_KERNEL(hard_sigmoid, HardSigmoid, HardSigmoidFunctor, + HardSigmoidGradFunctor); +REGISTER_ACTIVATION_GPU_KERNEL(swish, Swish, SwishFunctor, SwishGradFunctor); +REGISTER_ACTIVATION_GPU_KERNEL(thresholded_relu, ThresholdedRelu, + ThresholdedReluFunctor, + ThresholdedReluGradFunctor); +REGISTER_ACTIVATION_GPU_KERNEL(hard_swish, HardSwish, HardSwishFunctor, + HardSwishGradFunctor); diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index fb9f956f17c0b1..ccd5bf528ba58c 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -258,6 +258,31 @@ struct SigmoidGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } }; +// silu(x) = x / (1 + exp(-x)) +template +struct SiluFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out) const { + auto temp = static_cast(1) / (static_cast(1) + (-x).exp()); + out.device(d) = x * temp; + } +}; + +// silu'(x) = (1 / (1 + e^{-x})) * (1 + out * e^{-x})) +template +struct SiluGradFunctor : public BaseActivationFunctor { + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto temp1 = static_cast(1) + (-x).exp(); // 1+e^(-x) + auto temp2 = x * (-x).exp(); // x*e^(-x) + dx.device(d) = dout * ((static_cast(1) / temp1) * + (static_cast(1) + (temp2 / temp1))); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + // Originally: logsigmoid(x) = -log (1 + exp(-x)) // For numerical stability, we can use the log-sum-exp trick: // https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/ @@ -430,7 +455,7 @@ struct HardShrinkFunctor : public BaseActivationFunctor { void operator()(Device d, X x, Out out) const { auto temp1 = x < static_cast(threshold * -1.f); auto temp2 = x > static_cast(threshold); - out.device(d) = x * (temp1 + temp2).template cast(); + out.device(d) = x * (temp1 || temp2).template cast(); } }; @@ -447,7 +472,7 @@ struct HardShrinkGradFunctor : public BaseActivationFunctor { void operator()(Device d, X x, Out out, dOut dout, dX dx) const { auto temp1 = x < static_cast(threshold * -1.f); auto temp2 = x > static_cast(threshold); - dx.device(d) = dout * (temp1 + temp2).template cast(); + dx.device(d) = dout * (temp1 || temp2).template cast(); } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } @@ -2129,6 +2154,7 @@ struct LogGradGradFunctor : public BaseActivationFunctor { #define FOR_EACH_ACTIVATION_OP(__macro) \ __macro(sigmoid, Sigmoid, SigmoidFunctor, SigmoidGradFunctor); \ + __macro(silu, Silu, SiluFunctor, SiluGradFunctor); \ __macro(logsigmoid, LogSigmoid, LogSigmoidFunctor, LogSigmoidGradFunctor); \ __macro(atan, Atan, AtanFunctor, AtanGradFunctor); \ __macro(softshrink, SoftShrink, SoftShrinkFunctor, SoftShrinkGradFunctor); \ diff --git a/paddle/fluid/operators/activation_op_npu.cc b/paddle/fluid/operators/activation_op_npu.cc index 923b581af287d1..f368c658230555 100644 --- a/paddle/fluid/operators/activation_op_npu.cc +++ b/paddle/fluid/operators/activation_op_npu.cc @@ -77,8 +77,7 @@ class PowGradNPUKernel : public framework::OpKernel { // 2.1 Get a factor tensor with shape [1]. Tensor factor_tensor(framework::proto::VarType::FP32); factor_tensor.mutable_data({1}, place); - TensorFromVector(std::vector{factor}, ctx.device_context(), - &factor_tensor); + FillNpuTensorWithConstant(&factor_tensor, factor); // 2.2 Get the factor which has the shape with x and the same value with // factor. diff --git a/paddle/fluid/operators/amp/alloc_float_status_op.cc b/paddle/fluid/operators/amp/alloc_float_status_op.cc new file mode 100644 index 00000000000000..181dd6eabe22d7 --- /dev/null +++ b/paddle/fluid/operators/amp/alloc_float_status_op.cc @@ -0,0 +1,74 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class AllocFloatStatusOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasOutput("FloatStatus"), "Output", "FloatStatus", + "alloc_float_status"); + ctx->SetOutputDim("FloatStatus", {8}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(framework::proto::VarType::FP32, + ctx.GetPlace()); + } +}; + +class AllocFloatStatusMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddOutput("FloatStatus", + "(Tensor) of shape {8} that holds the float status."); + AddComment(R"DOC( + Produces a float Tensor that holds the float status +)DOC"); + } +}; + +template +class AllocFloatStatusKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_THROW(platform::errors::Unimplemented( + "Operator alloc_float_status is not supported on CPU")); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +using CPU = paddle::platform::CPUDeviceContext; + +REGISTER_OPERATOR( + alloc_float_status, ops::AllocFloatStatusOp, ops::AllocFloatStatusMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker); + +REGISTER_OP_CPU_KERNEL(alloc_float_status, + ops::AllocFloatStatusKernel); diff --git a/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc b/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc new file mode 100644 index 00000000000000..fe5b08af52a624 --- /dev/null +++ b/paddle/fluid/operators/amp/alloc_float_status_op_npu.cc @@ -0,0 +1,47 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class AllocFloatStatusKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* float_status = ctx.Output("FloatStatus"); + float_status->mutable_data(ctx.GetPlace()); + + auto runner = NpuOpRunner("NPUAllocFloatStatus", {}, {*float_status}); + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + alloc_float_status, + ops::AllocFloatStatusKernel); diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc index 9d78936ad5f7f2..c7520dbd34f6a9 100644 --- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cc @@ -60,6 +60,12 @@ class CheckFiniteAndUnscaleOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("Scale", "(Tensor) 1-dim tensor, the scale of check_finite_and_unscale " "operator."); +#ifdef PADDLE_WITH_ASCEND_CL + AddInput("FloatStatus", + "(Tensor) 1-dim tensor of shape [8], allocated by " + "alloc_float_status op") + .AsDispensable(); +#endif AddOutput("Out", "(Tensors) The scaled output tensor of " "check_finite_and_unscale operator.") diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu index 2c3a9c366e4fd0..c699486a9140a3 100644 --- a/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op.cu @@ -39,33 +39,36 @@ __global__ void CheckFiniteAndUnscale(const T** xs, const MT* scale, __syncthreads(); const int64_t num = s_starts[size]; - int pre_xs_index = 0; - bool t_found_inf = false; - const MT t_scale = *scale; + int xs_index = 0; + bool local_found_inf = false; + const MT local_scale = *scale; for (int64_t idx = tid; idx < num; idx += gridDim.x * blockDim.x) { - // get the xs's index of thread - int xs_index = pre_xs_index; - while (idx < s_starts[xs_index]) xs_index++; - // avoid some tensor's numel is zero - while (idx >= s_starts[xs_index]) xs_index++; - pre_xs_index = xs_index - 1; + // get the "out" index of "id" + // For example: + // idx = 15, starts = [0, 10, 10, 20, 30] + // because 10 <= idx < 20 ==> + // the idx element locate in the 3rd tensor (notice the 2nd tensor size is + // 0) + int next_xs_index = xs_index; + while (idx >= s_starts[next_xs_index]) next_xs_index++; + xs_index = next_xs_index - 1; // get in data and out data - const T* in = xs[pre_xs_index]; - T* out = outs[pre_xs_index]; - int64_t in_idx = idx - s_starts[pre_xs_index]; + const T* in = xs[xs_index]; + T* out = outs[xs_index]; + int64_t in_idx = idx - s_starts[xs_index]; // Unscale - MT val = static_cast(in[in_idx]) * t_scale; + MT val = static_cast(in[in_idx]) * local_scale; T narrow_val = static_cast(val); out[in_idx] = narrow_val; // CheckFinite if (!isfinite(narrow_val)) { - t_found_inf = true; + local_found_inf = true; } } - if (t_found_inf) { + if (local_found_inf) { *found_inf = true; } } @@ -94,28 +97,30 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel { scale_data, inverse_scale_v, found_inf_data); size_t xs_size = xs.size(); + const auto& cpu_place = platform::CPUPlace(); // calculate each tensor's start index and copy to device auto h_starts_tensor = - memory::Alloc(platform::CPUPlace(), (xs_size + 1) * sizeof(int64_t)); + memory::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t)); int64_t* h_starts = reinterpret_cast(h_starts_tensor->ptr()); auto d_starts_tensor = memory::Alloc(dev_ctx, (xs_size + 1) * sizeof(int64_t)); int64_t* d_starts = reinterpret_cast(d_starts_tensor->ptr()); + // the start index value of each tensor is + // the sum of previous tensor's size. For example: + // xs = [10, 0, 10, 10] ==> starts = [0, 10, 10, 20, 30] h_starts[0] = 0; for (int i = 1; i <= xs_size; i++) { - // the start index value of each tensor is - // the sum of previous tensor's size h_starts[i] = h_starts[i - 1] + xs[i - 1]->numel(); } int64_t total_num = h_starts[xs_size]; memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), - d_starts, platform::CPUPlace(), h_starts, - (xs_size + 1) * sizeof(int64_t), dev_ctx.stream()); + d_starts, cpu_place, h_starts, (xs_size + 1) * sizeof(int64_t), + dev_ctx.stream()); // copy each tensor's data address to device - auto h_mem = memory::Alloc(platform::CPUPlace(), 2 * xs_size * sizeof(T*)); + auto h_mem = memory::Alloc(cpu_place, 2 * xs_size * sizeof(T*)); const T** h_xs = reinterpret_cast(h_mem->ptr()); T** h_outs = reinterpret_cast(h_mem->ptr()) + xs_size; @@ -128,16 +133,18 @@ class CheckFiniteAndUnscaleGpuKernel : public framework::OpKernel { h_outs[i] = outs[i]->mutable_data(dev_ctx.GetPlace()); } memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), d_xs, - platform::CPUPlace(), h_xs, 2 * xs_size * sizeof(T*), - dev_ctx.stream()); + cpu_place, h_xs, 2 * xs_size * sizeof(T*), dev_ctx.stream()); // Launch Kernel - int block = 1024; - int block_num = block * 20; // each thread deal with 20 number - int grid = (total_num + block_num - 1) / block_num; + int threads_per_block = std::min(static_cast(1024), total_num); + int elements_per_block = + threads_per_block * 20; // each thread deal with 20 number + int blocks_per_grid = + (total_num + elements_per_block - 1) / elements_per_block; VLOG(3) << "launch kernel"; - CheckFiniteAndUnscale<<< - grid, block, (xs_size + 1) * sizeof(int64_t), dev_ctx.stream()>>>( + CheckFiniteAndUnscale< + T, MPDType><<>>( d_xs, inverse_scale_v, xs_size, d_starts, found_inf_data, d_outs); VLOG(3) << "finish kernel"; } diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc index 46f9f7ff089448..8fd45326e4ec61 100644 --- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu.cc @@ -24,12 +24,19 @@ namespace operators { using Tensor = framework::Tensor; +// NOTE(zhiqiu): The CheckFiniteAndUnscaleNPUKernel is different from CUDA. +// On NPU, we do not really check the data of input tensors, +// but use NPUGetFloatStatus to check whether the nan/inf occurs on device, +// and clear it after this op. +// Which may leads to wrong result if the input tensors is not calculated +// on NPU device, but got from other way, for example, feeding. template class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const { const auto xs = ctx.MultiInput("X"); const auto* scale = ctx.Input("Scale"); + const auto* float_status = ctx.Input("FloatStatus"); auto outs = ctx.MultiOutput("Out"); auto* found_inf = ctx.Output("FoundInfinite"); @@ -44,10 +51,7 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel { // step1: inverse scale(RealDiv) Tensor const_tensor; const_tensor.mutable_data({1}, ctx.GetPlace()); - TensorFromVector(std::vector{static_cast(1.0)}, ctx.device_context(), - &const_tensor); - - ctx.template device_context().Wait(); + FillNpuTensorWithConstant(&const_tensor, static_cast(1.0)); // Inverse(1.0/scale) Tensor* tmp_inverse_out = const_cast(scale); @@ -59,53 +63,60 @@ class CheckFiniteAndUnscaleNPUKernel : public framework::OpKernel { runner_inverse.Run(stream); tmp_inverse_out = &inverse_out; - size_t x_size = xs.size(); - for (size_t i = 0; i < x_size; ++i) { - found_inf_data = true; + // NOTE(zhiqiu): + Tensor tmp; + tmp.mutable_data({8}, ctx.GetPlace()); + + // NOTE(zhiqiu): NPUGetFloatStatus updates data on input in-place. + // tmp is only placeholder. + auto runner_float_status = + NpuOpRunner("NPUGetFloatStatus", {*float_status}, {tmp}, + {{"message", std::string("check_nan_and_inf")}}); + runner_float_status.Run(stream); + + Tensor sum; + sum.mutable_data({1}, ctx.GetPlace()); + auto runner_reduce_sum = + NpuOpRunner("ReduceSumD", {*float_status}, {sum}, + {{"axes", std::vector{0}}, {"keep_dims", true}}); + runner_reduce_sum.Run(stream); + + std::vector sum_vec; + TensorToVector( + sum, ctx.template device_context(), + &sum_vec); + found_inf_data = (sum_vec[0] > 1); + + VLOG(4) << "found_inf_data:" << found_inf_data; + + for (size_t i = 0; i < xs.size(); ++i) { const auto* x = xs[i]; auto* out = outs[i]; out->mutable_data(ctx.GetPlace()); - - // step2: CheckNumerics - // CheckNumerics runs on the Ascend AI CPU, which delivers poor - // performance. - Tensor check_xout(x->type()); - check_xout.Resize(x->dims()); - check_xout.mutable_data(ctx.GetPlace()); - try { - auto runner_checknumerics = - NpuOpRunner("CheckNumerics", {*x}, {check_xout}, - {{"message", std::string("check_nan_and_inf")}}); - runner_checknumerics.Run(stream); - } catch (platform::EnforceNotMet& exception) { - LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!"; - found_inf_data = true; - } catch (...) { - LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!"; - found_inf_data = true; - } - if (!found_inf_data) { // MatMul auto runner_matmul = NpuOpRunner("Mul", {*x, *tmp_inverse_out}, {*out}, {}); runner_matmul.Run(stream); - } else { - // ZerosLike - auto runner_zeroslike = NpuOpRunner("ZerosLike", {*x}, {*out}, {}); - runner_zeroslike.Run(stream); - } // end if - } // end for + } + } // set found_inf to true - if (found_inf_data) { - Tensor found_inf_tensor; - found_inf_tensor.Resize({1}); - bool* is_found_inf = - found_inf_tensor.mutable_data(paddle::platform::CPUPlace()); - *is_found_inf = true; - framework::TensorCopySync(found_inf_tensor, ctx.GetPlace(), found_inf); - } + VLOG(4) << "found overflow:" << found_inf_data; + Tensor found_inf_tensor; + found_inf_tensor.Resize({1}); + bool* is_found_inf = + found_inf_tensor.mutable_data(paddle::platform::CPUPlace()); + *is_found_inf = found_inf_data; + + framework::TensorCopy( + found_inf_tensor, ctx.GetPlace(), + ctx.template device_context(), found_inf); + ctx.template device_context().Wait(); + + auto runner_clear_status = + NpuOpRunner("NPUClearFloatStatus", {*float_status}, {tmp}); + runner_clear_status.Run(stream); } }; diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc index 99e81a4757d0e0..a80b83f0cbe51f 100644 --- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc +++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc @@ -110,22 +110,22 @@ void Compare(f::Scope *scope, const p::DeviceContext &ctx) { // out found_inf Tensor found_inf_tensor; found_inf_tensor.Resize({1}); - bool *is_finite_data = + bool *found_inf_data = found_inf_tensor.mutable_data(paddle::platform::CPUPlace()); f::TensorCopy(*found_inf, place, &found_inf_tensor); - EXPECT_FALSE(*is_finite_data); + EXPECT_TRUE(*found_inf_data); ctx.Wait(); } TEST(check_finite_and_unscale, NPU_fp32) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx); + auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx); } TEST(check_finite_and_unscale, NPU_fp16) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx); + auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx); } diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op.cu b/paddle/fluid/operators/amp/update_loss_scaling_op.cu index b48b0e78892933..de1f83c1ee50d0 100644 --- a/paddle/fluid/operators/amp/update_loss_scaling_op.cu +++ b/paddle/fluid/operators/amp/update_loss_scaling_op.cu @@ -34,13 +34,39 @@ __global__ void GpuUpdateLossScaling( } template -__global__ void FillIf(T* data, const int64_t num, const T value, - const bool* has_inf) { - if (*has_inf) { - int tid = threadIdx.x + blockIdx.x * blockDim.x; - for (int i = tid; i < num; i += blockDim.x * gridDim.x) { - data[i] = value; - } +__global__ void FusedFillIf(T** outs, const size_t xs_size, + const int64_t* starts, const T value, + const bool* has_inf) { + if (!(*has_inf)) return; + + const int tid = threadIdx.x + blockIdx.x * blockDim.x; + + // copy starts array from global memory to shared memory + extern __shared__ int64_t s_starts[]; + for (int i = threadIdx.x; i <= xs_size; i += blockDim.x) { + s_starts[i] = starts[i]; + } + __syncthreads(); + + const int64_t total_num = s_starts[xs_size]; + int out_index = 0; + + for (int64_t id = tid; id < total_num; id += blockDim.x * gridDim.x) { + // get the "out" index of "id" + // For example: + // id = 15, starts = [0, 10, 10, 20, 30] + // because 10 <= id < 20 ==> + // the id element locate in the 3rd tensor (notice the 2nd tensor size is 0) + int next_out_index = out_index; + while (id >= s_starts[next_out_index]) next_out_index++; + out_index = next_out_index - 1; + + // get data pointer and index + T* out_data = outs[out_index]; + int64_t idx = id - s_starts[out_index]; + + // set value + out_data[idx] = value; } } @@ -68,15 +94,52 @@ class LazyZeros { const bool* found_inf_data, const std::vector& xs, const std::vector& outs) const { - for (size_t i = 0; i < xs.size(); ++i) { - auto* out = outs[i]; - T* out_data = out->mutable_data(dev_ctx.GetPlace()); - int64_t num = out->numel(); - int block = 1024; - int grid = (block - 1 + num) / block; - FillIf<<>>( - out_data, num, static_cast(0), found_inf_data); + size_t xs_size = xs.size(); + const auto& cpu_place = platform::CPUPlace(); + // alloc each tensor's start index and copy to device + auto h_in_starts_mem = + memory::Alloc(cpu_place, (xs_size + 1) * sizeof(int64_t)); + int64_t* h_starts = reinterpret_cast(h_in_starts_mem->ptr()); + + auto d_in_starts_mem = + memory::Alloc(dev_ctx, (xs_size + 1) * sizeof(int64_t)); + int64_t* d_starts = reinterpret_cast(d_in_starts_mem->ptr()); + + // the start index value of each tensor is + // the sum of previous tensor's size. For example: + // outs = [10, 0, 10, 10] ==> starts = [0, 10, 10, 20, 30] + h_starts[0] = 0; + for (int i = 0; i < xs_size; i++) { + h_starts[i + 1] = h_starts[i] + outs[i]->numel(); } + memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), + d_starts, cpu_place, h_starts, (xs_size + 1) * sizeof(int64_t), + dev_ctx.stream()); + + // copy each tensor of "outs" data address array to device + auto h_out_addrs_mem = memory::Alloc(cpu_place, xs_size * sizeof(T*)); + T** h_out_addrs = reinterpret_cast(h_out_addrs_mem->ptr()); + + auto d_out_addrs_mem = memory::Alloc(dev_ctx, xs_size * sizeof(T*)); + T** d_out_addrs = reinterpret_cast(d_out_addrs_mem->ptr()); + + for (size_t i = 0; i < xs_size; ++i) { + h_out_addrs[i] = outs[i]->mutable_data(dev_ctx.GetPlace()); + } + memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), + d_out_addrs, cpu_place, h_out_addrs, xs_size * sizeof(T*), + dev_ctx.stream()); + + // launch cuda kernel + int64_t total_num = h_starts[xs_size]; + int64_t threads_per_block = std::min(static_cast(1024), total_num); + int64_t elements_per_block = + threads_per_block * 50; // each thread deal with 50 data + int64_t blocks_per_grid = + (total_num + elements_per_block - 1) / elements_per_block; + FusedFillIf<<>>( + d_out_addrs, xs_size, d_starts, static_cast(0), found_inf_data); } }; diff --git a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc index dd6dbfd5c0b653..45b28bf61e5d68 100644 --- a/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc +++ b/paddle/fluid/operators/amp/update_loss_scaling_op_npu.cc @@ -41,7 +41,7 @@ void Update(const platform::NPUDeviceContext& ctx, // bad_out_data = bad_in_data + 1 Tensor factor_tensor(bad_out_tensor->type()); factor_tensor.mutable_data({1}, place); - TensorFromVector(std::vector{1}, ctx, &factor_tensor); + FillNpuTensorWithConstant(&factor_tensor, static_cast(1)); auto runner_p2 = NpuOpRunner("Add", {*bad_in_tensor, factor_tensor}, {*bad_out_tensor}, {}); runner_p2.Run(stream); @@ -84,7 +84,7 @@ void Update(const platform::NPUDeviceContext& ctx, // good_out_data = good_in_data + 1 Tensor factor_tensor(good_out_tensor->type()); factor_tensor.mutable_data({1}, place); - TensorFromVector(std::vector{1}, ctx, &factor_tensor); + FillNpuTensorWithConstant(&factor_tensor, static_cast(1)); auto runner_p2 = NpuOpRunner("Add", {*good_in_tensor, factor_tensor}, {*good_out_tensor}, {}); runner_p2.Run(stream); diff --git a/paddle/fluid/operators/assign_op.cc b/paddle/fluid/operators/assign_op.cc index add533bafcb0a7..433cabcfee0104 100644 --- a/paddle/fluid/operators/assign_op.cc +++ b/paddle/fluid/operators/assign_op.cc @@ -162,6 +162,7 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(assign, float, ops::AssignKernel, double, ops::AssignKernel, int, ops::AssignKernel, int64_t, ops::AssignKernel, bool, ops::AssignKernel, plat::float16, + ops::AssignKernel, plat::bfloat16, ops::AssignKernel); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) diff --git a/paddle/fluid/operators/assign_op_npu_test.cc b/paddle/fluid/operators/assign_op_npu_test.cc index 5cf1303a229a90..792d01a5efe430 100644 --- a/paddle/fluid/operators/assign_op_npu_test.cc +++ b/paddle/fluid/operators/assign_op_npu_test.cc @@ -75,6 +75,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx, TEST(assign, NPU_fp32) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx, "assign"); + auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx, "assign"); } diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index fc31885824b55f..edad20435b41c9 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -575,7 +575,7 @@ class BatchNormGradKernel // SavedVariance have been reverted in forward operator const auto *saved_inv_variance = ctx.Input("SavedVariance"); const std::string data_layout_str = ctx.Attr("data_layout"); - const bool use_global_stats = ctx.Attr("use_global_stats"); + bool use_global_stats = ctx.Attr("use_global_stats"); const bool is_test = ctx.Attr("is_test"); const float epsilon = ctx.Attr("epsilon"); const DataLayout data_layout = @@ -585,6 +585,8 @@ class BatchNormGradKernel auto *d_scale = ctx.Output(framework::GradVarName("Scale")); auto *d_bias = ctx.Output(framework::GradVarName("Bias")); + use_global_stats = is_test || use_global_stats; + // batch_norm with inplace as false will take X as grad input, which // is same as cuDNN batch_norm backward calculation, batch_norm // with inplace as true only take Y as input and X should be calculate @@ -605,13 +607,6 @@ class BatchNormGradKernel "X@GRAD and Y@GRAD inplaced in non-inplace mode")); } - PADDLE_ENFORCE_EQ( - is_test, false, - platform::errors::InvalidArgument( - "`is_test = True` CANNOT be used in train program. If " - "you want to use global status in pre_train model, " - "please set `use_global_stats = True`")); - // Get the size for each dimension. // NCHW [batch_size, in_channels, in_height, in_width] const auto &x_dims = x->dims(); diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu index 444c24b826b1b8..6fc78732b1063a 100644 --- a/paddle/fluid/operators/batch_norm_op.cu +++ b/paddle/fluid/operators/batch_norm_op.cu @@ -41,6 +41,83 @@ using CudnnDataType = platform::CudnnDataType; template using BatchNormParamType = typename CudnnDataType::BatchNormParamType; +template +static __global__ void BNForwardInference( + const T *x, const BatchNormParamType *mean, + const BatchNormParamType *variance, const BatchNormParamType *scale, + const BatchNormParamType *bias, const int C, const int N, const int HxW, + const double epsilon, T *y) { + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + int num = N * C * HxW; + for (int i = gid; i < num; i += stride) { + const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C; + BatchNormParamType x_sub_mean = + static_cast>(x[i]) - mean[c]; + BatchNormParamType inv_var = 1 / sqrt(variance[c] + epsilon); + y[i] = static_cast(scale[c] * x_sub_mean * inv_var + bias[c]); + } +} + +template +static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining( + const T *x, const BatchNormParamType *scale, + const BatchNormParamType *bias, const int C, const int N, const int HxW, + const double epsilon, double exponentialAverageFactor, T *y, + BatchNormParamType *mean, BatchNormParamType *variance, + BatchNormParamType *save_mean, + BatchNormParamType *save_inv_variance) { + int outer_size = C; + int inner_size = N * HxW; + typedef cub::BlockReduce, BlockDim> BlockReduce; + __shared__ typename BlockReduce::TempStorage mean_storage; + __shared__ typename BlockReduce::TempStorage variance_storeage; + __shared__ BatchNormParamType mean_val; + __shared__ BatchNormParamType variance_val; + __shared__ BatchNormParamType inv_var_val; + + for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { + BatchNormParamType x_sum = static_cast>(0); + BatchNormParamType x_square_sum = static_cast>(0); + + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == framework::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType x_i = static_cast>(x[index]); + x_sum += x_i; + x_square_sum += x_i * x_i; + } + x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum()); + x_square_sum = + BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum()); + if (threadIdx.x == 0) { + mean_val = x_sum / inner_size; + variance_val = x_square_sum / inner_size - mean_val * mean_val; + inv_var_val = 1 / sqrt(variance_val + epsilon); + + if (save_mean && save_inv_variance) { + save_mean[i] = mean_val; + save_inv_variance[i] = inv_var_val; + } + mean[i] = (1 - exponentialAverageFactor) * mean_val + + exponentialAverageFactor * mean[i]; + variance[i] = (1 - exponentialAverageFactor) * variance_val + + exponentialAverageFactor * variance[i]; + } + __syncthreads(); + + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == framework::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType x_sub_mean = + static_cast>(x[index]) - mean_val; + y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i]; + } + } +} + template class BatchNormKernel : public framework::OpKernel { @@ -80,8 +157,12 @@ class BatchNormKernel auto dtype = platform::CudnnDataType::type; #ifdef PADDLE_WITH_HIP - // HIP do not support compute format of NHWC - auto compute_format = DataLayout::kNCHW; + auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC + : DataLayout::kNCHW; + +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// HIP do not support compute format of NHWC +// auto compute_format = DataLayout::kNCHW; #else const bool fast_nhwc_batch_norm = test_mode || @@ -111,14 +192,15 @@ class BatchNormKernel // ------------------- cudnn descriptors --------------------- #ifdef PADDLE_WITH_HIP - miopenTensorDescriptor_t data_desc_; - miopenTensorDescriptor_t bn_param_desc_; - miopenBatchNormMode_t mode_; - - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// miopenTensorDescriptor_t data_desc_; +// miopenTensorDescriptor_t bn_param_desc_; +// miopenBatchNormMode_t mode_; + +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); #else cudnnTensorDescriptor_t data_desc_; cudnnTensorDescriptor_t bn_param_desc_; @@ -138,7 +220,8 @@ class BatchNormKernel epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); #ifdef PADDLE_WITH_HIP - mode_ = miopenBNSpatial; +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// mode_ = miopenBNSpatial; #elif CUDNN_VERSION_MIN(7, 0, 1) if (FLAGS_cudnn_batchnorm_spatial_persistent) { mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; @@ -161,14 +244,15 @@ class BatchNormKernel } #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor( - data_desc_, CudnnDataType::type, - x_dims.size() > 3 ? x_dims.size() : 4, const_cast(dims.data()), - const_cast(strides.data()))); - // Note: PERSISTENT not implemented for inference - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenDeriveBNTensorDescriptor( - bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_)); +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor( +// data_desc_, CudnnDataType::type, +// x_dims.size() > 3 ? x_dims.size() : 4, const_cast(dims.data()), +// const_cast(strides.data()))); +// Note: PERSISTENT not implemented for inference +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenDeriveBNTensorDescriptor( +// bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_)); #else PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( data_desc_, CudnnDataType::type, @@ -226,28 +310,53 @@ class BatchNormKernel C, est_var->dims()[0], est_var->dims())); #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenBatchNormalizationForwardInference( - handle, miopenBNSpatial, - const_cast( - static_cast(CudnnDataType::kOne())), - const_cast( - static_cast(CudnnDataType::kZero())), - data_desc_, - static_cast(transformed_x.template data()), - data_desc_, - static_cast( - transformed_y.template mutable_data(ctx.GetPlace())), - bn_param_desc_, - const_cast(static_cast( - scale->template data>())), - const_cast(static_cast( - bias->template data>())), - const_cast(static_cast( - est_mean->template data>())), - const_cast(static_cast( - est_var->template data>())), - epsilon)); + const int block_size = 256; + const int grid_size = (N * C * H * W * D + block_size - 1) / block_size; + if (compute_format == DataLayout::kNCHW) { + BNForwardInference< + T, + DataLayout::kNCHW><<>>( + transformed_x.template data(), + est_mean->template data>(), + est_var->template data>(), + scale->template data>(), + bias->template data>(), C, N, H * W * D, + epsilon, transformed_y.template data()); + } else { + BNForwardInference< + T, + DataLayout::kNHWC><<>>( + transformed_x.template data(), + est_mean->template data>(), + est_var->template data>(), + scale->template data>(), + bias->template data>(), C, N, H * W * D, + epsilon, transformed_y.template data()); + } + +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenBatchNormalizationForwardInference( +// handle, miopenBNSpatial, +// const_cast( +// static_cast(CudnnDataType::kOne())), +// const_cast( +// static_cast(CudnnDataType::kZero())), +// data_desc_, +// static_cast(transformed_x.template data()), +// data_desc_, +// static_cast( +// transformed_y.template mutable_data(ctx.GetPlace())), +// bn_param_desc_, +// const_cast(static_cast( +// scale->template data>())), +// const_cast(static_cast( +// bias->template data>())), +// const_cast(static_cast( +// est_mean->template data>())), +// const_cast(static_cast( +// est_var->template data>())), +// epsilon)); #else PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnBatchNormalizationForwardInference( @@ -365,34 +474,66 @@ class BatchNormKernel #endif // CUDNN_VERSION_MIN(7, 4, 1) if (!called) { #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenBatchNormalizationForwardTraining( - handle, mode_, const_cast(static_cast( - CudnnDataType::kOne())), - const_cast( - static_cast(CudnnDataType::kZero())), - data_desc_, - static_cast(transformed_x.template data()), - data_desc_, - static_cast( - transformed_y.template mutable_data(ctx.GetPlace())), - bn_param_desc_, - const_cast(static_cast( - scale->template data>())), - const_cast(static_cast( - bias->template data>())), - this_factor, - static_cast( - mean_out->template mutable_data>( - ctx.GetPlace())), - static_cast(variance_out->template mutable_data< - BatchNormParamType>(ctx.GetPlace())), - epsilon, - static_cast( - saved_mean->template mutable_data>( - ctx.GetPlace())), - static_cast(saved_variance->template mutable_data< - BatchNormParamType>(ctx.GetPlace())))); + const int num = transformed_x.numel(); + const int block = 256; + const int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(max_threads / block, 1); + const int grid = std::min(C, max_blocks); + if (compute_format == DataLayout::kNCHW) { + BNForwardTraining< + T, block, + DataLayout::kNCHW><<>>( + transformed_x.template data(), + scale->template data>(), + bias->template data>(), C, N, H * W * D, + epsilon, this_factor, transformed_y.template data(), + mean_out->template data>(), + variance_out->template data>(), + saved_mean->template data>(), + saved_variance->template data>()); + } else { + BNForwardTraining< + T, block, + DataLayout::kNHWC><<>>( + transformed_x.template data(), + scale->template data>(), + bias->template data>(), C, N, H * W * D, + epsilon, this_factor, transformed_y.template data(), + mean_out->template data>(), + variance_out->template data>(), + saved_mean->template data>(), + saved_variance->template data>()); + } + +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenBatchNormalizationForwardTraining( +// handle, mode_, const_cast(static_cast( +// CudnnDataType::kOne())), +// const_cast( +// static_cast(CudnnDataType::kZero())), +// data_desc_, +// static_cast(transformed_x.template data()), +// data_desc_, +// static_cast( +// transformed_y.template mutable_data(ctx.GetPlace())), +// bn_param_desc_, +// const_cast(static_cast( +// scale->template data>())), +// const_cast(static_cast( +// bias->template data>())), +// this_factor, +// static_cast( +// mean_out->template mutable_data>( +// ctx.GetPlace())), +// static_cast(variance_out->template mutable_data< +// BatchNormParamType>(ctx.GetPlace())), +// epsilon, +// static_cast( +// saved_mean->template mutable_data>( +// ctx.GetPlace())), +// static_cast(saved_variance->template mutable_data< +// BatchNormParamType>(ctx.GetPlace())))); #else PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnBatchNormalizationForwardTraining( @@ -423,11 +564,12 @@ class BatchNormKernel ctx, &transformed_y, y); } #ifdef PADDLE_WITH_HIP - // clean when exit. - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// clean when exit. +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); #else // clean when exit. PADDLE_ENFORCE_CUDA_SUCCESS( @@ -439,7 +581,7 @@ class BatchNormKernel }; template -static __global__ void KeBNBackwardScaleBias( +static __global__ LAUNCH_BOUNDS(BlockDim) void KeBNBackwardScaleBias( const T *dy, const T *x, const BatchNormParamType *mean, const BatchNormParamType *variance, const double epsilon, const int N, const int C, const int HxW, BatchNormParamType *dscale, @@ -526,13 +668,97 @@ class InplaceHelper { }; template -static __global__ void BNBackwardData(const T *dy, - const BatchNormParamType *scale, - const BatchNormParamType *mean, - const T *x, - const BatchNormParamType *variance, - const int C, const int N, const int HxW, - T *dx) { +static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackward( + const T *dy, const T *x, const BatchNormParamType *scale, + const BatchNormParamType *saved_mean, + const BatchNormParamType *saved_inv_variance, const int C, const int N, + const int HxW, const double epsilon, T *dx, BatchNormParamType *dscale, + BatchNormParamType *dbias) { + const int outer_size = C; + const int inner_size = N * HxW; + typedef cub::BlockReduce, BlockDim> BlockReduce; + __shared__ typename BlockReduce::TempStorage ds_storage; + __shared__ typename BlockReduce::TempStorage db_storage; + __shared__ typename BlockReduce::TempStorage mean_storage; + __shared__ typename BlockReduce::TempStorage variance_storeage; + __shared__ BatchNormParamType inv_var_val; + __shared__ BatchNormParamType mean_val; + __shared__ BatchNormParamType dscale_val; + __shared__ BatchNormParamType dbias_val; + + for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { + BatchNormParamType ds_sum = static_cast>(0); + BatchNormParamType db_sum = static_cast>(0); + + if (saved_mean && saved_inv_variance) { + if (threadIdx.x == 0) { + inv_var_val = saved_inv_variance[i]; + mean_val = saved_mean[i]; + } + } else { + BatchNormParamType x_sum = static_cast>(0); + BatchNormParamType x_square_sum = + static_cast>(0); + + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == framework::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType x_i = + static_cast>(x[index]); + x_sum += x_i; + x_square_sum += x_i * x_i; + } + x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum()); + x_square_sum = + BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum()); + if (threadIdx.x == 0) { + mean_val = x_sum / inner_size; + inv_var_val = + 1 / sqrt(x_square_sum / inner_size - mean_val * mean_val + epsilon); + } + } + __syncthreads(); + + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == framework::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType dy_i = + static_cast>(dy[index]); + ds_sum += + dy_i * (static_cast>(x[index]) - mean_val); + db_sum += dy_i; + } + ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum()); + db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum()); + if (threadIdx.x == 0) { + dscale_val = ds_sum * inv_var_val; + dbias_val = db_sum; + dscale[i] = dscale_val; + dbias[i] = dbias_val; + } + __syncthreads(); + + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == framework::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + dx[index] = scale[i] * inv_var_val * + (static_cast>(dy[index]) - + dbias_val / static_cast>(inner_size) - + (static_cast>(x[index]) - mean_val) * + inv_var_val * dscale_val / inner_size); + } + } +} + +template +static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackwardData( + const T *dy, const BatchNormParamType *scale, + const BatchNormParamType *mean, const T *x, + const BatchNormParamType *variance, const int C, const int N, + const int HxW, T *dx) { const int outer_size = C; const int inner_size = N * HxW; typedef cub::BlockReduce, BlockDim> BlockReduce; @@ -567,7 +793,6 @@ static __global__ void BNBackwardData(const T *dy, dy_x_sub_mean_sum_val = dy_x_sub_mean_sum; } __syncthreads(); - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { const int index = layout == framework::DataLayout::kNCHW ? (j / HxW * C + i) * HxW + j % HxW @@ -592,7 +817,7 @@ class BatchNormGradKernel platform::errors::InvalidArgument("It must use CUDAPlace.")); double epsilon = static_cast(ctx.Attr("epsilon")); const std::string data_layout_str = ctx.Attr("data_layout"); - const bool use_global_stats = ctx.Attr("use_global_stats"); + bool use_global_stats = ctx.Attr("use_global_stats"); const DataLayout data_layout = framework::StringToDataLayout(data_layout_str); @@ -625,12 +850,7 @@ class BatchNormGradKernel } const bool is_test = ctx.Attr("is_test"); - PADDLE_ENFORCE_EQ( - is_test, false, - platform::errors::InvalidArgument( - "`is_test = True` CANNOT be used in train program. If " - "you want to use global status in pre_train model, " - "please set `use_global_stats = True`")); + use_global_stats = is_test || use_global_stats; const auto &x_dims = x->dims(); @@ -668,8 +888,12 @@ class BatchNormGradKernel auto dtype = platform::CudnnDataType::type; const auto *reserve_space = ctx.Input("ReserveSpace"); #ifdef PADDLE_WITH_HIP - // HIP do not support compute format of NHWC - auto compute_format = DataLayout::kNCHW; + auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC + : DataLayout::kNCHW; + +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// HIP do not support compute format of NHWC +// auto compute_format = DataLayout::kNCHW; #else const bool fast_nhwc_batch_norm = dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent && @@ -714,7 +938,11 @@ class BatchNormGradKernel auto &dev_ctx = ctx.template device_context(); const int num = transformed_x.numel(); +#ifdef HIPCC + const int block = 256; +#else const int block = 512; +#endif int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); const int max_blocks = std::max(max_threads / block, 1); int grid1 = (num + block - 1) / block; @@ -734,14 +962,15 @@ class BatchNormGradKernel // ------------------- cudnn descriptors --------------------- #ifdef PADDLE_WITH_HIP - miopenTensorDescriptor_t data_desc_; - miopenTensorDescriptor_t bn_param_desc_; - miopenBatchNormMode_t mode_; - - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// miopenTensorDescriptor_t data_desc_; +// miopenTensorDescriptor_t bn_param_desc_; +// miopenBatchNormMode_t mode_; + +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); #else cudnnTensorDescriptor_t data_desc_; cudnnTensorDescriptor_t bn_param_desc_; @@ -759,7 +988,8 @@ class BatchNormGradKernel } epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); #ifdef PADDLE_WITH_HIP - mode_ = miopenBNSpatial; +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// mode_ = miopenBNSpatial; #elif CUDNN_VERSION_MIN(7, 0, 1) if (FLAGS_cudnn_batchnorm_spatial_persistent) { mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; @@ -771,13 +1001,14 @@ class BatchNormGradKernel #endif // CUDNN_VERSION_MIN(7, 0, 1) #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor( - data_desc_, CudnnDataType::type, - x_dims.size() > 3 ? x_dims.size() : 4, const_cast(dims.data()), - const_cast(strides.data()))); - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_, - data_desc_, mode_)); +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenSetTensorDescriptor( +// data_desc_, CudnnDataType::type, +// x_dims.size() > 3 ? x_dims.size() : 4, const_cast(dims.data()), +// const_cast(strides.data()))); +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_, +// data_desc_, mode_)); #else PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( data_desc_, CudnnDataType::type, @@ -871,20 +1102,49 @@ class BatchNormGradKernel #endif // CUDNN_VERSION_MIN(7, 4, 1) if (!called) { #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenBatchNormalizationBackward( - dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), - CudnnDataType::kZero(), CudnnDataType::kOne(), - CudnnDataType::kZero(), data_desc_, - transformed_x.template data(), data_desc_, - transformed_d_y.template data(), data_desc_, - transformed_d_x.template mutable_data(ctx.GetPlace()), - bn_param_desc_, scale->template data>(), - d_scale->template mutable_data>( - ctx.GetPlace()), - d_bias->template mutable_data>( - ctx.GetPlace()), - epsilon, saved_mean_data, saved_var_data)); + if (compute_format == DataLayout::kNCHW) { + BNBackward< + T, block, + DataLayout::kNCHW><<>>( + transformed_d_y.template data(), + transformed_x.template data(), + scale->template data>(), saved_mean_data, + saved_var_data, C, N, H * W * D, epsilon, + transformed_d_x.template data(), + d_scale->template mutable_data>( + ctx.GetPlace()), + d_bias->template mutable_data>( + ctx.GetPlace())); + } else { + BNBackward< + T, block, + DataLayout::kNHWC><<>>( + transformed_d_y.template data(), + transformed_x.template data(), + scale->template data>(), saved_mean_data, + saved_var_data, C, N, H * W * D, epsilon, + transformed_d_x.template data(), + d_scale->template mutable_data>( + ctx.GetPlace()), + d_bias->template mutable_data>( + ctx.GetPlace())); + } + +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenBatchNormalizationBackward( +// dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), +// CudnnDataType::kZero(), CudnnDataType::kOne(), +// CudnnDataType::kZero(), data_desc_, +// transformed_x.template data(), data_desc_, +// transformed_d_y.template data(), data_desc_, +// transformed_d_x.template mutable_data(ctx.GetPlace()), +// bn_param_desc_, scale->template data>(), +// d_scale->template mutable_data>( +// ctx.GetPlace()), +// d_bias->template mutable_data>( +// ctx.GetPlace()), +// epsilon, saved_mean_data, saved_var_data)); #else PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnBatchNormalizationBackward( @@ -931,11 +1191,12 @@ class BatchNormGradKernel } #ifdef PADDLE_WITH_HIP - // clean when exit. - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// clean when exit. +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); +// PADDLE_ENFORCE_CUDA_SUCCESS( +// platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); #else // clean when exit. PADDLE_ENFORCE_CUDA_SUCCESS( diff --git a/paddle/fluid/operators/cast_op_npu.cc b/paddle/fluid/operators/cast_op_npu.cc index 20b33c4e4e05a6..0de0f5e4505795 100644 --- a/paddle/fluid/operators/cast_op_npu.cc +++ b/paddle/fluid/operators/cast_op_npu.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_ASCEND_CL #include #include @@ -41,11 +40,20 @@ class CastNPUKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { auto* x = ctx.Input("X"); int dtype = ctx.Attr("out_dtype"); - auto* out = ctx.Output("Out"); - auto place = ctx.GetPlace(); + if (x->type() == dtype) { + // NOTE(zhiqiu): NPU cast op may result in wrong value, so + // add special case here. + VLOG(4) << "cast to same dtype:" << dtype; + out->mutable_data(place, x->type()); + framework::TensorCopy( + *x, ctx.GetPlace(), + ctx.template device_context(), out); + return; + } + auto iter = DTYPE_2_ACL_DTYPE.find( static_cast(dtype)); int aclDtype = iter->second; @@ -76,7 +84,7 @@ class CastNPUKernel : public framework::OpKernel { } }; } // namespace operators -} // namespace paddleaclDtype +} // namespace paddle namespace ops = paddle::operators; @@ -84,9 +92,9 @@ REGISTER_OP_NPU_KERNEL( cast, ops::CastNPUKernel, ops::CastNPUKernel, ops::CastNPUKernel, + ops::CastNPUKernel, ops::CastNPUKernel, ops::CastNPUKernel, ops::CastNPUKernel, ops::CastNPUKernel); -#endif diff --git a/paddle/fluid/operators/clip_op.cc b/paddle/fluid/operators/clip_op.cc index eb27df8a36757c..7176a0466bb831 100644 --- a/paddle/fluid/operators/clip_op.cc +++ b/paddle/fluid/operators/clip_op.cc @@ -145,10 +145,14 @@ REGISTER_OPERATOR(clip_grad, ops::ClipOpGrad, ops::ClipGradInplaceInferer, ops::ClipDoubleGradOpMaker); REGISTER_OP_CPU_KERNEL( clip, ops::ClipKernel, - ops::ClipKernel); + ops::ClipKernel, + ops::ClipKernel, + ops::ClipKernel); REGISTER_OP_CPU_KERNEL( clip_grad, ops::ClipGradKernel, - ops::ClipGradKernel); + ops::ClipGradKernel, + ops::ClipGradKernel, + ops::ClipGradKernel); REGISTER_OP_VERSION(clip) .AddCheckpoint( diff --git a/paddle/fluid/operators/clip_op.cu b/paddle/fluid/operators/clip_op.cu index d31b81c13c5cf6..fd61e4ea61d4ff 100644 --- a/paddle/fluid/operators/clip_op.cu +++ b/paddle/fluid/operators/clip_op.cu @@ -17,8 +17,12 @@ limitations under the License. */ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( clip, ops::ClipKernel, - ops::ClipKernel); + ops::ClipKernel, + ops::ClipKernel, + ops::ClipKernel); REGISTER_OP_CUDA_KERNEL( clip_grad, ops::ClipGradKernel, - ops::ClipGradKernel); + ops::ClipGradKernel, + ops::ClipGradKernel, + ops::ClipGradKernel); diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt index 977a208d20e783..3f210219608fb7 100644 --- a/paddle/fluid/operators/collective/CMakeLists.txt +++ b/paddle/fluid/operators/collective/CMakeLists.txt @@ -11,7 +11,7 @@ foreach(src ${OPS}) set_source_files_properties(${src} PROPERTIES COMPILE_FLAGS ${COLLECTIVE_COMPILE_FLAGS}) endforeach() -register_operators(EXCLUDES c_gen_bkcl_id_op gen_bkcl_id_op c_gen_nccl_id_op gen_nccl_id_op DEPS ${COLLECTIVE_DEPS}) +register_operators(EXCLUDES c_gen_bkcl_id_op gen_bkcl_id_op c_gen_nccl_id_op gen_nccl_id_op c_gen_hccl_id_op gen_hccl_id_op DEPS ${COLLECTIVE_DEPS}) if(WITH_NCCL OR WITH_RCCL) set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} nccl_common collective_helper) @@ -19,12 +19,6 @@ if(WITH_NCCL OR WITH_RCCL) op_library(gen_nccl_id_op DEPS ${COLLECTIVE_DEPS}) endif() -if(WITH_ASCEND) - op_library(gen_nccl_id_op) - op_library(c_gen_nccl_id_op) -endif() - - if(WITH_GLOO) set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} gloo_wrapper) endif() @@ -35,5 +29,38 @@ if(WITH_XPU_BKCL) op_library(gen_bkcl_id_op DEPS ${COLLECTIVE_DEPS}) endif() +if(WITH_ASCEND_CL) + cc_library(gen_hccl_id_op_helper SRCS gen_hccl_id_op_helper.cc DEPS dynload_warpctc dynamic_loader scope) + set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper gen_hccl_id_op_helper) + op_library(c_gen_hccl_id_op DEPS ${COLLECTIVE_DEPS}) + op_library(gen_hccl_id_op DEPS ${COLLECTIVE_DEPS}) +endif() + set(OPERATOR_DEPS ${OPERATOR_DEPS} ${COLLECTIVE_DEPS} PARENT_SCOPE) set(GLOB_COLLECTIVE_DEPS ${COLLECTIVE_DEPS} CACHE INTERNAL "collective dependency") + +if(WITH_ASCEND_CL) + set(COMMON_TEST_DEPS_FOR_HCOM c_comm_init_hccl_op c_gen_hccl_id_op gen_hccl_id_op_helper + gen_hccl_id_op op_registry ascend_hccl flags + dynamic_loader dynload_warpctc scope device_context enforce executor) + cc_test(c_broadcast_op_npu_test SRCS c_broadcast_op_npu_test.cc + DEPS c_broadcast_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) + cc_test(c_allreduce_sum_op_npu_test SRCS c_allreduce_sum_op_npu_test.cc + DEPS c_allreduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) + cc_test(c_reducescatter_op_npu_test SRCS c_reducescatter_op_npu_test.cc + DEPS c_reducescatter_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) + cc_test(c_allgather_op_npu_test SRCS c_allgather_op_npu_test.cc + DEPS c_allgather_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) + cc_test(c_reduce_sum_op_npu_test SRCS c_reduce_sum_op_npu_test.cc + DEPS c_reduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) + cc_test(c_allreduce_max_op_npu_test SRCS c_allreduce_max_op_npu_test.cc + DEPS c_allreduce_max_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) + cc_test(send_v2_op_npu_test SRCS send_v2_op_npu_test.cc + DEPS send_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) + cc_test(recv_v2_op_npu_test SRCS recv_v2_op_npu_test.cc + DEPS recv_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) + cc_test(c_sync_comm_stream_op_npu_test SRCS c_sync_comm_stream_op_npu_test.cc + DEPS op_registry c_broadcast_op c_comm_init_hccl_op c_sync_comm_stream_op c_gen_hccl_id_op gen_hccl_id_op_helper ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor) + cc_test(c_sync_calc_stream_op_npu_test SRCS c_sync_calc_stream_op_npu_test.cc + DEPS op_registry elementwise_add_op c_sync_calc_stream_op c_gen_hccl_id_op gen_hccl_id_op_helper ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor) +endif() diff --git a/paddle/fluid/operators/collective/alltoall_op.cc b/paddle/fluid/operators/collective/alltoall_op.cc new file mode 100644 index 00000000000000..1c57b9f9967633 --- /dev/null +++ b/paddle/fluid/operators/collective/alltoall_op.cc @@ -0,0 +1,94 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/alltoall_op.h" + +namespace paddle { +namespace operators { + +class AllToAllOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "AllToAll"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "AllToAll"); + int ring_id = ctx->Attrs().Get("ring_id"); + PADDLE_ENFORCE_GE( + ring_id, 0, + platform::errors::InvalidArgument( + "The ring_id (%d) for alltoall op must be non-negative.", ring_id)); + framework::DDim dim = ctx->GetInputDim("X"); + if (dim[0] < 0) dim[0] = -1; + ctx->SetOutputDim("Out", dim); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); + } +}; + +class AllToAllOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "(Tensor) tensor send."); + AddOutput("Out", "(Tensor) the result of alltoall."); + AddAttr("ring_id", "(int default 0) nccl communication ring id.") + .SetDefault(0); + AddAttr( + "use_calc_stream", + "(bool default false) eject CUDA operations to calculation stream.") + .SetDefault(false); + AddComment(R"DOC( +AllToAll Operator +Scatter tensors from all participators to all participators. +)DOC"); + } +}; + +template +class AllToAllOpGradMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr retv) const override { + retv->SetType("alltoall"); + retv->SetInput("X", this->OutputGrad("Out")); + retv->SetOutput("Out", this->InputGrad("X")); + retv->SetAttrMap(this->Attrs()); + } +}; + +DECLARE_INPLACE_OP_INFERER(AllToAllInplaceInferer, {"X", "Out"}); + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OPERATOR(alltoall, ops::AllToAllOp, ops::AllToAllOpMaker, + ops::AllToAllOpGradMaker, + ops::AllToAllOpGradMaker, + ops::AllToAllInplaceInferer) + +REGISTER_OP_CPU_KERNEL(alltoall, ops::AllToAllOpCPUKernel, + ops::AllToAllOpCPUKernel, + ops::AllToAllOpCPUKernel, + ops::AllToAllOpCPUKernel, + ops::AllToAllOpCPUKernel); diff --git a/paddle/fluid/operators/collective/alltoall_op.cu.cc b/paddle/fluid/operators/collective/alltoall_op.cu.cc new file mode 100644 index 00000000000000..1bcb47fc686cfe --- /dev/null +++ b/paddle/fluid/operators/collective/alltoall_op.cu.cc @@ -0,0 +1,95 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/alltoall_op.h" + +#if defined(PADDLE_WITH_NCCL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/nccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +template +class AllToAllOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_NCCL) +#if NCCL_VERSION_CODE >= 2703 + auto x = ctx.Input("X"); + auto out = ctx.Output("Out"); + int send_numel = x->numel(); + ncclDataType_t dtype = platform::ToNCCLDataType(x->type()); + + int ring_id = ctx.Attr("ring_id"); + PADDLE_ENFORCE_GE( + ring_id, 0, + platform::errors::InvalidArgument( + "The ring_id (%d) for alltoall op must be non-negative.", ring_id)); + auto place = ctx.GetPlace(); + auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place); + int nranks = comm->nranks(); + + cudaStream_t stream = nullptr; + if (ctx.Attr("use_calc_stream")) { + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + stream = static_cast(dev_ctx)->stream(); + } else { + stream = comm->stream(); + } + + framework::DDim x_dims = x->dims(); + framework::DDim out_dims(x_dims); + PADDLE_ENFORCE_EQ( + x_dims[0] % nranks, 0, + platform::errors::InvalidArgument( + "The first dimension size (%d) of the input tensor must be " + "divisible by the number of ranks (%d).", + x_dims[0], nranks)); + auto send_buf = x->data(); + auto recv_buf = out->mutable_data(out_dims, place); + size_t offset = 0; + send_numel /= nranks; + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupStart()); + for (auto i = 0; i < nranks; ++i) { + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclSend( + send_buf + offset, send_numel, dtype, i, comm->comm(), stream)); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclRecv( + recv_buf + offset, send_numel, dtype, i, comm->comm(), stream)); + offset += send_numel; + } + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclGroupEnd()); +#else + PADDLE_THROW( + platform::errors::Unavailable("NCCL version >= 2.7.3 is needed.")); +#endif +#else + PADDLE_THROW( + platform::errors::Unavailable("PaddlePaddle should compile with GPU.")); +#endif + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL(alltoall, ops::AllToAllOpCUDAKernel, + ops::AllToAllOpCUDAKernel, + ops::AllToAllOpCUDAKernel, + ops::AllToAllOpCUDAKernel, + ops::AllToAllOpCUDAKernel); diff --git a/paddle/fluid/operators/collective/alltoall_op.h b/paddle/fluid/operators/collective/alltoall_op.h new file mode 100644 index 00000000000000..61eec44093794c --- /dev/null +++ b/paddle/fluid/operators/collective/alltoall_op.h @@ -0,0 +1,42 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" + +#if defined(PADDLE_WITH_GLOO) +#include "paddle/fluid/framework/fleet/gloo_wrapper.h" +#endif + +namespace paddle { +namespace operators { + +template +class AllToAllOpCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_THROW(platform::errors::Unavailable( + "Do not support alltoall for cpu kernel now.")); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/collective/c_allgather_op.cc b/paddle/fluid/operators/collective/c_allgather_op.cc index 4111a19c5ebc8c..c4e779698cccaf 100644 --- a/paddle/fluid/operators/collective/c_allgather_op.cc +++ b/paddle/fluid/operators/collective/c_allgather_op.cc @@ -42,6 +42,10 @@ class CAllGatherOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("Out", "(Tensor) the allgather result"); AddAttr("ring_id", "(int default 0) communication ring id.") .SetDefault(0); +#if defined(PADDLE_WITH_ASCEND_CL) + AddAttr("tag", "(string default tag) tag for all gather.") + .SetDefault("tag"); +#endif AddAttr( "use_calc_stream", "(bool default false) eject CUDA operations to calculation stream.") diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu.cc b/paddle/fluid/operators/collective/c_allgather_op_npu.cc new file mode 100644 index 00000000000000..e7f05549d9efea --- /dev/null +++ b/paddle/fluid/operators/collective/c_allgather_op_npu.cc @@ -0,0 +1,83 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_allgather_op.h" + +#include + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +template +class CAllGatherOpASCENDKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { +#if defined(PADDLE_WITH_ASCEND_CL) + auto in = ctx.Input("X"); + auto out = ctx.Output("Out"); + HcclDataType dtype = platform::ToHCCLDataType(in->type()); + + int ring_id = ctx.Attr("ring_id"); + std::string group = + std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id); + auto place = ctx.GetPlace(); + auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place); + int nranks = comm->nranks(); + + framework::DDim out_dims = in->dims(); + out_dims[0] *= nranks; + out->mutable_data(out_dims, place); + + uint64_t send_numel = in->numel(); + void *send_buff = reinterpret_cast(const_cast(in->data())); + void *recv_buff = reinterpret_cast(out->data()); + + aclrtStream stream = nullptr; + if (ctx.Attr("use_calc_stream")) { + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + stream = static_cast(dev_ctx)->stream(); + } else { + stream = comm->stream(); + } + + VLOG(3) << "begin hccl allgather, parameter is: " + << ", group is " << group << ", ring_id is " << ring_id + << ", nranks is " << nranks; + + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllGather( + send_buff, recv_buff, send_numel, dtype, comm->comm(), + reinterpret_cast(stream))); + +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with NPU.")); +#endif + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(c_allgather, ops::CAllGatherOpASCENDKernel, + ops::CAllGatherOpASCENDKernel, + ops::CAllGatherOpASCENDKernel, + ops::CAllGatherOpASCENDKernel); diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc new file mode 100644 index 00000000000000..4c7dfc4aad7d0e --- /dev/null +++ b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc @@ -0,0 +1,192 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +#include "paddle/fluid/operators/collective/c_allgather_op.h" +#include "paddle/fluid/operators/collective/c_allreduce_op.h" +#include "paddle/fluid/operators/collective/c_broadcast_op.h" +#include "paddle/fluid/operators/collective/c_reducescatter_op.h" +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(c_allgather); +USE_NO_KERNEL_OP(c_gen_hccl_id); +USE_NO_KERNEL_OP(c_comm_init_hccl); +USE_OP_DEVICE_KERNEL(c_allgather, NPU); + +DECLARE_string(selected_npus); + +template +void PrintDebugInfo(const std::string preStr, const std::vector& data) { + std::string debugstring = ""; + for (auto ele : data) { + debugstring += std::to_string(ele) + std::string(","); + } + VLOG(2) << preStr << ":" << std::endl << debugstring; +} + +void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + std::vector rank_ids{0, 1}; + f::AttributeMap gen_hccl_id; + + std::vector endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"}; + gen_hccl_id["rank"] = rank_id; + gen_hccl_id["endpoint"] = endpointList[rank_id]; + std::vector other_endpoints = { + endpointList[rank_id == 0 ? 1 : 0]}; + gen_hccl_id["other_endpoints"] = other_endpoints; + + auto out = scope->Var("Out"); + auto id = out->GetMutable(); + + VLOG(3) << "break"; + + auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {}, + {{"Out", {"Out"}}}, gen_hccl_id); + VLOG(3) << "break"; + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); + + memcpy(hccl_id, id, 1024); +} + +void Prepare(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + auto x = scope->Var("X"); + auto id = x->GetMutable(); + + memcpy(id, hccl_id, 1024); + + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + // std::vector rank_ids{0, 1}; + f::AttributeMap comm_init_attrs; + comm_init_attrs["ring_id"] = 0; + comm_init_attrs["rank_ids"] = 2; + comm_init_attrs["rank"] = rank_id; + comm_init_attrs["device_id"] = device_id; + // comm_init_attrs["rank_ids"] = rank_ids; + auto comm_init_op = f::OpRegistry::CreateOp( + "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); +} + +void TestHCCLAllGatherOp(f::Scope* scope, const p::DeviceContext& ctx) { + // init + auto x = scope->Var("Data"); + auto tensor_x = x->GetMutable(); + + std::vector init; + int rank_id = atoi(getenv("RANK_ID")); + + int num1 = 1; + int num2 = 4; + + for (int64_t i = 0; i < num1 * num2; ++i) { + init.push_back(1.0 + rank_id); + } + PrintDebugInfo("input data", init); + + TensorFromVector(init, ctx, tensor_x); + tensor_x->Resize({num1, num2}); + ctx.Wait(); + + auto place = ctx.GetPlace(); + auto out = scope->Var("OutData"); + auto tensor_out = out->GetMutable(); + tensor_out->Resize({num1, num2}); + tensor_out->mutable_data(place); // allocate + ctx.Wait(); + + // run + f::AttributeMap attrs; + attrs["tag"] = std::string("tagx"); + attrs["ring_id"] = 0; + attrs["nranks"] = 2; + + auto op = f::OpRegistry::CreateOp("c_allgather", {{"X", {"Data"}}}, + {{"Out", {"OutData"}}}, attrs); + + for (int i = 0; i < 10; i++) { + op->Run(*scope, place); + } + ctx.Wait(); + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + ctx.Wait(); + + PrintDebugInfo("output data", out_vec); + + EXPECT_EQ(out_vec.size(), init.size() * 2); + for (uint32_t i = 0; i < out_vec.size() / 2; i++) { + EXPECT_EQ(out_vec[i], 1.0); + } + for (uint32_t i = out_vec.size() / 2; i < out_vec.size(); i++) { + EXPECT_EQ(out_vec[i], 2.0); + } +} + +TEST(c_allgather, NPU) { + f::Scope scope; + HcclRootInfo hccl_id; + + // only support one device, if more than one device, use first default + p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str()))); + + PrepareUniqueId(&scope, ctx, &hccl_id); + Prepare(&scope, ctx, &hccl_id); + TestHCCLAllGatherOp(&scope, ctx); +} diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op.cc b/paddle/fluid/operators/collective/c_allreduce_max_op.cc index 835b49e57bc092..8bdbdfac8ffd1d 100644 --- a/paddle/fluid/operators/collective/c_allreduce_max_op.cc +++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cc @@ -37,14 +37,19 @@ class CAllReduceMaxOpMaker : public CAllReduceOpMaker { std::string GetName() const override { return "Max"; } }; +DECLARE_INPLACE_OP_INFERER(AllreduceMaxInplaceInferer, {"X", "Out"}); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_WITHOUT_GRADIENT(c_allreduce_max, ops::CAllReduceOp, - ops::CAllReduceMaxOpMaker); +REGISTER_OPERATOR( + c_allreduce_max, ops::CAllReduceOp, ops::CAllReduceMaxOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + ops::AllreduceMaxInplaceInferer) REGISTER_OP_CPU_KERNEL(c_allreduce_max, ops::CAllReduceOpCPUKernel, diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc new file mode 100644 index 00000000000000..4dece4a3721ff5 --- /dev/null +++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc @@ -0,0 +1,31 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_allreduce_op.h" + +namespace paddle { +namespace platform { +struct ASCENDPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL( + c_allreduce_max, ops::CAllReduceOpASCENDKernel, + ops::CAllReduceOpASCENDKernel, + ops::CAllReduceOpASCENDKernel, + ops::CAllReduceOpASCENDKernel) diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc new file mode 100644 index 00000000000000..b7fd2739d51181 --- /dev/null +++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc @@ -0,0 +1,188 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +#include "paddle/fluid/operators/collective/c_allgather_op.h" +#include "paddle/fluid/operators/collective/c_allreduce_op.h" +#include "paddle/fluid/operators/collective/c_broadcast_op.h" +#include "paddle/fluid/operators/collective/c_reducescatter_op.h" +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(c_allreduce_max); +USE_NO_KERNEL_OP(c_gen_hccl_id); +USE_NO_KERNEL_OP(c_comm_init_hccl); +USE_OP_DEVICE_KERNEL(c_allreduce_max, NPU); + +DECLARE_string(selected_npus); + +template +void PrintDebugInfo(const std::string preStr, const std::vector& data) { + std::string debugstring = ""; + for (auto ele : data) { + debugstring += std::to_string(ele) + std::string(","); + } + VLOG(2) << preStr << ":" << std::endl << debugstring; +} + +void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + std::vector rank_ids{0, 1}; + f::AttributeMap gen_hccl_id; + + std::vector endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"}; + gen_hccl_id["rank"] = rank_id; + gen_hccl_id["endpoint"] = endpointList[rank_id]; + std::vector other_endpoints = { + endpointList[rank_id == 0 ? 1 : 0]}; + gen_hccl_id["other_endpoints"] = other_endpoints; + + auto out = scope->Var("Out"); + auto id = out->GetMutable(); + + VLOG(3) << "break"; + + auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {}, + {{"Out", {"Out"}}}, gen_hccl_id); + VLOG(3) << "break"; + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); + + memcpy(hccl_id, id, 1024); +} + +void Prepare(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + auto x = scope->Var("X"); + auto id = x->GetMutable(); + + memcpy(id, hccl_id, 1024); + + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + // std::vector rank_ids{0, 1}; + f::AttributeMap comm_init_attrs; + comm_init_attrs["ring_id"] = 0; + comm_init_attrs["rank_ids"] = 2; + comm_init_attrs["rank"] = rank_id; + comm_init_attrs["device_id"] = device_id; + // comm_init_attrs["rank_ids"] = rank_ids; + auto comm_init_op = f::OpRegistry::CreateOp( + "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); +} + +void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx) { + // init + auto x = scope->Var("Data"); + auto tensor_x = x->GetMutable(); + + std::vector init; + int rank_id = atoi(getenv("RANK_ID")); + + int num1 = 100; + int num2 = 100; + + for (int64_t i = 0; i < num1 * num2; ++i) { + init.push_back(1.0 + rank_id * 3); + } + PrintDebugInfo("input data", init); + + TensorFromVector(init, ctx, tensor_x); + tensor_x->Resize({num1, num2}); + ctx.Wait(); + + auto place = ctx.GetPlace(); + auto out = scope->Var("OutData"); + auto tensor_out = out->GetMutable(); + tensor_out->Resize({num1, num2}); + tensor_out->mutable_data(place); // allocate + ctx.Wait(); + + // run + f::AttributeMap attrs; + attrs["tag"] = std::string("tagx"); + attrs["ring_id"] = 0; + + auto op = f::OpRegistry::CreateOp("c_allreduce_max", {{"X", {"Data"}}}, + {{"Out", {"OutData"}}}, attrs); + + for (int i = 0; i < 10; i++) { + op->Run(*scope, place); + } + ctx.Wait(); + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + ctx.Wait(); + + PrintDebugInfo("output data", out_vec); + + EXPECT_EQ(out_vec.size(), init.size()); + for (uint32_t i = 0; i < out_vec.size(); i++) { + EXPECT_EQ(out_vec[i], 4.0); + } +} + +TEST(c_allreduce_max, NPU) { + f::Scope scope; + HcclRootInfo hccl_id; + + // only support one device, if more than one device, use first default + p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str()))); + + PrepareUniqueId(&scope, ctx, &hccl_id); + Prepare(&scope, ctx, &hccl_id); + TestHCCLAllReduceOp(&scope, ctx); +} diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc new file mode 100644 index 00000000000000..b0aa51f7cfdfdc --- /dev/null +++ b/paddle/fluid/operators/collective/c_allreduce_max_op_xpu.cc @@ -0,0 +1,28 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_allreduce_op.h" + +namespace paddle { +namespace platform { +struct XPUPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_XPU_KERNEL(c_allreduce_max, + ops::CAllReduceOpXPUKernel) diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op.cc b/paddle/fluid/operators/collective/c_allreduce_min_op.cc index efc19659c83ec3..9d913b12b13767 100644 --- a/paddle/fluid/operators/collective/c_allreduce_min_op.cc +++ b/paddle/fluid/operators/collective/c_allreduce_min_op.cc @@ -37,14 +37,19 @@ class CAllReduceMinOpMaker : public CAllReduceOpMaker { std::string GetName() const override { return "Min"; } }; +DECLARE_INPLACE_OP_INFERER(AllreduceMinInplaceInferer, {"X", "Out"}); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_WITHOUT_GRADIENT(c_allreduce_min, ops::CAllReduceOp, - ops::CAllReduceMinOpMaker); +REGISTER_OPERATOR( + c_allreduce_min, ops::CAllReduceOp, ops::CAllReduceMinOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + ops::AllreduceMinInplaceInferer) REGISTER_OP_CPU_KERNEL(c_allreduce_min, ops::CAllReduceOpCPUKernel, diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc new file mode 100644 index 00000000000000..48e1d2eeb58c52 --- /dev/null +++ b/paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc @@ -0,0 +1,31 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_allreduce_op.h" + +namespace paddle { +namespace platform { +struct ASCENDPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL( + c_allreduce_min, ops::CAllReduceOpASCENDKernel, + ops::CAllReduceOpASCENDKernel, + ops::CAllReduceOpASCENDKernel, + ops::CAllReduceOpASCENDKernel) diff --git a/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc new file mode 100644 index 00000000000000..2f16a89c217dac --- /dev/null +++ b/paddle/fluid/operators/collective/c_allreduce_min_op_xpu.cc @@ -0,0 +1,28 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_allreduce_op.h" + +namespace paddle { +namespace platform { +struct XPUPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_XPU_KERNEL(c_allreduce_min, + ops::CAllReduceOpXPUKernel) diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h index 2f56f43d793fa9..3a74f551e7a30e 100644 --- a/paddle/fluid/operators/collective/c_allreduce_op.h +++ b/paddle/fluid/operators/collective/c_allreduce_op.h @@ -19,17 +19,31 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/memory/memory.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_XPU_BKCL) #include "paddle/fluid/platform/collective_helper.h" +#endif + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/nccl_helper.h" #endif +#if defined(PADDLE_WITH_XPU_BKCL) +#include "paddle/fluid/platform/bkcl_helper.h" +#endif + #if defined(PADDLE_WITH_GLOO) #include #include "paddle/fluid/framework/fleet/gloo_wrapper.h" #endif +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/hccl_helper.h" +#endif + namespace paddle { namespace operators { @@ -105,6 +119,136 @@ class CAllReduceOpCPUKernel : public framework::OpKernel { } }; +template +class CAllReduceOpASCENDKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_ASCEND_CL) + auto in = ctx.Input("X"); + auto out = ctx.Output("Out"); + auto place = ctx.GetPlace(); + HcclDataType dtype = platform::ToHCCLDataType(in->type()); + int64_t numel = in->numel(); + + void* sendbuff = reinterpret_cast(const_cast(in->data())); + out->mutable_data(in->dims(), ctx.GetPlace()); + void* recvbuff = reinterpret_cast(out->data()); + + int ring_id = ctx.Attr("ring_id"); + std::string group = + std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id); + auto comm = + paddle::platform::HCCLCommContext::Instance().Get(ring_id, place); + + aclrtStream stream = nullptr; + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + if (ctx.Attr("use_calc_stream")) { + stream = static_cast(dev_ctx)->stream(); + } else { + stream = comm->stream(); + } + + HcclReduceOp hccl_red_type = HCCL_REDUCE_SUM; + switch (red_type) { + case kRedSum: + hccl_red_type = HCCL_REDUCE_SUM; + break; + + case kRedMax: + hccl_red_type = HCCL_REDUCE_MAX; + break; + + case kRedMin: + hccl_red_type = HCCL_REDUCE_MIN; + break; + + case kRedProd: + hccl_red_type = HCCL_REDUCE_PROD; + break; + + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "Invalid reduce type: %d", red_type)); + } + + VLOG(3) << "begin hccl allreduce, parameter is: " + << "input num: " << numel << "dtype: " << dtype + << "hccl_red_type: " << hccl_red_type << ", group is: " << group; + + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce( + sendbuff, recvbuff, numel, dtype, hccl_red_type, comm->comm(), + reinterpret_cast(stream))); + + out->Resize(in->dims()); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with NPU.")); +#endif + } +}; + +template +class CAllReduceOpXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_XPU_BKCL) + auto in = ctx.Input("X"); + auto out = ctx.Output("Out"); + + auto place = ctx.GetPlace(); + BKCLDataType dtype = platform::ToBKCLDataType(in->type()); + int64_t numel = in->numel(); + const void* sendbuff = in->data(); + out->Resize(in->dims()); + void* recvbuff = out->mutable_data(place); + + int rid = ctx.Attr("ring_id"); + auto comm = platform::BKCLCommContext::Instance().Get(rid, place); + + XPUStream stream = nullptr; + if (ctx.Attr("use_calc_stream")) { + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + stream = static_cast(dev_ctx) + ->x_context() + ->xpu_stream; + } else { + stream = comm->stream(); + } + + BKCLOp bkcl_red_type = BKCL_ADD; + switch (red_type) { + case kRedSum: + bkcl_red_type = BKCL_ADD; + break; + + case kRedMax: + bkcl_red_type = BKCL_MAX; + break; + + case kRedMin: + bkcl_red_type = BKCL_MIN; + break; + + case kRedProd: + bkcl_red_type = BKCL_PRODUCT; + break; + + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "Invalid reduce type: %d", red_type)); + } + + PADDLE_ENFORCE_EQ(bkcl_all_reduce(comm->comm(), sendbuff, recvbuff, numel, + dtype, bkcl_red_type, stream), + BKCL_SUCCESS, platform::errors::PreconditionNotMet( + "BKCL all reduce failed")); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should be compiled with XPU.")); +#endif + } +}; + template class CAllReduceOpCUDAKernel : public framework::OpKernel { public: @@ -170,10 +314,20 @@ class CAllReduceOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("Out", "(Tensor) the allreduced result."); AddAttr("ring_id", "(int default 0) communication ring id.") .SetDefault(0); +#if defined(PADDLE_WITH_ASCEND_CL) + AddAttr("tag", "(string default tag) tag for all reduce.") + .SetDefault("tag"); +#endif AddAttr( "use_calc_stream", "(bool default false) eject CUDA operations to calculation stream.") .SetDefault(false); + AddAttr( + "use_model_parallel", + "(bool default false) use this op with model parallel mode. In model " + "parallel mode, the backward is c_identity which returns itself for " + "c_allreduce_sum.") + .SetDefault(false); AddComment(string::Sprintf(R"DOC( CAllReduce %s Operator diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op.cc index 5ab07ef026bac5..3ad078e1c8ff0f 100644 --- a/paddle/fluid/operators/collective/c_allreduce_prod_op.cc +++ b/paddle/fluid/operators/collective/c_allreduce_prod_op.cc @@ -37,14 +37,19 @@ class CAllReduceProdOpMaker : public CAllReduceOpMaker { std::string GetName() const override { return "Prod"; } }; +DECLARE_INPLACE_OP_INFERER(AllreduceProdInplaceInferer, {"X", "Out"}); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_WITHOUT_GRADIENT(c_allreduce_prod, ops::CAllReduceOp, - ops::CAllReduceProdOpMaker); +REGISTER_OPERATOR( + c_allreduce_prod, ops::CAllReduceOp, ops::CAllReduceProdOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker, + ops::AllreduceProdInplaceInferer) REGISTER_OP_CPU_KERNEL(c_allreduce_prod, ops::CAllReduceOpCPUKernel, diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc new file mode 100644 index 00000000000000..f3d14afe0a1bc7 --- /dev/null +++ b/paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc @@ -0,0 +1,31 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_allreduce_op.h" + +namespace paddle { +namespace platform { +struct ASCENDPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL( + c_allreduce_prod, ops::CAllReduceOpASCENDKernel, + ops::CAllReduceOpASCENDKernel, + ops::CAllReduceOpASCENDKernel, + ops::CAllReduceOpASCENDKernel) diff --git a/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc new file mode 100644 index 00000000000000..92ba00428065bc --- /dev/null +++ b/paddle/fluid/operators/collective/c_allreduce_prod_op_xpu.cc @@ -0,0 +1,28 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_allreduce_op.h" + +namespace paddle { +namespace platform { +struct XPUPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_XPU_KERNEL(c_allreduce_prod, + ops::CAllReduceOpXPUKernel) diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op.cc index 68061e6ae6bea0..18c317506c06e1 100644 --- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cc +++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cc @@ -37,7 +37,12 @@ class CAllReduceSumOpGradMaker : public framework::SingleGradOpMaker { protected: void Apply(GradOpPtr retv) const override { - retv->SetType("c_allreduce_sum"); + bool use_mp = BOOST_GET_CONST(bool, this->GetAttr("use_model_parallel")); + if (use_mp) { + retv->SetType("c_identity"); + } else { + retv->SetType("c_allreduce_sum"); + } retv->SetInput("X", this->OutputGrad("Out")); retv->SetOutput("Out", this->InputGrad("X")); retv->SetAttrMap(this->Attrs()); @@ -49,6 +54,8 @@ class CAllReduceSumOpMaker : public CAllReduceOpMaker { std::string GetName() const override { return "Sum"; } }; +DECLARE_INPLACE_OP_INFERER(AllreduceSumInplaceInferer, {"X", "Out"}); + } // namespace operators } // namespace paddle @@ -58,7 +65,7 @@ namespace plat = paddle::platform; REGISTER_OPERATOR(c_allreduce_sum, ops::CAllReduceOp, ops::CAllReduceSumOpGradMaker, ops::CAllReduceSumOpGradMaker, - ops::CAllReduceSumOpMaker); + ops::CAllReduceSumOpMaker, ops::AllreduceSumInplaceInferer); REGISTER_OP_CPU_KERNEL(c_allreduce_sum, ops::CAllReduceOpCPUKernel, diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc new file mode 100644 index 00000000000000..b66e2e1968908c --- /dev/null +++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc @@ -0,0 +1,31 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_allreduce_op.h" + +namespace paddle { +namespace platform { +struct ASCENDPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL( + c_allreduce_sum, ops::CAllReduceOpASCENDKernel, + ops::CAllReduceOpASCENDKernel, + ops::CAllReduceOpASCENDKernel, + ops::CAllReduceOpASCENDKernel) diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc new file mode 100644 index 00000000000000..f1bf9683e35593 --- /dev/null +++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc @@ -0,0 +1,189 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +#include "paddle/fluid/operators/collective/c_allreduce_op.h" +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(c_allreduce_sum); +USE_NO_KERNEL_OP(c_gen_hccl_id); +USE_NO_KERNEL_OP(c_comm_init_hccl); +USE_OP_DEVICE_KERNEL(c_allreduce_sum, NPU); + +DECLARE_string(selected_npus); + +template +void PrintDebugInfo(const std::string preStr, const std::vector& data) { + std::string debugstring = ""; + for (auto ele : data) { + debugstring += std::to_string(ele) + std::string(","); + } + VLOG(3) << preStr << ":" << std::endl << debugstring; +} + +void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + std::vector rank_ids{0, 1}; + f::AttributeMap gen_hccl_id; + + std::vector endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"}; + gen_hccl_id["rank"] = rank_id; + gen_hccl_id["endpoint"] = endpointList[rank_id]; + std::vector other_endpoints = { + endpointList[rank_id == 0 ? 1 : 0]}; + gen_hccl_id["other_endpoints"] = other_endpoints; + + auto out = scope->Var("Out"); + auto id = out->GetMutable(); + + VLOG(3) << "break"; + + auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {}, + {{"Out", {"Out"}}}, gen_hccl_id); + VLOG(3) << "break"; + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); + + memcpy(hccl_id, id, 1024); +} + +void Prepare(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + auto x = scope->Var("X"); + auto id = x->GetMutable(); + + memcpy(id, hccl_id, 1024); + + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + // std::vector rank_ids{0, 1}; + f::AttributeMap comm_init_attrs; + comm_init_attrs["ring_id"] = 0; + comm_init_attrs["rank_ids"] = 2; + comm_init_attrs["rank"] = rank_id; + comm_init_attrs["device_id"] = device_id; + // comm_init_attrs["rank_ids"] = rank_ids; + auto comm_init_op = f::OpRegistry::CreateOp( + "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); +} + +void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx, + int iter) { + // init + auto x = scope->Var("Data"); + auto tensor_x = x->GetMutable(); + + int rank_id = atoi(getenv("RANK_ID")); + int num1 = 3; + int num2 = 128; + + std::vector init; + for (int64_t i = 0; i < num1 * num2; ++i) { + init.push_back(1.0 + rank_id); + } + PrintDebugInfo("input data", init); + + auto place = ctx.GetPlace(); + + TensorFromVector(init, ctx, tensor_x); + tensor_x->Resize({num1, num2}); + ctx.Wait(); + + auto out = scope->Var("OutData"); + auto tensor_out = out->GetMutable(); + tensor_out->Resize({num1, num2}); + tensor_out->mutable_data(place); // allocate + ctx.Wait(); + + // run + f::AttributeMap attrs; + attrs["tag"] = std::string("tagx_" + std::to_string(iter)); + attrs["ring_id"] = 0; + + auto op = f::OpRegistry::CreateOp("c_allreduce_sum", {{"X", {"Data"}}}, + {{"Out", {"OutData"}}}, attrs); + + for (int i = 0; i < 10; i++) { + op->Run(*scope, place); + } + ctx.Wait(); + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + ctx.Wait(); + + PrintDebugInfo("output data", out_vec); + + EXPECT_EQ(out_vec.size(), init.size()); + for (uint32_t i = 0; i < out_vec.size(); i++) { + EXPECT_EQ(out_vec[i], 3.0); + } +} + +TEST(c_allreduce_sum, NPU) { + f::Scope scope; + HcclRootInfo hccl_id; + + p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str()))); + + // only support one device, if more than one device, use first default + PrepareUniqueId(&scope, ctx, &hccl_id); + Prepare(&scope, ctx, &hccl_id); + for (int i = 0; i < 1; i++) { + VLOG(2) << "iter num: " << i; + TestHCCLAllReduceOp(&scope, ctx, i); + } +} diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc new file mode 100644 index 00000000000000..e4ec538cd23230 --- /dev/null +++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_xpu.cc @@ -0,0 +1,28 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_allreduce_op.h" + +namespace paddle { +namespace platform { +struct XPUPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_XPU_KERNEL(c_allreduce_sum, + ops::CAllReduceOpXPUKernel) diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cc b/paddle/fluid/operators/collective/c_broadcast_op.cc index 928fa8549ffb92..271d543eb2364d 100644 --- a/paddle/fluid/operators/collective/c_broadcast_op.cc +++ b/paddle/fluid/operators/collective/c_broadcast_op.cc @@ -42,6 +42,10 @@ class CBroadcastOpMaker : public framework::OpProtoAndCheckerMaker { .SetDefault(0); AddAttr("root", "(int default 0) root id for broadcasting.") .SetDefault(0); +#if defined(PADDLE_WITH_ASCEND_CL) + AddAttr("tag", "(string default tag) tag for broadcasting.") + .SetDefault("tag"); +#endif AddAttr( "use_calc_stream", "(bool default false) eject CUDA operations to calculation stream.") diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu.cc new file mode 100644 index 00000000000000..a60ba86572822c --- /dev/null +++ b/paddle/fluid/operators/collective/c_broadcast_op_npu.cc @@ -0,0 +1,91 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_broadcast_op.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +template +class CBroadcastOpASCENDKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_ASCEND_CL) + auto x = ctx.Input("X"); + void* ptr = reinterpret_cast(const_cast(x->data())); + int numel = x->numel(); + HcclDataType dtype = platform::ToHCCLDataType(x->type()); + + auto out = ctx.Output("Out"); + + int ring_id = ctx.Attr("ring_id"); + auto place = ctx.GetPlace(); + auto comm = + paddle::platform::HCCLCommContext::Instance().Get(ring_id, place); + + aclrtStream stream = nullptr; + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + if (ctx.Attr("use_calc_stream")) { + stream = static_cast(dev_ctx)->stream(); + } else { + stream = comm->stream(); + } + + int root = ctx.Attr("root"); + std::string group = + std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id); + + VLOG(3) << "begin hccl broadcast, parameter is: " + << "root " << root << ", group is " << group + << ", comm: " << comm->comm() << ", stream: " << stream; + + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast( + ptr, numel, dtype, (uint32_t)root, comm->comm(), stream)); + + VLOG(3) << "rank " << comm->rank() << " invoke Bcast. recieved " + << framework::product(out->dims()); + + dev_ctx->Wait(); + + if (out != x) { + framework::TensorCopy(*static_cast(x), place, + *platform::DeviceContextPool::Instance().Get(place), + static_cast(out)); + } + dev_ctx->Wait(); + + out->Resize(x->dims()); + out->set_lod(x->lod()); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with NPU.")); +#endif + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(c_broadcast, ops::CBroadcastOpASCENDKernel, + ops::CBroadcastOpASCENDKernel, + ops::CBroadcastOpASCENDKernel, + ops::CBroadcastOpASCENDKernel); diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc new file mode 100644 index 00000000000000..9e39613f3fbe3a --- /dev/null +++ b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc @@ -0,0 +1,181 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +#include "paddle/fluid/operators/collective/c_broadcast_op.h" +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(c_broadcast); +USE_NO_KERNEL_OP(c_gen_hccl_id); +USE_NO_KERNEL_OP(c_comm_init_hccl); +USE_OP_DEVICE_KERNEL(c_broadcast, NPU); + +DECLARE_string(selected_npus); + +template +void PrintDebugInfo(const std::string preStr, const std::vector& data) { + std::string debugstring = ""; + for (auto ele : data) { + debugstring += std::to_string(ele) + std::string(","); + } + VLOG(2) << preStr << ":" << std::endl << debugstring; +} + +void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + std::vector rank_ids{0, 1}; + f::AttributeMap gen_hccl_id; + + std::vector endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"}; + gen_hccl_id["rank"] = rank_id; + gen_hccl_id["endpoint"] = endpointList[rank_id]; + std::vector other_endpoints = { + endpointList[rank_id == 0 ? 1 : 0]}; + gen_hccl_id["other_endpoints"] = other_endpoints; + + auto out = scope->Var("Out"); + auto id = out->GetMutable(); + + VLOG(3) << "break"; + + auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {}, + {{"Out", {"Out"}}}, gen_hccl_id); + VLOG(3) << "break"; + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); + + memcpy(hccl_id, id, 1024); +} + +void Prepare(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + auto x = scope->Var("X"); + auto id = x->GetMutable(); + + memcpy(id, hccl_id, 1024); + + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + // std::vector rank_ids{0, 1}; + f::AttributeMap comm_init_attrs; + comm_init_attrs["ring_id"] = 0; + comm_init_attrs["rank_ids"] = 2; + comm_init_attrs["rank"] = rank_id; + comm_init_attrs["device_id"] = device_id; + // comm_init_attrs["rank_ids"] = rank_ids; + auto comm_init_op = f::OpRegistry::CreateOp( + "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); +} + +void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) { + // init + auto x = scope->Var("Data"); + auto tensor_x = x->GetMutable(); + int num = 2; + std::vector init; + int rank_id = atoi(getenv("RANK_ID")); + + for (int64_t i = 0; i < num * num; ++i) { + init.push_back(1.0 + rank_id); + } + PrintDebugInfo("input data", init); + + TensorFromVector(init, ctx, tensor_x); + tensor_x->Resize({num, num}); + ctx.Wait(); + + auto place = ctx.GetPlace(); + auto out = scope->Var("OutData"); + auto tensor_out = out->GetMutable(); + tensor_out->Resize({num, num}); + tensor_out->mutable_data(place); // allocate + ctx.Wait(); + + // run + f::AttributeMap attrs; + attrs["tag"] = std::string("tagx"); + attrs["root"] = 0; + attrs["ring_id"] = 0; + + auto op = f::OpRegistry::CreateOp("c_broadcast", {{"X", {"Data"}}}, + {{"Out", {"OutData"}}}, attrs); + + for (int i = 0; i < 10; i++) { + op->Run(*scope, place); + } + ctx.Wait(); + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + ctx.Wait(); + + PrintDebugInfo("output data", out_vec); + EXPECT_EQ(out_vec.size(), init.size()); + for (uint32_t i = 0; i < out_vec.size(); i++) { + EXPECT_EQ(out_vec[i], 1.0); + } +} + +TEST(c_broadcast, NPU) { + f::Scope scope; + HcclRootInfo hccl_id; + // only support one device, if more than one device, use first default + p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str()))); + + PrepareUniqueId(&scope, ctx, &hccl_id); + Prepare(&scope, ctx, &hccl_id); + TestHCCLBroadcastOp(&scope, ctx); +} diff --git a/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc new file mode 100644 index 00000000000000..7817f19bacb187 --- /dev/null +++ b/paddle/fluid/operators/collective/c_comm_init_hccl_op.cc @@ -0,0 +1,96 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace framework { +class Scope; +} // namespace framework +} // namespace paddle +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#endif + +namespace paddle { +namespace operators { + +class CCommInitOpAscend : public framework::OperatorBase { + public: + CCommInitOpAscend(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void RunImpl(const framework::Scope& scope, + const platform::Place& place) const override { + PADDLE_ENFORCE_EQ(is_npu_place(place), true, + platform::errors::PreconditionNotMet( + "CCommInitOpAscend can run on npu place only.")); + + auto var = scope.FindVar(Input("X")); + PADDLE_ENFORCE_NOT_NULL( + var, platform::errors::InvalidArgument("Input con not be empty.")); +#if defined(PADDLE_WITH_ASCEND_CL) + HcclRootInfo* hccl_id = var->GetMutable(); + + int rank_ids = Attr("rank_ids"); + int rank_id = Attr("rank"); + int rid = Attr("ring_id"); + int device_id = BOOST_GET_CONST(platform::NPUPlace, place).device; + if (Attr("device_id") >= 0) { + device_id = Attr("device_id"); + } + platform::HCCLCommContext::Instance().CreateHCCLComm( + hccl_id, rank_ids, rank_id, device_id, rid); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with NPU.")); +#endif + } +}; + +class CCommInitOpAscendMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "Raw variable contains a NCCL UniqueId instaces."); + AddComment(R"DOC( +CCommInit operator + +Initialize collective communicatoin context within this trainer +)DOC"); + AddAttr("rank_ids", + "(int) The number of ranks of distributed trainers"); + AddAttr("rank", + "(int) The rank of the trainer in distributed training."); + AddAttr("device_id", + "(int) The deivce_id on which to initialize the communicator." + "Now, you only have to set this attr manually for pipeline " + "training. Otherwise, make it as default.") + .SetDefault(-1); + AddAttr("ring_id", "(int default 0) user specified ring id") + .SetDefault(0); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(c_comm_init_hccl, ops::CCommInitOpAscend, + ops::CCommInitOpAscendMaker); diff --git a/paddle/fluid/operators/collective/c_concat_op.cc b/paddle/fluid/operators/collective/c_concat_op.cc new file mode 100644 index 00000000000000..551fde21162582 --- /dev/null +++ b/paddle/fluid/operators/collective/c_concat_op.cc @@ -0,0 +1,112 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_concat_op.h" + +namespace paddle { +namespace operators { + +class CConcatOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "c_concat"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "c_concat"); + int nranks = ctx->Attrs().Get("nranks"); + int rank = ctx->Attrs().Get("rank"); + int ring_id = ctx->Attrs().Get("ring_id"); + PADDLE_ENFORCE_GE(nranks, 2, platform::errors::InvalidArgument( + "The number of ranks (%d) for c_concat " + "must be greater than 1.", + nranks)); + PADDLE_ENFORCE_GE( + ring_id, 0, + platform::errors::InvalidArgument( + "The ring_id (%d) for c_concat must be non-negative.", ring_id)); + PADDLE_ENFORCE_GE( + rank, 0, platform::errors::InvalidArgument( + "The rank (%d) for c_concat must be non-negative.", rank)); + PADDLE_ENFORCE_LT(rank, nranks, + platform::errors::InvalidArgument( + "The value of rank (%d) for c_concat must " + "be less than that of nranks.", + rank, nranks)); + + framework::DDim dim = ctx->GetInputDim("X"); + dim[dim.size() - 1] = dim[dim.size() - 1] * nranks; + if (dim[dim.size() - 1] < 0) dim[dim.size() - 1] = -1; + ctx->SetOutputDim("Out", dim); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); + } +}; + +template +class CConcatOpGradMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr retv) const override { + retv->SetType("c_split"); + retv->SetInput("X", this->OutputGrad("Out")); + retv->SetOutput("Out", this->InputGrad("X")); + retv->SetAttrMap(this->Attrs()); + } +}; + +class CConcatOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "(Tensor) tensor to be concated."); + AddOutput("Out", "(Tensor) the result of concat."); + AddAttr("rank", "(int default 0) rank id.").SetDefault(0); + AddAttr("nranks", "(int default 1) number of ranks.").SetDefault(1); + AddAttr("ring_id", "(int default 0) ring id.").SetDefault(0); + AddAttr( + "use_calc_stream", + "(bool default true) eject CUDA operations to calculation stream.") + .SetDefault(true); + AddAttr("use_model_parallel", + "(bool default true) use this op with model parallel.") + .SetDefault(true); + AddComment(R"DOC( +CConcat Operator +AllGather the tensors on different trainers and concat them along the last dimension. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OPERATOR(c_concat, ops::CConcatOp, + ops::CConcatOpGradMaker, + ops::CConcatOpGradMaker, + ops::CConcatOpMaker); + +REGISTER_OP_CPU_KERNEL(c_concat, ops::CConcatOpCPUKernel, + ops::CConcatOpCPUKernel, + ops::CConcatOpCPUKernel, + ops::CConcatOpCPUKernel, + ops::CConcatOpCPUKernel); diff --git a/paddle/fluid/operators/collective/c_concat_op.cu.cc b/paddle/fluid/operators/collective/c_concat_op.cu.cc new file mode 100644 index 00000000000000..bfdc49c440aae7 --- /dev/null +++ b/paddle/fluid/operators/collective/c_concat_op.cu.cc @@ -0,0 +1,110 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/operators/collective/c_concat_op.h" +#include "paddle/fluid/operators/math/concat_and_split.h" + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/nccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +template +class CConcatOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto x = ctx.Input("X"); + auto out = ctx.Output("Out"); + ncclDataType_t dtype = platform::ToNCCLDataType(x->type()); + + int nranks = ctx.Attr("nranks"); + int rank = ctx.Attr("rank"); + int rid = ctx.Attr("ring_id"); + auto place = ctx.GetPlace(); + PADDLE_ENFORCE_GE(rank, 0, + platform::errors::PreconditionNotMet( + "The value of rank (%d) for c_concat must be " + "greater than or equal to 0.", + rank)); + PADDLE_ENFORCE_GE(nranks, 2, + platform::errors::PreconditionNotMet( + "The value of nranks (%d) for c_concat must be " + "greater than or equal to 2.", + nranks)); + PADDLE_ENFORCE_LT(rank, nranks, + platform::errors::PreconditionNotMet( + "The value of rank (%d) for c_concat must be " + "less than that of nranks (%d).", + rank, nranks)); + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + auto comm = platform::NCCLCommContext::Instance().Get(rid, place); + PADDLE_ENFORCE_EQ( + nranks, comm->nranks(), + platform::errors::InvalidArgument("nranks: %s should equal to %s", + nranks, comm->nranks())); + + framework::Tensor temp_out; + framework::DDim temp_out_dims = x->dims(); + temp_out_dims[0] *= nranks; + temp_out.mutable_data(temp_out_dims, place); + int64_t send_numel = x->numel(); + const T* send_buff = x->data(); + T* recv_buff = temp_out.data(); + gpuStream_t stream = nullptr; + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + stream = static_cast(dev_ctx)->stream(); + + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllGather( + send_buff, recv_buff, send_numel, static_cast(dtype), + comm->comm(), stream)); + + std::vector inputs; + int axis = x->dims().size() - 1; + auto out_dims = x->dims(); + out_dims[out_dims.size() - 1] *= nranks; + int rows_per_tensor = x->dims()[0]; + int offset = 0; + for (int i = 0; i < nranks; i++) { + framework::Tensor temp = temp_out.Slice(offset, offset + rows_per_tensor); + inputs.emplace_back(temp); + offset += rows_per_tensor; + } + + math::ConcatFunctor functor; + out->mutable_data(out_dims, place); + auto& dev_ctx2 = ctx.template device_context(); + functor(dev_ctx2, inputs, axis, out); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with GPU.")); +#endif + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL(c_concat, ops::CConcatOpCUDAKernel, + ops::CConcatOpCUDAKernel, + ops::CConcatOpCUDAKernel, + ops::CConcatOpCUDAKernel, + ops::CConcatOpCUDAKernel); diff --git a/paddle/fluid/operators/collective/c_concat_op.h b/paddle/fluid/operators/collective/c_concat_op.h new file mode 100644 index 00000000000000..55a5799e37b6f5 --- /dev/null +++ b/paddle/fluid/operators/collective/c_concat_op.h @@ -0,0 +1,38 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class CConcatOpCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_THROW(platform::errors::Unavailable( + "Do not support c_concat for cpu kernel now.")); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc new file mode 100644 index 00000000000000..593eaf923a9784 --- /dev/null +++ b/paddle/fluid/operators/collective/c_gen_hccl_id_op.cc @@ -0,0 +1,111 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include + +#include "glog/logging.h" +#include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/var_type_traits.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/place.h" + +#ifdef PADDLE_WITH_ASCEND_CL +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" +#endif + +namespace paddle { +namespace operators { + +#ifdef PADDLE_WITH_ASCEND_CL + +class CGenHCCLIdOp : public framework::OperatorBase { + public: + CGenHCCLIdOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override { + int rank = Attr("rank"); + framework::Scope& local_scope = scope.NewScope(); + + std::function func = [&](size_t i) -> std::string { + return Output("Out"); + }; + + if (rank == 0) { + std::vector endpoint_list = + Attr>("other_endpoints"); + SendBroadCastHCCLID(endpoint_list, 1, func, local_scope); + } else { + std::string endpoint = Attr("endpoint"); + RecvBroadCastHCCLID(endpoint, 1, func, local_scope); + } + scope.DeleteScope(&local_scope); + } +}; + +#else + +class CGenHCCLIdOp : public framework::OperatorBase { + public: + CGenHCCLIdOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override {} +}; + +#endif + +class CGenHCCLIdOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + VLOG(3) << "ele"; + AddOutput("Out", "Raw variable contains a HCCL UniqueId instaces."); + AddComment(R"DOC( +CGenHCCLId operator + +For trainer 0: generate a new UniqueId and send it to all the other trainers. +For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the server. +)DOC"); + AddAttr("endpoint", + "(string), e.g. 127.0.0.1:6175 " + "current listen endpoint"); + AddAttr>( + "other_endpoints", + "['trainer1_ip:port', 'trainer2_ip:port', ...] " + "list of other trainer endpoints") + .SetDefault({}); + AddAttr("rank", + "(int default 0) " + "The rank of the trainer in distributed training.") + .SetDefault(0); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(c_gen_hccl_id, ops::CGenHCCLIdOp, ops::CGenHCCLIdOpMaker); diff --git a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc index 7da30f64d1ce39..470537582e9783 100644 --- a/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc +++ b/paddle/fluid/operators/collective/c_gen_nccl_id_op.cc @@ -66,6 +66,9 @@ class CGenNCCLIdOp : public framework::OperatorBase { return Output("Out"); }; + std::string endpoint = Attr("endpoint"); + int server_fd = platform::SocketServer::GetInstance(endpoint).socket(); + std::vector nccl_ids; nccl_ids.resize(1); @@ -75,8 +78,6 @@ class CGenNCCLIdOp : public framework::OperatorBase { Attr>("other_endpoints"); platform::SendBroadCastCommID(endpoint_list, &nccl_ids); } else { - std::string endpoint = Attr("endpoint"); - int server_fd = platform::SocketServer::GetInstance(endpoint).socket(); platform::RecvBroadCastCommID(server_fd, endpoint, &nccl_ids); } diff --git a/paddle/fluid/operators/collective/c_identity_op.cc b/paddle/fluid/operators/collective/c_identity_op.cc new file mode 100644 index 00000000000000..646c27b90e17ea --- /dev/null +++ b/paddle/fluid/operators/collective/c_identity_op.cc @@ -0,0 +1,92 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_identity_op.h" + +namespace paddle { +namespace operators { + +class CIdentityOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "c_identity"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "c_identity"); + int ring_id = ctx->Attrs().Get("ring_id"); + PADDLE_ENFORCE_GE( + ring_id, 0, + platform::errors::InvalidArgument( + "The ring_id (%d) for c_identity must be non-negative.", ring_id)); + framework::DDim dim = ctx->GetInputDim("X"); + ctx->SetOutputDim("Out", dim); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); + } +}; + +class CIdentityOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "(Tensor) identity tensor."); + AddOutput("Out", "(Tensor) identity tensor."); + AddAttr("ring_id", "(int default 0) nccl communication ring id.") + .SetDefault(0); + AddAttr( + "use_calc_stream", + "(bool default true) eject CUDA operations to calculation stream.") + .SetDefault(true); + AddAttr("use_model_parallel", + "(bool default true) use this op with model parallel.") + .SetDefault(true); + AddComment(R"DOC( +Identity Operator which returns a copy of itself. +)DOC"); + } +}; + +template +class CIdentityOpGradMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr retv) const override { + retv->SetType("c_allreduce_sum"); + retv->SetInput("X", this->OutputGrad("Out")); + retv->SetOutput("Out", this->InputGrad("X")); + retv->SetAttrMap(this->Attrs()); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OPERATOR(c_identity, ops::CIdentityOp, + ops::CIdentityOpGradMaker, + ops::CIdentityOpGradMaker, + ops::CIdentityOpMaker); + +REGISTER_OP_CPU_KERNEL(c_identity, ops::CIdentityOpCPUKernel, + ops::CIdentityOpCPUKernel, + ops::CIdentityOpCPUKernel, + ops::CIdentityOpCPUKernel, + ops::CIdentityOpCPUKernel); diff --git a/paddle/fluid/operators/collective/c_identity_op.cu.cc b/paddle/fluid/operators/collective/c_identity_op.cu.cc new file mode 100644 index 00000000000000..8ccf40e317aded --- /dev/null +++ b/paddle/fluid/operators/collective/c_identity_op.cu.cc @@ -0,0 +1,48 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_identity_op.h" + +namespace paddle { +namespace operators { + +template +class CIdentityOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto x = ctx.Input("X"); + auto out = ctx.Output("Out"); + + int rid = ctx.Attr("ring_id"); + PADDLE_ENFORCE_GE( + rid, 0, + platform::errors::InvalidArgument( + "The ring_id (%d) for c_identity op must be non-negative.", rid)); + out->mutable_data(ctx.GetPlace()); + + TensorCopy(*x, out->place(), out); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL(c_identity, ops::CIdentityOpCUDAKernel, + ops::CIdentityOpCUDAKernel, + ops::CIdentityOpCUDAKernel, + ops::CIdentityOpCUDAKernel, + ops::CIdentityOpCUDAKernel); diff --git a/paddle/fluid/operators/collective/c_identity_op.h b/paddle/fluid/operators/collective/c_identity_op.h new file mode 100644 index 00000000000000..ca817fb6bac0e1 --- /dev/null +++ b/paddle/fluid/operators/collective/c_identity_op.h @@ -0,0 +1,38 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class CIdentityOpCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_THROW(platform::errors::Unavailable( + "Do not support c_identity for cpu kernel now.")); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/collective/c_reduce_max_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_max_op_npu.cc new file mode 100644 index 00000000000000..f35b4c2f707226 --- /dev/null +++ b/paddle/fluid/operators/collective/c_reduce_max_op_npu.cc @@ -0,0 +1,31 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_reduce_op.h" + +namespace paddle { +namespace platform { +struct ASCENDPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(c_reduce_max, + ops::CReduceOpASCENDKernel, + ops::CReduceOpASCENDKernel, + ops::CReduceOpASCENDKernel, + ops::CReduceOpASCENDKernel) diff --git a/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc new file mode 100644 index 00000000000000..6d3af7bb5f258b --- /dev/null +++ b/paddle/fluid/operators/collective/c_reduce_max_op_xpu.cc @@ -0,0 +1,28 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_reduce_op.h" + +namespace paddle { +namespace platform { +struct XPUPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_XPU_KERNEL(c_reduce_max, + ops::CReduceOpXPUKernel) diff --git a/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc new file mode 100644 index 00000000000000..6ebb7e4c40e68e --- /dev/null +++ b/paddle/fluid/operators/collective/c_reduce_min_op_npu.cc @@ -0,0 +1,31 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_reduce_op.h" + +namespace paddle { +namespace platform { +struct ASCENDPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(c_reduce_min, + ops::CReduceOpASCENDKernel, + ops::CReduceOpASCENDKernel, + ops::CReduceOpASCENDKernel, + ops::CReduceOpASCENDKernel) diff --git a/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc new file mode 100644 index 00000000000000..791e58d8493cec --- /dev/null +++ b/paddle/fluid/operators/collective/c_reduce_min_op_xpu.cc @@ -0,0 +1,28 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_reduce_op.h" + +namespace paddle { +namespace platform { +struct XPUPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_XPU_KERNEL(c_reduce_min, + ops::CReduceOpXPUKernel) diff --git a/paddle/fluid/operators/collective/c_reduce_op.h b/paddle/fluid/operators/collective/c_reduce_op.h index 1bce01e13a2ad2..fa9fd079d8e48b 100644 --- a/paddle/fluid/operators/collective/c_reduce_op.h +++ b/paddle/fluid/operators/collective/c_reduce_op.h @@ -24,15 +24,28 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_ASCEND_CL) #include "paddle/fluid/platform/collective_helper.h" +#endif + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) #include "paddle/fluid/platform/nccl_helper.h" #endif + +#if defined(PADDLE_WITH_XPU_BKCL) +#include "paddle/fluid/platform/bkcl_helper.h" +#endif + #if defined(PADDLE_WITH_GLOO) #include #include "paddle/fluid/framework/fleet/gloo_wrapper.h" #endif +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/hccl_helper.h" +#endif + namespace paddle { namespace operators { @@ -110,6 +123,148 @@ class CReduceOpCPUKernel : public framework::OpKernel { } }; +template +class CReduceOpASCENDKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_ASCEND_CL) + auto in = ctx.Input("X"); + auto out = ctx.Output("Out"); + auto place = ctx.GetPlace(); + HcclDataType dtype = platform::ToHCCLDataType(in->type()); + int64_t numel = in->numel(); + + void* sendbuff = reinterpret_cast(const_cast(in->data())); + void* recvbuff = reinterpret_cast(out->data()); + + int ring_id = ctx.Attr("ring_id"); + int root_id = ctx.Attr("root_id"); + std::string group = + std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id); + auto comm = + paddle::platform::HCCLCommContext::Instance().Get(ring_id, place); + + aclrtStream stream = nullptr; + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + if (ctx.Attr("use_calc_stream")) { + stream = static_cast(dev_ctx)->stream(); + } else { + stream = comm->stream(); + } + + int rank_id = comm->rank(); + + HcclReduceOp hccl_red_type = HCCL_REDUCE_SUM; + switch (red_type) { + case kRedSum: + hccl_red_type = HCCL_REDUCE_SUM; + break; + + case kRedMax: + hccl_red_type = HCCL_REDUCE_MAX; + break; + + case kRedMin: + hccl_red_type = HCCL_REDUCE_MIN; + break; + + case kRedProd: + hccl_red_type = HCCL_REDUCE_PROD; + break; + + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "Invalid reduce type: %d", red_type)); + } + + VLOG(3) << "begin hccl reduce, parameter is: " + << "input num: " << numel << "root_id: " << root_id + << "dtype: " << dtype << "hccl_red_type: " << hccl_red_type + << ", group is: " << group; + + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce( + sendbuff, recvbuff, numel, dtype, hccl_red_type, comm->comm(), + reinterpret_cast(stream))); + + if (rank_id != root_id) { + auto npu_place = BOOST_GET_CONST(platform::NPUPlace, place); + memory::Copy(npu_place, reinterpret_cast(out->data()), + npu_place, + reinterpret_cast(const_cast(in->data())), + numel * sizeof(T), stream); + } + + out->Resize(in->dims()); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with NPU.")); +#endif + } +}; + +template +class CReduceOpXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_XPU_BKCL) + auto in = ctx.Input("X"); + auto out = ctx.Output("Out"); + + auto place = ctx.GetPlace(); + BKCLDataType dtype = platform::ToBKCLDataType(in->type()); + int64_t numel = in->numel(); + const void* sendbuff = in->data(); + out->Resize(in->dims()); + void* recvbuff = out->mutable_data(place); + + int rid = ctx.Attr("ring_id"); + int root = ctx.Attr("root_id"); + auto comm = platform::BKCLCommContext::Instance().Get(rid, place); + + XPUStream stream = nullptr; + if (ctx.Attr("use_calc_stream")) { + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + stream = static_cast(dev_ctx) + ->x_context() + ->xpu_stream; + } else { + stream = comm->stream(); + } + + BKCLOp bkcl_red_type = BKCL_ADD; + switch (red_type) { + case kRedSum: + bkcl_red_type = BKCL_ADD; + break; + + case kRedMax: + bkcl_red_type = BKCL_MAX; + break; + + case kRedMin: + bkcl_red_type = BKCL_MIN; + break; + + case kRedProd: + bkcl_red_type = BKCL_PRODUCT; + break; + + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "Invalid reduce type: %d", red_type)); + } + + PADDLE_ENFORCE_EQ(bkcl_reduce(comm->comm(), sendbuff, recvbuff, numel, + dtype, bkcl_red_type, root, stream), + BKCL_SUCCESS, platform::errors::PreconditionNotMet( + "BKCL all reduce failed")); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should be compiled with XPU.")); +#endif + } +}; + template class CReduceOpCUDAKernel : public framework::OpKernel { public: @@ -179,6 +334,10 @@ class CReduceOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("Out", "(Tensor) the reduced result."); AddAttr("ring_id", "(int default 0) communication ring id.") .SetDefault(0); +#if defined(PADDLE_WITH_ASCEND_CL) + AddAttr("tag", "(string default tag) tag for reduce.") + .SetDefault("tag"); +#endif AddAttr("root_id", "(int default 0) root id.").SetDefault(0); AddAttr( "use_calc_stream", diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc new file mode 100644 index 00000000000000..f0b7021e7997d9 --- /dev/null +++ b/paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc @@ -0,0 +1,31 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_reduce_op.h" + +namespace paddle { +namespace platform { +struct ASCENDPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(c_reduce_prod, + ops::CReduceOpASCENDKernel, + ops::CReduceOpASCENDKernel, + ops::CReduceOpASCENDKernel, + ops::CReduceOpASCENDKernel) diff --git a/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc new file mode 100644 index 00000000000000..e7e770e8ffdcaf --- /dev/null +++ b/paddle/fluid/operators/collective/c_reduce_prod_op_xpu.cc @@ -0,0 +1,28 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_reduce_op.h" + +namespace paddle { +namespace platform { +struct XPUPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_XPU_KERNEL(c_reduce_prod, + ops::CReduceOpXPUKernel) diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc new file mode 100644 index 00000000000000..dd4dbbd5f36457 --- /dev/null +++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc @@ -0,0 +1,31 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_reduce_op.h" + +namespace paddle { +namespace platform { +struct ASCENDPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(c_reduce_sum, + ops::CReduceOpASCENDKernel, + ops::CReduceOpASCENDKernel, + ops::CReduceOpASCENDKernel, + ops::CReduceOpASCENDKernel) diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc new file mode 100644 index 00000000000000..3683c7722ba3bf --- /dev/null +++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc @@ -0,0 +1,192 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +#include "paddle/fluid/operators/collective/c_reduce_op.h" +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(c_reduce_sum); +USE_NO_KERNEL_OP(c_gen_hccl_id); +USE_NO_KERNEL_OP(c_comm_init_hccl); +USE_OP_DEVICE_KERNEL(c_reduce_sum, NPU); + +DECLARE_string(selected_npus); + +template +void PrintDebugInfo(const std::string preStr, const std::vector& data) { + std::string debugstring = ""; + for (auto ele : data) { + debugstring += std::to_string(ele) + std::string(","); + } + VLOG(3) << preStr << ":" << std::endl << debugstring; +} + +void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + std::vector rank_ids{0, 1}; + f::AttributeMap gen_hccl_id; + + std::vector endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"}; + gen_hccl_id["rank"] = rank_id; + gen_hccl_id["endpoint"] = endpointList[rank_id]; + std::vector other_endpoints = { + endpointList[rank_id == 0 ? 1 : 0]}; + gen_hccl_id["other_endpoints"] = other_endpoints; + + auto out = scope->Var("Out"); + auto id = out->GetMutable(); + + VLOG(3) << "break"; + + auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {}, + {{"Out", {"Out"}}}, gen_hccl_id); + VLOG(3) << "break"; + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); + + memcpy(hccl_id, id, 1024); +} + +void Prepare(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + auto x = scope->Var("X"); + auto id = x->GetMutable(); + + memcpy(id, hccl_id, 1024); + + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + // std::vector rank_ids{0, 1}; + f::AttributeMap comm_init_attrs; + comm_init_attrs["ring_id"] = 0; + comm_init_attrs["rank_ids"] = 2; + comm_init_attrs["rank"] = rank_id; + comm_init_attrs["device_id"] = device_id; + // comm_init_attrs["rank_ids"] = rank_ids; + auto comm_init_op = f::OpRegistry::CreateOp( + "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); +} + +void TestHCCLReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) { + // init + auto x = scope->Var("Data"); + auto tensor_x = x->GetMutable(); + + int rank_id = atoi(getenv("RANK_ID")); + int num1 = 3; + int num2 = 128; + + std::vector init; + for (int64_t i = 0; i < num1 * num2; ++i) { + init.push_back(1.0 + rank_id); + } + PrintDebugInfo("input data", init); + + auto place = ctx.GetPlace(); + + TensorFromVector(init, ctx, tensor_x); + tensor_x->Resize({num1, num2}); + ctx.Wait(); + + auto out = scope->Var("OutData"); + auto tensor_out = out->GetMutable(); + tensor_out->Resize({num1, num2}); + tensor_out->mutable_data(place); // allocate + ctx.Wait(); + + // run + f::AttributeMap attrs; + attrs["tag"] = std::string("tagx_" + std::to_string(iter)); + attrs["ring_id"] = 0; + int root_id = 0; + attrs["root_id"] = root_id; + + auto op = f::OpRegistry::CreateOp("c_reduce_sum", {{"X", {"Data"}}}, + {{"Out", {"OutData"}}}, attrs); + + op->Run(*scope, place); + ctx.Wait(); + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + ctx.Wait(); + + PrintDebugInfo("output data", out_vec); + + EXPECT_EQ(out_vec.size(), init.size()); + for (uint32_t i = 0; i < out_vec.size(); i++) { + if (rank_id == root_id) { + EXPECT_EQ(out_vec[i], 3.0); + } else { + EXPECT_EQ(out_vec[i], init[i]); + } + } +} + +TEST(c_reduce_sum, NPU) { + f::Scope scope; + HcclRootInfo hccl_id; + + // only support one device, if more than one device, use first default + p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str()))); + + PrepareUniqueId(&scope, ctx, &hccl_id); + Prepare(&scope, ctx, &hccl_id); + for (int i = 0; i < 2; i++) { + VLOG(2) << "iter num: " << i; + TestHCCLReduceOp(&scope, ctx, i); + } +} diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc new file mode 100644 index 00000000000000..a0ec4d2a99cd71 --- /dev/null +++ b/paddle/fluid/operators/collective/c_reduce_sum_op_xpu.cc @@ -0,0 +1,28 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_reduce_op.h" + +namespace paddle { +namespace platform { +struct XPUPlace; +struct float16; +} // namespace platform +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_XPU_KERNEL(c_reduce_sum, + ops::CReduceOpXPUKernel) diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.cc b/paddle/fluid/operators/collective/c_reducescatter_op.cc index ada1fd2b1270cc..7836f11dc9b1fb 100644 --- a/paddle/fluid/operators/collective/c_reducescatter_op.cc +++ b/paddle/fluid/operators/collective/c_reducescatter_op.cc @@ -49,6 +49,10 @@ class CReduceScatterOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr("nranks", "Total trainer count of the distributed training job") .SetDefault(1); +#if defined(PADDLE_WITH_ASCEND_CL) + AddAttr("tag", "(string default tag) tag for reduce scatter.") + .SetDefault("tag"); +#endif AddAttr( "use_calc_stream", "(bool default false) eject CUDA operations to calculation stream.") diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.h b/paddle/fluid/operators/collective/c_reducescatter_op.h index 366d8a3747cfb7..490b152bc2d302 100644 --- a/paddle/fluid/operators/collective/c_reducescatter_op.h +++ b/paddle/fluid/operators/collective/c_reducescatter_op.h @@ -22,6 +22,7 @@ limitations under the License. */ #include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/collective/c_allreduce_op.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc new file mode 100644 index 00000000000000..44096a82c34d61 --- /dev/null +++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu.cc @@ -0,0 +1,87 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_reducescatter_op.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +template +class CReduceScatterOpAscendKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_ASCEND_CL) + auto in = ctx.Input("X"); + auto out = ctx.Output("Out"); + + int ring_id = ctx.Attr("ring_id"); + std::string group = + std::string(HCOM_GROUP_PREFIX) + std::to_string(ring_id); + auto place = ctx.GetPlace(); + auto comm = platform::HCCLCommContext::Instance().Get(ring_id, place); + int nranks = comm->nranks(); + + auto out_dims = in->dims(); + PADDLE_ENFORCE_EQ(out_dims[0] % nranks, 0, + platform::errors::InvalidArgument( + "The input tensor X's " + "dim[0] (%d) should be divisible by nranks(%d)", + out_dims[0], nranks)); + + out_dims[0] = out_dims[0] / nranks; + out->mutable_data(out_dims, place); + + uint64_t recv_numel = in->numel() / nranks; + + void* inputPtr = reinterpret_cast(const_cast(in->data())); + void* outputPtr = reinterpret_cast(out->data()); + HcclDataType dtype = platform::ToHCCLDataType(in->type()); + + aclrtStream stream = nullptr; + if (ctx.Attr("use_calc_stream")) { + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + stream = static_cast(dev_ctx)->stream(); + } else { + stream = comm->stream(); + } + VLOG(3) << "begin hccl reduce scatter, parameter is: " + << "recv_numel: " << recv_numel << "dtype: " << dtype + << "hccl_red_type: " << HCCL_REDUCE_SUM << ", group is: " << group; + + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclReduceScatter( + inputPtr, outputPtr, recv_numel, dtype, HCCL_REDUCE_SUM, comm->comm(), + reinterpret_cast(stream))); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with NPU.")); +#endif + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(c_reducescatter, + ops::CReduceScatterOpAscendKernel, + ops::CReduceScatterOpAscendKernel, + ops::CReduceScatterOpAscendKernel, + ops::CReduceScatterOpAscendKernel); diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc new file mode 100644 index 00000000000000..f82f050a7206fe --- /dev/null +++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc @@ -0,0 +1,189 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +#include "paddle/fluid/operators/collective/c_allgather_op.h" +#include "paddle/fluid/operators/collective/c_allreduce_op.h" +#include "paddle/fluid/operators/collective/c_broadcast_op.h" +#include "paddle/fluid/operators/collective/c_reducescatter_op.h" +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(c_reducescatter); +USE_NO_KERNEL_OP(c_gen_hccl_id); +USE_NO_KERNEL_OP(c_comm_init_hccl); +USE_OP_DEVICE_KERNEL(c_reducescatter, NPU); + +DECLARE_string(selected_npus); + +template +void PrintDebugInfo(const std::string preStr, const std::vector& data) { + std::string debugstring = ""; + for (auto ele : data) { + debugstring += std::to_string(ele) + std::string(","); + } + VLOG(2) << preStr << ":" << std::endl << debugstring; +} + +void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + std::vector rank_ids{0, 1}; + f::AttributeMap gen_hccl_id; + + std::vector endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"}; + gen_hccl_id["rank"] = rank_id; + gen_hccl_id["endpoint"] = endpointList[rank_id]; + std::vector other_endpoints = { + endpointList[rank_id == 0 ? 1 : 0]}; + gen_hccl_id["other_endpoints"] = other_endpoints; + + auto out = scope->Var("Out"); + auto id = out->GetMutable(); + + VLOG(3) << "break"; + + auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {}, + {{"Out", {"Out"}}}, gen_hccl_id); + VLOG(3) << "break"; + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); + + memcpy(hccl_id, id, 1024); +} + +void Prepare(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + auto x = scope->Var("X"); + auto id = x->GetMutable(); + + memcpy(id, hccl_id, 1024); + + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + // std::vector rank_ids{0, 1}; + f::AttributeMap comm_init_attrs; + comm_init_attrs["ring_id"] = 0; + comm_init_attrs["rank_ids"] = 2; + comm_init_attrs["rank"] = rank_id; + comm_init_attrs["device_id"] = device_id; + // comm_init_attrs["rank_ids"] = rank_ids; + auto comm_init_op = f::OpRegistry::CreateOp( + "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); +} + +void TestHCCLReduceScatterOp(f::Scope* scope, const p::DeviceContext& ctx) { + // init + auto x = scope->Var("Data"); + auto tensor_x = x->GetMutable(); + + std::vector init; + int num1 = 4; + int num2 = 1; + + for (int64_t i = 0; i < num1 * num2; ++i) { + init.push_back(1.0); + } + PrintDebugInfo("input data", init); + + TensorFromVector(init, ctx, tensor_x); + tensor_x->Resize({num1, num2}); + + ctx.Wait(); + + auto place = ctx.GetPlace(); + auto out = scope->Var("OutData"); + auto tensor_out = out->GetMutable(); + tensor_out->Resize({num1, num2}); + tensor_out->mutable_data(place); // allocate + + ctx.Wait(); + + // run + f::AttributeMap attrs; + attrs["tag"] = std::string("tagx"); + attrs["ring_id"] = 0; + attrs["nranks"] = 2; + + auto op = f::OpRegistry::CreateOp("c_reducescatter", {{"X", {"Data"}}}, + {{"Out", {"OutData"}}}, attrs); + + int iter_num = 10; + for (int i = 0; i < iter_num; i++) { + op->Run(*scope, place); + ctx.Wait(); + } + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + ctx.Wait(); + + PrintDebugInfo("output data", out_vec); + EXPECT_EQ(out_vec.size(), init.size() / 2); + for (uint32_t i = 0; i < out_vec.size(); i++) { + EXPECT_EQ(out_vec[i], 2.0); + } +} + +TEST(c_reducescatter, NPU) { + f::Scope scope; + HcclRootInfo hccl_id; + + // only support one device, if more than one device, use first default + p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str()))); + + PrepareUniqueId(&scope, ctx, &hccl_id); + Prepare(&scope, ctx, &hccl_id); + TestHCCLReduceScatterOp(&scope, ctx); +} diff --git a/paddle/fluid/operators/collective/c_split_op.cc b/paddle/fluid/operators/collective/c_split_op.cc new file mode 100644 index 00000000000000..03046d571d0f05 --- /dev/null +++ b/paddle/fluid/operators/collective/c_split_op.cc @@ -0,0 +1,112 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/c_split_op.h" + +namespace paddle { +namespace operators { + +class CSplitOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "c_split"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "c_split"); + int nranks = ctx->Attrs().Get("nranks"); + int rank = ctx->Attrs().Get("rank"); + int ring_id = ctx->Attrs().Get("ring_id"); + PADDLE_ENFORCE_GE(nranks, 2, platform::errors::InvalidArgument( + "The number of ranks (%d) for c_split " + "must be greater than 1.", + nranks)); + PADDLE_ENFORCE_GE( + ring_id, 0, + platform::errors::InvalidArgument( + "The ring_id (%d) for c_split must be non-negative.", ring_id)); + PADDLE_ENFORCE_GE( + rank, 0, platform::errors::InvalidArgument( + "The rank (%d) for c_split must be non-negative.", rank)); + PADDLE_ENFORCE_LT(rank, nranks, + platform::errors::InvalidArgument( + "The value of rank (%d) for c_split must " + "be less than that of nranks.", + rank, nranks)); + + framework::DDim dim = ctx->GetInputDim("X"); + dim[dim.size() - 1] = dim[dim.size() - 1] / nranks; + if (dim[0] < 0) dim[0] = -1; + ctx->SetOutputDim("Out", dim); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); + } +}; + +template +class CSplitOpGradMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr retv) const override { + retv->SetType("c_allgather"); + retv->SetInput("X", this->OutputGrad("Out")); + retv->SetOutput("Out", this->InputGrad("X")); + retv->SetAttrMap(this->Attrs()); + } +}; + +class CSplitOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "(Tensor) tensor to be split."); + AddOutput("Out", "(Tensor) the result of split."); + AddAttr("rank", "(int default 0) rank id.").SetDefault(0); + AddAttr("nranks", "(int default 1) number of ranks.").SetDefault(1); + AddAttr("ring_id", "(int default 0) ring id.").SetDefault(0); + AddAttr( + "use_calc_stream", + "(bool default false) eject CUDA operations to calculation stream.") + .SetDefault(false); + AddAttr("use_model_parallel", + "(bool default false) use this op with model parallel.") + .SetDefault(true); + AddComment(R"DOC( +CSplit Operator +Split the tensor evenly according to its rank. +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OPERATOR(c_split, ops::CSplitOp, + ops::CSplitOpGradMaker, + ops::CSplitOpGradMaker, + ops::CSplitOpMaker); + +REGISTER_OP_CPU_KERNEL(c_split, ops::CSplitOpCPUKernel, + ops::CSplitOpCPUKernel, + ops::CSplitOpCPUKernel, + ops::CSplitOpCPUKernel, + ops::CSplitOpCPUKernel); diff --git a/paddle/fluid/operators/collective/c_split_op.cu.cc b/paddle/fluid/operators/collective/c_split_op.cu.cc new file mode 100644 index 00000000000000..92a7f5e41b1d2d --- /dev/null +++ b/paddle/fluid/operators/collective/c_split_op.cu.cc @@ -0,0 +1,80 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include "paddle/fluid/operators/collective/c_split_op.h" +#include "paddle/fluid/operators/math/concat_and_split.h" + +namespace paddle { +namespace operators { + +template +class CSplitOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto x = ctx.Input("X"); + auto out = ctx.Output("Out"); + + int nranks = ctx.Attr("nranks"); + int rank = ctx.Attr("rank"); + auto place = ctx.GetPlace(); + + PADDLE_ENFORCE_GE(rank, 0, platform::errors::PreconditionNotMet( + "The value of rank (%d) for c_split must be " + "greater than or equal to 0.", + rank)); + PADDLE_ENFORCE_GE(nranks, 2, + platform::errors::PreconditionNotMet( + "The value of nranks (%d) for c_split must be " + "greater than or equal to 2.", + nranks)); + PADDLE_ENFORCE_LT(rank, nranks, + platform::errors::PreconditionNotMet( + "The value of rank (%d) for c_split must be " + "less than that of nranks (%d).", + rank, nranks)); + + auto& dev_ctx = ctx.template device_context(); + std::vector shape_refer; + std::vector results; + size_t numel = x->numel(); + auto dims = x->dims(); + numel /= nranks; + int axis = dims.size() - 1; + dims[dims.size() - 1] /= nranks; + for (int i = 0; i < nranks; i++) { + framework::Tensor* out = new framework::Tensor(); + out->mutable_data(dims, place); + shape_refer.emplace_back(out); + results.emplace_back(out); + } + + math::SplitFunctor functor; + functor(dev_ctx, *x, shape_refer, axis, &results); + out->mutable_data(dims, place); + paddle::framework::TensorCopySync(*results[rank], out->place(), out); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL(c_split, ops::CSplitOpCUDAKernel, + ops::CSplitOpCUDAKernel, + ops::CSplitOpCUDAKernel, + ops::CSplitOpCUDAKernel, + ops::CSplitOpCUDAKernel); diff --git a/paddle/fluid/operators/collective/c_split_op.h b/paddle/fluid/operators/collective/c_split_op.h new file mode 100644 index 00000000000000..ea0c7fc45c66b8 --- /dev/null +++ b/paddle/fluid/operators/collective/c_split_op.h @@ -0,0 +1,38 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class CSplitOpCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_THROW(platform::errors::Unavailable( + "Do not support c_split for cpu kernel now.")); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc index 700d1173e2ff68..71ab25a7b0ff8a 100644 --- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc +++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc @@ -46,7 +46,7 @@ Call calculation stream synchronization. }; template -class CSyncCalcStreamCudaKernel : public framework::OpKernel { +class CSyncCalcStreamKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { #if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) @@ -61,6 +61,16 @@ class CSyncCalcStreamCudaKernel : public framework::OpKernel { PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(dev_ctx->stream())); #endif +#elif defined(PADDLE_WITH_ASCEND_CL) && !defined(_WIN32) + auto place = ctx.GetPlace(); + PADDLE_ENFORCE_EQ(is_npu_place(place), true, + platform::errors::PreconditionNotMet( + "Sync stream op can run on npu place only for now.")); + + auto dev_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place)); + PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(dev_ctx->stream())); + #else PADDLE_THROW(platform::errors::PreconditionNotMet( "PaddlePaddle should compile with GPU.")); @@ -76,5 +86,6 @@ namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(c_sync_calc_stream, ops::CSyncCalcStreamOp, ops::CSyncCalcStreamOpMaker); -REGISTER_OP_CUDA_KERNEL(c_sync_calc_stream, - ops::CSyncCalcStreamCudaKernel); +REGISTER_OP_CUDA_KERNEL(c_sync_calc_stream, ops::CSyncCalcStreamKernel); + +REGISTER_OP_NPU_KERNEL(c_sync_calc_stream, ops::CSyncCalcStreamKernel); diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc new file mode 100644 index 00000000000000..45613715b8260c --- /dev/null +++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc @@ -0,0 +1,107 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include + +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(elementwise_add); +USE_OP_DEVICE_KERNEL(elementwise_add, NPU); +USE_OP_DEVICE_KERNEL(c_sync_calc_stream, NPU); + +template +void Compare(f::Scope* scope, const p::DeviceContext& ctx) { + // init + auto x = scope->Var("X"); + auto tensor_x = x->GetMutable(); + + auto y = scope->Var("Y"); + auto tensor_y = y->GetMutable(); + + std::vector init_x; + for (int64_t i = 0; i < 10 * 10; ++i) { + init_x.push_back(static_cast(1.0)); + } + + std::vector init_y; + for (int64_t i = 0; i < 10 * 10; ++i) { + init_y.push_back(static_cast(2.0)); + } + + TensorFromVector(init_x, ctx, tensor_x); + tensor_x->Resize({10, 10}); + TensorFromVector(init_y, ctx, tensor_y); + tensor_y->Resize({10, 10}); + + f::AttributeMap attrs; + auto place = ctx.GetPlace(); + auto out = scope->Var("Out"); + auto tensor_out = out->GetMutable(); + + // sync data + auto sync_op0 = f::OpRegistry::CreateOp("c_sync_calc_stream", {{"X", {"X"}}}, + {{"Out", {"Out"}}}, attrs); + sync_op0->Run(*scope, place); + + // run + + auto op = + f::OpRegistry::CreateOp("elementwise_add", {{"X", {"X"}}, {"Y", {"Y"}}}, + {{"Out", {"Out"}}}, attrs); + + op->Run(*scope, place); + + // sync op run + auto sync_op = f::OpRegistry::CreateOp("c_sync_calc_stream", {{"X", {"X"}}}, + {{"Out", {"Out"}}}, attrs); + sync_op->Run(*scope, place); + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + + // sync op copy + auto sync_op2 = f::OpRegistry::CreateOp("c_sync_calc_stream", {{"X", {"X"}}}, + {{"Out", {"Out"}}}, attrs); + sync_op2->Run(*scope, place); + + float expected = 3.0; + + EXPECT_EQ(out_vec.size(), init_x.size()); + for (uint32_t i = 0; i < out_vec.size(); i++) { + EXPECT_EQ(out_vec[i], static_cast(expected)); + } +} + +TEST(c_sync_calc_stream, NPU_fp32) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + Compare(&scope, ctx); +} diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc index 95b9cd040fe94e..71fda2cd01c8d6 100644 --- a/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc +++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op.cc @@ -19,6 +19,11 @@ limitations under the License. */ #include "paddle/fluid/platform/nccl_helper.h" #endif +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + namespace paddle { namespace operators { @@ -53,13 +58,11 @@ Call communication stream synchronization. }; template -class CSyncCommStreamCudaKernel : public framework::OpKernel { +class CSyncCommStreamKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { -#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) - auto place = ctx.GetPlace(); - +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) int ring_id = ctx.Attr("ring_id"); auto stream = platform::NCCLCommContext::Instance().Get(ring_id, place)->stream(); @@ -70,6 +73,15 @@ class CSyncCommStreamCudaKernel : public framework::OpKernel { PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream)); #endif +#elif defined(PADDLE_WITH_ASCEND_CL) + PADDLE_ENFORCE_EQ(is_npu_place(place), true, + platform::errors::PreconditionNotMet( + "Sync stream op can run on npu place only for now.")); + int ring_id = ctx.Attr("ring_id"); + auto stream = + platform::HCCLCommContext::Instance().Get(ring_id, place)->stream(); + PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream)); + #else PADDLE_THROW(platform::errors::PreconditionNotMet( "PaddlePaddle should compile with GPU.")); @@ -85,5 +97,6 @@ namespace ops = paddle::operators; REGISTER_OP_WITHOUT_GRADIENT(c_sync_comm_stream, ops::CSyncCommStreamOp, ops::CSyncCommStreamOpMaker); -REGISTER_OP_CUDA_KERNEL(c_sync_comm_stream, - ops::CSyncCommStreamCudaKernel); +REGISTER_OP_CUDA_KERNEL(c_sync_comm_stream, ops::CSyncCommStreamKernel); + +REGISTER_OP_NPU_KERNEL(c_sync_comm_stream, ops::CSyncCommStreamKernel); diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc new file mode 100644 index 00000000000000..6c5a6db61483dc --- /dev/null +++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc @@ -0,0 +1,190 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include + +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +#include "paddle/fluid/operators/collective/c_broadcast_op.h" +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(c_broadcast); +USE_OP_DEVICE_KERNEL(c_sync_comm_stream, NPU); +USE_NO_KERNEL_OP(c_gen_hccl_id); +USE_NO_KERNEL_OP(c_comm_init_hccl); +USE_OP_DEVICE_KERNEL(c_broadcast, NPU); + +DECLARE_string(selected_npus); + +template +void PrintDebugInfo(const std::string preStr, const std::vector& data) { + std::string debugstring = ""; + for (auto ele : data) { + debugstring += std::to_string(ele) + std::string(","); + } + VLOG(2) << preStr << ":" << std::endl << debugstring; +} + +void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + std::vector rank_ids{0, 1}; + f::AttributeMap gen_hccl_id; + + std::vector endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"}; + gen_hccl_id["rank"] = rank_id; + gen_hccl_id["endpoint"] = endpointList[rank_id]; + std::vector other_endpoints = { + endpointList[rank_id == 0 ? 1 : 0]}; + gen_hccl_id["other_endpoints"] = other_endpoints; + + auto out = scope->Var("Out"); + auto id = out->GetMutable(); + + VLOG(3) << "break"; + + auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {}, + {{"Out", {"Out"}}}, gen_hccl_id); + VLOG(3) << "break"; + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); + + memcpy(hccl_id, id, 1024); +} + +void Prepare(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + auto x = scope->Var("X"); + auto id = x->GetMutable(); + + memcpy(id, hccl_id, 1024); + + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + // std::vector rank_ids{0, 1}; + f::AttributeMap comm_init_attrs; + comm_init_attrs["ring_id"] = 0; + comm_init_attrs["rank_ids"] = 2; + comm_init_attrs["rank"] = rank_id; + comm_init_attrs["device_id"] = device_id; + // comm_init_attrs["rank_ids"] = rank_ids; + auto comm_init_op = f::OpRegistry::CreateOp( + "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); +} + +void TestHCCLBroadcastOp(f::Scope* scope, const p::DeviceContext& ctx) { + std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl; + // init + auto x = scope->Var("Data"); + auto tensor_x = x->GetMutable(); + int num = 2; + std::vector init; + int rank_id = atoi(getenv("RANK_ID")); + std::cout << "rank_id:" << rank_id << std::endl; + for (int64_t i = 0; i < num * num; ++i) { + init.push_back(1.0 + rank_id); + std::cout << init[0]; + } + std::cout << std::endl; + + TensorFromVector(init, ctx, tensor_x); + tensor_x->Resize({num, num}); + + ctx.Wait(); + + auto place = ctx.GetPlace(); + auto out = scope->Var("OutData"); + auto tensor_out = out->GetMutable(); + tensor_out->Resize({num, num}); + tensor_out->mutable_data(place); // allocate + + ctx.Wait(); + + // run + f::AttributeMap attrs; + attrs["tag"] = std::string("tagx"); + attrs["root"] = 0; + attrs["ring_id"] = 0; + + auto op = f::OpRegistry::CreateOp("c_broadcast", {{"X", {"Data"}}}, + {{"Out", {"OutData"}}}, attrs); + + op->Run(*scope, place); + + // comm sync + + auto sync_op = f::OpRegistry::CreateOp( + "c_sync_comm_stream", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs); + sync_op->Run(*scope, place); + + // ctx.Wait(); + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + + EXPECT_EQ(out_vec.size(), init.size()); + for (uint32_t i = 0; i < out_vec.size(); i++) { + EXPECT_EQ(out_vec[i], 1.0); + } +} + +TEST(c_sync_comm_stream_op, NPU) { + f::Scope scope; + HcclRootInfo hccl_id; + + // only support one device, if more than one device, use first default + p::NPUDeviceContext ctx(p::NPUPlace(atoi(FLAGS_selected_npus.c_str()))); + + PrepareUniqueId(&scope, ctx, &hccl_id); + Prepare(&scope, ctx, &hccl_id); + TestHCCLBroadcastOp(&scope, ctx); +} diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op.cc b/paddle/fluid/operators/collective/gen_hccl_id_op.cc new file mode 100644 index 00000000000000..0cb2dd188725f8 --- /dev/null +++ b/paddle/fluid/operators/collective/gen_hccl_id_op.cc @@ -0,0 +1,216 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "glog/logging.h" +#include "paddle/fluid/framework/op_proto_maker.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/var_type_traits.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/hccl_helper.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/string/split.h" + +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" + +namespace paddle { +namespace operators { + +#ifdef PADDLE_WITH_ASCEND_CL + +class GenHCCLIdOp : public framework::OperatorBase { + public: + GenHCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override { + std::vector trainers = + Attr>("trainers"); + int trainer_id = Attr("trainer_id"); + std::string endpoint = trainers[trainer_id]; + + PADDLE_ENFORCE_GE(trainer_id, 0, platform::errors::InvalidArgument( + "trainer_id %d is less than 0. Its " + "valid range is [0, trainer_size)")); + PADDLE_ENFORCE_LT( + trainer_id, static_cast(trainers.size()), + platform::errors::OutOfRange("trainer_id %d is out of range. Its valid " + "range is [0, trainer_size)", + trainer_id)); + + int hccl_comm_num = Attr("hccl_comm_num"); + int use_hierarchical_allreduce = Attr("use_hierarchical_allreduce"); + int inter_nranks = Attr("hierarchical_allreduce_inter_nranks"); + int inter_trainer_id = -1; + int exter_trainer_id = -1; + + if (use_hierarchical_allreduce) { + PADDLE_ENFORCE_GT( + trainers.size(), 1, + platform::errors::PreconditionNotMet( + "The number of collective trainers %llu <= 1", trainers.size())); + PADDLE_ENFORCE_GT( + inter_nranks, 1, + platform::errors::PreconditionNotMet( + "inter_nranks %d <= 1 while in hierarchical allreduce mode", + inter_nranks)); + PADDLE_ENFORCE_EQ( + trainers.size() % inter_nranks, 0, + platform::errors::PreconditionNotMet( + "The number of trainers %llu mod inter_nranks %d is not equal 0", + trainers.size(), inter_nranks)); + + inter_trainer_id = trainer_id % inter_nranks; + + if (trainer_id % inter_nranks == 0) { + exter_trainer_id = trainer_id / inter_nranks; + } + } + + std::ostringstream ss; + for (size_t i = 0; i < trainers.size(); i++) { + ss << trainers[i] << ","; + } + + VLOG(1) << "trainer_id:" << trainer_id + << ", use_hierarchical_allreduce:" << use_hierarchical_allreduce + << ", hccl_comm_num:" << hccl_comm_num + << ", inter_nranks:" << inter_nranks + << ", inter_trainer_id:" << inter_trainer_id + << ", exter_trainer_id:" << exter_trainer_id + << ", trainers:" << ss.str(); + + int server_fd = -1; + + /// 1. init flat + std::function func = platform::GetFlatHCCLVarName; + if (trainer_id == 0) { + // server endpoints + std::vector flat_endpoints; + flat_endpoints.insert(flat_endpoints.begin(), trainers.begin() + 1, + trainers.end()); + SendBroadCastHCCLID(flat_endpoints, hccl_comm_num, func, scope); + } else { + server_fd = CreateListenSocket(endpoint); + RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope); + } + + /// 2. hierarchical inter ncclid + func = platform::GetHierarchicalInterHCCLVarName; + if (inter_trainer_id == 0) { + std::ostringstream ss; + ss << endpoint; + std::vector inter_endpoints; + for (int i = trainer_id + 1; i < trainer_id + inter_nranks && + i < static_cast(trainers.size()); + i++) { + ss << ","; + inter_endpoints.push_back(trainers[i]); + ss << trainers[i]; + } + VLOG(1) << "Hierarchical inter ring endpoints:" << ss.str(); + + SendBroadCastHCCLID(inter_endpoints, hccl_comm_num, func, scope); + } else if (inter_trainer_id > 0) { + VLOG(1) << "Hierarchical inter ring"; + RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope); + } + + /// 3. hierarchical exter ncclid + func = platform::GetHierarchicalExterHCCLVarName; + if (exter_trainer_id == 0) { + std::ostringstream ss; + std::vector exter_endpoints; + ss << endpoint; + for (size_t i = inter_nranks; i < trainers.size(); i += inter_nranks) { + ss << ","; + exter_endpoints.push_back(trainers[i]); + ss << trainers[i]; + } + VLOG(1) << "Hierarchical exter ring endpoints:" << ss.str(); + + SendBroadCastHCCLID(exter_endpoints, hccl_comm_num, func, scope); + } else if (exter_trainer_id > 0) { + VLOG(1) << "Hierarchical exter ring"; + RecvBroadCastHCCLID(server_fd, endpoint, hccl_comm_num, func, scope); + } + + // close socket server + if (trainer_id != 0) { + CloseSocket(server_fd); + } + } +}; + +#else +class GenHCCLIdOp : public framework::OperatorBase { + public: + GenHCCLIdOp(const std::string& type, const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override {} +}; + +#endif + +class GenHCCLIdOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddOutput("HCCLID", "Raw variable contains a HCCL UniqueId instaces."); + AddComment(R"DOC( +GenHCCLId operator + +For trainer 0: generate a new UniqueId and send it to all the other trainers. +For trainer 1~n: start a gRPC server to get the UniqueId, once got, stop the server. +)DOC"); + AddAttr>( + "trainers", + "['trainer0_ip:port', 'trainer1_ip:port', ...] " + "list of all trainer endpoints") + .SetDefault({}); + AddAttr("trainer_id", + "(int) " + "The index of the trainer in distributed training."); + AddAttr("hccl_comm_num", + "(int default 1) " + "The number of nccl communicator num.") + .SetDefault(1); + AddAttr("use_hierarchical_allreduce", + "(bool default false) " + "Wheter to use hierarchical allreduce.") + .SetDefault(false); + AddAttr("hierarchical_allreduce_inter_nranks", + "(int default 1) " + "Wheter to use hierarchical allreduce.") + .SetDefault(-1); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(gen_hccl_id, ops::GenHCCLIdOp, ops::GenHCCLIdOpMaker); diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc new file mode 100644 index 00000000000000..15940a76f71105 --- /dev/null +++ b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc @@ -0,0 +1,350 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "glog/logging.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/var_type_traits.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/string/split.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +constexpr char COMM_HEAD[] = "_pd_gen_comm_id_"; +#define HCCL_UNIQUE_ID_BYTES 1024 + +// Check system calls, such as socket, bind. +#define CHECK_SYS_CALL(call, name) \ + do { \ + int retval; \ + CHECK_SYS_CALL_VAL(call, name, retval); \ + } while (false) + +#define CHECK_SYS_CALL_VAL(call, name, retval) \ + do { \ + RETRY_SYS_CALL_VAL(call, name, retval); \ + if (retval == -1) { \ + PADDLE_THROW(platform::errors::Unavailable("Call to %s failed: %s", \ + name, strerror(errno))); \ + } \ + } while (false) + +#define RETRY_SYS_CALL_VAL(call, name, retval) \ + do { \ + retval = (call); \ + if (retval == -1 && \ + (errno == EINTR || errno == EWOULDBLOCK || errno == EAGAIN)) { \ + LOG(WARNING) << "Call " << name << " returned " << strerror(errno) \ + << " retry"; \ + } else { \ + break; \ + } \ + } while (true) + +static int SocketSend(int fd, const char* buffer, int size) { + int offset = 0; + int bytes = 0; + while (offset < size) { + bytes = send(fd, buffer + offset, size - offset, 0); + if (bytes == -1) { + if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) { + // send failed + return -1; + } else { + bytes = 0; + } + } + offset += bytes; + } + return offset; +} + +static int SocketRecv(int fd, char* buffer, int size) { + int offset = 0; + int bytes = 0; + while (offset < size) { + bytes = recv(fd, buffer + offset, size - offset, 0); + if (bytes == 0) { + // closed by client, maybe probing alive client + return 0; + } + if (bytes == -1) { + if (errno != EINTR && errno != EWOULDBLOCK && errno != EAGAIN) { + return -1; + } else { + bytes = 0; + } + } + offset += bytes; + } + return offset; +} + +static void BindOrConnectFailed(int timeout, int* try_times, int* total_time, + const char* op, const std::string& ep) { + PADDLE_ENFORCE_LT( + *total_time, timeout, + platform::errors::Unavailable("%s addr=%s timeout, failed reason: %s", op, + ep.c_str(), strerror(errno))); + ++(*try_times); + int retry_time = std::min(*try_times * 500, 3000); // max 3 seconds + *total_time += retry_time; + + LOG(WARNING) << op << " addr=" << ep << " failed " << *try_times + << " times with reason: " << strerror(errno) << " retry after " + << retry_time / 1000.0 << " seconds"; + std::this_thread::sleep_for(std::chrono::milliseconds(retry_time)); +} + +int CreateListenSocket(const std::string& ep) { + auto addr = paddle::string::Split(ep, ':'); + PADDLE_ENFORCE_EQ( + addr.size(), 2UL, + platform::errors::InvalidArgument( + "The endpoint should contain host and port, but got %s.", ep)); + std::string host = addr[0]; + int port = std::stoi(addr[1]); + + // creating socket fd + int server_fd = -1; + CHECK_SYS_CALL_VAL(socket(AF_INET, SOCK_STREAM, 0), "socket", server_fd); + + // NOTE. Solutions to `Address already in use`. + // 1. Reuse addr&port. Otherwise, once the server closes the socket + // before client, the server will enter TIME-WAIT status. If we bind port + // again, the error `Address already in use` will appear. + // 2. Or we can close the client first to ensure that the server does + // not enter the TIME-WAIT state. But this is obviously not as convenient + // as the reuse method. + int opt = 1; +#if defined(SO_REUSEPORT) + // since Linux kernel 3.9 + CHECK_SYS_CALL(setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, + &opt, sizeof(opt)), + "setsockopt"); +#else + CHECK_SYS_CALL( + setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt)), + "setsockopt"); +#endif + + struct sockaddr_in address; + address.sin_family = AF_INET; + address.sin_addr.s_addr = INADDR_ANY; + address.sin_port = htons(port); + + // TODO(wangxi) Set from env, default 900s=15min + int timeout = 900 * 1000; + int try_times = 0; + int total_time = 0; + while (true) { + int ret_val = -1; + RETRY_SYS_CALL_VAL( + bind(server_fd, (struct sockaddr*)&address, sizeof(address)), "bind", + ret_val); + + if (ret_val == -1) { + BindOrConnectFailed(timeout, &try_times, &total_time, "bind", ep); + continue; + } + break; + } + + CHECK_SYS_CALL(listen(server_fd, 3), "listen"); + LOG(INFO) << "Server listening on: " << ep << " successful."; + return server_fd; +} + +void CloseSocket(int fd) { CHECK_SYS_CALL(close(fd), "close"); } + +static int SocketAccept(int server_fd, const char* head) { + struct sockaddr_in client_addr; + socklen_t addr_length = sizeof(client_addr); + char buffer[1024] = {0}; + int conn = -1; + + while (true) { + CHECK_SYS_CALL_VAL( + accept(server_fd, reinterpret_cast(&client_addr), + &addr_length), + "accept", conn); + + int ret_val = SocketRecv(conn, buffer, strlen(head)); + if (ret_val > 0 && strncmp(buffer, head, strlen(head)) == 0) { + break; // accept client + } else { + VLOG(3) << "socket read failed with ret_val=" << ret_val; + CloseSocket(conn); + } + } + return conn; +} + +static int ConnectAddr(const std::string& ep, const char* head) { + auto addr = paddle::string::Split(ep, ':'); + PADDLE_ENFORCE_EQ( + addr.size(), 2UL, + platform::errors::InvalidArgument( + "The endpoint should contain host and port, but got %s.", ep)); + std::string host = addr[0]; + int port = std::stoi(addr[1]); + + int sock = -1; + CHECK_SYS_CALL_VAL(socket(AF_INET, SOCK_STREAM, 0), "socket", sock); + + struct sockaddr_in server_addr; + memset(&server_addr, 0, sizeof(server_addr)); + server_addr.sin_family = AF_INET; + server_addr.sin_port = htons(port); + + char* ip = NULL; + struct hostent* hp = NULL; + hp = gethostbyname(host.c_str()); + PADDLE_ENFORCE_NOT_NULL(hp, platform::errors::InvalidArgument( + "Fail to get host by name %s.", host)); + + int i = 0; + while (hp->h_addr_list[i] != NULL) { + ip = inet_ntoa(*(struct in_addr*)hp->h_addr_list[i]); + VLOG(3) << "gethostbyname host:" << host << " ->ip: " << ip; + break; + } + + PADDLE_ENFORCE_GT(inet_pton(AF_INET, ip, &server_addr.sin_addr), 0, + platform::errors::Unavailable("Open address %s failed: %s", + ep, strerror(errno))); + + // TODO(wangxi) Set from env, default 900s=15min + int timeout = 900 * 1000; + int try_times = 0; + int total_time = 0; + while (true) { + int ret_val = -1; + RETRY_SYS_CALL_VAL( + connect(sock, (struct sockaddr*)&server_addr, sizeof(server_addr)), + "connect", ret_val); + + if (ret_val == -1) { + BindOrConnectFailed(timeout, &try_times, &total_time, "connect", ep); + continue; + } + + CHECK_SYS_CALL(SocketSend(sock, head, strlen(head)), "send"); + break; + } + return sock; +} + +static void RecvHCCLID(int conn, HcclRootInfo* hccl_id) { + char buffer[1024] = {0}; + static_assert(HCCL_UNIQUE_ID_BYTES <= 1024, + "hccl id bytes must <= buffer size"); + + CHECK_SYS_CALL(SocketRecv(conn, buffer, HCCL_UNIQUE_ID_BYTES), + "recv hccl id"); + memcpy(hccl_id, buffer, HCCL_UNIQUE_ID_BYTES); +} + +static void SendHCCLID(int conn, HcclRootInfo* hccl_id) { + char buffer[1024] = {0}; + memcpy(buffer, hccl_id, HCCL_UNIQUE_ID_BYTES); + + CHECK_SYS_CALL(SocketSend(conn, buffer, HCCL_UNIQUE_ID_BYTES), + "send hccl id"); +} + +void SendBroadCastHCCLID(std::vector servers, int hccl_comm_num, + std::function func, + const framework::Scope& scope) { + // connect with server + std::vector connects; + for (auto server : servers) { + VLOG(3) << "connecting endpoint: " << server; + int conn = ConnectAddr(server, COMM_HEAD); + connects.push_back(conn); + } + VLOG(3) << "connecting completed..."; + + for (int i = 0; i < hccl_comm_num; ++i) { + std::string var_name = func(i); + auto var = scope.FindVar(var_name); + PADDLE_ENFORCE_NOT_NULL( + var, platform::errors::NotFound("Variable with name %s is not found", + var_name.c_str())); + auto hccl_id = var->GetMutable(); + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclGetRootInfo(hccl_id)); + + int j = 0; + for (auto conn : connects) { + VLOG(3) << "sending hccl_id_var: " << var_name << " to " << servers[j] + << " hccl_comm_no: " << i; + SendHCCLID(conn, hccl_id); + ++j; + } + VLOG(3) << "sending completed..."; + } + + // close client + for (auto conn : connects) { + CloseSocket(conn); + } +} + +void RecvBroadCastHCCLID(std::string endpoint, int hccl_comm_num, + std::function func, + const framework::Scope& scope) { + int server = CreateListenSocket(endpoint); + RecvBroadCastHCCLID(server, endpoint, hccl_comm_num, func, scope); + CloseSocket(server); +} + +void RecvBroadCastHCCLID(int server_fd, std::string endpoint, int hccl_comm_num, + std::function func, + const framework::Scope& scope) { + int client = SocketAccept(server_fd, COMM_HEAD); + + for (int i = 0; i < hccl_comm_num; ++i) { + std::string var_name = func(i); + auto var = scope.FindVar(var_name); + PADDLE_ENFORCE_NOT_NULL( + var, platform::errors::NotFound("Variable with name %s is not found", + var_name.c_str())); + auto hccl_id = var->GetMutable(); + + VLOG(3) << "trainer: " << endpoint << " receiving hccl_id_var: " << var_name + << " from trainer 0, hccl_comm_no: " << i; + RecvHCCLID(client, hccl_id); + } + VLOG(3) << "receiving completed..."; + CloseSocket(client); +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/collective/gen_hccl_id_op_helper.h b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.h new file mode 100644 index 00000000000000..1ad6f791e1fc34 --- /dev/null +++ b/paddle/fluid/operators/collective/gen_hccl_id_op_helper.h @@ -0,0 +1,48 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +namespace paddle { +namespace framework { +class Scope; +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace operators { + +int CreateListenSocket(const std::string& ep); + +void CloseSocket(int fd); + +void SendBroadCastHCCLID(std::vector servers, int nccl_comm_num, + std::function func, + const framework::Scope& scope); + +// server listen on endpoint, then recv nccl id +void RecvBroadCastHCCLID(std::string endpoint, int nccl_comm_num, + std::function func, + const framework::Scope& scope); + +// recv nccl id from socket +void RecvBroadCastHCCLID(int server_fd, std::string endpoint, int nccl_comm_num, + std::function func, + const framework::Scope& scope); +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/collective/recv_v2_op.cc b/paddle/fluid/operators/collective/recv_v2_op.cc index 0ae7b821617f91..39a9ed0c74ef59 100644 --- a/paddle/fluid/operators/collective/recv_v2_op.cc +++ b/paddle/fluid/operators/collective/recv_v2_op.cc @@ -70,6 +70,12 @@ class RecvOpV2Maker : public framework::OpProtoAndCheckerMaker { AddAttr("peer", "(int default 0) rank id for sender.").SetDefault(0); AddAttr("dtype", "(int default 5('float32')) data type of tensor.") .SetDefault(5); +#if defined(PADDLE_WITH_ASCEND_CL) + AddAttr("tag", "(string default tag) tag for broadcasting.") + .SetDefault("tag"); + AddAttr("srTag", "(string default tag) tag for broadcasting.") + .SetDefault(0); +#endif AddAttr>("out_shape", "shape of the output tensor.") .SetDefault(std::vector()); AddAttr( diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu.cc b/paddle/fluid/operators/collective/recv_v2_op_npu.cc new file mode 100644 index 00000000000000..69f1f4681a33d6 --- /dev/null +++ b/paddle/fluid/operators/collective/recv_v2_op_npu.cc @@ -0,0 +1,79 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/recv_v2_op.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +template +class CRecvOpASCENDKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_ASCEND_CL) + auto x = ctx.Output("Out"); + void* ptr = reinterpret_cast(const_cast(x->data())); + int numel = x->numel(); + HcclDataType dtype = platform::ToHCCLDataType(x->type()); + + int ring_id = ctx.Attr("ring_id"); + auto place = ctx.GetPlace(); + auto comm = + paddle::platform::HCCLCommContext::Instance().Get(ring_id, place); + + aclrtStream stream = nullptr; + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + if (ctx.Attr("use_calc_stream")) { + stream = static_cast(dev_ctx)->stream(); + } else { + stream = comm->stream(); + } + + int nranks = comm->nranks(); + int peer = ctx.Attr("peer"); + + PADDLE_ENFORCE_EQ(nranks, 2, platform::errors::InvalidArgument( + "The nranks must be 2, but (%d)", nranks)); + + int root = peer; + + VLOG(3) << "begin hccl recv, parameter is: " + << "root " << root << ", comm: " << comm->comm() + << ", stream: " << stream; + + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast( + ptr, numel, dtype, (uint32_t)root, comm->comm(), stream)); + +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with NPU.")); +#endif + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(recv_v2, ops::CRecvOpASCENDKernel, + ops::CRecvOpASCENDKernel, + ops::CRecvOpASCENDKernel, + ops::CRecvOpASCENDKernel); diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc new file mode 100644 index 00000000000000..384dfd1fc5f2d3 --- /dev/null +++ b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc @@ -0,0 +1,165 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" +#include "paddle/fluid/operators/collective/recv_v2_op.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(recv_v2); +USE_NO_KERNEL_OP(c_gen_hccl_id); +USE_NO_KERNEL_OP(c_comm_init_hccl); +USE_OP_DEVICE_KERNEL(recv_v2, NPU); + +void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + std::vector rank_ids{0, 1}; + f::AttributeMap gen_hccl_id; + + std::vector endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"}; + gen_hccl_id["rank"] = rank_id; + gen_hccl_id["endpoint"] = endpointList[rank_id]; + std::vector other_endpoints = { + endpointList[rank_id == 0 ? 1 : 0]}; + gen_hccl_id["other_endpoints"] = other_endpoints; + + auto out = scope->Var("Out"); + auto id = out->GetMutable(); + + VLOG(3) << "break"; + + auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {}, + {{"Out", {"Out"}}}, gen_hccl_id); + VLOG(3) << "break"; + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); + + memcpy(hccl_id, id, 1024); +} + +void Prepare(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + auto x = scope->Var("X"); + auto id = x->GetMutable(); + + memcpy(id, hccl_id, 1024); + + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + // std::vector rank_ids{0, 1}; + f::AttributeMap comm_init_attrs; + comm_init_attrs["ring_id"] = 0; + comm_init_attrs["rank_ids"] = 2; + comm_init_attrs["rank"] = rank_id; + comm_init_attrs["device_id"] = device_id; + // comm_init_attrs["rank_ids"] = rank_ids; + auto comm_init_op = f::OpRegistry::CreateOp( + "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); +} + +void TestHcomRecvOp(f::Scope* scope, const p::DeviceContext& ctx) { + std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl; + + int num = atoi(getenv("DATA_SIZE")); + EXPECT_GT(num, 0); + EXPECT_LT(num, 1 << 15); + int rank_id = atoi(getenv("RANK_ID")); + VLOG(3) << "rank_id:" << rank_id << std::endl; + + ctx.Wait(); + auto place = ctx.GetPlace(); + auto out = scope->Var("Data"); + auto tensor_out = out->GetMutable(); + tensor_out->Resize({num, num}); + tensor_out->mutable_data(place); // allocate + + ctx.Wait(); + + f::AttributeMap attrs; + attrs["tag"] = std::string("srtest"); + attrs["peer"] = atoi(getenv("SRC_RANK")); + attrs["ring_id"] = 0; + attrs["srTag"] = 0; + std::vector out_shape; + out_shape.push_back(num); + out_shape.push_back(num); + attrs["out_shape"] = out_shape; + + auto op = f::OpRegistry::CreateOp("recv_v2", {}, {{"Out", {"Data"}}}, attrs); + VLOG(3) << "CreateOp recv_v2"; + + for (int i = 0; i < 10; i++) { + op->Run(*scope, place); + } + VLOG(3) << "Run op recv_v2"; + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + ctx.Wait(); + std::vector init(num * num, 1.0 * atoi(getenv("DEST_RANK"))); + EXPECT_EQ(out_vec == init, true); +} + +TEST(recv_v2, NPU) { + f::Scope scope; + HcclRootInfo hccl_id; + + char* npu_id = getenv("FLAGS_selected_npus"); + VLOG(3) << "Select npu:" << npu_id; + p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id))); + + PrepareUniqueId(&scope, ctx, &hccl_id); + Prepare(&scope, ctx, &hccl_id); + TestHcomRecvOp(&scope, ctx); +} diff --git a/paddle/fluid/operators/collective/send_v2_op.cc b/paddle/fluid/operators/collective/send_v2_op.cc index c5a86b4f08813a..c60d560e43baed 100644 --- a/paddle/fluid/operators/collective/send_v2_op.cc +++ b/paddle/fluid/operators/collective/send_v2_op.cc @@ -50,6 +50,12 @@ class SendOpV2Maker : public framework::OpProtoAndCheckerMaker { AddAttr("ring_id", "(int default 0) nccl communication ring id.") .SetDefault(0); AddAttr("peer", "(int default 0) rank id for receiver.").SetDefault(0); +#if defined(PADDLE_WITH_ASCEND_CL) + AddAttr("tag", "(string default tag) tag for broadcasting.") + .SetDefault("tag"); + AddAttr("srTag", "(string default tag) tag for broadcasting.") + .SetDefault(0); +#endif AddAttr( "use_calc_stream", "(bool default false) eject CUDA operations to calculation stream.") diff --git a/paddle/fluid/operators/collective/send_v2_op_npu.cc b/paddle/fluid/operators/collective/send_v2_op_npu.cc new file mode 100644 index 00000000000000..0ade090fcaac07 --- /dev/null +++ b/paddle/fluid/operators/collective/send_v2_op_npu.cc @@ -0,0 +1,79 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/collective/send_v2_op.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +template +class CSendOpASCENDKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#if defined(PADDLE_WITH_ASCEND_CL) + auto x = ctx.Input("X"); + void* ptr = reinterpret_cast(const_cast(x->data())); + int numel = x->numel(); + HcclDataType dtype = platform::ToHCCLDataType(x->type()); + + int ring_id = ctx.Attr("ring_id"); + auto place = ctx.GetPlace(); + auto comm = + paddle::platform::HCCLCommContext::Instance().Get(ring_id, place); + + aclrtStream stream = nullptr; + auto dev_ctx = platform::DeviceContextPool::Instance().Get(place); + if (ctx.Attr("use_calc_stream")) { + stream = static_cast(dev_ctx)->stream(); + } else { + stream = comm->stream(); + } + + int nranks = comm->nranks(); + int rank = comm->rank(); + + PADDLE_ENFORCE_EQ(nranks, 2, platform::errors::InvalidArgument( + "The nranks must be 2, but (%d)", nranks)); + + int root = rank; + + VLOG(3) << "begin hccl send, parameter is: " + << "root " << root << ", comm: " << comm->comm() + << ", stream: " << stream; + + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclBroadcast( + ptr, numel, dtype, (uint32_t)root, comm->comm(), stream)); + +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddlePaddle should compile with NPU.")); +#endif + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(send_v2, ops::CSendOpASCENDKernel, + ops::CSendOpASCENDKernel, + ops::CSendOpASCENDKernel, + ops::CSendOpASCENDKernel); diff --git a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc new file mode 100644 index 00000000000000..cf01b1d0a6a1d1 --- /dev/null +++ b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc @@ -0,0 +1,154 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include // NOLINT +#include +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" +#include "paddle/fluid/operators/collective/send_v2_op.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(send_v2); +USE_NO_KERNEL_OP(c_gen_hccl_id); +USE_NO_KERNEL_OP(c_comm_init_hccl); +USE_OP_DEVICE_KERNEL(send_v2, NPU); + +void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + std::vector rank_ids{0, 1}; + f::AttributeMap gen_hccl_id; + + std::vector endpointList = {"127.0.0.1:6175", "127.0.0.1:6177"}; + gen_hccl_id["rank"] = rank_id; + gen_hccl_id["endpoint"] = endpointList[rank_id]; + std::vector other_endpoints = { + endpointList[rank_id == 0 ? 1 : 0]}; + gen_hccl_id["other_endpoints"] = other_endpoints; + + auto out = scope->Var("Out"); + auto id = out->GetMutable(); + + VLOG(3) << "break"; + + auto comm_init_op = f::OpRegistry::CreateOp("c_gen_hccl_id", {}, + {{"Out", {"Out"}}}, gen_hccl_id); + VLOG(3) << "break"; + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); + + memcpy(hccl_id, id, 1024); +} + +void Prepare(f::Scope* scope, const p::DeviceContext& ctx, + HcclRootInfo* hccl_id) { + auto x = scope->Var("X"); + auto id = x->GetMutable(); + + memcpy(id, hccl_id, 1024); + + int rank_id = atoi(getenv("RANK_ID")); + int device_id = atoi(getenv("DEVICE_ID")); + + VLOG(2) << "rank_id = " << rank_id << "; device_id = " << device_id + << "; rank_id = " << rank_id + << "; RANK_TABLE_FILE = " << atoi(getenv("DEVICE_ID")); + + // std::vector rank_ids{0, 1}; + f::AttributeMap comm_init_attrs; + comm_init_attrs["ring_id"] = 0; + comm_init_attrs["rank_ids"] = 2; + comm_init_attrs["rank"] = rank_id; + comm_init_attrs["device_id"] = device_id; + // comm_init_attrs["rank_ids"] = rank_ids; + auto comm_init_op = f::OpRegistry::CreateOp( + "c_comm_init_hccl", {{"X", {"X"}}}, {}, comm_init_attrs); + auto place = ctx.GetPlace(); + comm_init_op->Run(*scope, place); + ctx.Wait(); +} + +void TestHcomSendOp(f::Scope* scope, const p::DeviceContext& ctx) { + std::cout << "BEGIN TEST:" << __FUNCTION__ << std::endl; + auto x = scope->Var("Data"); + auto tensor_x = x->GetMutable(); + int num = atoi(getenv("DATA_SIZE")); + + EXPECT_GT(num, 0); + EXPECT_LT(num, 1 << 15); + std::vector init(num * num, 1.0 * atoi(getenv("DEST_RANK"))); + int rank_id = atoi(getenv("RANK_ID")); + VLOG(3) << "rank id:" << rank_id; + TensorFromVector(init, ctx, tensor_x); + tensor_x->Resize({num, num}); + ctx.Wait(); + auto place = ctx.GetPlace(); + ctx.Wait(); + + f::AttributeMap attrs; + attrs["tag"] = std::string("srtest"); + attrs["peer"] = atoi(getenv("DEST_RANK")); + attrs["ring_id"] = 0; + attrs["srTag"] = 0; + + auto op = f::OpRegistry::CreateOp("send_v2", {{"X", {"Data"}}}, {}, attrs); + + for (int i = 0; i < 10; i++) { + op->Run(*scope, place); + } + VLOG(3) << "send run over"; + ctx.Wait(); +} + +TEST(send_v2, NPU) { + f::Scope scope; + HcclRootInfo hccl_id; + + char* npu_id = getenv("FLAGS_selected_npus"); + VLOG(3) << "Select npu:" << npu_id; + p::NPUDeviceContext ctx(p::NPUPlace(atoi(npu_id))); + + PrepareUniqueId(&scope, ctx, &hccl_id); + Prepare(&scope, ctx, &hccl_id); + TestHcomSendOp(&scope, ctx); +} diff --git a/paddle/fluid/operators/controlflow/compare_op.cc b/paddle/fluid/operators/controlflow/compare_op.cc index 3cad86d96c26a0..bf047de86fc21a 100644 --- a/paddle/fluid/operators/controlflow/compare_op.cc +++ b/paddle/fluid/operators/controlflow/compare_op.cc @@ -23,29 +23,6 @@ limitations under the License. */ namespace paddle { namespace operators { -template -class CompareOpKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - using T = typename Functor::ELEM_TYPE; - using Tensor = framework::Tensor; - - auto* x = context.Input("X"); - auto* y = context.Input("Y"); - auto* z = context.Output("Out"); - int axis = context.Attr("axis"); - - if (x->numel() == 1 && y->numel() == 1) { - bool* z_data = z->mutable_data(context.GetPlace()); - z_data[0] = Functor()(x->data()[0], y->data()[0]); - } else { - ElementwiseComputeEx( - context, x, y, axis, Functor(), z); - } - } -}; - template class CompareOpProtoMaker : public framework::OpProtoAndCheckerMaker { public: @@ -153,16 +130,22 @@ class CompareOp : public framework::OperatorWithKernel { REGISTER_COMPARE_OP_VERSION(op_type); REGISTER_COMPARE_OP(less_than, "Out = X < Y"); -REGISTER_COMPARE_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor); +REGISTER_COMPARE_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor, + paddle::operators::GreaterEqualFunctor); REGISTER_COMPARE_OP(less_equal, "Out = X <= Y"); -REGISTER_COMPARE_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor); +REGISTER_COMPARE_KERNEL(less_equal, CPU, paddle::operators::LessEqualFunctor, + paddle::operators::GreaterThanFunctor); REGISTER_COMPARE_OP(greater_than, "Out = X > Y"); REGISTER_COMPARE_KERNEL(greater_than, CPU, - paddle::operators::GreaterThanFunctor); + paddle::operators::GreaterThanFunctor, + paddle::operators::LessEqualFunctor); REGISTER_COMPARE_OP(greater_equal, "Out = X >= Y"); REGISTER_COMPARE_KERNEL(greater_equal, CPU, - paddle::operators::GreaterEqualFunctor); + paddle::operators::GreaterEqualFunctor, + paddle::operators::LessThanFunctor); REGISTER_COMPARE_OP(equal, "Out = X == Y"); -REGISTER_COMPARE_KERNEL(equal, CPU, paddle::operators::EqualFunctor); +REGISTER_COMPARE_KERNEL(equal, CPU, paddle::operators::EqualFunctor, + paddle::operators::EqualFunctor); REGISTER_COMPARE_OP(not_equal, "Out = X != Y"); -REGISTER_COMPARE_KERNEL(not_equal, CPU, paddle::operators::NotEqualFunctor); +REGISTER_COMPARE_KERNEL(not_equal, CPU, paddle::operators::NotEqualFunctor, + paddle::operators::NotEqualFunctor); diff --git a/paddle/fluid/operators/controlflow/compare_op.cu b/paddle/fluid/operators/controlflow/compare_op.cu index b1f30635835976..3ca700e16e6e7b 100644 --- a/paddle/fluid/operators/controlflow/compare_op.cu +++ b/paddle/fluid/operators/controlflow/compare_op.cu @@ -14,11 +14,17 @@ limitations under the License. */ #include "paddle/fluid/operators/controlflow/compare_op.h" -REGISTER_COMPARE_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor); -REGISTER_COMPARE_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor); -REGISTER_COMPARE_KERNEL(greater_than, CUDA, +REGISTER_COMPARE_KERNEL(less_than, CUDA, paddle::operators::LessThanFunctor, + paddle::operators::GreaterEqualFunctor); +REGISTER_COMPARE_KERNEL(less_equal, CUDA, paddle::operators::LessEqualFunctor, paddle::operators::GreaterThanFunctor); +REGISTER_COMPARE_KERNEL(greater_than, CUDA, + paddle::operators::GreaterThanFunctor, + paddle::operators::LessEqualFunctor); REGISTER_COMPARE_KERNEL(greater_equal, CUDA, - paddle::operators::GreaterEqualFunctor); -REGISTER_COMPARE_KERNEL(equal, CUDA, paddle::operators::EqualFunctor); -REGISTER_COMPARE_KERNEL(not_equal, CUDA, paddle::operators::NotEqualFunctor); + paddle::operators::GreaterEqualFunctor, + paddle::operators::LessThanFunctor); +REGISTER_COMPARE_KERNEL(equal, CUDA, paddle::operators::EqualFunctor, + paddle::operators::EqualFunctor); +REGISTER_COMPARE_KERNEL(not_equal, CUDA, paddle::operators::NotEqualFunctor, + paddle::operators::NotEqualFunctor); diff --git a/paddle/fluid/operators/controlflow/compare_op.h b/paddle/fluid/operators/controlflow/compare_op.h index b7529e4ae632d3..ff929ee7dfce79 100644 --- a/paddle/fluid/operators/controlflow/compare_op.h +++ b/paddle/fluid/operators/controlflow/compare_op.h @@ -68,7 +68,7 @@ struct NotEqualFunctor { } }; -template +template class CompareOpKernel : public framework::OpKernel { public: @@ -80,21 +80,33 @@ class CompareOpKernel auto* y = context.Input("Y"); auto* z = context.Output("Out"); int axis = context.Attr("axis"); - ElementwiseComputeEx(context, x, y, axis, - Functor(), z); + + auto x_dims = x->dims(); + auto y_dims = y->dims(); + if (x_dims.size() >= y_dims.size()) { + ElementwiseComputeEx(context, x, y, axis, + Functor(), z); + } else { + ElementwiseComputeEx( + context, x, y, axis, InverseFunctor(), z); + } } }; } // namespace operators } // namespace paddle -#define REGISTER_COMPARE_KERNEL(op_type, dev, functor) \ - REGISTER_OP_##dev##_KERNEL( \ - op_type, ::paddle::operators::CompareOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::CompareOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::CompareOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>, \ - ::paddle::operators::CompareOpKernel< \ - ::paddle::platform::dev##DeviceContext, functor>); +#define REGISTER_COMPARE_KERNEL(op_type, dev, functor, inverse_functor) \ + REGISTER_OP_##dev##_KERNEL(op_type, \ + ::paddle::operators::CompareOpKernel< \ + ::paddle::platform::dev##DeviceContext, \ + functor, inverse_functor>, \ + ::paddle::operators::CompareOpKernel< \ + ::paddle::platform::dev##DeviceContext, \ + functor, inverse_functor>, \ + ::paddle::operators::CompareOpKernel< \ + ::paddle::platform::dev##DeviceContext, \ + functor, inverse_functor>, \ + ::paddle::operators::CompareOpKernel< \ + ::paddle::platform::dev##DeviceContext, \ + functor, inverse_functor>); diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu index ab535e341f7575..7fdb1ccfe9614f 100644 --- a/paddle/fluid/operators/conv_cudnn_op.cu +++ b/paddle/fluid/operators/conv_cudnn_op.cu @@ -699,24 +699,51 @@ class CUDNNConvGradOpKernel : public framework::OpKernel { // ------------------- cudnn conv backward data --------------------- ScalingParamType alpha = 1.0f; +#ifdef PADDLE_WITH_HIP + // MIOPEN ONLY support beta to be 0.0f + ScalingParamType beta = 0.0f; +#else ScalingParamType beta = ctx.Attr("use_addto") ? 1.0f : 0.0f; +#endif VLOG(4) << "Conv_grad: use_addto = " << ctx.Attr("use_addto"); if (input_grad) { // When beta is 0, it is unnecessary to reset input_grad. // When beta is 1, the output cannot be reset since addt strategy used. #ifdef PADDLE_WITH_HIP - workspace_handle.RunFunc( - [&](void* cudnn_workspace_ptr) { - PADDLE_ENFORCE_CUDA_SUCCESS( - platform::dynload::miopenConvolutionBackwardData( - handle, &alpha, args1.odesc.desc(), output_grad_data, - args1.wdesc.desc(), filter_data, args1.cdesc.desc(), - data_algo, &beta, args1.idesc.desc(), - transformed_input_grad_data, cudnn_workspace_ptr, - workspace_size)); - }, - workspace_size); + if (ctx.Attr("use_addto")) { + Tensor temp_tensor(transformed_input_grad.type()); + temp_tensor.Resize(transformed_input_grad.dims()); + T* temp_tensor_data = temp_tensor.mutable_data(ctx.GetPlace()); + workspace_handle.RunFunc( + [&](void* cudnn_workspace_ptr) { + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::miopenConvolutionBackwardData( + handle, &alpha, args1.odesc.desc(), output_grad_data, + args1.wdesc.desc(), filter_data, args1.cdesc.desc(), + data_algo, &beta, args1.idesc.desc(), temp_tensor_data, + cudnn_workspace_ptr, workspace_size)); + }, + workspace_size); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenOpTensor( + handle, miopenTensorOpAdd, &alpha, args1.idesc.desc(), + transformed_input_grad_data, &alpha, args1.idesc.desc(), + temp_tensor_data, &beta, args1.idesc.desc(), + transformed_input_grad_data)); + } else { + workspace_handle.RunFunc( + [&](void* cudnn_workspace_ptr) { + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::miopenConvolutionBackwardData( + handle, &alpha, args1.odesc.desc(), output_grad_data, + args1.wdesc.desc(), filter_data, args1.cdesc.desc(), + data_algo, &beta, args1.idesc.desc(), + transformed_input_grad_data, cudnn_workspace_ptr, + workspace_size)); + }, + workspace_size); + } + #else for (int i = 0; i < groups; i++) { workspace_handle.RunFunc( diff --git a/paddle/fluid/operators/conv_miopen_helper.h b/paddle/fluid/operators/conv_miopen_helper.h index 3ab27e1ec4f4fc..befe09c8e6beb3 100644 --- a/paddle/fluid/operators/conv_miopen_helper.h +++ b/paddle/fluid/operators/conv_miopen_helper.h @@ -146,28 +146,8 @@ struct SearchAlgorithm { cudnn_workspace_ptr, workspace_size, false)); }; - if (!exhaustive_search && !deterministic) { - workspace_handle.RunFuncSync(cudnn_find_func, workspace_size); - algo = find_result.fwd_algo; - } else { - auto& temp = ctx.cuda_device_context(); - AlgorithmsCache& algo_cache = - *(framework::ConvSearchCache::Instance().GetForward()); - - auto x_dims = framework::vectorize(args.x->dims()); - auto w_dims = framework::vectorize(args.w->dims()); - - VLOG(10) << "miopenConvolutionFwdAlgoPerf_t:" - << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s" - << args.s << ", args.p" << args.p << ", args.d" << args.d; - - algo = algo_cache.GetAlgorithm( - x_dims, w_dims, args.s, args.p, args.d, 0, - static_cast(args.cudnn_dtype), [&]() { - workspace_handle.RunFuncSync(cudnn_find_func, workspace_size); - return find_result.fwd_algo; - }); - } + workspace_handle.RunFuncSync(cudnn_find_func, workspace_size); + algo = find_result.fwd_algo; VLOG(3) << "choose algo " << algo; return algo; } @@ -208,27 +188,8 @@ struct SearchAlgorithm { cudnn_workspace_ptr, workspace_size, false)); }; - if (!exhaustive_search && !deterministic) { - workspace_handle.RunFuncSync(cudnn_find_func, workspace_size); - algo = find_result.bwd_data_algo; - } else { - AlgorithmsCache& algo_cache = - *(framework::ConvSearchCache::Instance().GetBackwardData()); - - auto x_dims = framework::vectorize(args.x->dims()); - auto w_dims = framework::vectorize(args.w->dims()); - - VLOG(10) << "miopenConvolutionFwdAlgoPerf_t" - << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s" - << args.s << ", args.p" << args.p << ", args.d" << args.d; - - algo = algo_cache.GetAlgorithm( - x_dims, w_dims, args.s, args.p, args.d, 0, - static_cast(args.cudnn_dtype), [&]() { - workspace_handle.RunFuncSync(cudnn_find_func, workspace_size); - return find_result.bwd_data_algo; - }); - } + workspace_handle.RunFuncSync(cudnn_find_func, workspace_size); + algo = find_result.bwd_data_algo; VLOG(3) << "choose algo " << algo; return algo; } @@ -269,27 +230,8 @@ struct SearchAlgorithm { cudnn_workspace_ptr, workspace_size, false)); }; - if (!exhaustive_search && !deterministic) { - workspace_handle.RunFuncSync(cudnn_find_func, workspace_size); - algo = find_result.bwd_weights_algo; - } else { - AlgorithmsCache& algo_cache = - *(framework::ConvSearchCache::Instance().GetBackwardFilter()); - - auto x_dims = framework::vectorize(args.x->dims()); - auto w_dims = framework::vectorize(args.w->dims()); - - VLOG(10) << "miopenConvolutionFwdAlgoPerf_t:" - << ", x_dims:" << x_dims << ", w_dims:" << w_dims << ", args.s" - << args.s << ", args.p" << args.p << ", args.d" << args.d; - - algo = algo_cache.GetAlgorithm( - x_dims, w_dims, args.s, args.p, args.d, 0, - static_cast(args.cudnn_dtype), [&]() { - workspace_handle.RunFuncSync(cudnn_find_func, workspace_size); - return find_result.bwd_weights_algo; - }); - } + workspace_handle.RunFuncSync(cudnn_find_func, workspace_size); + algo = find_result.bwd_weights_algo; VLOG(3) << "choose algo " << algo; return algo; } diff --git a/paddle/fluid/operators/copy_cross_scope_op.cc b/paddle/fluid/operators/copy_cross_scope_op.cc new file mode 100644 index 00000000000000..721354954c7035 --- /dev/null +++ b/paddle/fluid/operators/copy_cross_scope_op.cc @@ -0,0 +1,151 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/var_type_traits.h" + +namespace paddle { +namespace framework { +class OpDesc; +template +class EmptyGradOpMaker; +} // namespace framework +namespace imperative { +class OpBase; +} // namespace imperative +} // namespace paddle + +using LoDTensor = paddle::framework::LoDTensor; +using Tensor = paddle::framework::Tensor; + +namespace paddle { +namespace operators { + +class CopyCrossScopeOp : public framework::OperatorBase { + public: + CopyCrossScopeOp(const std::string& type, + const framework::VariableNameMap& inputs, + const framework::VariableNameMap& outputs, + const framework::AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + void InferShape(framework::InferShapeContext* ctx) const {} + + private: + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override { + int num_micro_scopes = scope.kids().size(); + int num_micro_batches = Attr("num_micro_batches"); + bool ToM = Attr("to_main_scope"); + PADDLE_ENFORCE_EQ(num_micro_scopes, num_micro_batches, + platform::errors::InvalidArgument( + "For pipeline, number of micro scopes (%d) should " + "be equal to number of micro batches (%d).", + num_micro_scopes, num_micro_batches)); + const std::string& id_name = Input("Id"); + auto* id_var = scope.FindVar(id_name); + PADDLE_ENFORCE_NOT_NULL( + id_var, + platform::errors::NotFound("No variable with name %s found.", id_name)); + auto id_tensor = id_var->GetMutable(); + auto it = scope.kids().begin(); + framework::Tensor cpu_id_tensor; + TensorCopySync(*id_tensor, platform::CPUPlace(), &cpu_id_tensor); + auto id_value = cpu_id_tensor.data(); + for (auto i = 0; i < *id_value; i++) { + it++; + } + if (it == scope.kids().end()) { + if (ToM) { + auto dst_scope = *it; + const std::string& x_name = Input("X"); + auto* dst_var = dst_scope->FindVar(x_name); + PADDLE_ENFORCE_NOT_NULL( + dst_var, + platform::errors::NotFound( + "No variable with name %s found in source scope.", x_name)); + auto* main_var = scope.FindVar(x_name); + PADDLE_ENFORCE_NOT_NULL( + main_var, + platform::errors::NotFound( + "No variable with name %s found in destination scope.", + x_name)); + auto dst_tensor = dst_var->GetMutable(); + auto main_tensor = main_var->GetMutable(); + TensorCopySync(*dst_tensor, main_tensor->place(), main_tensor); + } + return; + } + auto source_scope = *it; + it++; + auto dst_scope = *it; + const std::string& x_name = Input("X"); + auto* source_var = source_scope->FindVar(x_name); + PADDLE_ENFORCE_NOT_NULL( + source_var, + platform::errors::NotFound( + "No variable with name %s found in source scope.", x_name)); + auto* dst_var = dst_scope->FindVar(x_name); + PADDLE_ENFORCE_NOT_NULL( + dst_var, + platform::errors::NotFound( + "No variable with name %s found in destination scope.", x_name)); + auto src_tensor = source_var->GetMutable(); + auto dst_tensor = dst_var->GetMutable(); + TensorCopySync(*src_tensor, dst_tensor->place(), dst_tensor); + + if (ToM) { + auto* main_var = scope.FindVar(x_name); + PADDLE_ENFORCE_NOT_NULL( + main_var, + platform::errors::NotFound( + "No variable with name %s found in destination scope.", x_name)); + auto main_tensor = main_var->GetMutable(); + TensorCopySync(*dst_tensor, main_tensor->place(), main_tensor); + } + } +}; + +class CopyCrossScopeOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor), The first input tensor of copy_cross_scope op, which " + "is copying micro scope."); + AddInput("Id", + "(Tensor), The second input tensor of copy_cross_scope op, which " + "is a id of the current micro scope."); + AddAttr("to_main_scope", "Return current scope to main scope.") + .SetDefault(false); + AddAttr("num_micro_batches", "Number of micro batches for pipeline."); + AddComment(R"DOC( + This op is used by pipeline to copy tensors across micro batch scopes. + Copy the variable value of the giving Id's micro scope to the micro scope of Id + 1 position. + If need to copy back to the main scope, using to_main_scope option to copy the variable value of + the current micro scope to the main scope. + )DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_WITHOUT_GRADIENT(copy_cross_scope, ops::CopyCrossScopeOp, + ops::CopyCrossScopeOpMaker); diff --git a/paddle/fluid/operators/copy_cross_scope_test.cc b/paddle/fluid/operators/copy_cross_scope_test.cc new file mode 100644 index 00000000000000..e175b235f9c181 --- /dev/null +++ b/paddle/fluid/operators/copy_cross_scope_test.cc @@ -0,0 +1,154 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/copy_cross_scope_op.cc" +#include "paddle/fluid/string/printf.h" + +#define Conn(x, y) x##y + +namespace f = paddle::framework; +namespace p = paddle::platform; + +USE_NO_KERNEL_OP(copy_cross_scope); + +template +void Compare1(f::Scope* scope, const p::DeviceContext& ctx, + std::string op_type) { + // init + auto var_x = scope->Var("tmp"); + auto x = var_x->GetMutable(); + std::vector main_x = {1.0}; + TensorFromVector(main_x, ctx, x); + + auto var_id = scope->Var("Id"); + auto id = var_id->GetMutable(); + std::vector main_id = {1}; + TensorFromVector(main_id, ctx, id); + for (int i = 0; i < 3; i++) { + auto& child_scope = scope->NewScope(); + auto child_var = child_scope.Var("tmp"); + auto tensor_x = child_var->GetMutable(); + std::vector init_x = {static_cast(i)}; + TensorFromVector(init_x, ctx, tensor_x); + } + + ctx.Wait(); + + // run + f::AttributeMap attrs = {{"to_main_scope", false}, {"num_micro_batches", 3}}; + std::map> output; + auto op = f::OpRegistry::CreateOp(op_type, {{"X", {"tmp"}}, {"Id", {"Id"}}}, + output, attrs); + + auto place = ctx.GetPlace(); + op->Run(*scope, place); + ctx.Wait(); + + std::list::const_iterator iter = scope->kids().begin(); + iter++; + iter++; + + auto* kid_scope = *iter; + auto* dst_var = kid_scope->FindVar("tmp"); + auto* tensor_out = dst_var->GetMutable(); + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + + int expected = 1; + EXPECT_EQ(static_cast(out_vec[0]), expected); +} + +template +void Compare2(f::Scope* scope, const p::DeviceContext& ctx, + std::string op_type) { + // init + auto var_x = scope->Var("tmp"); + auto x = var_x->GetMutable(); + std::vector main_x = {1.0}; + TensorFromVector(main_x, ctx, x); + + auto var_id = scope->Var("Id"); + auto id = var_id->GetMutable(); + std::vector main_id = {0}; + TensorFromVector(main_id, ctx, id); + for (int i = 0; i < 3; i++) { + auto& child_scope = scope->NewScope(); + auto child_var = child_scope.Var("tmp"); + auto tensor_x = child_var->GetMutable(); + std::vector init_x = {static_cast(i)}; + TensorFromVector(init_x, ctx, tensor_x); + } + + ctx.Wait(); + + // run + f::AttributeMap attrs = {{"to_main_scope", true}, {"num_micro_batches", 3}}; + std::map> output; + auto op = f::OpRegistry::CreateOp(op_type, {{"X", {"tmp"}}, {"Id", {"Id"}}}, + output, attrs); + + auto place = ctx.GetPlace(); + op->Run(*scope, place); + ctx.Wait(); + + auto* dst_var = scope->FindVar("tmp"); + auto* tensor_out = dst_var->GetMutable(); + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + + int expected = 0; + EXPECT_EQ(static_cast(out_vec[0]), expected); +} + +#ifdef PADDLE_WITH_CUDA +TEST(copy_cross_scope, CUDA_fp32) { + f::Scope scope; + p::CUDADeviceContext ctx(p::CUDAPlace(0)); + Compare1(&scope, ctx, "copy_cross_scope"); +} + +TEST(copy_cross_scope_to_main_scope, CUDA_fp32) { + f::Scope scope; + p::CUDADeviceContext ctx(p::CUDAPlace(0)); + Compare2(&scope, ctx, "copy_cross_scope"); +} +#elif PADDLE_WITH_ASCEND_CL +TEST(copy_cross_scope, NPU_fp32) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + Compare1(&scope, ctx, "copy_cross_scope"); +} + +TEST(copy_cross_scope_to_main_scope, NPU_fp32) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + Compare2(&scope, ctx, "copy_cross_scope"); +} +#endif diff --git a/paddle/fluid/operators/correlation_op.cu b/paddle/fluid/operators/correlation_op.cu index a51fce8132418b..9b08f875bb6e6d 100644 --- a/paddle/fluid/operators/correlation_op.cu +++ b/paddle/fluid/operators/correlation_op.cu @@ -12,17 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifndef PADDLE_WITH_HIP -// HIP not supported yet - #include #include #include "paddle/fluid/framework/op_registry.h" +#ifdef __HIPCC__ +#define __syncwarp() __all(1) +#endif + namespace paddle { namespace operators { +#ifdef __HIPCC__ +#define THREADS_PER_BLOCK 64 +#else #define THREADS_PER_BLOCK 32 +#endif #define FULL_MASK 0xffffffff using framework::Tensor; @@ -30,14 +35,22 @@ using framework::Tensor; template __forceinline__ __device__ T warpReduceSum(T val) { for (int offset = 16; offset > 0; offset /= 2) { +#ifdef __HIPCC__ + val += __shfl_down(val, offset); +#else val += __shfl_down_sync(FULL_MASK, val, offset); +#endif } return val; } template __forceinline__ __device__ T blockReduceSum(T val) { +#ifdef __HIPCC__ + static __shared__ T shared[64]; +#else static __shared__ T shared[32]; +#endif int lane = threadIdx.x % warpSize; int wid = threadIdx.x / warpSize; @@ -483,5 +496,3 @@ REGISTER_OP_CUDA_KERNEL(correlation, ops::CorrelationCUDAKernel, ops::CorrelationCUDAKernel); REGISTER_OP_CUDA_KERNEL(correlation_grad, ops::CorrelationCUDAGradKernel, ops::CorrelationCUDAGradKernel); - -#endif // not PADDLE_WITH_HIP diff --git a/paddle/fluid/operators/decode_jpeg_op.cc b/paddle/fluid/operators/decode_jpeg_op.cc new file mode 100644 index 00000000000000..e553b1076a8640 --- /dev/null +++ b/paddle/fluid/operators/decode_jpeg_op.cc @@ -0,0 +1,114 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/dynload/nvjpeg.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { + +template +class CPUDecodeJpegKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + // TODO(LieLinJiang): add cpu implement. + PADDLE_THROW(platform::errors::Unimplemented( + "DecodeJpeg op only supports GPU now.")); + } +}; + +class DecodeJpegOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "DecodeJpeg"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "DecodeJpeg"); + + auto mode = ctx->Attrs().Get("mode"); + std::vector out_dims; + + if (mode == "unchanged") { + out_dims = {-1, -1, -1}; + } else if (mode == "gray") { + out_dims = {1, -1, -1}; + } else if (mode == "rgb") { + out_dims = {3, -1, -1}; + } else { + PADDLE_THROW(platform::errors::Fatal( + "The provided mode is not supported for JPEG files on GPU: ", mode)); + } + + ctx->SetOutputDim("Out", framework::make_ddim(out_dims)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); + } + + framework::OpKernelType GetKernelTypeForVar( + const std::string& var_name, const framework::Tensor& tensor, + const framework::OpKernelType& expected_kernel_type) const { + if (var_name == "X") { + return expected_kernel_type; + } + + return framework::OpKernelType(tensor.type(), tensor.place(), + tensor.layout()); + } +}; + +class DecodeJpegOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "A one dimensional uint8 tensor containing the raw bytes " + "of the JPEG image. It is a tensor with rank 1."); + AddOutput("Out", "The output tensor of DecodeJpeg op"); + AddComment(R"DOC( +This operator decodes a JPEG image into a 3 dimensional RGB Tensor +or 1 dimensional Gray Tensor. Optionally converts the image to the +desired format. The values of the output tensor are uint8 between 0 +and 255. +)DOC"); + AddAttr( + "mode", + "(string, default \"unchanged\"), The read mode used " + "for optionally converting the image, can be \"unchanged\" " + ",\"gray\" , \"rgb\" .") + .SetDefault("unchanged"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR( + decode_jpeg, ops::DecodeJpegOp, ops::DecodeJpegOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker) + +REGISTER_OP_CPU_KERNEL(decode_jpeg, ops::CPUDecodeJpegKernel) diff --git a/paddle/fluid/operators/decode_jpeg_op.cu b/paddle/fluid/operators/decode_jpeg_op.cu new file mode 100644 index 00000000000000..35975a6a549867 --- /dev/null +++ b/paddle/fluid/operators/decode_jpeg_op.cu @@ -0,0 +1,138 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef PADDLE_WITH_HIP + +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/dynload/nvjpeg.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/stream/cuda_stream.h" + +namespace paddle { +namespace operators { + +static cudaStream_t nvjpeg_stream = nullptr; +static nvjpegHandle_t nvjpeg_handle = nullptr; + +void InitNvjpegImage(nvjpegImage_t* img) { + for (int c = 0; c < NVJPEG_MAX_COMPONENT; c++) { + img->channel[c] = nullptr; + img->pitch[c] = 0; + } +} + +template +class GPUDecodeJpegKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + // Create nvJPEG handle + if (nvjpeg_handle == nullptr) { + nvjpegStatus_t create_status = + platform::dynload::nvjpegCreateSimple(&nvjpeg_handle); + + PADDLE_ENFORCE_EQ(create_status, NVJPEG_STATUS_SUCCESS, + platform::errors::Fatal("nvjpegCreateSimple failed: ", + create_status)); + } + + nvjpegJpegState_t nvjpeg_state; + nvjpegStatus_t state_status = + platform::dynload::nvjpegJpegStateCreate(nvjpeg_handle, &nvjpeg_state); + + PADDLE_ENFORCE_EQ(state_status, NVJPEG_STATUS_SUCCESS, + platform::errors::Fatal("nvjpegJpegStateCreate failed: ", + state_status)); + + int components; + nvjpegChromaSubsampling_t subsampling; + int widths[NVJPEG_MAX_COMPONENT]; + int heights[NVJPEG_MAX_COMPONENT]; + + auto* x = ctx.Input("X"); + auto* x_data = x->data(); + + nvjpegStatus_t info_status = platform::dynload::nvjpegGetImageInfo( + nvjpeg_handle, x_data, (size_t)x->numel(), &components, &subsampling, + widths, heights); + + PADDLE_ENFORCE_EQ( + info_status, NVJPEG_STATUS_SUCCESS, + platform::errors::Fatal("nvjpegGetImageInfo failed: ", info_status)); + + int width = widths[0]; + int height = heights[0]; + + nvjpegOutputFormat_t output_format; + int output_components; + + auto mode = ctx.Attr("mode"); + if (mode == "unchanged") { + if (components == 1) { + output_format = NVJPEG_OUTPUT_Y; + output_components = 1; + } else if (components == 3) { + output_format = NVJPEG_OUTPUT_RGB; + output_components = 3; + } else { + platform::dynload::nvjpegJpegStateDestroy(nvjpeg_state); + PADDLE_THROW(platform::errors::Fatal( + "The provided mode is not supported for JPEG files on GPU")); + } + } else if (mode == "gray") { + output_format = NVJPEG_OUTPUT_Y; + output_components = 1; + } else if (mode == "rgb") { + output_format = NVJPEG_OUTPUT_RGB; + output_components = 3; + } else { + platform::dynload::nvjpegJpegStateDestroy(nvjpeg_state); + PADDLE_THROW(platform::errors::Fatal( + "The provided mode is not supported for JPEG files on GPU")); + } + + nvjpegImage_t out_image; + InitNvjpegImage(&out_image); + + // create nvjpeg stream + if (nvjpeg_stream == nullptr) { + cudaStreamCreateWithFlags(&nvjpeg_stream, cudaStreamNonBlocking); + } + + int sz = widths[0] * heights[0]; + + auto* out = ctx.Output("Out"); + std::vector out_shape = {output_components, height, width}; + out->Resize(framework::make_ddim(out_shape)); + + T* data = out->mutable_data(ctx.GetPlace()); + + for (int c = 0; c < output_components; c++) { + out_image.channel[c] = data + c * sz; + out_image.pitch[c] = width; + } + + nvjpegStatus_t decode_status = platform::dynload::nvjpegDecode( + nvjpeg_handle, nvjpeg_state, x_data, x->numel(), output_format, + &out_image, nvjpeg_stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(decode_jpeg, ops::GPUDecodeJpegKernel) + +#endif diff --git a/paddle/fluid/operators/diag_embed_op.cu b/paddle/fluid/operators/diag_embed_op.cu index 2e03622e10f0f4..7e3ab6be664cb9 100644 --- a/paddle/fluid/operators/diag_embed_op.cu +++ b/paddle/fluid/operators/diag_embed_op.cu @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include +#include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/diag_embed_op.h" diff --git a/paddle/fluid/operators/dist_op.h b/paddle/fluid/operators/dist_op.h index a2279e40623b4b..6a34ef48a169dc 100644 --- a/paddle/fluid/operators/dist_op.h +++ b/paddle/fluid/operators/dist_op.h @@ -167,6 +167,7 @@ static void DistGradFunction(const framework::ExecutionContext& context) { auto sign = (x_minux_y > static_cast(0)).template cast() * static_cast(1.0) + (x_minux_y < static_cast(0)).template cast() * static_cast(-1.0); + T epsilon = static_cast(1.0e-10f); // 1: Lp-norm(z), z = x-y, compute dz if (p == 0) { @@ -189,12 +190,14 @@ static void DistGradFunction(const framework::ExecutionContext& context) { // dz = pow(abs(x-y)/out, p-1) * sign(x-y) * dout if (platform::is_cpu_place(context.GetPlace())) { grad_t.device(place) = - (x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) * + (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims)) + .pow(p - 1) * sign.eval() * out_grad_t.broadcast(out_bcast_dims); } else { grad_t.device(place) = - (x_minux_y_abs / out_t.broadcast(out_bcast_dims)).pow(p - 1) * sign * - out_grad_t.broadcast(out_bcast_dims); + (x_minux_y_abs / (out_t + epsilon).broadcast(out_bcast_dims)) + .pow(p - 1) * + sign * out_grad_t.broadcast(out_bcast_dims); } } diff --git a/paddle/fluid/operators/dlnne/CMakeLists.txt b/paddle/fluid/operators/dlnne/CMakeLists.txt new file mode 100644 index 00000000000000..4fe9cf214eaa70 --- /dev/null +++ b/paddle/fluid/operators/dlnne/CMakeLists.txt @@ -0,0 +1,54 @@ +# compile flags +set(DLNNE_FLAGS + -Wno-error=non-virtual-dtor + -Wno-error=unused-variable + -Wno-error=attributes + ${fsanitize} +) +foreach(flag ${DLNNE_FLAGS}) + safe_set_cflag(CMAKE_C_FLAGS ${flag}) + safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag}) +endforeach() + + +# add nne +find_path(DLNNE_INCLUDE_DIR dlnne.h + PATHS + $ENV{SOFTWARE_SOURCE_DIR} $ENV{SOFTWARE_SOURCE_DIR}/driver/nne/include + NO_DEFAULT_PATH +) + +find_library(DLNNE_LIB libdlnne.so + PATHS + $ENV{SOFTWARE_BUILD_DIR} $ENV{SOFTWARE_BUILD_DIR}/driver/nne + NO_DEFAULT_PATH +) + +find_path(CUDA_INCLUDE_DIR cuda.h + $ENV{SOFTWARE_BUILD_DIR}/llvm-project-10/cuda/include +) + +find_library(CURT_LIB libcurt.so + PATHS + $ENV{SOFTWARE_BUILD_DIR} $ENV{SOFTWARE_BUILD_DIR}/llvm-project-10/cuda/lib + NO_DEFAULT_PATH +) + + +message("DLNNE_INCLUDE_DIR: "${DLNNE_INCLUDE_DIR}) +message("DLNNE_LIB: "${DLNNE_LIB}) +message("CUDA_INCLUDE_DIR: "${CUDA_INCLUDE_DIR}) +message("CURT_LIB: "${CURT_LIB}) + +include_directories("${DLNNE_INCLUDE_DIR}") +include_directories("${CUDA_INCLUDE_DIR}") + +op_library(dlnne_engine_op DEPS ${GLOB_OPERATOR_DEPS} framework_proto boost device_context op_registry scope) + +#message("PYBIND_FILE:${pybind_file}") +#file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(dlnne_engine);\n") +#endif() + +target_link_libraries(dlnne_engine_op ${DLNNE_LIB} ${CURT_LIB}) + +cc_test(test_dlnne_engine_op SRCS dlnne_engine_op_test.cc DEPS dlnne_engine_op analysis) diff --git a/paddle/fluid/operators/dlnne/dlnne_engine_op.cc b/paddle/fluid/operators/dlnne/dlnne_engine_op.cc new file mode 100644 index 00000000000000..4654e6a9f978a2 --- /dev/null +++ b/paddle/fluid/operators/dlnne/dlnne_engine_op.cc @@ -0,0 +1,58 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/dlnne/dlnne_engine_op.h" + +namespace paddle { +namespace inference { + +void CopyTensorDeviceToCpu(void* dst_ptr, void* src_ptr, int total_bytes) { + cudaDeviceSynchronize(); + cudaMemcpy(dst_ptr, src_ptr, total_bytes, cudaMemcpyDeviceToHost); + cudaDeviceSynchronize(); +} +void CopyTensorCpuToDevice(void* dst_ptr, void* src_ptr, int total_bytes) { + cudaDeviceSynchronize(); + cudaMemcpy(dst_ptr, src_ptr, total_bytes, cudaMemcpyHostToDevice); + cudaDeviceSynchronize(); +} + +} // namespace inference + +namespace operators { + +class DlnneEngineOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Xs", "A list of inputs.").AsDuplicable(); + AddOutput("Ys", "A list of outputs").AsDuplicable(); + AddAttr("subgraph", "the subgraph."); + AddAttr( + "engine_key", + "The engine_key here is used to distinguish different DLNNE Engines"); + AddAttr("sub_block", "the trt block"); + AddComment("Dlnne engine operator."); + } +}; + +class DlnneEngineInferVarType : public framework::VarTypeInference { + public: + void operator()(framework::InferVarTypeContext* ctx) const override {} +}; + +} // namespace operators +} // namespace paddle +namespace ops = paddle::operators; + +REGISTER_OPERATOR(dlnne_engine, ops::DlnneEngineOp, ops::DlnneEngineOpMaker); diff --git a/paddle/fluid/operators/dlnne/dlnne_engine_op.h b/paddle/fluid/operators/dlnne/dlnne_engine_op.h new file mode 100644 index 00000000000000..d426876c18fa5e --- /dev/null +++ b/paddle/fluid/operators/dlnne/dlnne_engine_op.h @@ -0,0 +1,351 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include // NOTLINT +#include // NOTLINT +#include // NOTLINT + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/executor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/inference/analysis/helper.h" + +namespace dl { +namespace nne { +class Builder; +class Engine; +class Network; +class Parser; +class ExecutionContext; +} // namespace nne +} // namespace dl + +namespace paddle { +namespace inference { +class NneDeleter { + public: + NneDeleter() {} + + template + inline void operator()(T *ptr) { + if (ptr != nullptr) { + ptr->Destroy(); + } + } +}; + +void CopyTensorDeviceToCpu(void *dst_ptr, void *src_ptr, int total_bytes); + +void CopyTensorCpuToDevice(void *dst_ptr, void *src_ptr, int total_bytes); + +template +struct Singleton; +} // namespace inference +} // namespace paddle + +namespace paddle { + +namespace operators { + +class DlnneEngineOp : public framework::OperatorBase { + private: + std::vector input_names_; + std::unordered_set param_names_; + std::string engine_key_; + int num_inputs; + int num_outputs; + std::vector output_names; + std::vector input_names; + + dl::nne::Builder *builder; + dl::nne::Parser *parser; + dl::nne::Network *network; + dl::nne::ExecutionContext *context; + dl::nne::Engine *engine; + + unsigned int engine_input_size; + std::vector InputIndexToBindIndex_; + + public: + DlnneEngineOp(const std::string &type, + const framework::VariableNameMap &inputs, + const framework::VariableNameMap &outputs, + const framework::AttributeMap &attrs) + : framework::OperatorBase(type, inputs, outputs, attrs) { + input_names_ = Inputs("Xs"); + engine_key_ = Attr("engine_key"); + auto params = Attr>("parameters"); + for (const auto ¶m : params) { + param_names_.insert(param); + } + + num_inputs = 0; + for (const auto &x : Inputs("Xs")) { + if (param_names_.count(x)) continue; + num_inputs += 1; + input_names.push_back(x); + } + + num_outputs = Outputs("Ys").size(); + for (const auto &y : Outputs("Ys")) { + VLOG(4) << "y: " << y << std::endl; + output_names.push_back(y); + } + + // onnx path + std::stringstream filename; + std::string current_path = "."; + char *buffer; + if ((buffer = getcwd(NULL, 0)) != NULL) { + current_path = buffer; + } else { + current_path = "."; + } + filename << current_path << "/dump/" << engine_key_ << "/" << engine_key_ + << ".onnx"; + + builder = dl::nne::CreateInferBuilder(); + PADDLE_ENFORCE_NE(builder, nullptr, platform::errors::Unavailable( + "nne create builder failed")); + parser = dl::nne::CreateParser(); + PADDLE_ENFORCE_NE(parser, nullptr, platform::errors::Unavailable( + "nne create parser failed")); + + network = builder->CreateNetwork(); + + LOG(INFO) << "set output for dlnne"; + for (std::string &output_op_name : output_names) + parser->RegisterOutput(output_op_name.c_str()); + + LOG(INFO) << "parser onnx for dlnne"; + parser->Parse(filename.str().c_str(), *network); + + LOG(INFO) << "build network"; + engine = builder->BuildEngine(*network); + + // total size = input_size+output_size + engine_input_size = num_inputs + num_outputs; + for (std::string &input_name : input_names) { + int BindIndex = engine->GetBindingIndex(input_name.c_str()); + InputIndexToBindIndex_.push_back(BindIndex); + } + + for (std::string &output_name : output_names) { + int BindIndex = engine->GetBindingIndex(output_name.c_str()); + InputIndexToBindIndex_.push_back(BindIndex); + } + + // context + context = engine->CreateExecutionContext(); + } + + ~DlnneEngineOp() { + network->Destroy(); + context->Destroy(); + engine->Destroy(); + parser->Destroy(); + builder->Destroy(); + } + + protected: + void RunDlnneOnCreateEngine(const framework::Scope &scope, + const platform::Place &dev_place) const { + PADDLE_ENFORCE_EQ( + input_names_.empty(), false, + platform::errors::PreconditionNotMet( + "Dlnne engine needs at least one input, but no input is found. " + "Please check if you set the input correctly.")); + + std::vector input_buffers(num_inputs); + std::vector cpu_input_buffers(num_inputs); + std::vector> input_shapes(num_inputs); + std::vector input_data_types(num_inputs); + std::vector input_bytes(num_inputs); + + int index = 0; + for (const auto &x : Inputs("Xs")) { + if (param_names_.count(x)) continue; + // convert input and copy to Dlnne engine's buffer + auto &t = + inference::analysis::GetFromScope(scope, x); + + const int bind_index = index; + index++; + int64_t data_bytes; + int32_t dtype; + auto type = t.type(); + data_bytes = 1; + void *buffer = nullptr; + if (type == framework::proto::VarType::FP32) { + buffer = static_cast(t.data()); + data_bytes = 4; + dtype = 0; + } else if (type == framework::proto::VarType::INT64) { + buffer = static_cast(t.data()); + data_bytes = 8; + dtype = 1; + } else if (type == framework::proto::VarType::INT32) { + buffer = static_cast(t.data()); + data_bytes = 4; + dtype = 2; + } else { + PADDLE_THROW(platform::errors::Fatal( + "The DLNNE Engine OP only support float/int32_t/int64_t input.")); + } + input_buffers[bind_index] = buffer; + + auto t_shape = framework::vectorize(t.dims()); + std::vector runtime_input_shape(t_shape.begin(), t_shape.end()); + for (auto &size : t_shape) { + data_bytes = data_bytes * size; + } + + VLOG(4) << "buffers_size:" << data_bytes; + cpu_input_buffers[bind_index] = + input_buffers[bind_index]; // malloc(data_bytes); + input_shapes[bind_index] = runtime_input_shape; + input_data_types[bind_index] = dtype; + input_bytes[bind_index] = data_bytes; + } + + // output shape + std::vector> out_shapes; + std::vector output_bytes; + for (int i = 0; i < num_outputs; i++) { + int index = engine->GetBindingIndex(output_names[i].c_str()); + dl::nne::Dims out_dim = engine->GetBindingDimensions(index); + std::vector shape(out_dim.nbDims); + for (int dim = 0; dim < out_dim.nbDims; dim++) { + shape[dim] = (out_dim.d[dim]); + } + + out_shapes.push_back(shape); + int64_t data_bytes; + + // float32 + data_bytes = 4; + for (auto &size : shape) { + data_bytes = data_bytes * size; + } + VLOG(4) << "data_bytes: " << data_bytes; + output_bytes.push_back(data_bytes); + } + + int bind_index = 0; + std::vector cpu_output_buffers(num_outputs); + std::vector output_buffers(num_outputs); + std::vector output_dtypes(num_outputs); + + for (const auto &y : Outputs("Ys")) { + auto *fluid_v = scope.FindVar(y); + PADDLE_ENFORCE_NOT_NULL( + fluid_v, + platform::errors::NotFound( + "Output variable %s is not found in DLNNE subgraph.", y)); + + auto *fluid_t = fluid_v->GetMutable(); + + VLOG(4) << "out_shapes[bind_index] dim:" << out_shapes[bind_index].size(); + fluid_t->Resize(framework::make_ddim(out_shapes[bind_index])); + + int32_t dtype; + output_buffers[bind_index] = fluid_t->mutable_data( + BOOST_GET_CONST(platform::CPUPlace, dev_place)); + dtype = 0; + cpu_output_buffers[bind_index] = + output_buffers[bind_index]; // malloc(data_bytes); + output_dtypes[bind_index] = dtype; + bind_index++; + } + + std::vector engine_input_ptr(engine_input_size); + + // set input_ptr + for (unsigned int i = 0; i < engine_input_size; i++) { + if (InputIndexToBindIndex_[i] < 0) continue; + + if (engine->BindingIsInput(InputIndexToBindIndex_[i])) { + // copy cpu buffer to gpu buffer + int64_t total_bytes; + total_bytes = input_bytes[i]; + VLOG(4) << "input_bytes: " << total_bytes; + + void *gpu_ptr; + cudaMalloc(&gpu_ptr, total_bytes); + engine_input_ptr[InputIndexToBindIndex_[i]] = gpu_ptr; + + paddle::inference::CopyTensorCpuToDevice( + gpu_ptr, reinterpret_cast(cpu_input_buffers[i]), + total_bytes); + + } else { + int64_t total_size; + total_size = output_bytes[i - input_names.size()]; + VLOG(4) << "output_bytes: " << total_size; + void *gpu_ptr; + cudaMalloc(&gpu_ptr, total_size); + engine_input_ptr[InputIndexToBindIndex_[i]] = gpu_ptr; + } + } + + clock_t startTime, endTime; + startTime = clock(); + context->Execute(1, engine_input_ptr.data()); + endTime = clock(); + double during_ms = + static_cast(endTime - startTime) / CLOCKS_PER_SEC * 1000; + LOG(INFO) << "dlNNE execute time: " << during_ms << " ms"; + + bind_index = 0; + for (unsigned int i = 0; i < engine_input_size; i++) { + if (InputIndexToBindIndex_[i] < 0) continue; + + if (i >= input_names.size()) { + void *cpu_ptr = cpu_output_buffers[i - input_names.size()]; + int64_t size; + size = output_bytes[i - input_names.size()]; + paddle::inference::CopyTensorDeviceToCpu( + cpu_ptr, engine_input_ptr[InputIndexToBindIndex_[i]], size); + // dtype: float32 + int32_t dtypes; + dtypes = 0; + + cpu_output_buffers[bind_index] = cpu_ptr; + output_dtypes[bind_index] = dtypes; + bind_index++; + } + cudaFree(engine_input_ptr[InputIndexToBindIndex_[i]]); + } + } + + void RunImpl(const framework::Scope &scope, + const platform::Place &dev_place) const override { + RunDlnneOnCreateEngine(scope, dev_place); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc b/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc new file mode 100644 index 00000000000000..caf1a80fcc737f --- /dev/null +++ b/paddle/fluid/operators/dlnne/dlnne_engine_op_test.cc @@ -0,0 +1,237 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/dlnne/dlnne_engine_op.h" +#include +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/inference/analysis/helper.h" +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" +#include "paddle/fluid/operators/tensorrt/tensorrt_engine_op.h" + +USE_NO_KERNEL_OP(dlnne_engine); +namespace paddle { +namespace operators { + +namespace { +void CreateCUDATensor(framework::Scope* scope, const std::string& name, + const std::vector& shape) { + auto* var = scope->Var(name); + auto* tensor = var->GetMutable(); + auto dims = framework::make_ddim(shape); + tensor->Resize(dims); + platform::CUDAPlace place; + platform::CUDADeviceContext ctx(place); + inference::tensorrt::RandomizeTensor(tensor, place, ctx); +} + +void AddTensorToBlockDesc(framework::proto::BlockDesc* block, + const std::string& name, + const std::vector& shape) { + using framework::proto::VarType; + auto* var = block->add_vars(); + framework::VarDesc desc(name); + desc.SetType(VarType::LOD_TENSOR); + desc.SetDataType(VarType::FP32); + desc.SetShape(shape); + *var = *desc.Proto(); +} + +} // namespace + +using inference::analysis::SetAttr; + +TEST(DlnneEngineOp, manual) { + framework::ProgramDesc program; + auto* block_ = program.Proto()->add_blocks(); + block_->set_idx(0); + block_->set_parent_idx(-1); + + LOG(INFO) << "create block desc"; + framework::BlockDesc block_desc(&program, block_); + LOG(INFO) << "create fc op"; + auto* fc0 = block_desc.AppendOp(); + fc0->SetType("fc"); + fc0->SetInput("X", std::vector({"x"})); // 4 x 1 x 1 + fc0->SetInput("Y", std::vector({"y"})); // 4 x 6 + fc0->SetOutput("Out", std::vector({"z"})); // 6 x 1 x 1 + + LOG(INFO) << "create fc op"; + auto* fc1 = block_desc.AppendOp(); + fc1->SetType("fc"); + fc1->SetInput("X", std::vector({"z"})); + fc1->SetInput("Y", std::vector({"y0"})); // 6 x 8 + fc1->SetOutput("Out", std::vector({"z0"})); // 8 x 1 x 1 + + // Set inputs' variable shape in BlockDesc + // the batch size is 2, so the dims of 'x' is {2, 4, 1, 1} + AddTensorToBlockDesc(block_, "x", std::vector({2, 4, 1, 1})); + AddTensorToBlockDesc(block_, "y", std::vector({4, 6})); + AddTensorToBlockDesc(block_, "y0", std::vector({6, 8})); + AddTensorToBlockDesc(block_, "z", std::vector({2, 6})); + + // It is wired, need to copy manually. + *block_->add_ops() = *fc0->Proto(); + *block_->add_ops() = *fc1->Proto(); + + ASSERT_EQ(block_->ops_size(), 2); + + LOG(INFO) << "create dlnne desc"; + framework::OpDesc engine_op_desc(nullptr); + engine_op_desc.SetType("dlnne_engine"); + engine_op_desc.SetInput("Xs", std::vector({"x"})); + engine_op_desc.SetOutput("Ys", std::vector({"z0"})); + + engine_op_desc.SetBlockAttr("sub_block", &block_desc); + engine_op_desc.SetAttr("max_batch_size", static_cast(2)); + engine_op_desc.SetAttr("workspace_size", static_cast(1 << 20)); + engine_op_desc.SetAttr("parameters", std::vector({})); + engine_op_desc.SetAttr("engine_key", std::string("a_engine")); + engine_op_desc.SetAttr("calibration_engine_key", + std::string("a_calib_engine")); + engine_op_desc.SetAttr("predictor_id", 1); + engine_op_desc.SetAttr("calibration_data", std::string("")); + engine_op_desc.SetAttr("enable_int8", static_cast(false)); + engine_op_desc.SetAttr("enable_fp16", static_cast(false)); + engine_op_desc.SetAttr("use_calib_mode", static_cast(false)); + engine_op_desc.SetAttr("output_name_mapping", + std::vector({"z0"})); + engine_op_desc.SetAttr("origin_output_dims", std::vector({2})); + engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString())); + engine_op_desc.SetAttr("engine_serialized_data", std::string("")); + int device_id = 0; + engine_op_desc.SetAttr("gpu_id", device_id); + + LOG(INFO) << "create engine op"; + auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc); + LOG(INFO) << "engine_op " << engine_op.get(); + + framework::Scope scope; + platform::CUDAPlace place; + platform::CUDADeviceContext ctx(place); + // Prepare variables. + CreateCUDATensor(&scope, "x", std::vector({2, 4})); + CreateCUDATensor(&scope, "y", std::vector({4, 6})); + CreateCUDATensor(&scope, "z", std::vector({2, 6})); + + CreateCUDATensor(&scope, "y0", std::vector({6, 8})); + CreateCUDATensor(&scope, "z0", std::vector({2, 8})); + + // Execute them. + LOG(INFO) << "engine_op run"; + engine_op->Run(scope, place); +} + +void Execute(int batch_size, int input_dim, int output_dim, int nlayers = 1) { + framework::ProgramDesc program; + framework::Scope scope; + platform::CUDAPlace place; + platform::CUDADeviceContext ctx(place); + + auto* block_ = program.Proto()->add_blocks(); + block_->set_idx(0); + block_->set_parent_idx(-1); + + using shape_t = std::vector; + + LOG(INFO) << "create block desc"; + framework::BlockDesc block_desc(&program, block_); + + auto AddFCLayer = [&](const std::string& x_name, const std::string& y_name, + const std::string& z_name, bool x_created, + const shape_t& x_shape, const shape_t& y_shape, + const shape_t& z_shape) { + LOG(INFO) << "create fc op"; + auto* fc = block_desc.AppendOp(); + fc->SetType("mul"); + fc->SetInput("X", std::vector({x_name})); + fc->SetInput("Y", std::vector({y_name})); + fc->SetOutput("Out", std::vector({z_name})); + + // Set inputs' variable shape in BlockDesc + if (!x_created) { + AddTensorToBlockDesc(block_, x_name, + std::vector({batch_size, input_dim, 1, 1})); + } + AddTensorToBlockDesc(block_, y_name, + std::vector({input_dim, output_dim})); + AddTensorToBlockDesc(block_, z_name, + std::vector({batch_size, output_dim})); + + // Prepare variables. + if (!x_created) { + CreateCUDATensor(&scope, x_name, std::vector(x_shape)); + } + CreateCUDATensor(&scope, y_name, std::vector(y_shape)); + CreateCUDATensor(&scope, z_name, std::vector(z_shape)); + + // It is wired, need to copy manually. + *block_->add_ops() = *fc->Proto(); + }; + + // Test with 4 layer FC + AddFCLayer("x0", "y0", "z0", false, {batch_size, input_dim}, + {input_dim, output_dim}, {batch_size, output_dim}); + AddFCLayer("z0", "y1", "z1", true, {}, {output_dim, output_dim}, + {batch_size, output_dim}); + AddFCLayer("z1", "y2", "z2", true, {}, {output_dim, output_dim}, + {batch_size, output_dim}); + AddFCLayer("z2", "y3", "z3", true, {}, {output_dim, output_dim}, + {batch_size, output_dim}); + + LOG(INFO) << "create dlnne desc"; + framework::OpDesc engine_op_desc(nullptr); + engine_op_desc.SetType("dlnne_engine"); + engine_op_desc.SetInput("Xs", std::vector({"x0"})); + engine_op_desc.SetOutput("Ys", std::vector({"z3"})); + + engine_op_desc.SetBlockAttr("sub_block", &block_desc); + engine_op_desc.SetAttr("max_batch_size", static_cast(batch_size)); + engine_op_desc.SetAttr("workspace_size", static_cast(1 << 20)); + engine_op_desc.SetAttr("parameters", + std::vector({"y0", "y1", "y2", "y3"})); + engine_op_desc.SetAttr("engine_key", std::string("b_engine")); + engine_op_desc.SetAttr("calibration_engine_key", + std::string("b_calib_engine")); + engine_op_desc.SetAttr("predictor_id", 1); + engine_op_desc.SetAttr("calibration_data", std::string("")); + engine_op_desc.SetAttr("enable_int8", static_cast(false)); + engine_op_desc.SetAttr("enable_fp16", static_cast(false)); + engine_op_desc.SetAttr("use_calib_mode", static_cast(false)); + engine_op_desc.SetAttr("output_name_mapping", + std::vector({"z3"})); + engine_op_desc.SetAttr("origin_output_dims", std::vector({2})); + engine_op_desc.SetAttr("subgraph", std::string(block_->SerializeAsString())); + engine_op_desc.SetAttr("engine_serialized_data", std::string("")); + int device_id = 0; + engine_op_desc.SetAttr("gpu_id", device_id); + + auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc); + + // Execute them. + engine_op->Run(scope, place); +} + +// Test with a larger FC layer. +TEST(DlnneEngineOp, fc) { Execute(40, 28, 28); } + +} // namespace operators +} // namespace paddle + +USE_TRT_CONVERTER(fc) diff --git a/paddle/fluid/operators/dot_op.h b/paddle/fluid/operators/dot_op.h index 1b607922eda1d8..0987118ba39b6e 100644 --- a/paddle/fluid/operators/dot_op.h +++ b/paddle/fluid/operators/dot_op.h @@ -205,35 +205,25 @@ struct DotGradFunction> { } } #else - const auto* data_dout = tensor_dout->data(); + auto const *x = tensor_x->data(), *y = tensor_y->data(), + *dz = tensor_dout->data(); + auto&& d = tensor_x->dims(); + auto const N = tensor_x->numel(); + auto const B = d[d.size() - 1]; if (tensor_dx) { - auto* data_dx = tensor_dx->mutable_data(ctx.GetPlace()); - const auto* data_y = tensor_y->data(); - const framework::DDim& dim = tensor_x->dims(); - size_t N = static_cast(framework::product(dim)); - - auto step = dim[dim.size() - 1]; - - int s = -1; - for (size_t i = 0; i < N; ++i) { - if (0 == i % step) ++s; - data_dx[i] = data_y[i] * data_dout[s]; + auto* dx = tensor_dx->mutable_data(ctx.GetPlace()); + for (auto j = 0; j < N / B; ++j) { + auto const ss = dz[j]; + for (auto i = 0; i < B; ++i) *dx++ = *y++ * ss; } } if (tensor_dy) { - auto* data_dy = tensor_dy->mutable_data(ctx.GetPlace()); - const auto* data_x = tensor_x->data(); - const framework::DDim& dim = tensor_y->dims(); - size_t N = static_cast(framework::product(dim)); - - auto step = dim[dim.size() - 1]; - - int s = -1; - for (size_t i = 0; i < N; ++i) { - if (0 == i % step) ++s; - data_dy[i] = data_x[i] * data_dout[s]; + auto* dy = tensor_dy->mutable_data(ctx.GetPlace()); + for (auto j = 0; j < N / B; ++j) { + auto const ss = dz[j]; + for (auto i = 0; i < B; i++) *dy++ = *x++ * ss; } } #endif @@ -266,21 +256,20 @@ class DotKernel : public framework::OpKernel { out.device(dev) = (x * y).sum(Eigen::DSizes(1)); } #else - const auto* data_x = tensor_x->data(); - const auto* data_y = tensor_y->data(); - auto* data_out = tensor_out->data(); - - auto x_dims = tensor_x->dims(); - auto step = x_dims[x_dims.size() - 1]; - int size = static_cast(framework::product(x_dims)); - - for (int ind = -1, j = 0; j < size; ++j) { - if (j % step == 0) { - ++ind; - data_out[ind] = data_x[j] * data_y[j]; - } else { - data_out[ind] += data_x[j] * data_y[j]; - } + auto const *x = tensor_x->data(), *x_ = &x[0]; + auto const *y = tensor_y->data(), *y_ = &y[0]; + auto* z = tensor_out->data(); + + // Loop over the total N elements of both operands while sum-reducing every + // B pairs along the way where B is the dimension of the least ordered axis + auto&& d = tensor_x->dims(); + auto const N = tensor_x->numel(); + auto const B = d[d.size() - 1]; + + for (int j = 0; j < N / B; j++) { + T ss = 0; + for (int i = 0; i < B; i++) ss += (*x_++) * (*y_++); + z[j] = ss; } #endif } diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu index 0ca03fc32fbf67..5c444e752e7975 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/elementwise/elementwise_add_op.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h" #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h" #include "paddle/fluid/platform/complex128.h" #include "paddle/fluid/platform/complex64.h" @@ -34,7 +33,9 @@ namespace operators { */ template struct CudaAddFunctor { - inline HOSTDEVICE T operator()(T args[]) const { return args[0] + args[1]; } + __device__ __forceinline__ T operator()(const T* args) const { + return args[0] + args[1]; + } }; template diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc index 5b8d08a8943dde..3768748931ded2 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc @@ -100,9 +100,9 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel { {{"axes", axes}, {"keep_dims", true}}); runner.Run(stream); } else { - ctx.template device_context() - .Wait(); - framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dx); + framework::TensorCopy( + *tmp_dout, ctx.GetPlace(), + ctx.template device_context(), dx); } } @@ -127,8 +127,6 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel { {{"axes", axes}, {"keep_dims", false}}); runner.Run(stream); tmp_dout = &reduced_dout; - ctx.template device_context() - .Wait(); } // stage 2 @@ -144,9 +142,9 @@ class ElementwiseAddGradNPUKernel : public framework::OpKernel { {{"axes", axes}, {"keep_dims", true}}); runner.Run(stream); } else { - ctx.template device_context() - .Wait(); - framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dy); + framework::TensorCopy( + *tmp_dout, ctx.GetPlace(), + ctx.template device_context(), dy); } } } diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h index 36add2112974dc..321826ec647c99 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h @@ -13,6 +13,17 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/float16.h" + +#ifdef __HIPCC__ +#define ELEMENTWISE_BLOCK_SIZE 256 +#else +#define ELEMENTWISE_BLOCK_SIZE 512 +#endif + namespace paddle { namespace operators { @@ -90,8 +101,7 @@ struct ElementwiseDataWrapper { template __device__ void VectorizedKernelImpl( - ElementwiseDataWrapper data, int size, Functor func, - int tid) { + ElementwiseDataWrapper data, Functor func, int tid) { using VecType = CudaAlignedVector; VecType ins_vec[ET]; VecType out_vec; @@ -121,10 +131,9 @@ __device__ void VectorizedKernelImpl( data.store_vector(out_vec, tid); } -template -__device__ void ScalarKernelImpl(ElementwiseDataWrapper data, - int size, Functor func, int start, - int remain) { +template +__device__ void ScalarKernelImpl(ElementwiseDataWrapper data, + Functor func, int start, int remain) { T ins[ET]; T out; @@ -146,12 +155,11 @@ __global__ void VectorizedKernel(const T *__restrict__ in0, int tid = blockIdx.x * blockDim.x + threadIdx.x; int remain = size - VecSize * tid; remain = remain > 0 ? remain : 0; + auto data = ElementwiseDataWrapper(out, in0, in1); if (remain >= VecSize) { - auto data = ElementwiseDataWrapper(out, in0, in1); - VectorizedKernelImpl(data, size, func, tid); + VectorizedKernelImpl(data, func, tid); } else { - auto data = ElementwiseDataWrapper(out, in0, in1); - ScalarKernelImpl(data, size, func, tid * VecSize, remain); + ScalarKernelImpl(data, func, tid * VecSize, remain); } } @@ -162,7 +170,7 @@ __global__ void ScalarKernel(const T *__restrict__ in0, auto data = ElementwiseDataWrapper(out, in0, in1); int tid = blockIdx.x * blockDim.x + threadIdx.x; int remain = tid < size ? 1 : 0; - ScalarKernelImpl(data, size, func, tid, remain); + ScalarKernelImpl(data, func, tid, remain); } template @@ -173,7 +181,7 @@ void LaunchElementwiseCudaKernel( // calculate the max vec_size for all ins and outs auto size = ins[0]->numel(); int vec_size = GetVectorizedSize(ins, *outs); - int block_size = PADDLE_CUDA_THREAD_SIZE; + int block_size = ELEMENTWISE_BLOCK_SIZE; int grid_size = ((size + vec_size - 1) / vec_size + block_size - 1) / block_size; const T *in0 = ins[0]->data(); diff --git a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc index df6fae6c8484a0..f06dbd26873a60 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc +++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc @@ -38,7 +38,7 @@ USE_OP(elementwise_sub); USE_OP_DEVICE_KERNEL(elementwise_sub, NPU); template -void Compare(f::Scope* scope, const p::DeviceContext& ctx, +void Compare(f::Scope *scope, const p::DeviceContext &ctx, std::string op_type) { // init auto x = scope->Var("X"); @@ -62,8 +62,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx, TensorFromVector(init_y, ctx, tensor_y); tensor_y->Resize({10, 10}); - ctx.Wait(); - auto place = ctx.GetPlace(); auto out = scope->Var("Out"); auto tensor_out = out->GetMutable(); @@ -74,7 +72,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx, {{"Out", {"Out"}}}, attrs); op->Run(*scope, place); - ctx.Wait(); std::vector out_vec; TensorToVector(*tensor_out, ctx, &out_vec); @@ -93,7 +90,7 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx, } template -void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx, +void CompareGrad(f::Scope *scope, const p::DeviceContext &ctx, std::string op_type) { // init auto dout = scope->Var("DOut"); @@ -122,8 +119,6 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx, TensorFromVector(init_dout, ctx, tensor_dout); tensor_dout->Resize({2, 3, 5}); - ctx.Wait(); - // run f::AttributeMap attrs; auto op = f::OpRegistry::CreateOp( @@ -132,7 +127,6 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx, auto place = ctx.GetPlace(); op->Run(*scope, place); - ctx.Wait(); std::vector dx_vec; TensorToVector(*tensor_dx, ctx, &dx_vec); @@ -160,30 +154,30 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx, TEST(elementwise_add, NPU_fp32) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx, "elementwise_add"); + auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx, "elementwise_add"); } TEST(elementwise_sub, NPU_fp32) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx, "elementwise_sub"); + auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx, "elementwise_sub"); } TEST(elementwise_sub, NPU_fp16) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx, "elementwise_sub"); + auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx, "elementwise_sub"); } TEST(elementwise_sub_grad, NPU) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - CompareGrad(&scope, ctx, "elementwise_sub_grad"); + auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + CompareGrad(&scope, *ctx, "elementwise_sub_grad"); } TEST(elementwise_add_grad, NPU) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - CompareGrad(&scope, ctx, "elementwise_add_grad"); + auto *ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + CompareGrad(&scope, *ctx, "elementwise_add_grad"); } diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc index 809445c2862035..a6e438f8016e0c 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc @@ -102,7 +102,9 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel { {{"axes", axes}, {"keep_dims", true}}); runner.Run(stream); } else { - framework::TensorCopySync(*tmp_dout, ctx.GetPlace(), dx); + framework::TensorCopy( + *tmp_dout, ctx.GetPlace(), + ctx.template device_context(), dx); } } if (dy) { diff --git a/paddle/fluid/operators/expand_as_op.cc b/paddle/fluid/operators/expand_as_op.cc index 25b83ed93f7296..e2bf61de63196b 100644 --- a/paddle/fluid/operators/expand_as_op.cc +++ b/paddle/fluid/operators/expand_as_op.cc @@ -147,3 +147,17 @@ REGISTER_OP_CPU_KERNEL( ops::ExpandAsGradKernel, ops::ExpandAsGradKernel, ops::ExpandAsGradKernel); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +REGISTER_OP_CUDA_KERNEL( + expand_as, ops::ExpandAsKernel, + ops::ExpandAsKernel, + ops::ExpandAsKernel, + ops::ExpandAsKernel, + ops::ExpandAsKernel); +REGISTER_OP_CUDA_KERNEL( + expand_as_grad, + ops::ExpandAsGradKernel, + ops::ExpandAsGradKernel, + ops::ExpandAsGradKernel, + ops::ExpandAsGradKernel); +#endif diff --git a/paddle/fluid/operators/expand_as_op.cu b/paddle/fluid/operators/expand_as_op.cu deleted file mode 100755 index dbb1fcf3ab3261..00000000000000 --- a/paddle/fluid/operators/expand_as_op.cu +++ /dev/null @@ -1,25 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include "paddle/fluid/operators/expand_as_op.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - expand_as, ops::ExpandAsKernel, - ops::ExpandAsKernel, - ops::ExpandAsKernel, - ops::ExpandAsKernel, - ops::ExpandAsKernel); -REGISTER_OP_CUDA_KERNEL( - expand_as_grad, - ops::ExpandAsGradKernel, - ops::ExpandAsGradKernel, - ops::ExpandAsGradKernel, - ops::ExpandAsGradKernel); diff --git a/paddle/fluid/operators/expand_as_v2_op.cc b/paddle/fluid/operators/expand_as_v2_op.cc index 70099afbd5994d..5296a144f6247d 100644 --- a/paddle/fluid/operators/expand_as_v2_op.cc +++ b/paddle/fluid/operators/expand_as_v2_op.cc @@ -129,3 +129,18 @@ REGISTER_OP_CPU_KERNEL( ops::ExpandAsV2GradKernel, ops::ExpandAsV2GradKernel, ops::ExpandAsV2GradKernel); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +REGISTER_OP_CUDA_KERNEL( + expand_as_v2, + ops::ExpandAsV2Kernel, + ops::ExpandAsV2Kernel, + ops::ExpandAsV2Kernel, + ops::ExpandAsV2Kernel, + ops::ExpandAsV2Kernel); +REGISTER_OP_CUDA_KERNEL( + expand_as_v2_grad, + ops::ExpandAsV2GradKernel, + ops::ExpandAsV2GradKernel, + ops::ExpandAsV2GradKernel, + ops::ExpandAsV2GradKernel); +#endif diff --git a/paddle/fluid/operators/expand_as_v2_op.cu b/paddle/fluid/operators/expand_as_v2_op.cu deleted file mode 100644 index e315144472dd9f..00000000000000 --- a/paddle/fluid/operators/expand_as_v2_op.cu +++ /dev/null @@ -1,26 +0,0 @@ -/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include "paddle/fluid/operators/expand_as_v2_op.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - expand_as_v2, - ops::ExpandAsV2Kernel, - ops::ExpandAsV2Kernel, - ops::ExpandAsV2Kernel, - ops::ExpandAsV2Kernel, - ops::ExpandAsV2Kernel); -REGISTER_OP_CUDA_KERNEL( - expand_as_v2_grad, - ops::ExpandAsV2GradKernel, - ops::ExpandAsV2GradKernel, - ops::ExpandAsV2GradKernel, - ops::ExpandAsV2GradKernel); diff --git a/paddle/fluid/operators/expand_op.cc b/paddle/fluid/operators/expand_op.cc index 83e205367a7af6..e7da08ff277117 100644 --- a/paddle/fluid/operators/expand_op.cc +++ b/paddle/fluid/operators/expand_op.cc @@ -273,3 +273,21 @@ REGISTER_OP_CPU_KERNEL( ops::ExpandGradKernel, ops::ExpandGradKernel, ops::ExpandGradKernel); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +REGISTER_OP_CUDA_KERNEL( + expand, ops::ExpandKernel, + ops::ExpandKernel, + ops::ExpandKernel, + ops::ExpandKernel, + ops::ExpandKernel, + ops::ExpandKernel); +REGISTER_OP_CUDA_KERNEL( + expand_grad, + ops::ExpandGradKernel, + ops::ExpandGradKernel, + ops::ExpandGradKernel, + ops::ExpandGradKernel, + ops::ExpandGradKernel); +#endif diff --git a/paddle/fluid/operators/expand_op.cu b/paddle/fluid/operators/expand_op.cu deleted file mode 100644 index f2f8e2f7414f38..00000000000000 --- a/paddle/fluid/operators/expand_op.cu +++ /dev/null @@ -1,32 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include "paddle/fluid/operators/expand_op.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL( - expand, ops::ExpandKernel, - ops::ExpandKernel, - ops::ExpandKernel, - ops::ExpandKernel, - ops::ExpandKernel, - ops::ExpandKernel); -REGISTER_OP_CUDA_KERNEL( - expand_grad, - ops::ExpandGradKernel, - ops::ExpandGradKernel, - ops::ExpandGradKernel, - ops::ExpandGradKernel, - ops::ExpandGradKernel); diff --git a/paddle/fluid/operators/expand_op_npu.cc b/paddle/fluid/operators/expand_op_npu.cc index f4ae1785b024f5..bb3a6512d2c8ba 100644 --- a/paddle/fluid/operators/expand_op_npu.cc +++ b/paddle/fluid/operators/expand_op_npu.cc @@ -58,9 +58,11 @@ class ExpandNPUKernel : public framework::OpKernel { expand_times.size(), static_cast(in_dims.size()))); auto* out0 = context.Output("Out"); framework::DDim out_dims(in_dims); + for (size_t i = 0; i < expand_times.size(); ++i) { out_dims[i] *= expand_times[i]; } + out0->Resize(out_dims); out0->mutable_data(context.device_context().GetPlace()); auto runner = @@ -77,6 +79,7 @@ class ExpandNPUKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_NPU_KERNEL( expand, ops::ExpandNPUKernel, + ops::ExpandNPUKernel, ops::ExpandNPUKernel); diff --git a/paddle/fluid/operators/expand_op_npu_test.cc b/paddle/fluid/operators/expand_op_npu_test.cc index 95f7865a8a3a4e..880eb341f2093b 100644 --- a/paddle/fluid/operators/expand_op_npu_test.cc +++ b/paddle/fluid/operators/expand_op_npu_test.cc @@ -69,6 +69,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) { TEST(expand, NPU_fp32) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx); + auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx); } diff --git a/paddle/fluid/operators/expand_v2_op.cc b/paddle/fluid/operators/expand_v2_op.cc index 05ab0f6c8dc8fc..618c1560c5eac7 100644 --- a/paddle/fluid/operators/expand_v2_op.cc +++ b/paddle/fluid/operators/expand_v2_op.cc @@ -278,3 +278,21 @@ REGISTER_OP_CPU_KERNEL( ops::ExpandV2GradKernel, ops::ExpandV2GradKernel, ops::ExpandV2GradKernel); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +REGISTER_OP_CUDA_KERNEL( + expand_v2, ops::ExpandV2Kernel, + ops::ExpandV2Kernel, + ops::ExpandV2Kernel, + ops::ExpandV2Kernel, + ops::ExpandV2Kernel, + ops::ExpandV2Kernel); +REGISTER_OP_CUDA_KERNEL( + expand_v2_grad, + ops::ExpandV2GradKernel, + ops::ExpandV2GradKernel, + ops::ExpandV2GradKernel, + ops::ExpandV2GradKernel, + ops::ExpandV2GradKernel); +#endif diff --git a/paddle/fluid/operators/expand_v2_op.cu b/paddle/fluid/operators/expand_v2_op.cu deleted file mode 100644 index e096dbc27f0c2a..00000000000000 --- a/paddle/fluid/operators/expand_v2_op.cu +++ /dev/null @@ -1,32 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include "paddle/fluid/operators/expand_v2_op.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL( - expand_v2, ops::ExpandV2Kernel, - ops::ExpandV2Kernel, - ops::ExpandV2Kernel, - ops::ExpandV2Kernel, - ops::ExpandV2Kernel, - ops::ExpandV2Kernel); -REGISTER_OP_CUDA_KERNEL( - expand_v2_grad, - ops::ExpandV2GradKernel, - ops::ExpandV2GradKernel, - ops::ExpandV2GradKernel, - ops::ExpandV2GradKernel, - ops::ExpandV2GradKernel); diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc index caa29309901932..f35d8b6bbf89f1 100644 --- a/paddle/fluid/operators/fill_constant_op.cc +++ b/paddle/fluid/operators/fill_constant_op.cc @@ -154,6 +154,7 @@ REGISTER_OP_CPU_KERNEL(fill_constant, ops::FillConstantKernel, ops::FillConstantKernel, ops::FillConstantKernel, ops::FillConstantKernel, + ops::FillConstantKernel, ops::FillConstantKernel, ops::FillConstantKernel); diff --git a/paddle/fluid/operators/fill_constant_op.h b/paddle/fluid/operators/fill_constant_op.h index 4608f167548a38..17c7321122b174 100644 --- a/paddle/fluid/operators/fill_constant_op.h +++ b/paddle/fluid/operators/fill_constant_op.h @@ -105,7 +105,8 @@ class FillConstantKernel : public framework::OpKernel { int actual_place = place_type; if (actual_place == -1) { - bool cpu_place = force_cpu || ctx.GetPlace() == platform::CPUPlace(); + bool cpu_place = (force_cpu || ctx.GetPlace() == platform::CPUPlace() || + data_type == framework::proto::VarType::BF16); if (cpu_place) { actual_place = 0; } else if (platform::is_gpu_place(ctx.GetPlace())) { @@ -116,6 +117,9 @@ class FillConstantKernel : public framework::OpKernel { } if (actual_place == 0) { + VLOG(4) << "[CPU] FillConstantKernel" + << ((data_type == framework::proto::VarType::BF16) ? "" + : ""); tensor->mutable_data(platform::CPUPlace(), data_type); math::SetConstant functor; functor(reinterpret_cast(dev_ctx), diff --git a/paddle/fluid/operators/fill_constant_op_npu.cc b/paddle/fluid/operators/fill_constant_op_npu.cc index 9d5499e00c82f6..4ea4c11c478357 100644 --- a/paddle/fluid/operators/fill_constant_op_npu.cc +++ b/paddle/fluid/operators/fill_constant_op_npu.cc @@ -65,7 +65,7 @@ class FillConstantNPUKernel : public framework::OpKernel { Tensor tensor_tmp(data_type); tensor_tmp.mutable_data({1}, ctx.GetPlace()); - TensorFromVector(std::vector{value}, ctx.device_context(), &tensor_tmp); + FillNpuTensorWithConstant(&tensor_tmp, value); out_var->mutable_data(shape, place); auto runner = NpuOpRunner("FillD", {tensor_tmp}, {*out_var}, diff --git a/paddle/fluid/operators/flatten_op.h b/paddle/fluid/operators/flatten_op.h index 1b2f1db1b07cdd..efcb0cbe2e2a8d 100644 --- a/paddle/fluid/operators/flatten_op.h +++ b/paddle/fluid/operators/flatten_op.h @@ -120,23 +120,9 @@ template class FlattenContiguousRangeKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { - auto &start_axis = context.Attr("start_axis"); - auto &stop_axis = context.Attr("stop_axis"); - auto *in = context.Input("X"); - auto x_dims = in->dims(); - int in_dims_size = x_dims.size(); - int real_start_axis = start_axis, real_stop_axis = stop_axis; - if (start_axis < 0) { - real_start_axis = start_axis + in_dims_size; - } - if (stop_axis < 0) { - real_stop_axis = stop_axis + in_dims_size; - } auto *out = context.Output("Out"); - - auto out_dims = framework::make_ddim( - GetOutputShape(real_start_axis, real_stop_axis, x_dims)); + auto out_dims = out->dims(); out->mutable_data(context.GetPlace(), in->type()); framework::TensorCopy( @@ -144,27 +130,6 @@ class FlattenContiguousRangeKernel : public framework::OpKernel { context.template device_context(), out); out->Resize(out_dims); } - static std::vector GetOutputShape(const int start_axis, - const int stop_axis, - const framework::DDim &in_dims) { - int64_t outer = 1; - std::vector out_shape; - int in_dims_size = in_dims.size(); - out_shape.reserve(in_dims_size - stop_axis + start_axis); - - for (int i = 0; i < start_axis; ++i) { - out_shape.push_back(in_dims[i]); - } - for (int i = start_axis; i <= stop_axis; i++) { - outer *= in_dims[i]; - } - out_shape.push_back(outer); - for (int i = stop_axis + 1; i < in_dims_size; i++) { - out_shape.push_back(in_dims[i]); - } - - return out_shape; - } }; template diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt index 287827ced5115e..104298e037319c 100644 --- a/paddle/fluid/operators/fused/CMakeLists.txt +++ b/paddle/fluid/operators/fused/CMakeLists.txt @@ -32,8 +32,7 @@ if (WITH_GPU OR WITH_ROCM) file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fused_batch_norm_act);\n") endif() # conv_fusion_op needs cudnn 7 above - # HIP not support cudnnConvolutionBiasActivationForward - if ((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 7100)) + if (NOT ${CUDNN_VERSION} VERSION_LESS 7100) op_library(conv_fusion_op) file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(conv2d_fusion);\n") endif() diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu index c9ba7a61e0907f..f5ee7f55991845 100644 --- a/paddle/fluid/operators/fused/conv_fusion_op.cu +++ b/paddle/fluid/operators/fused/conv_fusion_op.cu @@ -18,14 +18,18 @@ limitations under the License. */ #include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/operators/math/padding.h" +#ifdef PADDLE_WITH_HIP +#include "paddle/fluid/platform/miopen_helper.h" +#else #include "paddle/fluid/platform/cudnn_helper.h" +#endif DECLARE_int64(cudnn_exhaustive_search_times); namespace paddle { namespace operators { -#if CUDNN_VERSION >= 7100 +#if PADDLE_WITH_HIP || CUDNN_VERSION >= 7100 using Tensor = framework::Tensor; using ScopedTensorDescriptor = platform::ScopedTensorDescriptor; using ScopedFilterDescriptor = platform::ScopedFilterDescriptor; @@ -162,7 +166,78 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { if (input->dims().size() == 5) { layout = DataLayout::kNCDHW; } +#ifdef PADDLE_WITH_HIP + miopenConvolutionDescriptor_t cudnn_conv_desc = + conv_desc.descriptor(padding_common, strides, dilations); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::miopenSetConvolutionGroupCount(cudnn_conv_desc, + groups)); + // Now only support NCHW + std::vector bias_dim = { + 1, static_cast(transformed_output.dims()[1]), 1, 1}; + miopenTensorDescriptor_t cudnn_input_desc = input_desc.descriptor( + layout, framework::vectorize(transformed_input.dims())); + miopenTensorDescriptor_t cudnn_output_desc = output_desc.descriptor( + layout, framework::vectorize(transformed_output.dims())); + miopenTensorDescriptor_t cudnn_filter_desc = filter_desc.descriptor( + layout, framework::vectorize(filter->dims())); + miopenTensorDescriptor_t cudnn_bias_desc = + bias_desc.descriptor(layout, bias_dim); + miopenActivationDescriptor_t cudnn_act_desc = + act_desc.descriptor(activation); + miopenConvFwdAlgorithm_t algo; + auto handle = dev_ctx.cudnn_handle(); + auto workspace_handle = dev_ctx.cudnn_workspace_handle(); + + auto x_dims = framework::vectorize(transformed_input.dims()); + auto f_dims = framework::vectorize(filter->dims()); + + size_t workspace_size = 0; + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::miopenConvolutionForwardGetWorkSpaceSize( + handle, cudnn_filter_desc, cudnn_input_desc, cudnn_conv_desc, + cudnn_output_desc, &workspace_size)); + int find_count; + miopenConvAlgoPerf_t find_result; + auto cudnn_find_func = [&](void* cudnn_workspace_ptr) { + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::miopenFindConvolutionForwardAlgorithm( + handle, cudnn_input_desc, input_data, cudnn_filter_desc, + filter_data, cudnn_conv_desc, cudnn_output_desc, output_data, + kNUM_CUDNN_FWD_ALGS, &find_count, &find_result, + cudnn_workspace_ptr, workspace_size, false)); + }; + workspace_handle.RunFuncSync(cudnn_find_func, workspace_size); + algo = find_result.fwd_algo; + VLOG(3) << "cuDNN forward algo " << algo; + + { + ScalingParamType alpha = 1.0f, beta = 0.0f; + auto cudnn_func = [&](void* cudnn_workspace) { + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenConvolutionForward( + handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc, + filter_data, cudnn_conv_desc, algo, &beta, cudnn_output_desc, + output_data, cudnn_workspace, workspace_size)); + }; + workspace_handle.RunFunc(cudnn_func, workspace_size); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::miopenConvolutionForwardBias( + handle, &alpha, cudnn_bias_desc, bias_data, &beta, + cudnn_output_desc, output_data)); + if (activation != "identity") { + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenActivationForward( + handle, cudnn_act_desc, &alpha, cudnn_output_desc, output_data, + &beta, cudnn_output_desc, output_data)); + } + if (residual) { + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::miopenOpTensor( + handle, miopenTensorOpAdd, &alpha, cudnn_output_desc, output_data, + &alpha, cudnn_output_desc, residual_data, &beta, cudnn_output_desc, + output_data)); + } + } +#else // PADDLE_WITH_HIP cudnnConvolutionDescriptor_t cudnn_conv_desc = conv_desc.descriptor(padding_common, strides, dilations); PADDLE_ENFORCE_CUDA_SUCCESS( @@ -327,6 +402,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { }; workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); } +#endif std::vector channels = ctx.Attr>("split_channels"); if (channels.size()) { auto outs = ctx.MultiOutput("Outputs"); @@ -358,8 +434,11 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -#if CUDNN_VERSION >= 7100 namespace ops = paddle::operators; +#if CUDNN_VERSION >= 7100 REGISTER_OP_CUDA_KERNEL(conv2d_fusion, ops::CUDNNConvFusionOpKernel, ops::CUDNNConvFusionOpKernel); #endif +#ifdef PADDLE_WITH_HIP +REGISTER_OP_CUDA_KERNEL(conv2d_fusion, ops::CUDNNConvFusionOpKernel); +#endif diff --git a/paddle/fluid/operators/gather_op_npu.cc b/paddle/fluid/operators/gather_op_npu.cc index 8a487234ad94ac..1ee8889995f4d6 100644 --- a/paddle/fluid/operators/gather_op_npu.cc +++ b/paddle/fluid/operators/gather_op_npu.cc @@ -50,6 +50,7 @@ class GatherGradOpNPUKernel : public framework::OpKernel { auto *x = ctx.Input("X"); auto *dout = ctx.Input(framework::GradVarName("Out")); auto *dx = ctx.Output(framework::GradVarName("X")); + dx->mutable_data(ctx.GetPlace()); // step1: Unsqueeze index framework::Tensor tmp_tensor(index->type()); @@ -66,7 +67,7 @@ class GatherGradOpNPUKernel : public framework::OpKernel { .stream(); // step2: ZerosLike x in device - Tensor zeroslike_xout(x->type()); + Tensor zeroslike_xout(dx->type()); zeroslike_xout.Resize(x->dims()); auto p = zeroslike_xout.mutable_data(ctx.GetPlace()); @@ -74,7 +75,6 @@ class GatherGradOpNPUKernel : public framework::OpKernel { zeroslike_xout.numel() * sizeof(T), stream); // step3: scatter(x_grad) - dx->mutable_data(ctx.GetPlace()); auto runner_scatter = NpuOpRunner( "TensorScatterUpdate", {zeroslike_xout, *index, *dout}, {*dx}, {}); runner_scatter.Run(stream); diff --git a/paddle/fluid/operators/gather_op_npu_test.cc b/paddle/fluid/operators/gather_op_npu_test.cc index de067e45585d91..31e19d8f600c39 100644 --- a/paddle/fluid/operators/gather_op_npu_test.cc +++ b/paddle/fluid/operators/gather_op_npu_test.cc @@ -152,18 +152,18 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx, TEST(gather, NPU_fp32) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx, "gather"); + auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx, "gather"); } TEST(gather, NPU_fp16) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx, "gather"); + auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx, "gather"); } TEST(gather_grad, NPU_fp32) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - CompareGrad(&scope, ctx, "gather_grad"); + auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + CompareGrad(&scope, *ctx, "gather_grad"); } diff --git a/paddle/fluid/operators/gaussian_random_op.cu b/paddle/fluid/operators/gaussian_random_op.cu index 7a0c93eb1b2eaa..453ae20656f1d6 100644 --- a/paddle/fluid/operators/gaussian_random_op.cu +++ b/paddle/fluid/operators/gaussian_random_op.cu @@ -11,6 +11,8 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include +#include #include #include #include "paddle/fluid/framework/generator.h" diff --git a/paddle/fluid/operators/gelu_op_npu_test.cc b/paddle/fluid/operators/gelu_op_npu_test.cc index f11812ce3bb219..830dcd59839015 100644 --- a/paddle/fluid/operators/gelu_op_npu_test.cc +++ b/paddle/fluid/operators/gelu_op_npu_test.cc @@ -157,12 +157,12 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) { TEST(gelu, NPU_fp32) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx); + auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx); } TEST(gelu_grad, NPU) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - CompareGrad(&scope, ctx); + auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + CompareGrad(&scope, *ctx); } diff --git a/paddle/fluid/operators/increment_op_npu.cc b/paddle/fluid/operators/increment_op_npu.cc index c1859bce02c904..7d75e385e8f3b7 100644 --- a/paddle/fluid/operators/increment_op_npu.cc +++ b/paddle/fluid/operators/increment_op_npu.cc @@ -39,10 +39,9 @@ class IncrementalNPUKernel : public framework::OpKernel { out_tensor->mutable_data(context.GetPlace()); Tensor step_tensor(x_tensor->type()); - std::vector step_vec; - step_vec.push_back(static_cast(step)); - framework::TensorFromVector(step_vec, context.device_context(), - &step_tensor); + + step_tensor.mutable_data({1}, context.GetPlace()); + FillNpuTensorWithConstant(&step_tensor, static_cast(step)); auto runner = NpuOpRunner("Add", {*x_tensor, step_tensor}, {*out_tensor}, {}); diff --git a/paddle/fluid/operators/increment_op_npu_test.cc b/paddle/fluid/operators/increment_op_npu_test.cc index b466ae275dd1c1..bde349b0a33b9d 100644 --- a/paddle/fluid/operators/increment_op_npu_test.cc +++ b/paddle/fluid/operators/increment_op_npu_test.cc @@ -71,12 +71,12 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx, TEST(increment, NPU_fp32) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx, "increment"); + auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx, "increment"); } TEST(increment, NPU_fp64) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx, "increment"); + auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx, "increment"); } diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc index 6c488c387f8150..445d129d07c14b 100644 --- a/paddle/fluid/operators/interpolate_op.cc +++ b/paddle/fluid/operators/interpolate_op.cc @@ -88,8 +88,11 @@ static void Interpolate1DInferShapeCheck(framework::InferShapeContext* ctx) { platform::errors::InvalidArgument( "OutSize's dimension size must be 1, but got dimention = %d .", out_size_dim.size())); - PADDLE_ENFORCE_EQ(out_size_dim[0], 1, platform::errors::InvalidArgument( - "OutSize's dim[0] must be 1")); + PADDLE_ENFORCE_EQ( + out_size_dim[0], 1, + platform::errors::InvalidArgument( + "OutSize's 0-th dimension's value must be 1, but got value = %d .", + out_size_dim[0])); ctx->ShareLoD("X", "Out"); return; } diff --git a/paddle/fluid/operators/interpolate_v2_op.cc b/paddle/fluid/operators/interpolate_v2_op.cc index cfbe1778c76646..a4353420c84a9a 100644 --- a/paddle/fluid/operators/interpolate_v2_op.cc +++ b/paddle/fluid/operators/interpolate_v2_op.cc @@ -14,6 +14,9 @@ #include #include #include "paddle/fluid/framework/op_registry.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif namespace paddle { namespace operators { @@ -73,9 +76,12 @@ static void Interpolate1DInferShapeCheck(framework::InferShapeContext* ctx) { if (scale.size() > 0) { float scale_w = -1; scale_w = scale[0]; - PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument( - "scale of Op(interpolate) " - "should be greater than 0.")); + PADDLE_ENFORCE_EQ( + scale_w > 0, true, + platform::errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); if (scale_w > 0.) { // round down out_w = (data_layout == DataLayout::kNCHW @@ -96,8 +102,11 @@ static void Interpolate1DInferShapeCheck(framework::InferShapeContext* ctx) { platform::errors::InvalidArgument( "OutSize's dimension size must be 1, but got dimention = %d .", out_size_dim.size())); - PADDLE_ENFORCE_EQ(out_size_dim[0], 1, platform::errors::InvalidArgument( - "OutSize's dim[0] must be 1")); + PADDLE_ENFORCE_EQ( + out_size_dim[0], 1, + platform::errors::InvalidArgument( + "OutSize's 0-th dimension's value must be 1, but got value = %d .", + out_size_dim[0])); ctx->ShareLoD("X", "Out"); return; } @@ -170,9 +179,17 @@ static void Interpolate2DInferShapeCheck(framework::InferShapeContext* ctx) { scale_h = scale[0]; scale_w = scale[1]; PADDLE_ENFORCE_EQ( - scale_w > 0 && scale_h > 0, true, - platform::errors::InvalidArgument("scale of Op(interpolate) " - "should be greater than 0.")); + scale_w > 0, true, + platform::errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, true, + platform::errors::InvalidArgument( + "The scale_h in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); if (scale_h > 0. && scale_w > 0.) { // round down out_h = (data_layout == DataLayout::kNCHW @@ -278,9 +295,23 @@ static void Interpolate3DInferShapeCheck(framework::InferShapeContext* ctx) { scale_h = scale[1]; scale_w = scale[2]; PADDLE_ENFORCE_EQ( - scale_w > 0 && scale_h > 0 && scale_d > 0, true, - platform::errors::InvalidArgument("scale of Op(interpolate) " - "should be greater than 0.")); + scale_w > 0, true, + platform::errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, true, + platform::errors::InvalidArgument( + "The scale_h in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + PADDLE_ENFORCE_EQ( + scale_d > 0, true, + platform::errors::InvalidArgument( + "The scale_d in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_d)); if (scale_d > 0. && scale_h > 0. && scale_w > 0.) { // round down out_d = (data_layout == DataLayout::kNCHW @@ -359,13 +390,41 @@ class InterpolateV2Op : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + framework::LibraryType library = framework::LibraryType::kPlain; + auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); + +#ifdef PADDLE_WITH_MKLDNN + auto interp_method = ctx.Attr("interp_method"); + // TODO(danqing): support other interp_method + if (this->CanMKLDNNBeUsed(ctx, data_type) && + (interp_method == "nearest" || interp_method == "bilinear")) { + layout = framework::DataLayout::kMKLDNN; + library = framework::LibraryType::kMKLDNN; + } +#endif + + return framework::OpKernelType(data_type, ctx.GetPlace(), layout, library); } framework::OpKernelType GetKernelTypeForVar( const std::string& var_name, const Tensor& tensor, const framework::OpKernelType& expected_kernel_type) const override { +#ifdef PADDLE_WITH_MKLDNN + if ((expected_kernel_type.data_layout_ == framework::DataLayout::kMKLDNN) && + (tensor.layout() != framework::DataLayout::kMKLDNN)) { + auto attrs = Attrs(); + auto ar = paddle::framework::AttrReader(attrs); + const std::string data_format = ar.Get("data_layout"); + auto dl = framework::StringToDataLayout(data_format); + // Some models may have intentionally set "AnyLayout" for pool + // op. Treat this as NCHW (default data_format value) + if (dl != framework::DataLayout::kAnyLayout) { + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), dl); + } + } +#endif if (var_name == "SizeTensor" || var_name == "Scale") { return expected_kernel_type; } @@ -436,6 +495,9 @@ class InterpolateV2OpMaker : public framework::OpProtoAndCheckerMaker { "can be \'0\' for src_idx = scale*(dst_indx+0.5)-0.5 , " "can be \'1\' for src_idx = scale*dst_index .") .SetDefault(1); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); AddComment(R"DOC( This operator samples input X to given output shape by using specified interpolation method, the interpolation methods can be \"nearest\" diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu index e5002e72d0edd7..6745592c5c1a8b 100644 --- a/paddle/fluid/operators/interpolate_v2_op.cu +++ b/paddle/fluid/operators/interpolate_v2_op.cu @@ -982,15 +982,21 @@ static void Interpolate1DCUDAFwd(const framework::ExecutionContext& ctx, if (scale_tensor != nullptr) { auto scale_data = get_new_data_from_tensor(scale_tensor); scale_w = scale_data[0]; - PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument( - "scale of Op(interpolate) " - "should be greater than 0.")); + PADDLE_ENFORCE_EQ( + scale_w > 0, true, + platform::errors::InvalidArgument( + "The scale_w in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); } else { if (scale.size() > 0) { scale_w = scale[0]; - PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument( - "scale of Op(interpolate) " - "should be greater than 0.")); + PADDLE_ENFORCE_EQ( + scale_w > 0, true, + platform::errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); } } if (scale_w > 0.) { @@ -1081,18 +1087,36 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx, scale_h = scale_data[0]; scale_w = scale_data[0]; } + PADDLE_ENFORCE_EQ( - scale_w > 0 && scale_h > 0, true, - platform::errors::InvalidArgument("scale of Op(interpolate) " - "should be greater than 0.")); + scale_w > 0, true, + platform::errors::InvalidArgument( + "The scale_w in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, true, + platform::errors::InvalidArgument( + "The scale_h in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); } else { if (scale.size() > 1) { scale_w = scale[1]; scale_h = scale[0]; + PADDLE_ENFORCE_EQ( - scale_w > 0 && scale_h > 0, true, - platform::errors::InvalidArgument("scale of Op(interpolate) " - "should be greater than 0.")); + scale_w > 0, true, + platform::errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, true, + platform::errors::InvalidArgument( + "The scale_h in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); } } if (scale_w > 0. && scale_h > 0.) { @@ -1216,10 +1240,25 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx, scale_h = scale_data[0]; scale_w = scale_data[0]; } + + PADDLE_ENFORCE_EQ( + scale_w > 0, true, + platform::errors::InvalidArgument( + "The scale_w in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); PADDLE_ENFORCE_EQ( - scale_w > 0 && scale_h > 0 && scale_d > 0, true, - platform::errors::InvalidArgument("scale of Op(interpolate) " - "should be greater than 0.")); + scale_h > 0, true, + platform::errors::InvalidArgument( + "The scale_h in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + PADDLE_ENFORCE_EQ( + scale_d > 0, true, + platform::errors::InvalidArgument( + "The scale_d in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_d)); } else { if (scale.size() > 1) { scale_d = scale[0]; @@ -1227,9 +1266,23 @@ static void Interpolate3DCUDAFwd(const framework::ExecutionContext& ctx, scale_w = scale[2]; PADDLE_ENFORCE_EQ( - scale_w > 0 && scale_h > 0 && scale_d > 0, true, - platform::errors::InvalidArgument("scale of Op(interpolate) " - "should be greater than 0.")); + scale_w > 0, true, + platform::errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, true, + platform::errors::InvalidArgument( + "The scale_h in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + PADDLE_ENFORCE_EQ( + scale_d > 0, true, + platform::errors::InvalidArgument( + "The scale_d in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_d)); } } if (scale_d > 0. && scale_h > 0. && scale_w > 0.) { @@ -1334,16 +1387,22 @@ static void Interpolate1DCUDABwd(const framework::ExecutionContext& ctx, if (scale_tensor != nullptr) { auto scale_data = get_new_data_from_tensor(scale_tensor); scale_w = scale_data[0]; - PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument( - "scale of Op(interpolate) " - "should be greater than 0.")); + PADDLE_ENFORCE_EQ( + scale_w > 0, true, + platform::errors::InvalidArgument( + "The scale_w in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); } else { if (scale.size() > 0) { scale_w = scale[0]; - PADDLE_ENFORCE_EQ(scale_w > 0, true, platform::errors::InvalidArgument( - "scale of Op(interpolate) " - "should be greater than 0.")); + PADDLE_ENFORCE_EQ( + scale_w > 0, true, + platform::errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); } } if (scale_w > 0.) { @@ -1433,19 +1492,36 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx, scale_h = scale_data[0]; scale_w = scale_data[0]; } + + PADDLE_ENFORCE_EQ( + scale_w > 0, true, + platform::errors::InvalidArgument( + "The scale_w in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); PADDLE_ENFORCE_EQ( - scale_w > 0 && scale_h > 0, true, - platform::errors::InvalidArgument("scale of Op(interpolate) " - "should be greater than 0.")); + scale_h > 0, true, + platform::errors::InvalidArgument( + "The scale_h in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); } else { if (scale.size() > 1) { scale_w = scale[1]; scale_h = scale[0]; PADDLE_ENFORCE_EQ( - scale_w > 0 && scale_h > 0, true, - platform::errors::InvalidArgument("scale of Op(interpolate) " - "should be greater than 0.")); + scale_w > 0, true, + platform::errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, true, + platform::errors::InvalidArgument( + "The scale_h in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); } } if (scale_w > 0. && scale_h > 0.) { @@ -1581,9 +1657,23 @@ static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx, scale_w = scale_data[0]; } PADDLE_ENFORCE_EQ( - scale_w > 0 && scale_h > 0 && scale_d > 0, true, - platform::errors::InvalidArgument("scale of Op(interpolate) " - "should be greater than 0.")); + scale_w > 0, true, + platform::errors::InvalidArgument( + "The scale_w in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, true, + platform::errors::InvalidArgument( + "The scale_h in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + PADDLE_ENFORCE_EQ( + scale_d > 0, true, + platform::errors::InvalidArgument( + "The scale_d in input 'Scale' Tensor of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_d)); } else { if (scale.size() > 1) { scale_d = scale[0]; @@ -1591,9 +1681,23 @@ static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx, scale_w = scale[2]; PADDLE_ENFORCE_EQ( - scale_w > 0 && scale_h > 0 && scale_d > 0, true, - platform::errors::InvalidArgument("scale of Op(interpolate) " - "should be greater than 0.")); + scale_w > 0, true, + platform::errors::InvalidArgument( + "The scale_w in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_w)); + PADDLE_ENFORCE_EQ( + scale_h > 0, true, + platform::errors::InvalidArgument( + "The scale_h in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_h)); + PADDLE_ENFORCE_EQ( + scale_d > 0, true, + platform::errors::InvalidArgument( + "The scale_d in Attr(scale) of Operator(interpolate) " + "should be greater than 0, but received value is %d.", + scale_d)); } } if (scale_d > 0. && scale_h > 0. && scale_w > 0.) { diff --git a/paddle/fluid/operators/layer_norm_op_npu.cc b/paddle/fluid/operators/layer_norm_op_npu.cc index 95549319cd2096..c0c228ef22af3e 100644 --- a/paddle/fluid/operators/layer_norm_op_npu.cc +++ b/paddle/fluid/operators/layer_norm_op_npu.cc @@ -80,8 +80,7 @@ class LayerNormNPUKernel : public framework::OpKernel { default_scale.mutable_data(framework::make_ddim(axes), place); Tensor value(x->type()); value.mutable_data({1}, place); - TensorFromVector(std::vector{static_cast(1.0)}, - ctx.device_context(), &value); + FillNpuTensorWithConstant(&value, static_cast(1.0)); auto runner = NpuOpRunner("FillD", {value}, {default_scale}, {{"dims", axes}}); runner.Run(stream); @@ -95,8 +94,7 @@ class LayerNormNPUKernel : public framework::OpKernel { default_bias.mutable_data(framework::make_ddim(axes), place); Tensor value(x->type()); value.mutable_data({1}, place); - TensorFromVector(std::vector{static_cast(0)}, ctx.device_context(), - &value); + FillNpuTensorWithConstant(&value, static_cast(0)); auto runner = NpuOpRunner("FillD", {value}, {default_bias}, {{"dims", axes}}); runner.Run(stream); @@ -251,8 +249,7 @@ class LayerNormGradNPUKernel : public framework::OpKernel { default_scale.mutable_data(framework::make_ddim(axes), place); Tensor value(x->type()); value.mutable_data({1}, place); - TensorFromVector(std::vector{static_cast(1.0)}, - ctx.device_context(), &value); + FillNpuTensorWithConstant(&value, static_cast(1.0)); auto runner = NpuOpRunner("FillD", {value}, {default_scale}, {{"dims", axes}}); runner.Run(stream); diff --git a/paddle/fluid/operators/load_combine_op_npu.cc b/paddle/fluid/operators/load_combine_op_npu.cc new file mode 100644 index 00000000000000..4b9b96c23b0b71 --- /dev/null +++ b/paddle/fluid/operators/load_combine_op_npu.cc @@ -0,0 +1,25 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/load_combine_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + load_combine, + ops::LoadCombineOpKernel, + ops::LoadCombineOpKernel, + ops::LoadCombineOpKernel, + ops::LoadCombineOpKernel, + ops::LoadCombineOpKernel); diff --git a/paddle/fluid/operators/load_op_npu.cc b/paddle/fluid/operators/load_op_npu.cc new file mode 100644 index 00000000000000..1f532803458310 --- /dev/null +++ b/paddle/fluid/operators/load_op_npu.cc @@ -0,0 +1,24 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/load_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + load, ops::LoadOpKernel, + ops::LoadOpKernel, + ops::LoadOpKernel, + ops::LoadOpKernel, + ops::LoadOpKernel); diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc index fab2d7f7aa0542..9574b325ef77fd 100644 --- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc +++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc @@ -28,6 +28,7 @@ class LookupTableV2NPUKernel : public framework::OpKernel { auto *ids_t = ctx.Input("Ids"); // int tensor auto *output_t = ctx.Output("Out"); // float tensor auto *table_t = ctx.Input("W"); + auto *table_var = ctx.InputVar("W"); PADDLE_ENFORCE_EQ( table_var->IsType(), true, @@ -59,18 +60,16 @@ class LookupTableV2GradNPUKernel : public framework::OpKernel { ctx.template device_context() .stream(); - // step2: ZerosLike x in device - Tensor zeroslike_w(table_grad_t->type()); - zeroslike_w.Resize(table_grad_t->dims()); - auto p = zeroslike_w.mutable_data(ctx.GetPlace()); - - platform::NPUMemsetAsync(static_cast(p), 0, - zeroslike_w.numel() * sizeof(T), stream); + auto runner_zeros = + NpuOpRunner("ZerosLike", {*table_grad_t}, {*table_grad_t}); + runner_zeros.Run(stream); - table_grad_t->mutable_data(ctx.GetPlace()); + // NOTE(zhiqiu): It seems in cann 20.1, the first input and output + // can be different tensor, but in cann 20.2+, it does inplace operation. + // Thus, the first input and output should be same tensor. auto runner_scatter = - NpuOpRunner("ScatterAdd", {zeroslike_w, *ids_t, *output_grad_t}, - {*table_grad_t}, {}); + NpuOpRunner("ScatterAdd", {*table_grad_t, *ids_t, *output_grad_t}, + {*table_grad_t}, {{"use_locking", true}}); runner_scatter.Run(stream); } }; @@ -82,9 +81,11 @@ namespace ops = paddle::operators; REGISTER_OP_NPU_KERNEL( lookup_table_v2, ops::LookupTableV2NPUKernel, + ops::LookupTableV2NPUKernel, ops::LookupTableV2NPUKernel); REGISTER_OP_NPU_KERNEL( lookup_table_v2_grad, ops::LookupTableV2GradNPUKernel, + ops::LookupTableV2GradNPUKernel, ops::LookupTableV2GradNPUKernel); diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu_test.cc b/paddle/fluid/operators/lookup_table_v2_op_npu_test.cc deleted file mode 100644 index f37915834bd756..00000000000000 --- a/paddle/fluid/operators/lookup_table_v2_op_npu_test.cc +++ /dev/null @@ -1,142 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifndef _WIN32 -#include -#endif - -#include -#include -#include -#include -#include // NOLINT -#include - -#include "gtest/gtest.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/operator.h" -#include "paddle/fluid/framework/program_desc.h" -#include "paddle/fluid/operators/dropout_op.h" -#include "paddle/fluid/operators/math/math_function.h" -#include "paddle/fluid/string/printf.h" - -namespace f = paddle::framework; -namespace p = paddle::platform; -namespace m = paddle::operators::math; - -USE_OP(lookup_table_v2); -USE_OP_DEVICE_KERNEL(lookup_table_v2, NPU); - -template -void Compare(f::Scope* scope, const p::DeviceContext& ctx) { - // init - auto ids = scope->Var("Ids"); - auto out = scope->Var("Out"); - auto w = scope->Var("W"); - - auto ids_t = ids->GetMutable(); - auto out_t = out->GetMutable(); - auto w_t = w->GetMutable(); - int bsz = 10; - int dim = 32; - int seqlen = 8; - int vocab_size = 100; - TensorFromVector(std::vector(bsz * seqlen, 3), ctx, ids_t); - std::vector val(vocab_size * dim, 10.); - TensorFromVector(val, ctx, w_t); - ids_t->Resize({bsz, seqlen}); - w_t->Resize({vocab_size, dim}); - out_t->Resize({bsz, seqlen, dim}); - ctx.Wait(); - - auto place = ctx.GetPlace(); - out_t->mutable_data(place); - f::AttributeMap attrs = {{}}; - auto op = f::OpRegistry::CreateOp("lookup_table_v2", - {{"W", {"W"}}, {"Ids", {"Ids"}}}, - {{"Out", {"Out"}}}, attrs); - op->Run(*scope, place); - std::vector out_v; - TensorToVector(*out_t, ctx, &out_v); - ctx.Wait(); - EXPECT_EQ(out_t->numel(), bsz * seqlen * dim); - T res = std::accumulate(out_v.begin(), out_v.end(), 0.); - float eps = 1.e-6; - EXPECT_LT(fabs(res - bsz * seqlen * dim * 10.), eps); -} - -template -void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) { - // init - auto w = scope->Var("W"); - auto ids = scope->Var("Ids"); - auto out = scope->Var("DOut"); - auto dw = scope->Var("DW"); - - auto w_t = w->GetMutable(); - auto ids_t = ids->GetMutable(); - auto out_t = out->GetMutable(); - auto dw_t = dw->GetMutable(); - - int bsz = 2; - int dim = 2; - int seqlen = 2; - int vocab_size = 4; - - std::vector val_int(bsz * seqlen, 3); - std::vector val(vocab_size * dim, 0.); - std::vector val_out(bsz * seqlen * dim, 1.); - - TensorFromVector(val_int, ctx, ids_t); - TensorFromVector(val, ctx, w_t); - TensorFromVector(val, ctx, dw_t); - TensorFromVector(val_out, ctx, out_t); - - w_t->Resize({vocab_size, dim}); - ids_t->Resize({bsz, seqlen}); - out_t->Resize({bsz, seqlen, dim}); - dw_t->Resize({vocab_size, dim}); - - ctx.Wait(); - - auto place = ctx.GetPlace(); - out_t->mutable_data(place); - w_t->mutable_data(place); - dw_t->mutable_data(place); - f::AttributeMap attrs = {{}}; - auto op = f::OpRegistry::CreateOp( - "lookup_table_v2_grad", - {{"Ids", {"Ids"}}, {"W", {"W"}}, {"Out@GRAD", {"DOut"}}}, - {{"W@GRAD", {"DW"}}}, attrs); - op->Run(*scope, place); - ctx.Wait(); - std::vector w_v; - TensorToVector(*dw_t, ctx, &w_v); - ctx.Wait(); - EXPECT_EQ(dw_t->numel(), vocab_size * dim); - T res = std::accumulate(w_v.begin(), w_v.end(), 0.); - float eps = 1.e-6; - EXPECT_LT(fabs(res - bsz * seqlen * dim), eps); -} - -TEST(lookup_table_v2, NPU_fp32) { - f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx); -} - -TEST(lookup_table_v2_grad, NPU_fp32) { - f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - CompareGrad(&scope, ctx); -} diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h index 64b533de098cad..05d42f02c1003a 100644 --- a/paddle/fluid/operators/math/blas_impl.h +++ b/paddle/fluid/operators/math/blas_impl.h @@ -15,6 +15,7 @@ #ifdef PADDLE_WITH_MKLML #include #endif + #include #include #include @@ -28,6 +29,19 @@ namespace paddle { namespace operators { namespace math { +namespace detail { + +template +static void axpy(int n, const T alpha, const T *x, const int incx, T *y, + const int incy) { + // Y = Y + alpha * X + while (n-- > 0) { + *y += alpha * *x; + y = y + incy; + x = x + incx; + } +} +} // namespace detail template struct CBlas; @@ -43,6 +57,11 @@ struct CBlas { template <> struct CBlas { + template + static void AXPY(ARGS... args) { + detail::axpy(args...); + } + template static void VCOPY(ARGS... args) { PADDLE_THROW(platform::errors::Unimplemented( diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc index 68179a68574a01..0bdc7b69434221 100644 --- a/paddle/fluid/operators/math/math_function.cc +++ b/paddle/fluid/operators/math/math_function.cc @@ -27,6 +27,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/operators/math/math_function_impl.h" +#include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/float16.h" #include "unsupported/Eigen/CXX11/Tensor" @@ -49,6 +50,7 @@ template struct SetConstant; #ifdef PADDLE_WITH_XPU template struct SetConstant; +template struct SetConstant; template struct SetConstant; template struct SetConstant; template struct SetConstant; diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu index 2b93cd926081ec..f94c1bf696cdad 100644 --- a/paddle/fluid/operators/math/math_function.cu +++ b/paddle/fluid/operators/math/math_function.cu @@ -19,6 +19,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function_impl.h" +#include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/complex128.h" #include "paddle/fluid/platform/complex64.h" #include "paddle/fluid/platform/float16.h" @@ -33,6 +34,7 @@ using complex64 = paddle::platform::complex64; using complex128 = paddle::platform::complex128; template struct SetConstant; +template struct SetConstant; template struct SetConstant; template struct SetConstant; template struct SetConstant; diff --git a/paddle/fluid/operators/math/segment_pooling.cu b/paddle/fluid/operators/math/segment_pooling.cu index 0b615cefac4eed..b49b5036ac42e2 100644 --- a/paddle/fluid/operators/math/segment_pooling.cu +++ b/paddle/fluid/operators/math/segment_pooling.cu @@ -25,14 +25,12 @@ namespace operators { using Tensor = framework::Tensor; template -__global__ void SegmentMeanCustomKernel( - const Index* segment_ids, const T* input, T* output, T* summed_ids, - const Index input_length_size, const Index inner_dim_size, - const Index output_length_size, const Index total_stripe_count) { +__global__ void SegmentSumIdsKernel(const Index* segment_ids, T* summed_ids, + const Index input_length_size, + const Index total_stripe_count) { CUDA_KERNEL_LOOP(stripe_index, total_stripe_count) { - const Index segment_offset = stripe_index % inner_dim_size; - const Index dim_index_base = - stripe_index / inner_dim_size * Index(DimTileSize); + const Index segment_offset = stripe_index; + const Index dim_index_base = stripe_index * Index(DimTileSize); const Index actual_height = min(Index(DimTileSize), input_length_size - dim_index_base); @@ -41,19 +39,20 @@ __global__ void SegmentMeanCustomKernel( if (dim_index_base > 0) { last_segment_id = segment_ids[dim_index_base - 1]; } - if (segment_offset == 0) { - T sum = T(0); - for (Index j = 0; j < actual_height; j++) { - Index current_segment_id = segment_ids[dim_index_base + j]; - // Note(ZHUI): following check may cause - // cudaErrorLaunchOutOfResources. - // PADDLE_ENFORCE(current_segment_id >= last_segment_id, - // "the segment ids should be sorted, but got " - // "segment_ids[%d]:%d > segment_ids[%d]:%d.", - // dim_index_base + j - 1, dim_index_base + j, - // last_segment_id, current_segment_id); - - if (j > 0 && current_segment_id > last_segment_id) { + T sum = T(0); + for (Index j = 0; j < actual_height; j++) { + Index current_segment_id = segment_ids[dim_index_base + j]; + PADDLE_ENFORCE(current_segment_id >= last_segment_id, + "the segment ids should be sorted, but got " + "segment_ids[%d]:%d > segment_ids[%d]:%d.", + dim_index_base + j - 1, dim_index_base + j, + last_segment_id, current_segment_id); + if (current_segment_id > last_segment_id) { + for (Index interval_id = last_segment_id + 1; + interval_id < current_segment_id; ++interval_id) { + *(summed_ids + interval_id) = 0; + } + if (j > 0) { if (last_segment_id == first_segment_id) { platform::CudaAtomicAdd(summed_ids + last_segment_id, sum); } else { @@ -61,33 +60,60 @@ __global__ void SegmentMeanCustomKernel( } sum = T(0); } - sum += T(1); - last_segment_id = current_segment_id; } - platform::CudaAtomicAdd(summed_ids + last_segment_id, sum); + sum += T(1); + last_segment_id = current_segment_id; + } + platform::CudaAtomicAdd(summed_ids + last_segment_id, sum); + } +} + +template +__global__ void SegmentMeanKernel(const Index* segment_ids, const T* input, + T* output, T* summed_ids, + const Index input_length_size, + const Index inner_dim_size, + const Index output_length_size, + const Index total_stripe_count) { + CUDA_KERNEL_LOOP(stripe_index, total_stripe_count) { + const Index segment_offset = stripe_index % inner_dim_size; + const Index dim_index_base = + stripe_index / inner_dim_size * Index(DimTileSize); + const Index actual_height = + min(Index(DimTileSize), input_length_size - dim_index_base); + + Index first_segment_id = segment_ids[dim_index_base]; + Index last_segment_id = -1; + if (dim_index_base > 0) { + last_segment_id = segment_ids[dim_index_base - 1]; } - // ensure last_segment_id is the largest - last_segment_id = output_length_size; - __syncthreads(); T sum = T(0); for (Index j = 0; j < actual_height; j++) { Index current_segment_id = segment_ids[dim_index_base + j]; if (current_segment_id > last_segment_id) { - const Index output_index = - last_segment_id * inner_dim_size + segment_offset; - if (last_segment_id == first_segment_id) { - platform::CudaAtomicAdd(output + output_index, - sum / *(summed_ids + last_segment_id)); - } else { - *(output + output_index) = sum / *(summed_ids + last_segment_id); + // reset the interval value which do not have corresponding ids. + for (Index interval_id = last_segment_id + 1; + interval_id < current_segment_id; ++interval_id) { + *(output + interval_id * inner_dim_size + segment_offset) = T(0); + } + + if (j > 0) { + Index output_index = + last_segment_id * inner_dim_size + segment_offset; + + if (last_segment_id == first_segment_id) { + platform::CudaAtomicAdd(output + output_index, + sum / *(summed_ids + last_segment_id)); + } else { + *(output + output_index) = sum / *(summed_ids + last_segment_id); + } + sum = T(0); } - sum = T(0); } sum += input[(dim_index_base + j) * inner_dim_size + segment_offset]; last_segment_id = current_segment_id; } - const Index output_index = - last_segment_id * inner_dim_size + segment_offset; + Index output_index = last_segment_id * inner_dim_size + segment_offset; platform::CudaAtomicAdd(output + output_index, sum / *(summed_ids + last_segment_id)); } @@ -122,7 +148,7 @@ __global__ void SegmentOpsKernel(const Index* segment_ids, const T* input, // reset the interval value which do not have corresponding ids. for (Index interval_id = last_segment_id + 1; interval_id < current_segment_id; ++interval_id) { - *(output + interval_id * inner_dim_size + segment_offset) = 0; + *(output + interval_id * inner_dim_size + segment_offset) = T(0); } // don't update result when j=0 if (j > 0) { @@ -272,11 +298,25 @@ class SegmentPoolFunctor { framework::Tensor* output, framework::Tensor* summed_ids = nullptr, const std::string pooltype = "SUM") { + if (pooltype == "MEAN") { + // Sum the segment id num first + T DimTileSize = 8; + auto input_length_size = segment_ids.numel(); + auto total_stripe_count = + (input_length_size + DimTileSize - 1) / DimTileSize; + auto config = platform::GetGpuLaunchConfig1D(ctx, total_stripe_count); + SegmentSumIdsKernel< + T, IndexT, IndexT(8)><<>>( + segment_ids.data(), summed_ids->data(), input_length_size, + total_stripe_count); + } + auto h = ArrangeHelper(input.numel(), segment_ids.dims()[0], output->dims()[0]); auto config = platform::GetGpuLaunchConfig1D(ctx, h.total_stripe_count); if (pooltype == "MEAN") { - SegmentMeanCustomKernel< + SegmentMeanKernel< T, IndexT, IndexT(8)><<>>( segment_ids.data(), input.data(), output->data(), diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc index f7b16453e0133b..b9a1854a66118e 100644 --- a/paddle/fluid/operators/math/selected_rows_functor.cc +++ b/paddle/fluid/operators/math/selected_rows_functor.cc @@ -285,6 +285,8 @@ template struct SelectedRowsAddToTensor; template struct SelectedRowsAddToTensor; template struct SelectedRowsAddToTensor; template struct SelectedRowsAddToTensor; +template struct SelectedRowsAddToTensor; // This is a separated namespace for manipulate SelectedRows typed // data. Like merge duplicated rows, adding two SelectedRows etc. @@ -294,21 +296,17 @@ template struct SelectedRowsAddToTensor; // add or mul. namespace scatter { -template -typename std::enable_if< - std::is_floating_point::value && - std::is_same::value>::type -elementwise_add_to(const DeviceContext& ctx, BlasT* blas, - size_t data_len, const T* in, T* out) { - blas->AXPY(data_len, 1., in, out); +template +typename std::enable_if::value>::type +elementwise_add_to(BlasT* blas, size_t data_len, + const T* in, T* out) { + blas->AXPY(data_len, T(1.f), in, out); } -template -typename std::enable_if< - !std::is_floating_point::value && - std::is_same::value>::type -elementwise_add_to(const DeviceContext& ctx, BlasT* blas, - size_t data_len, const T* in, T* out) { +template +typename std::enable_if::value>::type elementwise_add_to( + BlasT* blas, size_t data_len, const T* in, + T* out) { for (size_t i = 0; i < data_len; i++) { out[i] += in[i]; } @@ -412,7 +410,7 @@ struct MergeAdd { out.set_rows(merge_rows); math::SetConstant constant_functor; - constant_functor(context, out.mutable_value(), 0.0); + constant_functor(context, out.mutable_value(), static_cast(0.f)); std::unordered_map rows_to_id; for (size_t i = 0; i < merge_rows.size(); ++i) { @@ -429,9 +427,9 @@ struct MergeAdd { for (size_t i = 0; i < input_rows.size(); i++) { size_t out_i = rows_to_id[input_rows[i]]; - elementwise_add_to( - context, &blas, static_cast(input_width), - &input_data[i * input_width], &out_data[out_i * input_width]); + elementwise_add_to(&blas, static_cast(input_width), + &input_data[i * input_width], + &out_data[out_i * input_width]); } } } @@ -524,9 +522,9 @@ struct MergeAverage { for (size_t i = 0; i < input_rows.size(); i++) { size_t out_i = rows_to_id[input_rows[i]]; - elementwise_add_to( - context, &blas, static_cast(input_width), - &input_data[i * input_width], &out_data[out_i * input_width]); + elementwise_add_to(&blas, static_cast(input_width), + &input_data[i * input_width], + &out_data[out_i * input_width]); } } size_t input_width_cast = static_cast(input_width); @@ -547,6 +545,8 @@ template struct MergeAdd; template struct MergeAdd; +template struct MergeAdd; template struct MergeAverage; template struct MergeAverage; diff --git a/paddle/fluid/operators/math/unpooling.cu b/paddle/fluid/operators/math/unpooling.cu index d78e3385efb29c..a73f76f53be052 100644 --- a/paddle/fluid/operators/math/unpooling.cu +++ b/paddle/fluid/operators/math/unpooling.cu @@ -87,7 +87,11 @@ class Unpool2dMaxFunctor { const T* input_data = input.data(); const int* indices_data = indices.data(); T* output_data = output->mutable_data(context.GetPlace()); +#ifdef __HIPCC__ + int threads = 256; +#else int threads = 1024; +#endif int grid = (input.numel() + threads - 1) / threads; KernelUnpool2dMax<<>>( input.numel(), input_data, indices_data, input_height, input_width, @@ -117,7 +121,11 @@ class Unpool2dMaxGradFunctor { const T* output_data = output.data(); const T* output_grad_data = output_grad.data(); T* input_grad_data = input_grad->mutable_data(context.GetPlace()); +#ifdef __HIPCC__ + int threads = 256; +#else int threads = 1024; +#endif int grid = (input.numel() + threads - 1) / threads; KernelUnpool2dMaxGrad<<>>( input.numel(), input_data, indices_data, input_height, input_width, diff --git a/paddle/fluid/operators/mean_op_npu.cc b/paddle/fluid/operators/mean_op_npu.cc index 676086bd080633..d6e982039fa290 100644 --- a/paddle/fluid/operators/mean_op_npu.cc +++ b/paddle/fluid/operators/mean_op_npu.cc @@ -68,10 +68,8 @@ class MeanGradNPUKernel : public framework::OpKernel { Tensor mean_tensor(grad->type()); mean_tensor.Resize({1}); mean_tensor.mutable_data(context.GetPlace()); - std::vector mean_vec; - mean_vec.push_back(1.0 / static_cast(IG->numel())); - framework::TensorFromVector(mean_vec, context.device_context(), - &mean_tensor); + FillNpuTensorWithConstant( + &mean_tensor, static_cast(1.0 / static_cast(IG->numel()))); // means mul ones Tensor mean_ma(grad->type()); diff --git a/paddle/fluid/operators/memcpy_op.cc b/paddle/fluid/operators/memcpy_op.cc index d10d5bf12e6b4a..ecd2d48dcbd102 100644 --- a/paddle/fluid/operators/memcpy_op.cc +++ b/paddle/fluid/operators/memcpy_op.cc @@ -105,16 +105,18 @@ class MemcpyOpProtoMaker : public framework::OpProtoAndCheckerMaker { "is the same as input X."); AddAttr("dst_place_type", "Determine the dst place of tensor copy. " - "By Now it ONLY support CUDAPlace and CUDAPinnedPlace. Other " - "place type is Unimplemented and will cause ERROR." + "By Now it ONLY support CUDAPlace <-> CUDAPinnedPlace or " + "NPUPlace <-> CPUPlace. " + "Other place type is Unimplemented and will cause ERROR." "0: dst is on CPUPlace. " "1: dst is on CUDAPlace. " "2: dst is on CUDAPinnedPlace. " - "3: dst is on XPUPlace. "); + "3: dst is on XPUPlace. " + "4: dst is on NPUPlace. "); AddComment(R"DOC( Memcpy Operator. - By now, it ONLY supports the memcopy between CUDAPinnedPlace and CUDAPlace, - and used as an internal op by Recompute-Offload. + By now, it ONLY supports the memcopy between CUDAPinnedPlace <-> CUDAPlace or + NPUPlace <-> CPUPlace, and used as an internal op by Recompute-Offload. You would have to update it if you want other more capacities. Out = X, when type in [LoDTensor] @@ -139,10 +141,18 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(memcpy, float, ops::MemcpyKernel, double, ops::MemcpyKernel, plat::float16, ops::MemcpyKernel); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_ROCM) REGISTER_OP_CUDA_KERNEL_FUNCTOR(memcpy, float, ops::MemcpyKernel, double, ops::MemcpyKernel, int, ops::MemcpyKernel, int64_t, ops::MemcpyKernel, bool, ops::MemcpyKernel, plat::float16, ops::MemcpyKernel); #endif + +#ifdef PADDLE_WITH_ASCEND_CL +REGISTER_OP_NPU_KERNEL_FUNCTOR(memcpy, float, ops::MemcpyKernel, double, + ops::MemcpyKernel, int, ops::MemcpyKernel, + int64_t, ops::MemcpyKernel, bool, + ops::MemcpyKernel, plat::float16, + ops::MemcpyKernel); +#endif diff --git a/paddle/fluid/operators/memcpy_op.h b/paddle/fluid/operators/memcpy_op.h old mode 100755 new mode 100644 index f81ca05f4380a4..63a41cc7237310 --- a/paddle/fluid/operators/memcpy_op.h +++ b/paddle/fluid/operators/memcpy_op.h @@ -51,7 +51,17 @@ class MemcpyFunctor { } else if (dst_place_type_ == 1) { framework::TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_, &out_tensor); - } else { + } +#ifdef PADDLE_WITH_ASCEND_CL + else if (dst_place_type_ == 0) { // NOLINT + framework::TensorCopy(lod_tensor, platform::CPUPlace(), dev_ctx_, + &out_tensor); + } else if (dst_place_type_ == 4) { + framework::TensorCopy(lod_tensor, dev_ctx_.GetPlace(), dev_ctx_, + &out_tensor); + } +#endif + else { // NOLINT PADDLE_THROW(platform::errors::Unimplemented( "memcpy dst_place_type: %d is not supported yet.", dst_place_type_)); } diff --git a/paddle/fluid/operators/meshgrid_op.cc b/paddle/fluid/operators/meshgrid_op.cc index 33f71b4adc066f..54600e26bb57f3 100644 --- a/paddle/fluid/operators/meshgrid_op.cc +++ b/paddle/fluid/operators/meshgrid_op.cc @@ -157,3 +157,17 @@ REGISTER_OP_CPU_KERNEL( ops::MeshgridGradKernel, ops::MeshgridGradKernel, ops::MeshgridGradKernel); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +REGISTER_OP_CUDA_KERNEL( + meshgrid, ops::MeshgridKernel, + ops::MeshgridKernel, + ops::MeshgridKernel, + ops::MeshgridKernel, + ops::MeshgridKernel); +REGISTER_OP_CUDA_KERNEL( + meshgrid_grad, + ops::MeshgridGradKernel, + ops::MeshgridGradKernel, + ops::MeshgridGradKernel, + ops::MeshgridGradKernel); +#endif diff --git a/paddle/fluid/operators/meshgrid_op.cu b/paddle/fluid/operators/meshgrid_op.cu deleted file mode 100644 index dc813a07f8c8c1..00000000000000 --- a/paddle/fluid/operators/meshgrid_op.cu +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/operators/meshgrid_op.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - meshgrid, ops::MeshgridKernel, - ops::MeshgridKernel, - ops::MeshgridKernel, - ops::MeshgridKernel, - ops::MeshgridKernel); -REGISTER_OP_CUDA_KERNEL( - meshgrid_grad, - ops::MeshgridGradKernel, - ops::MeshgridGradKernel, - ops::MeshgridGradKernel, - ops::MeshgridGradKernel); diff --git a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc index 64a1903c2da4ff..9d80286f4c4efa 100644 --- a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc @@ -33,7 +33,7 @@ class InterpolateMKLDNNHandler : public platform::MKLDNNHandlerT { public: InterpolateMKLDNNHandler(const dnnl::algorithm algo, - const paddle::platform::MKLDNNDeviceContext& dev_ctx, + const platform::MKLDNNDeviceContext& dev_ctx, const dnnl::engine engine, platform::Place cpu_place, const Tensor* x, Tensor* z, const std::string& uniq_name) @@ -94,19 +94,32 @@ class InterpolateMKLDNNKernel : public framework::OpKernel { out_dims = out_size_data; } } else { - float scale; + std::vector scale; + scale.reserve(3); auto scale_tensor = ctx.Input("Scale"); if (scale_tensor != nullptr) { auto scale_data = get_new_data_from_tensor(scale_tensor); - scale = scale_data[0]; + scale.resize(3, scale_data[0]); + std::copy(scale_data.begin(), scale_data.end(), scale.begin()); } else { - scale = ctx.Attr("scale"); + std::string op_type = ctx.Type(); + + if (op_type.find("v2") == std::string::npos) { // v1 + scale.push_back(ctx.Attr("scale")); + scale.push_back(scale[0]); + scale.push_back(scale[0]); + } else { // v2 + std::vector scale_attr = ctx.Attr>("scale"); + scale.resize(3, scale_attr[0]); + std::copy(scale_attr.begin(), scale_attr.end(), scale.begin()); + } } - if (scale > 0) { + if (scale[0] > 0.0f && scale[1] > 0.0f && scale[2] > 0.0f) { + int j = 0; std::vector in_dhw_vec = framework::vectorize(in_dhw_dims); std::transform( in_dhw_vec.begin(), in_dhw_vec.end(), out_dims.begin(), - [&](int64_t i) -> int { return static_cast(i * scale); }); + [&](int64_t i) -> int { return static_cast(i * scale[j++]); }); } } @@ -172,3 +185,8 @@ REGISTER_OP_KERNEL(nearest_interp, MKLDNN, ::paddle::platform::CPUPlace, ops::InterpolateMKLDNNKernel); REGISTER_OP_KERNEL(bilinear_interp, MKLDNN, ::paddle::platform::CPUPlace, ops::InterpolateMKLDNNKernel); + +REGISTER_OP_KERNEL(nearest_interp_v2, MKLDNN, ::paddle::platform::CPUPlace, + ops::InterpolateMKLDNNKernel); +REGISTER_OP_KERNEL(bilinear_interp_v2, MKLDNN, ::paddle::platform::CPUPlace, + ops::InterpolateMKLDNNKernel); diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc index aafff5248a0244..d6cd76b697f518 100644 --- a/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc +++ b/paddle/fluid/operators/mkldnn/test_mkldnn_caching.cc @@ -50,7 +50,7 @@ class CacheTester { platform::CPUPlace place; onednn_dev_ctx_ = dynamic_cast(pool.Get(place)); - onednn_dev_ctx_->ResetBlobMap(); + onednn_dev_ctx_->ResetBlobMap(nullptr); } bool Analyze(unsigned short int num_entries) { diff --git a/paddle/fluid/operators/modified_huber_loss_op.cu b/paddle/fluid/operators/modified_huber_loss_op.cu index 71bfacb9283850..3c85da3c52c6c9 100644 --- a/paddle/fluid/operators/modified_huber_loss_op.cu +++ b/paddle/fluid/operators/modified_huber_loss_op.cu @@ -11,7 +11,9 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include #include +#include #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/modified_huber_loss_op.h" diff --git a/paddle/fluid/operators/norm_utils.cu.h b/paddle/fluid/operators/norm_utils.cu.h index 9fcc629233891b..843736833f8156 100644 --- a/paddle/fluid/operators/norm_utils.cu.h +++ b/paddle/fluid/operators/norm_utils.cu.h @@ -32,6 +32,12 @@ namespace cub = hipcub; #include "paddle/fluid/platform/cudnn_helper.h" #endif +#ifdef __HIPCC__ +#define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim) +#else +#define LAUNCH_BOUNDS(BlockDim) +#endif + namespace paddle { namespace operators { @@ -58,12 +64,10 @@ using DataLayout = framework::DataLayout; // axis=(n,h,w))) template -__global__ void DoubleGradComputeDX(const T *x, const T *mean, - const T *variance, const T *ddx, - const T *dy, const T *scale, - const T *ddscale, const int N, const int C, - const int sample_size, const double epsilon, - T *dx) { +__global__ LAUNCH_BOUNDS(BlockDim) void DoubleGradComputeDX( + const T *x, const T *mean, const T *variance, const T *ddx, const T *dy, + const T *scale, const T *ddscale, const int N, const int C, + const int sample_size, const double epsilon, T *dx) { const int outer_size = C; const int inner_size = N * sample_size; @@ -160,12 +164,10 @@ __global__ void DoubleGradComputeDX(const T *x, const T *mean, // scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) * // np.mean(ddx * (x - mean), axis=(n,h,w))) template -__global__ void DoubleGradComputeDDY(const T *x, const T *mean, - const T *variance, const T *ddscale, - const T *ddbias, const T *ddx, - const T *scale, const int N, const int C, - const int sample_size, - const double epsilon, T *ddy) { +__global__ LAUNCH_BOUNDS(BlockDim) void DoubleGradComputeDDY( + const T *x, const T *mean, const T *variance, const T *ddscale, + const T *ddbias, const T *ddx, const T *scale, const int N, const int C, + const int sample_size, const double epsilon, T *ddy) { const int outer_size = C; const int inner_size = N * sample_size; @@ -238,11 +240,10 @@ __global__ void DoubleGradComputeDDY(const T *x, const T *mean, // inv_var.pow(2) * np.mean(dy * (x-mean), axis=(n,h,w)))) * // ddx template -__global__ void DoubleGradComputeDScale(const T *x, const T *mean, - const T *variance, const T *ddx, - const T *dy, const int N, const int C, - const int sample_size, - const double epsilon, T *dscale) { +__global__ LAUNCH_BOUNDS(BlockDim) void DoubleGradComputeDScale( + const T *x, const T *mean, const T *variance, const T *ddx, const T *dy, + const int N, const int C, const int sample_size, const double epsilon, + T *dscale) { const int outer_size = C; const int inner_size = N * sample_size; @@ -302,7 +303,7 @@ __global__ void DoubleGradComputeDScale(const T *x, const T *mean, // math: dscale = np.sum(ddx * dy, axis=(n,h,w)) * inv_var template -__global__ void DoubleGradComputeDScaleWithGlobal( +__global__ LAUNCH_BOUNDS(BlockDim) void DoubleGradComputeDScaleWithGlobal( const T *ddx, const T *variance, const T *dy, const double epsilon, const int N, const int C, const int sample_size, T *dscale) { int outer_size = C; @@ -422,8 +423,11 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, set_constant(dev_ctx, &scale_tmp, static_cast(1)); } const T *scale_data = Scale ? Scale->data() : scale_tmp.data(); - +#ifdef __HIPCC__ + const int block = 256; +#else const int block = 512; +#endif int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); const int max_blocks = std::max(max_threads / block, 1); int grid = std::min(C, max_blocks); @@ -532,6 +536,5 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, } } } - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/npu_op_runner.cc b/paddle/fluid/operators/npu_op_runner.cc index aa0c4d2dfd274e..276bfa7b3281b9 100644 --- a/paddle/fluid/operators/npu_op_runner.cc +++ b/paddle/fluid/operators/npu_op_runner.cc @@ -64,8 +64,10 @@ aclFormat ConvertToNpuFormat(DataLayout layout) { return iter->second; } -aclrtStream GetCurrentNPUStream() { - int device_id = platform::GetCurrentNPUDeviceId(); +aclrtStream GetCurrentNPUStream(int device_id) { + if (device_id == -1) { + device_id = platform::GetCurrentNPUDeviceId(); + } platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto *dev_ctx = static_cast( pool.Get(platform::NPUPlace(device_id))); @@ -299,5 +301,6 @@ void NpuOpRunner::Run(aclrtStream stream) { VLOG(4) << "after aclopCompileAndExecute: " << ret; PADDLE_ENFORCE_NPU_SUCCESS(ret); } + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/npu_op_runner.h b/paddle/fluid/operators/npu_op_runner.h index e178f7fc6e96d8..cfc933c7a76fa7 100644 --- a/paddle/fluid/operators/npu_op_runner.h +++ b/paddle/fluid/operators/npu_op_runner.h @@ -86,6 +86,48 @@ class NpuOpRunner { aclDataType ConvertToNpuDtype(framework::proto::VarType::Type dtype); +aclrtStream GetCurrentNPUStream(int device_id = -1); + +template +void FillNpuTensorWithConstant(Tensor *tensor, T val) { + // NOTE(zhiqiu): we found that power sometimes returns 0 when val is small + // like 1e-8. + constexpr float MIN_PRECISION_FOR_POWER = 1e-3; + PADDLE_ENFORCE_EQ( + tensor->IsInitialized(), true, + platform::errors::InvalidArgument("The tensor should be initialized.")); + PADDLE_ENFORCE_EQ( + platform::is_npu_place(tensor->place()), true, + platform::errors::InvalidArgument("The tensor should be on NPUPlace.")); + // do async for better performance + if ((typeid(float) == typeid(T) || typeid(platform::float16) == typeid(T)) && + static_cast(val) > MIN_PRECISION_FOR_POWER) { + Tensor tmp(tensor->type()); + tmp.Resize(tensor->dims()); + tmp.mutable_data(tensor->place()); + auto stream = GetCurrentNPUStream( + BOOST_GET_CONST(platform::NPUPlace, tensor->place()).device); + platform::NPUMemsetAsync(tmp.data(), 0, tmp.numel() * sizeof(T), + stream); + auto runner = NpuOpRunner("Power", {tmp}, {*tensor}, + {{"power", static_cast(1)}, + {"scale", static_cast(0)}, + {"shift", static_cast(val)}}); + runner.Run(stream); + } else { + T *array = new T[tensor->numel()]; + for (unsigned int i = 0; i < tensor->numel(); ++i) { + array[i] = static_cast(val); + } + std::vector vec(tensor->numel(), static_cast(val)); + // do sync copy + memory::Copy(BOOST_GET_CONST(platform::NPUPlace, tensor->place()), + tensor->data(), platform::CPUPlace(), array, + tensor->numel() * sizeof(T), nullptr); + delete[] array; + } +} + } // namespace operators } // namespace paddle #endif diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc index 621920731fb603..a7886cdd670d4d 100644 --- a/paddle/fluid/operators/optimizers/adam_op.cc +++ b/paddle/fluid/operators/optimizers/adam_op.cc @@ -151,6 +151,11 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker { "as beta2, this has a higher priority than attr(beta2), the " "shape of this tensor MUST BE [1].") .AsDispensable(); + AddInput("EpsilonTensor", + "(Tensor, optional) If provided, Adam will use this " + "as epsilon, this has a higher priority than attr(epsilon), the " + "shape of this tensor MUST BE [1].") + .AsDispensable(); AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable(); AddOutput("ParamOut", "(Tensor) Output parameter"); @@ -232,4 +237,13 @@ REGISTER_OP_VERSION(adam) paddle::framework::compatible::OpVersionDesc().NewAttr( "multi_precision", "(bool) Whether to use multi-precision during weight updating.", - false)); + false)) + .AddCheckpoint( + R"ROC( + Upgrade adam, add 1 dispensable input [EpsilonTensor]. + )ROC", + paddle::framework::compatible::OpVersionDesc().NewInput( + "EpsilonTensor", + "If provided, Adam will use this as epsilon, " + "this has a higher priority than attr(epsilon). " + "For better performance in npu kernel. ")); diff --git a/paddle/fluid/operators/optimizers/adam_op.cu b/paddle/fluid/operators/optimizers/adam_op.cu index 54aea67f4ea1b3..3d6f0f99a52dfb 100644 --- a/paddle/fluid/operators/optimizers/adam_op.cu +++ b/paddle/fluid/operators/optimizers/adam_op.cu @@ -154,7 +154,7 @@ class AdamOpCUDAKernel : public framework::OpKernel { int64_t min_row_size_to_use_multithread = ctx.Attr("min_row_size_to_use_multithread"); bool lazy_mode = ctx.Attr("lazy_mode"); - MPDType epsilon = static_cast(ctx.Attr("epsilon")); + auto* param = ctx.Input("Param"); auto* grad_var = ctx.InputVar("Grad"); auto* mom1 = ctx.Input("Moment1"); @@ -188,6 +188,15 @@ class AdamOpCUDAKernel : public framework::OpKernel { beta2_tensor->numel())); beta2 = static_cast(GetAttrFromTensor(beta2_tensor)); } + MPDType epsilon = static_cast(ctx.Attr("epsilon")); + if (ctx.HasInput("EpsilonTensor")) { + auto* epsilon_tensor = ctx.Input("EpsilonTensor"); + PADDLE_ENFORCE_EQ(epsilon_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(EpsilonTensor) size must be 1, but get %d", + epsilon_tensor->numel())); + epsilon = static_cast(GetAttrFromTensor(epsilon_tensor)); + } VLOG(3) << "beta1_pow.numel() : " << beta1_pow->numel() << "beta2_pow.numel() : " << beta2_pow->numel(); VLOG(3) << "param.numel(): " << param->numel(); diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 6356911f0676a8..9667db8055b90c 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -406,7 +406,7 @@ class AdamOpKernel : public framework::OpKernel { int64_t min_row_size_to_use_multithread = ctx.Attr("min_row_size_to_use_multithread"); bool lazy_mode = ctx.Attr("lazy_mode"); - T epsilon = static_cast(ctx.Attr("epsilon")); + auto* param = ctx.Input("Param"); auto* grad_var = ctx.InputVar("Grad"); auto* mom1 = ctx.Input("Moment1"); @@ -440,6 +440,15 @@ class AdamOpKernel : public framework::OpKernel { beta2_tensor->numel())); beta2 = static_cast(GetAttrFromTensor(beta2_tensor)); } + T epsilon = static_cast(ctx.Attr("epsilon")); + if (ctx.HasInput("EpsilonTensor")) { + auto* epsilon_tensor = ctx.Input("EpsilonTensor"); + PADDLE_ENFORCE_EQ(epsilon_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(EpsilonTensor) size must be 1, but get %d", + epsilon_tensor->numel())); + epsilon = static_cast(GetAttrFromTensor(epsilon_tensor)); + } VLOG(3) << "beta1_pow.numel() : " << beta1_pow->numel() << "beta2_pow.numel() : " << beta2_pow->numel(); VLOG(3) << "param.numel(): " << param->numel(); diff --git a/paddle/fluid/operators/optimizers/adam_op_npu.cc b/paddle/fluid/operators/optimizers/adam_op_npu.cc index 134544c2f65bc3..343a6704388623 100644 --- a/paddle/fluid/operators/optimizers/adam_op_npu.cc +++ b/paddle/fluid/operators/optimizers/adam_op_npu.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include #include +#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/npu_op_runner.h" #include "paddle/fluid/operators/optimizers/adam_op.h" @@ -61,27 +62,71 @@ class AdamNPUKernel : public framework::OpKernel { param_out->mutable_data(ctx.GetPlace()); mom1_out->mutable_data(ctx.GetPlace()); mom2_out->mutable_data(ctx.GetPlace()); - beta1_pow_out->mutable_data(ctx.GetPlace()); - beta2_pow_out->mutable_data(ctx.GetPlace()); - T beta1 = static_cast(ctx.Attr("beta1")); + // NOTE(zhiqiu): beta1_pow and beta2_pow may on CPU and not transform place. + if (beta1_pow->place() == platform::CPUPlace()) { + T beta1 = *beta1_pow->data(); + // `mutable_data` operation needs to be done after getting data + beta1_pow_out->mutable_data(ctx.GetPlace()); + FillNpuTensorWithConstant(beta1_pow_out, beta1); + } else { + beta1_pow_out->mutable_data(ctx.GetPlace()); + } + if (beta2_pow->place() == platform::CPUPlace()) { + T beta2 = *beta2_pow->data(); + beta2_pow_out->mutable_data(ctx.GetPlace()); + FillNpuTensorWithConstant(beta2_pow_out, beta2); + } else { + beta2_pow_out->mutable_data(ctx.GetPlace()); + } + + const Tensor* beta1_tensor = nullptr; + const Tensor* beta2_tensor = nullptr; + const Tensor* epsilon_tensor = nullptr; + + Tensor beta1_tmp(framework::proto::VarType::FP32); + Tensor beta2_tmp(framework::proto::VarType::FP32); + Tensor epsilon_tmp(framework::proto::VarType::FP32); + if (ctx.HasInput("Beta1Tensor")) { - auto* beta1_tensor = ctx.Input("Beta1Tensor"); + beta1_tensor = ctx.Input("Beta1Tensor"); PADDLE_ENFORCE_EQ(beta1_tensor->numel(), 1, platform::errors::InvalidArgument( "Input(Beta1Tensor) size must be 1, but get %d", beta1_tensor->numel())); - beta1 = static_cast(GetAttrFromTensor(beta1_tensor)); + } else { + T beta1 = static_cast(ctx.Attr("beta1")); + beta1_tmp.mutable_data({1}, ctx.GetPlace()); + FillNpuTensorWithConstant(&beta1_tmp, beta1); + beta1_tensor = &beta1_tmp; } - T beta2 = static_cast(ctx.Attr("beta2")); + if (ctx.HasInput("Beta2Tensor")) { - auto* beta2_tensor = ctx.Input("Beta2Tensor"); - PADDLE_ENFORCE_EQ(beta2_tensor->numel(), 1, + beta2_tensor = ctx.Input("Beta2Tensor"); + PADDLE_ENFORCE_EQ(beta1_tensor->numel(), 1, platform::errors::InvalidArgument( "Input(Beta2Tensor) size must be 1, but get %d", beta2_tensor->numel())); - beta2 = static_cast(GetAttrFromTensor(beta2_tensor)); + } else { + T beta2 = static_cast(ctx.Attr("beta2")); + beta2_tmp.mutable_data({1}, ctx.GetPlace()); + FillNpuTensorWithConstant(&beta2_tmp, beta2); + beta2_tensor = &beta2_tmp; + } + + if (ctx.HasInput("EpsilonTensor")) { + epsilon_tensor = ctx.Input("EpsilonTensor"); + PADDLE_ENFORCE_EQ(epsilon_tensor->numel(), 1, + platform::errors::InvalidArgument( + "Input(EpsilonTensor) size must be 1, but get %d", + epsilon_tensor->numel())); + } else { + T epsilon = static_cast(ctx.Attr("epsilon")); + epsilon_tmp.mutable_data({1}, ctx.GetPlace()); + FillNpuTensorWithConstant(&epsilon_tmp, epsilon); + epsilon_tensor = &epsilon_tmp; } + VLOG(3) << "beta1_pow.numel() : " << beta1_pow->numel() << "beta2_pow.numel() : " << beta2_pow->numel(); VLOG(3) << "param.numel(): " << param->numel(); @@ -97,21 +142,6 @@ class AdamNPUKernel : public framework::OpKernel { "beta2 pow output size should be 1, but received " "value is:%d.", beta2_pow_out->numel())); - - // reshape - Tensor beta1_tensor(framework::proto::VarType::FP32); - beta1_tensor.mutable_data({1}, ctx.GetPlace()); - TensorFromVector(std::vector{beta1}, ctx.device_context(), - &beta1_tensor); - Tensor beta2_tensor(framework::proto::VarType::FP32); - beta2_tensor.mutable_data({1}, ctx.GetPlace()); - TensorFromVector(std::vector{beta2}, ctx.device_context(), - &beta2_tensor); - - Tensor epsilon_tensor(framework::proto::VarType::FP32); - epsilon_tensor.mutable_data({1}, ctx.GetPlace()); - TensorFromVector(std::vector{epsilon}, ctx.device_context(), - &epsilon_tensor); auto stream = ctx.template device_context() .stream(); @@ -119,7 +149,7 @@ class AdamNPUKernel : public framework::OpKernel { NpuOpRunner("ApplyAdamD", { *param, *mom1, *mom2, *beta1_pow, *beta2_pow, *lr, - beta1_tensor, beta2_tensor, epsilon_tensor, *grad, + *beta1_tensor, *beta2_tensor, *epsilon_tensor, *grad, }, { *param_out, *mom1_out, *mom2_out, @@ -130,22 +160,25 @@ class AdamNPUKernel : public framework::OpKernel { // NOTE(zhiqiu): ApplyAdamD updates params inplace, so // if param and param_out is not same, we need to do copy. if (param_out->data() != param->data()) { - ctx.template device_context().Wait(); - framework::TensorCopySync(*param, ctx.GetPlace(), param_out); + framework::TensorCopy( + *param, ctx.GetPlace(), + ctx.template device_context(), param_out); } if (mom1_out->data() != mom1->data()) { - ctx.template device_context().Wait(); - framework::TensorCopySync(*mom1, ctx.GetPlace(), mom1_out); + framework::TensorCopy( + *mom1, ctx.GetPlace(), + ctx.template device_context(), mom1_out); } if (mom2_out->data() != mom2->data()) { - ctx.template device_context().Wait(); - framework::TensorCopySync(*mom2, ctx.GetPlace(), mom2_out); + framework::TensorCopy( + *mom2, ctx.GetPlace(), + ctx.template device_context(), mom2_out); } auto runner_m1 = - NpuOpRunner("Mul", {*beta1_pow, beta1_tensor}, {*beta1_pow_out}, {}); + NpuOpRunner("Mul", {*beta1_pow, *beta1_tensor}, {*beta1_pow_out}, {}); runner_m1.Run(stream); auto runner_m2 = - NpuOpRunner("Mul", {*beta2_pow, beta2_tensor}, {*beta2_pow_out}, {}); + NpuOpRunner("Mul", {*beta2_pow, *beta2_tensor}, {*beta2_pow_out}, {}); runner_m2.Run(stream); } }; diff --git a/paddle/fluid/operators/optimizers/adam_op_xpu.cc b/paddle/fluid/operators/optimizers/adam_op_xpu.cc index 3baba424e8f43d..09f117374499b0 100644 --- a/paddle/fluid/operators/optimizers/adam_op_xpu.cc +++ b/paddle/fluid/operators/optimizers/adam_op_xpu.cc @@ -35,8 +35,6 @@ class AdamOpXPUKernel : public framework::OpKernel { framework::ToTypeName(param_var->Type()))); using paddle::framework::LoDTensor; - T epsilon = static_cast(ctx.Attr("epsilon")); - auto& param = GET_DATA_SAFELY(ctx.Input("Param"), "Input", "Param", "Adam"); // auto& grad = Ref(ctx.Input("Grad"), "Must set Grad"); @@ -85,6 +83,11 @@ class AdamOpXPUKernel : public framework::OpKernel { auto* beta2_tensor = ctx.Input("Beta2Tensor"); beta2 = static_cast(GetAttrFromTensor(beta2_tensor)); } + T epsilon = static_cast(ctx.Attr("epsilon")); + if (ctx.HasInput("EpsilonTensor")) { + auto* epsilon_tensor = ctx.Input("EpsilonTensor"); + epsilon = static_cast(GetAttrFromTensor(epsilon_tensor)); + } if (grad_var->IsType()) { auto& grad = GET_DATA_SAFELY(ctx.Input("Grad"), "Input", "Grad", "Adam"); diff --git a/paddle/fluid/operators/optimizers/sgd_op_npu.cc b/paddle/fluid/operators/optimizers/sgd_op_npu.cc index b7aaff5d457918..a8d19148ef520c 100644 --- a/paddle/fluid/operators/optimizers/sgd_op_npu.cc +++ b/paddle/fluid/operators/optimizers/sgd_op_npu.cc @@ -44,8 +44,9 @@ class SGDNPUKernel : public framework::OpKernel { // NOTE(zhiqiu): ApplyGradientDescent updates params inplace, so // if param and param_out is not same, we need to do copy. if (param_out->data() != param_var->data()) { - ctx.template device_context().Wait(); - framework::TensorCopySync(*param_var, ctx.GetPlace(), param_out); + framework::TensorCopy( + *param_var, ctx.GetPlace(), + ctx.template device_context(), param_out); } } }; diff --git a/paddle/fluid/operators/py_layer_op.cc b/paddle/fluid/operators/py_layer_op.cc index 0d5c23bed6016e..f91496eeab1420 100644 --- a/paddle/fluid/operators/py_layer_op.cc +++ b/paddle/fluid/operators/py_layer_op.cc @@ -60,33 +60,57 @@ void RunPyObject(py::object *py_object, outs->size(), result_tuple.size())); } for (size_t i = 0; i < result_tuple.size(); i++) { - if (Py_None != result_tuple[i].ptr()) { + if ((*outs)[i] != nullptr) { + if (Py_None != result_tuple[i].ptr()) { + try { + auto result_var = + result_tuple[i].cast>(); + *(*outs)[i] = result_var->Var(); + } catch (py::cast_error &) { + PADDLE_THROW(platform::errors::InvalidArgument( + "The output of `PyLayer.backward` should be `Tensor`.")); + } + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "The %dth input tensor of forward needs gradient and the " + "corresponding gradient cannot be None.", + i)); + } + } else { + if (Py_None != result_tuple[i].ptr()) { + PADDLE_THROW(platform::errors::InvalidArgument( + "The %dth input tensor of forward do not need gradient and the " + "corresponding gradient should be `None`.", + i)); + } + } + } + } else { + if (1 != outs->size()) { + PADDLE_THROW(platform::errors::InvalidArgument( + "The number of outputs of `PyLayer.backward` should be %d, but " + "received 1.", + outs->size())); + } + if ((*outs)[0] != nullptr) { + if (Py_None != py_result.ptr()) { try { auto result_var = - result_tuple[i].cast>(); - *(*outs)[i] = result_var->Var(); + py_result.cast>(); + *((*outs)[0]) = result_var->Var(); } catch (py::cast_error &) { - PADDLE_THROW(platform::errors::Unimplemented( + PADDLE_THROW(platform::errors::InvalidArgument( "The output of `PyLayer.backward` should be `Tensor`.")); } } else { - PADDLE_THROW(platform::errors::Unimplemented( - "The output of `PyLayer.backward` can not be `None`.")); - } - } - } else { - if (Py_None != py_result.ptr()) { - try { - auto result_var = - py_result.cast>(); - *((*outs)[0]) = result_var->Var(); - } catch (py::cast_error &) { - PADDLE_THROW(platform::errors::Unimplemented( - "The output of `PyLayer.backward` should be `Tensor`.")); + PADDLE_THROW(platform::errors::InvalidArgument( + "The input tensor of forward needs gradient, so the output of " + "`PyLayer.backward` can not be `None`.")); } } else { - PADDLE_THROW(platform::errors::Unimplemented( - "The output of `PyLayer.backward` can not be `None`.")); + PADDLE_THROW(platform::errors::InvalidArgument( + "The input tensor of forward do not need gradient, so the output of " + "`PyLayer.backward` should be `None`.")); } } } @@ -133,9 +157,12 @@ class PyLayerOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { auto &op_ = ctx.GetOp(); - auto pylayer_op = dynamic_cast(&op_); - if (pylayer_op) { - auto py_layer_context = pylayer_op->GetPyLayerContext(); + auto const_pylayer_op = dynamic_cast(&op_); + if (const_pylayer_op) { + auto pylayer_op = const_cast(const_pylayer_op); + + // Release contex after executing the compute + auto py_layer_context = pylayer_op->ReleasePyLayerContext(); py::object bk_ctx(py::handle(py_layer_context->GetMutableCtx()), true); auto &input_vars = ctx.MultiInputVar("X"); auto output_vars = ctx.MultiOutputVar("Out"); diff --git a/paddle/fluid/operators/py_layer_op.h b/paddle/fluid/operators/py_layer_op.h index 133435aa84d71e..d80faab90b2236 100644 --- a/paddle/fluid/operators/py_layer_op.h +++ b/paddle/fluid/operators/py_layer_op.h @@ -34,6 +34,10 @@ class PyLayerContext { PyLayerContext() = delete; PyObject* GetMutableCtx() { return context_; } + ~PyLayerContext() { + py::gil_scoped_acquire guard; + Py_XDECREF(context_); + } private: PyObject* context_; @@ -58,8 +62,11 @@ class PyLayerOp : public framework::OperatorWithKernel { void SetPyLayerContext(const std::shared_ptr& py_context) { py_context_ = py_context; } - const std::shared_ptr& GetPyLayerContext() const { - return py_context_; + std::shared_ptr ReleasePyLayerContext() { + auto temp = py_context_; + py_context_.reset(); + VLOG(3) << "`py_context_` in the PyLayerOp is released."; + return temp; } private: diff --git a/paddle/fluid/operators/range_op_npu.cc b/paddle/fluid/operators/range_op_npu.cc index 228372e1e93e03..a9a2effd2eb9db 100644 --- a/paddle/fluid/operators/range_op_npu.cc +++ b/paddle/fluid/operators/range_op_npu.cc @@ -39,11 +39,23 @@ class RangeNPUKernel : public framework::OpKernel { auto* out = context.Output("Out"); framework::Tensor n; - framework::TensorCopySync(*start_t, platform::CPUPlace(), &n); + framework::TensorCopy( + *start_t, platform::CPUPlace(), + context.template device_context(), &n); + context.template device_context() + .Wait(); T start = n.data()[0]; - framework::TensorCopySync(*end_t, platform::CPUPlace(), &n); + framework::TensorCopy( + *end_t, platform::CPUPlace(), + context.template device_context(), &n); + context.template device_context() + .Wait(); T end = n.data()[0]; - framework::TensorCopySync(*step_t, platform::CPUPlace(), &n); + framework::TensorCopy( + *step_t, platform::CPUPlace(), + context.template device_context(), &n); + context.template device_context() + .Wait(); T step = n.data()[0]; int64_t size = 0; diff --git a/paddle/fluid/operators/range_op_npu_test.cc b/paddle/fluid/operators/range_op_npu_test.cc index 562a560b2f1548..f2f395314c0cc8 100644 --- a/paddle/fluid/operators/range_op_npu_test.cc +++ b/paddle/fluid/operators/range_op_npu_test.cc @@ -87,6 +87,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx, TEST(range, NPU) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx, "range"); + auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx, "range"); } diff --git a/paddle/fluid/operators/read_file_op.cc b/paddle/fluid/operators/read_file_op.cc new file mode 100644 index 00000000000000..6da92ed7df7d8e --- /dev/null +++ b/paddle/fluid/operators/read_file_op.cc @@ -0,0 +1,92 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { + +template +class CPUReadFileKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto filename = ctx.Attr("filename"); + + std::ifstream input(filename.c_str(), + std::ios::in | std::ios::binary | std::ios::ate); + std::streamsize file_size = input.tellg(); + + input.seekg(0, std::ios::beg); + + auto* out = ctx.Output("Out"); + std::vector out_shape = {file_size}; + out->Resize(framework::make_ddim(out_shape)); + + uint8_t* data = out->mutable_data(ctx.GetPlace()); + + input.read(reinterpret_cast(data), file_size); + } +}; + +class ReadFileOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, + platform::errors::InvalidArgument( + "Output(Out) of ReadFileOp is null.")); + + auto out_dims = std::vector(1, -1); + ctx->SetOutputDim("Out", framework::make_ddim(out_dims)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(framework::proto::VarType::UINT8, + platform::CPUPlace()); + } +}; + +class ReadFileOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddOutput("Out", "The output tensor of ReadFile op"); + AddComment(R"DOC( +This operator read a file. +)DOC"); + AddAttr("filename", "Path of the file to be readed.") + .SetDefault({}); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR( + read_file, ops::ReadFileOp, ops::ReadFileOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker) + +REGISTER_OP_CPU_KERNEL(read_file, ops::CPUReadFileKernel) diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index b29493404f4536..f5d55791d86c68 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -53,9 +53,25 @@ BufferedReader::BufferedReader( stream_ = platform::CudaStreamResourcePool::Instance().New(dev_idx); } #endif + +#ifdef PADDLE_WITH_ASCEND_CL + if (platform::is_npu_place(place_)) { + int dev_idx = BOOST_GET_CONST(platform::NPUPlace, place_).device; + compute_stream_ = + ((platform::NPUDeviceContext *)(platform::DeviceContextPool::Instance() + .Get(place_))) + ->stream(); + events_.resize(buffer_size); + for (auto &event : events_) { + event = platform::NpuEventResourcePool::Instance().New(dev_idx); + } + stream_ = platform::NpuStreamResourcePool::Instance().New(dev_idx); + } +#endif is_same_place_ = false; cpu_buffer_.resize(buffer_size); cuda_buffer_.resize(buffer_size); + npu_buffer_.resize(buffer_size); ReadTillBufferFullAsync(); } @@ -196,7 +212,59 @@ void BufferedReader::ReadAsync(size_t i) { #endif } } -#endif // @} End Group GPU Place +#endif + +#ifdef PADDLE_WITH_ASCEND_CL + if (platform::is_npu_place(place_)) { + TensorVec &npu = npu_buffer_[i]; + if (npu.empty()) { + npu.resize(cpu.size()); + } else { + PADDLE_ENFORCE_EQ( + npu.size(), cpu.size(), + platform::errors::InvalidArgument( + "Input tensor number on NPU and CPU devices are not matched. " + "The number on NPU is %d, on CPU is %d", + npu.size(), cpu.size())); + } + + std::vector npu_ptrs; + npu_ptrs.reserve(cpu.size()); + for (size_t i = 0; i < cpu.size(); ++i) { + npu[i].Resize(cpu[i].dims()); + npu[i].set_layout(cpu[i].layout()); + npu_ptrs.emplace_back(npu[i].mutable_data(place_, cpu[i].type())); + } + + platform::SetNPUDeviceId( + BOOST_GET_CONST(platform::NPUPlace, place_).device); + PADDLE_ENFORCE_NPU_SUCCESS( + aclrtRecordEvent(events_[i].get(), compute_stream_)); + PADDLE_ENFORCE_NPU_SUCCESS( + aclrtStreamWaitEvent(stream_.get(), events_[i].get())); + + platform::RecordEvent record_event("BufferedReader:MemoryCopy"); + for (size_t i = 0; i < cpu.size(); ++i) { + auto cpu_place = cpu[i].place(); + auto cpu_ptr = cpu[i].data(); + auto npu_ptr = npu_ptrs[i]; + auto size = + cpu[i].numel() * paddle::framework::SizeOfType(cpu[i].type()); + if ((platform::is_npu_place(cpu_place))) { + memory::Copy(BOOST_GET_CONST(platform::NPUPlace, place_), npu_ptr, + BOOST_GET_CONST(platform::NPUPlace, cpu_place), cpu_ptr, + size, stream_.get()); + } else { + memory::Copy(BOOST_GET_CONST(platform::NPUPlace, place_), npu_ptr, + BOOST_GET_CONST(platform::CPUPlace, cpu_place), cpu_ptr, + size, stream_.get()); + PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream_.get())); + } + npu[i].set_lod(cpu[i].lod()); + } + PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeStream(stream_.get())); + } +#endif return i; })); } @@ -228,9 +296,13 @@ void BufferedReader::ReadNextImpl(std::vector *out) { return; } - *out = std::move((platform::is_gpu_place(place_) && !is_same_place_) - ? cuda_buffer_[i] - : cpu_buffer_[i]); + if (platform::is_gpu_place(place_) && !is_same_place_) { + *out = std::move(cuda_buffer_[i]); + } else if (platform::is_npu_place(place_) && !is_same_place_) { + *out = std::move(npu_buffer_[i]); + } else { + *out = std::move(cpu_buffer_[i]); + } // Do not push current position into ReadAsync. Push the previous position // Since all computation in fluid are async, change the data of diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h index fbc46aceb81305..9f7b0e753281eb 100644 --- a/paddle/fluid/operators/reader/buffered_reader.h +++ b/paddle/fluid/operators/reader/buffered_reader.h @@ -25,7 +25,10 @@ #include "paddle/fluid/platform/cuda_resource_pool.h" #include "paddle/fluid/platform/gpu_info.h" #endif - +#ifdef PADDLE_WITH_ASCEND_CL +#include "paddle/fluid/platform/npu_info.h" +#include "paddle/fluid/platform/npu_resource_pool.h" +#endif namespace paddle { namespace operators { namespace reader { @@ -67,12 +70,19 @@ class BufferedReader : public framework::DecoratedReader { bool is_same_place_; std::vector cpu_buffer_; std::vector cuda_buffer_; + std::vector npu_buffer_; size_t prev_pos_{-1UL}; #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) gpuStream_t compute_stream_; std::shared_ptr stream_; std::vector> events_; #endif + +#ifdef PADDLE_WITH_ASCEND_CL + aclrtStream compute_stream_; + std::shared_ptr stream_; + std::vector> events_; +#endif }; } // namespace reader diff --git a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc index 98a68ca69cafd0..1aa93c80387e65 100644 --- a/paddle/fluid/operators/reader/reader_blocking_queue_test.cc +++ b/paddle/fluid/operators/reader/reader_blocking_queue_test.cc @@ -68,7 +68,7 @@ TEST(BlockingQueue, SenderBlockingTest) { ++send_count; } }); - std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + std::this_thread::sleep_for(std::chrono::milliseconds(1500)); q.Close(); sender.join(); EXPECT_EQ(send_count, queue_cap); diff --git a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mean_mkldnn_op.cc b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mean_mkldnn_op.cc index a9eed0d7eb0427..dfba933940bd02 100644 --- a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mean_mkldnn_op.cc +++ b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mean_mkldnn_op.cc @@ -25,6 +25,32 @@ class ReduceMeanMKLDNNKernel : public ReduceMKLDNNKernel { } }; +template +class ReduceMeanGradMKLDNNKernel : public ReduceGradMKLDNNKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const auto* input_x = ctx.Input("X"); + auto input_dims = framework::vectorize(input_x->dims()); + auto reduce_dims = ctx.Attr>("dim"); + + int number_of_elements = 1; + if (!ctx.Attr("reduce_all")) { + for (size_t i = 0; i < reduce_dims.size(); ++i) { + reduce_dims[i] = (reduce_dims[i] >= 0) + ? reduce_dims[i] + : input_dims.size() + reduce_dims[i]; + number_of_elements *= input_dims[reduce_dims[i]]; + } + } else { + number_of_elements = input_x->numel(); + } + + this->RunKernel(ctx, dnnl::algorithm::binary_add, + dnnl::algorithm::reduction_mean, 0.0f, + 1.0L / number_of_elements); + } +}; + } // namespace operators } // namespace paddle @@ -32,3 +58,7 @@ namespace ops = paddle::operators; REGISTER_OP_KERNEL(reduce_mean, MKLDNN, paddle::platform::CPUPlace, ops::ReduceMeanMKLDNNKernel, ops::ReduceMeanMKLDNNKernel); + +REGISTER_OP_KERNEL(reduce_mean_grad, MKLDNN, paddle::platform::CPUPlace, + ops::ReduceMeanGradMKLDNNKernel, + ops::ReduceMeanGradMKLDNNKernel); diff --git a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h index 7e09aaa126effe..40cd3ba974f04c 100644 --- a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h +++ b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_mkldnn_op.h @@ -21,6 +21,27 @@ using paddle::framework::LoDTensor; using paddle::framework::Tensor; using platform::to_void_cast; +inline std::vector CalculateReducedDims(const Tensor* input, + const Tensor* output, + std::vector& reduce_dims, + bool reduce_all, + bool keep_dim) { + if (keep_dim) return framework::vectorize(output->dims()); + + if (reduce_all) + return std::vector(framework::vectorize(input->dims()).size(), 1); + + std::vector output_dims(framework::vectorize(input->dims())); + for (size_t i = 0; i < reduce_dims.size(); ++i) { + reduce_dims[i] = (reduce_dims[i] >= 0) + ? reduce_dims[i] + : input->dims().size() + reduce_dims[i]; + output_dims[reduce_dims[i]] = 1; + } + + return output_dims; +} + template class ReduceMKLDNNKernel : public framework::OpKernel { public: @@ -37,9 +58,8 @@ class ReduceMKLDNNKernel : public framework::OpKernel { bool reduce_all = ctx.Attr("reduce_all"); bool keep_dim = ctx.Attr("keep_dim"); - std::vector output_dims = - CalculateOutputDims(input, output, reduce_dims, reduce_all, keep_dim); - + auto output_dims = + CalculateReducedDims(input, output, reduce_dims, reduce_all, keep_dim); auto input_dims = framework::vectorize(input->dims()); auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); @@ -96,28 +116,98 @@ class ReduceMKLDNNKernel : public framework::OpKernel { paddle::framework::vectorize(output->dims())))); } } +}; + +template +class ReduceGradMKLDNNKernel : public framework::OpKernel { + public: + void RunKernel(const framework::ExecutionContext& ctx, + dnnl::algorithm binary_type, dnnl::algorithm reduction_type, + float scale_x, float scale_y) const { + const auto& dev_ctx = + ctx.template device_context(); + const auto& onednn_engine = dev_ctx.GetEngine(); + + bool keep_dim = ctx.Attr("keep_dim"); + bool reduce_all = ctx.Attr("reduce_all"); + auto dims = ctx.Attr>("dim"); + auto* input_dy = ctx.Input(framework::GradVarName("Out")); + auto* output_dx = ctx.Output(framework::GradVarName("X")); + + mkldnn::memory::format_tag x_format_tag; + auto input_dims = + CalculateReducedDims(output_dx, input_dy, dims, reduce_all, keep_dim); + + if (input_dims != framework::vectorize(output_dx->dims())) { + const std::string key_pd = + platform::CreateKey( + dev_ctx, framework::vectorize(output_dx->dims()), + ctx.InputName("X"), + (std::to_string(static_cast(reduction_type)))) + + "@fwd_pd"; + std::shared_ptr fwd_pd = + std::static_pointer_cast( + dev_ctx.GetBlob(key_pd)); + + PADDLE_ENFORCE_NOT_NULL( + fwd_pd, platform::errors::Unavailable( + "Forward primitive descriptor is not available in %s op, " + "cannot deduce memory format tag", + ctx.Type())); + + x_format_tag = platform::GetMKLDNNFormat(fwd_pd->src_desc()); + + PADDLE_ENFORCE_NE(x_format_tag, mkldnn::memory::format_tag::undef, + platform::errors::InvalidArgument( + "Cannot deduce format tag for %s op", ctx.Type())); + } else { // fwd descriptor not available because reorder was used instead + // of reduction + x_format_tag = getPlainFormatTag(output_dx); + } + + output_dx->mutable_data(ctx.GetPlace()); + output_dx->set_format(x_format_tag); + output_dx->set_layout(input_dy->layout()); + + platform::BroadcastDataMKLDNNHandler handler( + binary_type, dev_ctx, onednn_engine, ctx.GetPlace(), output_dx, + input_dy, scale_x, scale_y, + ctx.InputName(framework::GradVarName("Out")), input_dims); + + const auto src_dx_memory = handler.AcquireSrcMemory(output_dx); + const auto src_dy_memory = handler.AcquireSecondSrcMemory(input_dy); + const auto binary_prim = handler.AcquireForwardPrimitive(); + + const std::unordered_map args = { + {DNNL_ARG_SRC_0, *src_dx_memory}, + {DNNL_ARG_SRC_1, *src_dy_memory}, + {DNNL_ARG_DST, *src_dx_memory}}; + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + binary_prim->execute(astream, args); + astream.wait(); + } - private: - std::vector CalculateOutputDims(const Tensor* input, - const Tensor* output, - std::vector& reduce_dims, - bool reduce_all, - bool keep_dim) const { - if (keep_dim) return framework::vectorize(output->dims()); - - if (reduce_all) - return std::vector(framework::vectorize(input->dims()).size(), - 1); - - std::vector output_dims(framework::vectorize(input->dims())); - for (size_t i = 0; i < reduce_dims.size(); ++i) { - reduce_dims[i] = (reduce_dims[i] >= 0) - ? reduce_dims[i] - : input->dims().size() + reduce_dims[i]; - output_dims[reduce_dims[i]] = 1; + protected: + mkldnn::memory::format_tag getPlainFormatTag(const Tensor* tensor) const { + auto tensor_dims_size = tensor->dims().size(); + PADDLE_ENFORCE_EQ( + tensor_dims_size <= 5 && tensor_dims_size >= 1, true, + platform::errors::InvalidArgument( + "Dims for reduction_grad oneDNN op must be in range <1, 5>")); + + switch (tensor_dims_size) { + case 1: + return mkldnn::memory::format_tag::a; + case 2: + return mkldnn::memory::format_tag::ab; + case 3: + return mkldnn::memory::format_tag::abc; + case 4: + return mkldnn::memory::format_tag::abcd; } - return output_dims; + return mkldnn::memory::format_tag::abcde; } }; diff --git a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_sum_mkldnn_op.cc b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_sum_mkldnn_op.cc index 4676589e68910a..3f92d39ede1ae8 100644 --- a/paddle/fluid/operators/reduce_ops/mkldnn/reduce_sum_mkldnn_op.cc +++ b/paddle/fluid/operators/reduce_ops/mkldnn/reduce_sum_mkldnn_op.cc @@ -25,6 +25,15 @@ class ReduceSumMKLDNNKernel : public ReduceMKLDNNKernel { } }; +template +class ReduceSumGradMKLDNNKernel : public ReduceGradMKLDNNKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + this->RunKernel(ctx, dnnl::algorithm::binary_add, + dnnl::algorithm::reduction_sum, 0.0f, 1.0f); + } +}; + } // namespace operators } // namespace paddle @@ -32,3 +41,7 @@ namespace ops = paddle::operators; REGISTER_OP_KERNEL(reduce_sum, MKLDNN, paddle::platform::CPUPlace, ops::ReduceSumMKLDNNKernel, ops::ReduceSumMKLDNNKernel); + +REGISTER_OP_KERNEL(reduce_sum_grad, MKLDNN, paddle::platform::CPUPlace, + ops::ReduceSumGradMKLDNNKernel, + ops::ReduceSumGradMKLDNNKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc index d408ff3988f030..1eeeb5e1f8aa19 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc @@ -78,6 +78,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) { TEST(reduce_any, NPU) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx); + auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx); } diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h index 280464ea852793..390c4d9709a60f 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.h @@ -560,14 +560,28 @@ class ReduceGradOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { int in_dtype = ctx.Attr("in_dtype"); - if (in_dtype >= 0) { - return framework::OpKernelType( - static_cast(in_dtype), - ctx.GetPlace()); + auto input_data_type = + (in_dtype >= 0) ? static_cast(in_dtype) + : OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")); + +#ifdef PADDLE_WITH_MKLDNN + auto CanMKLDNNReduceGradBeUsed = [&]() { + auto dx_dims = ctx.Input("X")->dims(); + + if (dx_dims.size() > 5) return false; // max 5D tensor is supported + + return true; + }; + if (this->CanMKLDNNBeUsed(ctx, input_data_type) && + CanMKLDNNReduceGradBeUsed()) { + return framework::OpKernelType(input_data_type, ctx.GetPlace(), + framework::DataLayout::kMKLDNN, + framework::LibraryType::kMKLDNN); } - return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")), - ctx.GetPlace()); +#endif + + return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/rnn_op_xpu.cc b/paddle/fluid/operators/rnn_op_xpu.cc new file mode 100644 index 00000000000000..fb82d18e62f3bf --- /dev/null +++ b/paddle/fluid/operators/rnn_op_xpu.cc @@ -0,0 +1,314 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/utils.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/xpu_header.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using DDim = framework::DDim; + +using TensorList = std::vector; + +template +void reset_parameter_vector(const std::vector& raw_params_vec, + const int& num_layers, const bool& is_bidirec, + std::vector>* params_vec) { + // the parameter raw seuquence is [FWhi, FWhh, BWhi, BWhh] * num_layers + // + [FBhi, FBhh, BBhi, BBhh] * num_layers, we will reset the parameter to + // ([FWhi, FWhh, FBhi, FBhh] + [BWhi, BWhh, BBhi, BBhh]) * num_layers + const int& direction_num = is_bidirec ? 2 : 1; + const int& layer_weight_size = 4 * direction_num; + const int& all_weight_size = num_layers * layer_weight_size; + const int& bias_start_idx = all_weight_size / 2; + for (int i = 0; i < num_layers; i++) { + params_vec->at(i).resize(layer_weight_size); + for (int j = 0; j < layer_weight_size; j++) { + int k = j % 4; + const int& section = j / 4; + int tensor_idx = i * 2 * direction_num + section * 2 + k % 2; + if (k >= 2) { + tensor_idx += bias_start_idx; + } + using remove_cv_t = typename std::remove_cv::type; + params_vec->at(i)[j] = + raw_params_vec[tensor_idx]->template data(); + } + } +} + +template +class RnnXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto pre_state = ctx.MultiInput("PreState"); + auto weight_list = ctx.MultiInput("WeightList"); + auto state = ctx.MultiOutput("State"); + auto* output = ctx.Output("Out"); + auto* reserve_data = ctx.Output("Reserve"); + const int& num_layers = ctx.Attr("num_layers"); + const bool& is_bidirec = ctx.Attr("is_bidirec"); + const int& hidden_size = ctx.Attr("hidden_size"); + const std::string& mode = ctx.Attr("mode"); + + bool has_seq_length = ctx.HasInput("SequenceLength"); + const Tensor* sequence_length = nullptr; + if (has_seq_length) { + sequence_length = ctx.Input("SequenceLength"); + } + + PADDLE_ENFORCE_EQ( + mode, "LSTM", + platform::errors::InvalidArgument( + "XPU only support LSTM mode now, current mode is %s", mode)); + + PADDLE_ENFORCE_EQ(is_bidirec, false, + platform::errors::InvalidArgument( + "XPU only support unidirectional LSTM now")); + + PADDLE_ENFORCE_EQ( + num_layers, 1, + platform::errors::InvalidArgument( + "XPU only support 1 layer LSTM now, current layer num is %s", + num_layers)); + + auto init_h = pre_state[0]; + auto init_c = pre_state[1]; + auto last_h = state[0]; + auto last_c = state[1]; + + // check shape + int seq_len = input->dims()[0]; + int batch_size = input->dims()[1]; + int input_dim = input->dims()[2]; + + PADDLE_ENFORCE_EQ( + init_h->dims()[0], num_layers, + platform::errors::InvalidArgument("The num_layers of in RNN layer must" + " be the same as first dim of init " + "hidden, but received num_layers:%d," + " dim:%d", + num_layers, init_h->dims()[0])); + + PADDLE_ENFORCE_EQ( + init_c->dims()[0], num_layers, + platform::errors::InvalidArgument( + "The num_layers of in RNN layer must" + " be the same as first dim of cell state hidden, but received" + " num_layers:%d, dim:%d", + num_layers, init_c->dims()[0])); + + std::vector> parameter_lists; + parameter_lists.resize(num_layers); + reset_parameter_vector(weight_list, num_layers, is_bidirec, + ¶meter_lists); + + // init the output and allocate the memory + output->mutable_data(ctx.GetPlace()); + last_h->mutable_data(ctx.GetPlace()); + last_c->mutable_data(ctx.GetPlace()); + reserve_data->Resize({seq_len * batch_size * hidden_size * 5}); + reserve_data->mutable_data(ctx.GetPlace()); + + // get ptr from tensor + auto x = input->data(); + auto h_0 = init_h->data(); + auto c_0 = init_c->data(); + auto w_x = parameter_lists[0][0]; + auto w_h = parameter_lists[0][1]; + auto b_x = parameter_lists[0][2]; + auto b_h = parameter_lists[0][3]; + auto y = output->data(); + auto last_h_ptr = last_h->data(); + auto last_c_ptr = last_c->data(); + auto i_f_g_o = reserve_data->data(); + auto c = i_f_g_o + seq_len * batch_size * hidden_size * 4; + + std::vector seq_len_tensor(batch_size, seq_len); + if (has_seq_length) { + seq_len_tensor = operators::GetDataFromTensor(sequence_length); + } + + // run kernel + auto& dev_ctx = ctx.template device_context(); + int r = xpu::lstm_train( + dev_ctx.x_context(), (const T*)x, (const T*)h_0, (const T*)c_0, + (const T*)w_x, (const T*)w_h, (const T*)b_x, (const T*)b_h, + reinterpret_cast(y), reinterpret_cast(last_h_ptr), + reinterpret_cast(last_c_ptr), batch_size, input_dim, hidden_size, + seq_len, seq_len_tensor, nullptr, nullptr, nullptr, nullptr, + reinterpret_cast(i_f_g_o), reinterpret_cast(c)); + PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS, + platform::errors::External("RnnXPU(lstm) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); + } +}; + +template +class RnnXPUGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + // get the tensor pointer for the input + auto* input = ctx.Input("Input"); + auto pre_state = ctx.MultiInput("PreState"); + auto weight_list = ctx.MultiInput("WeightList"); + auto* output = ctx.Input("Out"); + auto* reserve_data = ctx.Input("Reserve"); + const int& num_layers = ctx.Attr("num_layers"); + const bool& is_bidirec = ctx.Attr("is_bidirec"); + const int& hidden_size = ctx.Attr("hidden_size"); + const std::string& mode = ctx.Attr("mode"); + + bool has_seq_length = ctx.HasInput("SequenceLength"); + const Tensor* sequence_length = nullptr; + if (has_seq_length) { + sequence_length = ctx.Input("SequenceLength"); + } + + PADDLE_ENFORCE_EQ( + mode, "LSTM", + platform::errors::InvalidArgument( + "XPU only support LSTM mode now, current mode is %s", mode)); + + PADDLE_ENFORCE_EQ(is_bidirec, false, + platform::errors::InvalidArgument( + "XPU only support unidirectional LSTM now")); + + PADDLE_ENFORCE_EQ( + num_layers, 1, + platform::errors::InvalidArgument( + "XPU only support 1 layer LSTM now, current layer num is %s", + num_layers)); + + auto init_h = pre_state[0]; + auto init_c = pre_state[1]; + + auto output_grad = ctx.Input(framework::GradVarName("Out")); + auto state_grad = ctx.MultiInput(framework::GradVarName("State")); + auto last_h_grad = state_grad[0]; + auto last_c_grad = state_grad[1]; + + // get the tensor pointer for the output + auto* input_grad = ctx.Output(framework::GradVarName("Input")); + auto weight_grad_list = ctx.MultiOutput( + framework::GradVarName("WeightList")); + auto pre_state_grad = + ctx.MultiOutput(framework::GradVarName("PreState")); + Tensor* init_h_grad = nullptr; + Tensor* init_c_grad = nullptr; + if (pre_state_grad.size() > 0) { // has gradient + init_h_grad = pre_state_grad[0]; + init_c_grad = pre_state_grad[1]; + } + + // check shape + int seq_len = input->dims()[0]; + int batch_size = input->dims()[1]; + int input_dim = input->dims()[2]; + + PADDLE_ENFORCE_EQ( + init_h->dims()[0], num_layers, + platform::errors::InvalidArgument("The num_layers of in RNN layer must" + " be the same as first dim of init " + "hidden, but received num_layers:%d," + " dim:%d", + num_layers, init_h->dims()[0])); + + PADDLE_ENFORCE_EQ( + init_c->dims()[0], num_layers, + platform::errors::InvalidArgument( + "The num_layers of in RNN layer must" + " be the same as first dim of cell state hidden, but received" + " num_layers:%d, dim:%d", + num_layers, init_c->dims()[0])); + + std::vector> parameter_lists; + parameter_lists.resize(num_layers); + reset_parameter_vector(weight_list, num_layers, is_bidirec, + ¶meter_lists); + + for (unsigned int i = 0; i < weight_grad_list.size(); ++i) { + weight_grad_list[i]->mutable_data(ctx.GetPlace()); + } + std::vector> parameter_lists_grad; + parameter_lists_grad.resize(num_layers); + reset_parameter_vector(weight_grad_list, num_layers, is_bidirec, + ¶meter_lists_grad); + + // allocate the memory and initization the input_grad + input_grad->mutable_data(input->dims(), ctx.GetPlace()); + if (init_h_grad) { + init_h_grad->mutable_data(init_h->dims(), ctx.GetPlace()); + } + if (init_c_grad) { + init_c_grad->mutable_data(init_c->dims(), ctx.GetPlace()); + } + + // get ptr from tensor + auto x = input->data(); + auto h_0 = init_h->data(); + auto c_0 = init_c->data(); + auto w_x = parameter_lists[0][0]; + auto w_h = parameter_lists[0][1]; + auto y = output->data(); + auto y_grad = output_grad->data(); + auto last_h_grad_ptr = last_h_grad->data(); + auto last_c_grad_ptr = last_c_grad->data(); + auto x_grad = input_grad->data(); + auto h_0_grad = init_h_grad ? init_h_grad->data() : nullptr; + auto c_0_grad = init_c_grad ? init_c_grad->data() : nullptr; + auto w_x_grad = parameter_lists_grad[0][0]; + auto w_h_grad = parameter_lists_grad[0][1]; + auto b_x_grad = parameter_lists_grad[0][2]; + auto b_h_grad = parameter_lists_grad[0][3]; + auto i_f_g_o = reserve_data->data(); + auto c = i_f_g_o + seq_len * batch_size * hidden_size * 4; + + std::vector seq_len_tensor(batch_size, seq_len); + if (has_seq_length) { + seq_len_tensor = operators::GetDataFromTensor(sequence_length); + } + + auto& dev_ctx = ctx.template device_context(); + int r = xpu::lstm_grad( + dev_ctx.x_context(), (const T*)x, (const T*)h_0, (const T*)c_0, + (const T*)w_x, (const T*)w_h, (const T*)y, (const T*)y_grad, + (const T*)last_h_grad_ptr, (const T*)last_c_grad_ptr, + reinterpret_cast(x_grad), reinterpret_cast(h_0_grad), + reinterpret_cast(c_0_grad), w_x_grad, w_h_grad, b_x_grad, b_h_grad, + batch_size, input_dim, hidden_size, seq_len, seq_len_tensor, nullptr, + nullptr, nullptr, nullptr, i_f_g_o, c); + PADDLE_ENFORCE_EQ( + r, xpu::Error_t::SUCCESS, + platform::errors::External("RnnXPUGrad(lstm) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_XPU_KERNEL( + rnn, ops::RnnXPUKernel); +REGISTER_OP_XPU_KERNEL( + rnn_grad, ops::RnnXPUGradKernel); + +#endif // PADDLE_WITH_XPU diff --git a/paddle/fluid/operators/save_combine_op_npu.cc b/paddle/fluid/operators/save_combine_op_npu.cc new file mode 100644 index 00000000000000..1fb136a5110dbd --- /dev/null +++ b/paddle/fluid/operators/save_combine_op_npu.cc @@ -0,0 +1,24 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/save_combine_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + save_combine, + ops::SaveCombineOpKernel, + ops::SaveCombineOpKernel, + ops::SaveCombineOpKernel, + ops::SaveCombineOpKernel); diff --git a/paddle/fluid/operators/save_op_npu.cc b/paddle/fluid/operators/save_op_npu.cc new file mode 100644 index 00000000000000..90db1a0bb85d60 --- /dev/null +++ b/paddle/fluid/operators/save_op_npu.cc @@ -0,0 +1,28 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/save_op.h" +#include "paddle/fluid/platform/float16.h" + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + save, ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel, + ops::SaveOpKernel); diff --git a/paddle/fluid/operators/scale_op_npu.cc b/paddle/fluid/operators/scale_op_npu.cc index ee7210a7784d72..cbfd11834ae477 100644 --- a/paddle/fluid/operators/scale_op_npu.cc +++ b/paddle/fluid/operators/scale_op_npu.cc @@ -34,6 +34,8 @@ class ScaleNPUKernel : public framework::OpKernel { ctx.template device_context() .stream(); float _power = 1.0; + VLOG(4) << "scale:" << scale << ", bias:" << bias + << " ,bias_after_scale:" << bias_after_scale; if (bias_after_scale) { out->mutable_data(ctx.GetPlace()); auto runner = diff --git a/paddle/fluid/operators/scatter_nd_add_op.cc b/paddle/fluid/operators/scatter_nd_add_op.cc index 144e7ceae20c16..2d23e81717abb8 100644 --- a/paddle/fluid/operators/scatter_nd_add_op.cc +++ b/paddle/fluid/operators/scatter_nd_add_op.cc @@ -50,10 +50,15 @@ class ScatterNdAddOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_LE( index_dims[index_dims_size - 1], ref_dims_size, platform::errors::InvalidArgument( - "Input(Index).shape[-1] should be no greater than Input(X).rank")); + "The last dimension of Input(Index)'s shape should be no greater " + "than the rank of Input(X), but received the last dimension of " + "Input(Index)'s shape is %d, the rank of Input(X) is %d.", + index_dims[index_dims_size - 1], ref_dims_size)); PADDLE_ENFORCE_GE(index_dims_size, 2UL, platform::errors::InvalidArgument( - "The rank of Input(Index) should be greater than 1")); + "The rank of Input(Index) should be greater than 1, " + "but received the rank of Input(Index) is %d.", + index_dims_size)); // update.shape = index.shape[:-1] + output.shape[index.shape[-1]:] std::vector r_updates_dims; @@ -66,12 +71,21 @@ class ScatterNdAddOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ( r_updates_dims.size(), updates_dims_size, - platform::errors::InvalidArgument("Updates has wrong shape")); + platform::errors::InvalidArgument( + "Updates has wrong shape. The shape of Updates and Input(Updates) " + "should be same, but received the shape of Updates is %d, " + "the shape of Input(Updates) is %d.", + r_updates_dims.size(), updates_dims_size)); for (int64_t i = 0; i < updates_dims_size; ++i) { PADDLE_ENFORCE_EQ( r_updates_dims[i], updates_dims[i], - platform::errors::InvalidArgument("Updates has wrong shape")); + platform::errors::InvalidArgument( + "Updates has wrong shape. The dimensions of Updates and " + "Input(Updates) should match, but received Updates's" + "%d-th dimension is %d, Input(Updates)'s %d-th " + "dimension is %d.", + i, r_updates_dims[i], i, updates_dims[i])); } ctx->SetOutputDim("Out", ref_dims); ctx->ShareLoD("X", /*->*/ "Out"); diff --git a/paddle/fluid/operators/scatter_op.cc b/paddle/fluid/operators/scatter_op.cc index 3fc40d41c30817..f0faa0c5798339 100644 --- a/paddle/fluid/operators/scatter_op.cc +++ b/paddle/fluid/operators/scatter_op.cc @@ -41,15 +41,24 @@ class ScatterOp : public framework::OperatorWithKernel { auto ref_dims = ctx->GetInputDim("X"); PADDLE_ENFORCE_EQ( ctx->GetInputDim("Ids").size(), 1, - platform::errors::InvalidArgument("Update Ids should be 1-D.")); + platform::errors::InvalidArgument( + "The size of Input(Ids)'s shape should be equal to 1, but " + "received the rank of Input(Ids) is %d.", + ctx->GetInputDim("Ids").size())); PADDLE_ENFORCE_EQ( ref_dims.size(), updates_dims.size(), platform::errors::InvalidArgument( - "Rerence and Updates should have the same shape size.")); - PADDLE_ENFORCE_EQ(ctx->GetInputDim("Updates")[0], - ctx->GetInputDim("Ids")[0], - platform::errors::InvalidArgument( - "Updates and Ids should have same batch-size.")); + "Input(X) and Input(Updates) should have the same shape size, " + "but received the size of Input(x)'s shape is %d, the size of " + "Input(Updates)'s shape is %d.", + ref_dims.size(), updates_dims.size())); + PADDLE_ENFORCE_EQ( + ctx->GetInputDim("Updates")[0], ctx->GetInputDim("Ids")[0], + platform::errors::InvalidArgument( + "Input(Updates) and Input(Ids) should have same batch-size, but" + " received Input(Updates)'s batch-size is %d, Input(Ids)'s " + "batch-size is %d.", + ctx->GetInputDim("Updates")[0], ctx->GetInputDim("Ids")[0])); ctx->SetOutputDim("Out", ref_dims); ctx->ShareLoD("X", /*->*/ "Out"); } diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc index 105d61015fcb9d..96a132ac6abc21 100644 --- a/paddle/fluid/operators/set_value_op.cc +++ b/paddle/fluid/operators/set_value_op.cc @@ -146,22 +146,75 @@ Assignment to a Tensor in static mode. )DOC"); } }; + +template +class SetValueGradMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + if (this->HasInput("ValueTensor")) { + op->SetType("slice"); + op->SetInput("Input", this->OutputGrad("Out")); + if (this->HasInput("StartsTensorList")) { + op->SetInput("StartsTensorList", this->Input("StartsTensorList")); + } + if (this->HasInput("EndsTensorList")) { + op->SetInput("EndsTensorList", this->Input("EndsTensorList")); + } + + // convert std::vector to std::vector + std::vector axes_int64 = static_cast>( + BOOST_GET_CONST(std::vector, this->GetAttr("axes"))); + std::vector starts_int64 = static_cast>( + BOOST_GET_CONST(std::vector, this->GetAttr("starts"))); + std::vector ends_int64 = static_cast>( + BOOST_GET_CONST(std::vector, this->GetAttr("ends"))); + std::vector decrease_axes_int64 = + static_cast>(BOOST_GET_CONST( + std::vector, this->GetAttr("decrease_axes"))); + + std::vector axes(axes_int64.begin(), axes_int64.end()); + std::vector starts(starts_int64.begin(), starts_int64.end()); + std::vector ends(ends_int64.begin(), ends_int64.end()); + std::vector decrease_axes(decrease_axes_int64.begin(), + decrease_axes_int64.end()); + + op->SetAttr("axes", axes); + op->SetAttr("starts", starts); + op->SetAttr("ends", ends); + op->SetAttr("decrease_axis", decrease_axes); + op->SetAttr("infer_flags", std::vector({})); + + op->SetOutput("Out", this->InputGrad("ValueTensor")); + } else { + op->SetType("assign"); + op->SetInput("X", this->OutputGrad("Out")); + op->SetOutput("Out", this->InputGrad("Input")); + } + } +}; + +DECLARE_INPLACE_OP_INFERER(SetValueOpInplaceInferer, {"Input", "Out"}); + } // namespace operators } // namespace paddle namespace ops = paddle::operators; +namespace plat = paddle::platform; -REGISTER_OPERATOR( - set_value, ops::SetValue, ops::SetValueMaker, - paddle::framework::EmptyGradOpMaker, - paddle::framework::EmptyGradOpMaker); +REGISTER_OPERATOR(set_value, ops::SetValue, ops::SetValueMaker, + ops::SetValueGradMaker, + ops::SetValueGradMaker, + ops::SetValueOpInplaceInferer); REGISTER_OP_CPU_KERNEL( set_value, ops::SetValueKernel, - ops::SetValueKernel, - ops::SetValueKernel, - ops::SetValueKernel, - ops::SetValueKernel); + ops::SetValueKernel, + ops::SetValueKernel, + ops::SetValueKernel, + ops::SetValueKernel); REGISTER_OP_VERSION(set_value) .AddCheckpoint( diff --git a/paddle/fluid/operators/slice_op.h b/paddle/fluid/operators/slice_op.h index 9c30c4e07fa774..22f6fa9e3e6f20 100644 --- a/paddle/fluid/operators/slice_op.h +++ b/paddle/fluid/operators/slice_op.h @@ -259,7 +259,20 @@ class SliceKernel : public framework::OpKernel { auto out_t = framework::EigenTensor::From( *out, new_out_dims); - out_t.device(place) = in_t.slice(offsets, extents); + + if (in->numel() <= Eigen::NumTraits::highest()) { + // similar to tf.slice: + // if element number less than INT_MAX, change the type of index to int + Eigen::DSizes offsets_32bit, extents_32bit; + for (size_t i = 0; i < D; i++) { + offsets_32bit[i] = offsets[i]; + extents_32bit[i] = extents[i]; + } + framework::To32BitIndex(out_t).device(place) = + framework::To32BitIndex(in_t).slice(offsets_32bit, extents_32bit); + } else { + out_t.device(place) = in_t.slice(offsets, extents); + } out->Resize(out_dims); } @@ -300,8 +313,6 @@ class SliceGradKernel : public framework::OpKernel { private: template void SliceCompute(const framework::ExecutionContext& context) const { - auto& place = - *context.template device_context().eigen_device(); auto axes = context.Attr>("axes"); auto starts_int = context.Attr>("starts"); @@ -435,13 +446,189 @@ class SliceGradKernel : public framework::OpKernel { paddings[i].first = offsets[i]; paddings[i].second = (in_dims[i] - out_dims[i]) - offsets[i]; } + EigenPaddingCompute(context, d_input, in_dims, d_out, out_dims, paddings); + } + + template + void EigenPaddingCompute( + const framework::ExecutionContext& context, framework::Tensor* d_input, + const framework::DDim& in_dims, const framework::Tensor* d_out, + const framework::DDim& out_dims, + const Eigen::array, D>& paddings) const { + if (D <= 3) { + // if dimension less than 3, cannot reduce dimension + LaunchEigenPadding(context, d_input, in_dims, d_out, out_dims, paddings); + } else { // else we can reduce dimension + // count not-zero padding number, and record the dimension + int need_pad_num = 0, pad_dim = -1; + for (size_t i = 0; i < D; i++) { + if (paddings[i].first != 0 || paddings[i].second != 0) { + need_pad_num++; + pad_dim = i; + } + } + + if (need_pad_num == 0) { + // do not need padding, pass if data address same, else copy + if (d_input->mutable_data(context.GetPlace()) == d_out->data()) { + // inplace, do not any operator, pass + } else { + framework::TensorCopy( + *d_out, context.GetPlace(), + context.template device_context(), + d_input); + } + } else if (need_pad_num == 1) { + // only need padding one dimension, we can reduce dimension. + // only the padding dimension is available for us. + // How to reduce dimension(5 to 3 for example): + // before(D=5): + // in_dims: [x1, x2, x3, x4, x5] + // padding.first: [0, 0, a, 0, 0] + // padding.second: [0, 0, b, 0, 0] + // | | + // V V + // after(D=3): + // reshaped_in_dims: [x1*x2, x3, x4*x5] + // reshaped_padding.first: [0, a, 0] + // reshaped_padding.second: [0, b, 0] + + if (pad_dim == D - 1) { + // only last dimension need padding, + // reshape the dimension of tensor in 2: [preceding, padding] + std::vector in_tore_shape(2, 1), out_tore_shape(2, 1); + Eigen::array, 2> reshaped_padding; + + // first dimension is the accumulate of preceding dimension + for (int i = 0; i < pad_dim; i++) { + in_tore_shape[0] *= in_dims[i]; + out_tore_shape[0] *= out_dims[i]; + } + // second dimension is the padding dimension + in_tore_shape[1] = in_dims[pad_dim]; + out_tore_shape[1] = out_dims[pad_dim]; + + // convert array from std::vector to DDim + framework::DDim reshaped_in_dims = + framework::make_ddim(in_tore_shape); + framework::DDim reshaped_out_dims = + framework::make_ddim(out_tore_shape); + + // after reshape: the first dimension do not need padding, + // set padding[0] zero + reshaped_padding[0].first = reshaped_padding[0].second = 0; + // the second dimension is the previous padding dimension + reshaped_padding[1].first = paddings[pad_dim].first; + reshaped_padding[1].second = paddings[pad_dim].second; + + LaunchEigenPadding(context, d_input, reshaped_in_dims, d_out, + reshaped_out_dims, reshaped_padding); + } else if (pad_dim == 0) { + // only first dimension need padding, + // reshape the dimension of tensor in 2: [padding, succeeding] + // similar to (D - 1) + std::vector in_tore_shape(2, 1), out_tore_shape(2, 1); + Eigen::array, 2> reshaped_padding; + + // first dimension is the padding dimension + in_tore_shape[0] = in_dims[pad_dim]; + out_tore_shape[0] = out_dims[pad_dim]; + // sencond dimension is the accumulate of succeeding dimension + for (size_t i = pad_dim + 1; i < D; i++) { + in_tore_shape[1] *= in_dims[i]; + out_tore_shape[1] *= out_dims[i]; + } + + // convert array from std::vector to DDim + framework::DDim reshaped_in_dims = + framework::make_ddim(in_tore_shape); + framework::DDim reshaped_out_dims = + framework::make_ddim(out_tore_shape); + + // after reshape: + // the first dimension is the previous padding dimension + reshaped_padding[0].first = paddings[pad_dim].first; + reshaped_padding[0].second = paddings[pad_dim].second; + // the second dimension do not need padding, set padding[1] zero + reshaped_padding[1].first = reshaped_padding[1].second = 0; + + LaunchEigenPadding(context, d_input, reshaped_in_dims, d_out, + reshaped_out_dims, reshaped_padding); + } else { + // other dimension need padding + // reshape the dimension of tensor in 3: + // [preceding, padding, succeeding] + std::vector in_tore_shape(3, 1), out_tore_shape(3, 1); + Eigen::array, 3> reshaped_padding; + + // first dimension is the accumulate of preceding dimension + for (int i = 0; i < pad_dim; i++) { + in_tore_shape[0] *= in_dims[i]; + out_tore_shape[0] *= out_dims[i]; + } + // second dimension is the padding dimension + in_tore_shape[1] = in_dims[pad_dim]; + out_tore_shape[1] = out_dims[pad_dim]; + // third dimension is the accumulate of succeeding dimension + for (size_t i = pad_dim + 1; i < D; i++) { + in_tore_shape[2] *= in_dims[i]; + out_tore_shape[2] *= out_dims[i]; + } + + // convert array from std::vector to DDim + framework::DDim reshaped_in_dims = + framework::make_ddim(in_tore_shape); + framework::DDim reshaped_out_dims = + framework::make_ddim(out_tore_shape); + + // after reshape: + // the first dimension do not need padding, set padding[0] zero + reshaped_padding[0].first = reshaped_padding[2].second = 0; + // the second dimension is the previous padding dimension + reshaped_padding[1].first = paddings[pad_dim].first; + reshaped_padding[1].second = paddings[pad_dim].second; + // the third dimension do not need padding, set padding[2] zero + reshaped_padding[2].first = reshaped_padding[2].second = 0; + + LaunchEigenPadding(context, d_input, reshaped_in_dims, d_out, + reshaped_out_dims, reshaped_padding); + } + } else { + // need padding at many dimension, cannot reduce dimension + LaunchEigenPadding(context, d_input, in_dims, d_out, out_dims, + paddings); + } + } + } + + template + void LaunchEigenPadding( + const framework::ExecutionContext& context, framework::Tensor* d_input, + const framework::DDim& in_dims, const framework::Tensor* d_out, + const framework::DDim& out_dims, + const Eigen::array, D>& paddings) const { + auto& place = + *context.template device_context().eigen_device(); auto d_in_t = framework::EigenTensor::From( - *d_input); + *d_input, in_dims); auto d_out_t = framework::EigenTensor::From( *d_out, out_dims); - d_in_t.device(place) = d_out_t.pad(paddings, T(0)); + + if (d_input->numel() <= Eigen::NumTraits::highest()) { + // similar to tf.pad: + // if element number less than INT_MAX, change the type of index to int + Eigen::array, D> paddings_32bit; + for (size_t i = 0; i < D; i++) { + paddings_32bit[i] = + std::make_pair(paddings[i].first, paddings[i].second); + } + framework::To32BitIndex(d_in_t).device(place) = + framework::To32BitIndex(d_out_t).pad(paddings_32bit, T(0)); + } else { + d_in_t.device(place) = d_out_t.pad(paddings, T(0)); + } } }; } // namespace operators diff --git a/paddle/fluid/operators/slice_op_npu.cc b/paddle/fluid/operators/slice_op_npu.cc index e5e0dafdae0b15..9974536da9acb4 100644 --- a/paddle/fluid/operators/slice_op_npu.cc +++ b/paddle/fluid/operators/slice_op_npu.cc @@ -124,11 +124,13 @@ namespace ops = paddle::operators; REGISTER_OP_NPU_KERNEL( slice, ops::SliceNPUKernel, + ops::SliceNPUKernel, ops::SliceNPUKernel); REGISTER_OP_NPU_KERNEL( slice_grad, ops::SliceGradNPUKernel, + ops::SliceGradNPUKernel, ops::SliceGradNPUKernel); diff --git a/paddle/fluid/operators/softmax_op_npu_test.cc b/paddle/fluid/operators/softmax_op_npu_test.cc index f06f59f3b4e005..d20b3ac04bf95c 100644 --- a/paddle/fluid/operators/softmax_op_npu_test.cc +++ b/paddle/fluid/operators/softmax_op_npu_test.cc @@ -159,12 +159,12 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) { TEST(softmax, NPU_fp32) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx); + auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx); } TEST(softmax_grad, NPU_fp32) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - CompareGrad(&scope, ctx); + auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + CompareGrad(&scope, *ctx); } diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc index e58b39252ce5f4..fbaf76d4e7cd89 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc @@ -55,7 +55,7 @@ class SoftmaxWithCrossEntropyOpMaker "the given labels as soft labels.") .SetDefault(false); AddAttr( - "softmax_switch", + "use_softmax", "(bool, default: true), A flag to indicate whether to do softmax ") .SetDefault(true); AddAttr( @@ -320,7 +320,6 @@ REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy_grad, REGISTER_OP_VERSION(softmax_with_cross_entropy) .AddCheckpoint( R"ROC( - Add a new attribute [softmax_switch] )ROC", + Add a new attribute [use_softmax] )ROC", paddle::framework::compatible::OpVersionDesc().NewAttr( - "softmax_switch", "A flag to indicate whether to do softmax", - true)); + "use_softmax", "A flag to indicate whether to do softmax", true)); diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu index 140059256c3cc9..4aec4c17422792 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu @@ -772,10 +772,10 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel { platform::is_gpu_place(context.GetPlace()), true, platform::errors::Unavailable("softmax_with_cross_entropy operator's " "CUDA kernel only runs on GPU device.")); - const bool softmax_switch = context.Attr("softmax_switch"); + const bool use_softmax = context.Attr("use_softmax"); // do not with softmax op, and input is softmax - if (!softmax_switch) { + if (!use_softmax) { const Tensor* softmax = context.Input("Logits"); const Tensor* labels = context.Input("Label"); Tensor* softmax_out = context.Output("Softmax"); @@ -925,10 +925,10 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel { int block = 512; auto stream = context.cuda_device_context().stream(); auto ignore_index = context.Attr("ignore_index"); - auto softmax_switch = context.Attr("softmax_switch"); + auto use_softmax = context.Attr("use_softmax"); // do not with softmax op, and input is softmax - if (!softmax_switch) { + if (!use_softmax) { if (context.Attr("soft_label")) { int grid = (n * d + block - 1) / block; const T* label_data = labels->data(); diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.h b/paddle/fluid/operators/softmax_with_cross_entropy_op.h index 55b811cbe31e40..74316841a13b17 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h @@ -31,10 +31,10 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ( platform::is_cpu_place(context.GetPlace()), true, platform::errors::Unimplemented("This kernel only runs on CPU.")); - const bool softmax_switch = context.Attr("softmax_switch"); + const bool use_softmax = context.Attr("use_softmax"); // do not with softmax op, and input is softmax - if (!softmax_switch) { + if (!use_softmax) { const Tensor* softmax = context.Input("Logits"); const Tensor* labels = context.Input("Label"); Tensor* softmax_out = context.Output("Softmax"); @@ -113,9 +113,9 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel { context.Output(framework::GradVarName("Logits")); const Tensor* softmax = context.Input("Softmax"); - const bool softmax_switch = context.Attr("softmax_switch"); + const bool use_softmax = context.Attr("use_softmax"); - if (logit_grad != softmax || !softmax_switch) { + if (logit_grad != softmax || !use_softmax) { framework::TensorCopy(*softmax, context.GetPlace(), context.device_context(), logit_grad); } @@ -138,8 +138,8 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel { auto logit_grad_mat = framework::EigenMatrix::From(logit_grad_2d); auto& place = *context.template device_context() .eigen_device(); - if (!softmax_switch) { - // softmax_switch step1 + if (!use_softmax) { + // use_softmax step1 if (soft_label) { auto lbl_mat = framework::EigenMatrix::From(labels_2d); logit_grad_mat.device(place) = @@ -148,7 +148,7 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel { out_grad_mat.broadcast(Eigen::DSizes(1, axis_dim)) * logit_grad_mat; } - // softmax_switch step2 + // use_softmax step2 else { const int64_t* label_data = labels->data(); T* logit_grad_data = logit_grad->data(); @@ -181,7 +181,7 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel { return; } - // for softmax_switch=False, continue + // for use_softmax=False, continue if (soft_label) { // when soft_label = True, ignore_index is not supported diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc index c777a02f96bd9a..a34946315f5a81 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc @@ -67,12 +67,10 @@ class SoftmaxWithCrossEntropyNPUKernel : public framework::OpKernel { // on and off Tensor on_tensor(framework::proto::VarType::INT32); on_tensor.mutable_data({1}, ctx.GetPlace()); - TensorFromVector(std::vector{static_cast(1)}, - ctx.device_context(), &on_tensor); + FillNpuTensorWithConstant(&on_tensor, static_cast(1)); Tensor off_tensor(framework::proto::VarType::INT32); off_tensor.mutable_data({1}, ctx.GetPlace()); - TensorFromVector(std::vector{static_cast(0)}, - ctx.device_context(), &off_tensor); + FillNpuTensorWithConstant(&off_tensor, static_cast(0)); // one_hot Tensor tmp_onehot(on_tensor.type()); @@ -142,12 +140,10 @@ class SoftmaxWithCrossEntropyGradNPUKernel : public framework::OpKernel { // on and off Tensor on_tensor(framework::proto::VarType::INT32); on_tensor.mutable_data({1}, ctx.GetPlace()); - TensorFromVector(std::vector{static_cast(1)}, - ctx.device_context(), &on_tensor); + FillNpuTensorWithConstant(&on_tensor, static_cast(1)); Tensor off_tensor(framework::proto::VarType::INT32); off_tensor.mutable_data({1}, ctx.GetPlace()); - TensorFromVector(std::vector{static_cast(0)}, - ctx.device_context(), &off_tensor); + FillNpuTensorWithConstant(&off_tensor, static_cast(0)); // one_hot Tensor tmp_onehot(on_tensor.type()); diff --git a/paddle/fluid/operators/squeeze_op_npu_test.cc b/paddle/fluid/operators/squeeze_op_npu_test.cc index 22dc81cbd79e0e..1de7ca8c7bdbf4 100644 --- a/paddle/fluid/operators/squeeze_op_npu_test.cc +++ b/paddle/fluid/operators/squeeze_op_npu_test.cc @@ -85,6 +85,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) { TEST(squeeze, NPU_fp32) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx); + auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx); } diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc index 741f86f35848b2..0f520adba57a20 100644 --- a/paddle/fluid/operators/sum_op.cc +++ b/paddle/fluid/operators/sum_op.cc @@ -326,4 +326,6 @@ REGISTER_OP_CPU_KERNEL( sum, ops::SumKernel, ops::SumKernel, ops::SumKernel, + ops::SumKernel, ops::SumKernel); diff --git a/paddle/fluid/operators/tensor_array_to_tensor_op.cc b/paddle/fluid/operators/tensor_array_to_tensor_op.cc index 620231eb2e2984..eb20e1c2cd2748 100644 --- a/paddle/fluid/operators/tensor_array_to_tensor_op.cc +++ b/paddle/fluid/operators/tensor_array_to_tensor_op.cc @@ -250,8 +250,12 @@ class LoDTensorArray2TensorGradOp : public framework::OperatorBase { auto dout_name = Input(framework::GradVarName("Out")); std::vector grad_names; + // NOTE(Aurelius84): Generating grad base name by Input("X") instead of + // fixed string to avoid incorrectly sharing same var's allocation in + // multi-thread that will cause wrong calculation result. + std::string grad_base_name = base_name + "_temp_grad_"; - LodTensorVectorResizeFromLodTensorArray(scope, "grad_name", Input("X"), + LodTensorVectorResizeFromLodTensorArray(scope, grad_base_name, Input("X"), &grad_names); auto use_stack = Attr("use_stack"); diff --git a/paddle/fluid/operators/tensor_formatter.cc b/paddle/fluid/operators/tensor_formatter.cc index 046ae90ec7c6e0..f1b64f042c3c09 100644 --- a/paddle/fluid/operators/tensor_formatter.cc +++ b/paddle/fluid/operators/tensor_formatter.cc @@ -125,6 +125,11 @@ void TensorFormatter::FormatData(const framework::LoDTensor& print_tensor, framework::LoDTensor cpu_tensor; platform::CPUPlace cpu_place; TensorCopy(print_tensor, cpu_place, &cpu_tensor); +#ifdef PADDLE_WITH_ASCEND_CL + if (platform::is_npu_place(print_tensor.place())) { + platform::DeviceContextPool::Instance().Get(print_tensor.place())->Wait(); + } +#endif data = cpu_tensor.data(); } diff --git a/paddle/fluid/operators/tile_op.cc b/paddle/fluid/operators/tile_op.cc index 6527362bb96907..b98e620cc2d342 100644 --- a/paddle/fluid/operators/tile_op.cc +++ b/paddle/fluid/operators/tile_op.cc @@ -286,3 +286,20 @@ REGISTER_OP_CPU_KERNEL( ops::TileGradKernel, ops::TileGradKernel, ops::TileGradKernel); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +REGISTER_OP_CUDA_KERNEL( + tile, ops::TileKernel, + ops::TileKernel, + ops::TileKernel, + ops::TileKernel, + ops::TileKernel, + ops::TileKernel); +REGISTER_OP_CUDA_KERNEL( + tile_grad, ops::TileGradKernel, + ops::TileGradKernel, + ops::TileGradKernel, + ops::TileGradKernel, + ops::TileGradKernel); +#endif diff --git a/paddle/fluid/operators/tile_op.cu b/paddle/fluid/operators/tile_op.cu deleted file mode 100644 index 5ca82cd6a1f435..00000000000000 --- a/paddle/fluid/operators/tile_op.cu +++ /dev/null @@ -1,31 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include "paddle/fluid/operators/tile_op.h" - -namespace ops = paddle::operators; -namespace plat = paddle::platform; - -REGISTER_OP_CUDA_KERNEL( - tile, ops::TileKernel, - ops::TileKernel, - ops::TileKernel, - ops::TileKernel, - ops::TileKernel, - ops::TileKernel); -REGISTER_OP_CUDA_KERNEL( - tile_grad, ops::TileGradKernel, - ops::TileGradKernel, - ops::TileGradKernel, - ops::TileGradKernel, - ops::TileGradKernel); diff --git a/paddle/fluid/operators/trace_op.cu b/paddle/fluid/operators/trace_op.cu index ea328361ded75a..2c2745018be402 100644 --- a/paddle/fluid/operators/trace_op.cu +++ b/paddle/fluid/operators/trace_op.cu @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include +#include #include "paddle/fluid/operators/reduce_ops/cub_reduce.h" #include "paddle/fluid/operators/trace_op.h" diff --git a/paddle/fluid/operators/transpose_op_npu_test.cc b/paddle/fluid/operators/transpose_op_npu_test.cc index 36f7a695358511..f6712814e1e3b8 100644 --- a/paddle/fluid/operators/transpose_op_npu_test.cc +++ b/paddle/fluid/operators/transpose_op_npu_test.cc @@ -126,12 +126,12 @@ void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx) { TEST(transpose2, NPU_fp32) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx); + auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx); } TEST(transpose2_grad, NPU_fp32) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - CompareGrad(&scope, ctx); + auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + CompareGrad(&scope, *ctx); } diff --git a/paddle/fluid/operators/truncated_gaussian_random_op.cu b/paddle/fluid/operators/truncated_gaussian_random_op.cu index 798709b1088d3f..1f25a880758923 100644 --- a/paddle/fluid/operators/truncated_gaussian_random_op.cu +++ b/paddle/fluid/operators/truncated_gaussian_random_op.cu @@ -12,25 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include +#include #include #include #include #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/truncated_gaussian_random_op.h" namespace paddle { namespace operators { template -struct TruncatedNormal { +struct GPUTruncatedNormal { T mean, std; T a_normal_cdf; T b_normal_cdf; unsigned int seed; T numeric_min; - __host__ __device__ TruncatedNormal(T mean, T std, T numeric_min, int seed) + __host__ __device__ GPUTruncatedNormal(T mean, T std, T numeric_min, int seed) : mean(mean), std(std), seed(seed), numeric_min(numeric_min) { a_normal_cdf = (1.0 + erff(-2.0 / sqrtf(2.0))) / 2.0; b_normal_cdf = (1.0 + erff(2.0 / sqrtf(2.0))) / 2.0; @@ -110,10 +113,10 @@ class GPUTruncatedGaussianRandomKernel : public framework::OpKernel { TruncatedNormalOffset(mean, std, std::numeric_limits::min(), seed_offset.first, gen_offset)); } else { - thrust::transform( - index_sequence_begin, index_sequence_begin + size, - thrust::device_ptr(data), - TruncatedNormal(mean, std, std::numeric_limits::min(), seed)); + thrust::transform(index_sequence_begin, index_sequence_begin + size, + thrust::device_ptr(data), + GPUTruncatedNormal( + mean, std, std::numeric_limits::min(), seed)); } } }; diff --git a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc index 4253187fdde74d..7f3190d9112c66 100644 --- a/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc +++ b/paddle/fluid/operators/truncated_gaussian_random_op_npu.cc @@ -35,28 +35,24 @@ class TruncatedGaussianRandomNPUKernel : public framework::OpKernel { float mean = ctx.Attr("mean"); Tensor mean_tensor(framework::proto::VarType::FP32); mean_tensor.mutable_data({1}, ctx.GetPlace()); - TensorFromVector(std::vector{mean}, ctx.device_context(), - &mean_tensor); + FillNpuTensorWithConstant(&mean_tensor, mean); float std = ctx.Attr("std"); Tensor std_tensor(framework::proto::VarType::FP32); std_tensor.mutable_data({1}, ctx.GetPlace()); - TensorFromVector(std::vector{std}, ctx.device_context(), - &std_tensor); + FillNpuTensorWithConstant(&std_tensor, std); int32_t seed_var = ctx.Attr("seed"); Tensor min_tensor(framework::proto::VarType::FP32); min_tensor.mutable_data({1}, ctx.GetPlace()); float min_value = mean - std * 2.0; - TensorFromVector(std::vector{min_value}, ctx.device_context(), - &min_tensor); + FillNpuTensorWithConstant(&min_tensor, min_value); Tensor max_tensor(framework::proto::VarType::FP32); max_tensor.mutable_data({1}, ctx.GetPlace()); float max_value = mean + std * 2.0; - TensorFromVector(std::vector{max_value}, ctx.device_context(), - &max_tensor); + FillNpuTensorWithConstant(&max_tensor, max_value); auto* out = ctx.Output("Out"); out->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc index 6efada4343ca54..007276b16d7f2e 100644 --- a/paddle/fluid/operators/uniform_random_op.cc +++ b/paddle/fluid/operators/uniform_random_op.cc @@ -18,10 +18,41 @@ limitations under the License. */ #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/bfloat16.h" namespace paddle { namespace operators { +namespace { +template +inline void UniformRealDistribution(T *data, const int64_t &size, + const float &min, const float &max, + const unsigned int &seed) { + VLOG(4) << "[CPU] UniformRandomKernel"; + std::uniform_real_distribution dist(static_cast(min), + static_cast(max)); + auto engine = paddle::framework::GetCPURandomEngine(seed); + + for (int64_t i = 0; i < size; ++i) { + data[i] = dist(*engine); + } +} + +template <> +inline void UniformRealDistribution(paddle::platform::bfloat16 *data, + const int64_t &size, const float &min, + const float &max, + const unsigned int &seed) { + VLOG(4) << "[CPU] UniformRandomKernel"; + std::uniform_real_distribution dist(min, max); + auto engine = paddle::framework::GetCPURandomEngine(seed); + + for (int64_t i = 0; i < size; ++i) { + data[i] = static_cast(dist(*engine)); + } +} +} // namespace + // It seems that Eigen::Tensor::random in GPU will SEGFAULT. // Use std::random and thrust::random(thrust is a std library in CUDA) to // implement uniform random. @@ -61,17 +92,11 @@ class CPUUniformRandomKernel : public framework::OpKernel { framework::ToTypeName(out_var->Type()))); } T *data = tensor->mutable_data(ctx.GetPlace()); - int64_t size = tensor->numel(); - std::uniform_real_distribution dist( - static_cast(ctx.Attr("min")), - static_cast(ctx.Attr("max"))); - unsigned int seed = static_cast(ctx.Attr("seed")); - auto engine = framework::GetCPURandomEngine(seed); - for (int64_t i = 0; i < size; ++i) { - data[i] = dist(*engine); - } + UniformRealDistribution( + data, size, ctx.Attr("min"), ctx.Attr("max"), + static_cast(ctx.Attr("seed"))); unsigned int diag_num = static_cast(ctx.Attr("diag_num")); @@ -257,9 +282,12 @@ REGISTER_OPERATOR( paddle::framework::EmptyGradOpMaker, paddle::operators::UniformRandomOpVarTypeInference); -REGISTER_OP_CPU_KERNEL(uniform_random, - paddle::operators::CPUUniformRandomKernel, - paddle::operators::CPUUniformRandomKernel); -REGISTER_OP_CPU_KERNEL(uniform_random_batch_size_like, - paddle::operators::CPUUniformRandomKernel, - paddle::operators::CPUUniformRandomKernel); +REGISTER_OP_CPU_KERNEL( + uniform_random, paddle::operators::CPUUniformRandomKernel, + paddle::operators::CPUUniformRandomKernel, + paddle::operators::CPUUniformRandomKernel); +REGISTER_OP_CPU_KERNEL( + uniform_random_batch_size_like, + paddle::operators::CPUUniformRandomKernel, + paddle::operators::CPUUniformRandomKernel, + paddle::operators::CPUUniformRandomKernel); diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu index 563a6c165b7485..ceb13a3dda41df 100644 --- a/paddle/fluid/operators/uniform_random_op.cu +++ b/paddle/fluid/operators/uniform_random_op.cu @@ -11,9 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include +#include #include #include - #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" diff --git a/paddle/fluid/operators/uniform_random_op.h b/paddle/fluid/operators/uniform_random_op.h index 6052e533643f3c..18a4154be30ac7 100644 --- a/paddle/fluid/operators/uniform_random_op.h +++ b/paddle/fluid/operators/uniform_random_op.h @@ -24,9 +24,9 @@ namespace operators { using Tensor = framework::Tensor; inline std::vector GetNewDataFromShapeTensor( - const Tensor *new_data_tensor) { + const Tensor* new_data_tensor) { if (new_data_tensor->type() == framework::proto::VarType::INT64) { - auto *new_data = new_data_tensor->data(); + auto* new_data = new_data_tensor->data(); framework::Tensor cpu_starts_tensor; if (platform::is_gpu_place(new_data_tensor->place())) { TensorCopySync(*new_data_tensor, platform::CPUPlace(), @@ -37,7 +37,7 @@ inline std::vector GetNewDataFromShapeTensor( new_data + new_data_tensor->numel()); return vec_new_data; } else if (new_data_tensor->type() == framework::proto::VarType::INT32) { - auto *new_data = new_data_tensor->data(); + auto* new_data = new_data_tensor->data(); std::vector vec_new_data; framework::Tensor cpu_starts_tensor; if (platform::is_gpu_place(new_data_tensor->place())) { @@ -58,7 +58,7 @@ inline std::vector GetNewDataFromShapeTensor( } inline std::vector GetNewDataFromShapeTensorList( - const std::vector &list_new_shape_tensor) { + const std::vector& list_new_shape_tensor) { std::vector vec_new_shape; vec_new_shape.reserve(list_new_shape_tensor.size()); for (size_t i = 0; i < list_new_shape_tensor.size(); ++i) { @@ -97,6 +97,5 @@ inline std::vector GetNewDataFromShapeTensorList( return vec_new_shape; } - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/unsqueeze_op_npu_test.cc b/paddle/fluid/operators/unsqueeze_op_npu_test.cc index 9b4485047f05c1..a145c914a8621b 100644 --- a/paddle/fluid/operators/unsqueeze_op_npu_test.cc +++ b/paddle/fluid/operators/unsqueeze_op_npu_test.cc @@ -85,6 +85,6 @@ void Compare(f::Scope* scope, const p::DeviceContext& ctx) { TEST(unsqueeze, NPU_fp32) { f::Scope scope; - p::NPUDeviceContext ctx(p::NPUPlace(0)); - Compare(&scope, ctx); + auto* ctx = p::DeviceContextPool::Instance().Get(p::NPUPlace(0)); + Compare(&scope, *ctx); } diff --git a/paddle/fluid/operators/where_index_op.cu b/paddle/fluid/operators/where_index_op.cu index bb968743585f7d..b1cd172923ee6d 100644 --- a/paddle/fluid/operators/where_index_op.cu +++ b/paddle/fluid/operators/where_index_op.cu @@ -12,7 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include +#ifdef __NVCC__ +#include "cub/cub.cuh" +#endif +#ifdef __HIPCC__ +#include +namespace cub = hipcub; +#endif + +#include #include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/where_index_op.h" @@ -25,52 +33,124 @@ namespace operators { using CUDADeviceContext = paddle::platform::CUDADeviceContext; template -class CUDAWhereIndexKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* condition = context.Input("Condition"); - auto* out = context.Output("Out"); - - // TODO(zhoukunsheng): Should optimize to ensure GPU is faster than CPU. - framework::Tensor cond_cpu; - framework::TensorCopy(*condition, platform::CPUPlace(), &cond_cpu); - - const T* cond_data = cond_cpu.data(); - int64_t numel = cond_cpu.numel(); - auto dims = cond_cpu.dims(); - int rank = dims.size(); - - thrust::host_vector h_true_index; - for (int64_t i = 0; i < numel; i++) { - if (static_cast(cond_data[i])) { - h_true_index.push_back(i); +__global__ void GetTrueNum(const T *cond_data, const int64_t numel, + int64_t *true_num_array) { + const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x; + + for (int64_t idx = tid; idx < numel; idx += gridDim.x * blockDim.x) { + true_num_array[idx] = + static_cast(static_cast(cond_data[idx])); + } +} + +template +__global__ void SetTrueIndex(int64_t *out_ptr, const T *cond_data, + const int64_t numel, const int64_t *stride_array, + const int64_t rank, + const int64_t *true_num_array) { + const int64_t tid = blockIdx.x * blockDim.x + threadIdx.x; + + for (int64_t idx = tid; idx < numel; idx += gridDim.x * blockDim.x) { + // true_num_array is calculated by cub::InclusiveSum, + // cause the first element of true_num_array is 1, + // so we need substract 1 to get true index. + const int64_t true_index = true_num_array[idx] - 1; + if (static_cast(cond_data[idx])) { + int64_t rank_index = idx; + for (int j = 0; j < rank; j++) { + const int64_t out_index = rank_index / stride_array[j]; + out_ptr[true_index * rank + j] = out_index; + rank_index -= out_index * stride_array[j]; } } - thrust::device_vector d_true_index = h_true_index; - int64_t* ptr_true_index = thrust::raw_pointer_cast(d_true_index.data()); - - size_t true_num = h_true_index.size(); + } +} +template +class CUDAWhereIndexKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *condition = context.Input("Condition"); + auto *out = context.Output("Out"); + auto &dev_ctx = context.template device_context(); + + const T *cond_data = condition->data(); + const int64_t numel = condition->numel(); + auto dims = condition->dims(); + const int rank = dims.size(); + + auto d_array_mem = memory::Alloc(dev_ctx, (numel + rank) * sizeof(int64_t)); + auto h_array_mem = + memory::Alloc(platform::CPUPlace(), (rank + 1) * sizeof(int64_t)); + + // "stride_array" is an array and len(stride_array)==rank, + // each element is the stride of each dimension -- the length from i to i+1. + int64_t *h_stride_array = reinterpret_cast(h_array_mem->ptr()); + int64_t *d_stride_array = reinterpret_cast(d_array_mem->ptr()); + + // "true_num_array" is an array and len(stride_array)==numel, + // at the beginning, + // "true_num_array" will set 1 if condition[i] == true else 0, + // then it will be calculated by cub::InclusiveSum, + // so that we can get the true number before i as the out index + int64_t *d_true_num_array = d_stride_array + rank; + + // the total_true_num is the total number of condition[i] == true + int64_t *h_total_true_num = h_stride_array + rank; + + // alloce cub memory + size_t cub_size = 0; + cub::DeviceScan::InclusiveSum(nullptr, cub_size, d_true_num_array, + d_true_num_array, numel, dev_ctx.stream()); + auto cub_mem = memory::Alloc(dev_ctx, cub_size * sizeof(int64_t)); + void *cub_data = cub_mem->ptr(); + + // set d_true_num_array[i]=1 if cond_data[i]==true else 0 + const int threads = std::min(numel, static_cast(128)); + const int64_t need_grids = (numel + threads - 1) / threads; + const int grids = std::min(need_grids, static_cast(256)); + GetTrueNum<<>>(cond_data, numel, + d_true_num_array); + + // calculate the inclusive prefix sum of "true_num_array" + // to get the index of "out" tensor, + // and the total number of cond_data[i]==true. + // Example: + // condition: F T T F F F T T + // before: 0 1 1 0 0 0 1 1 + // after: 0 1 2 2 2 2 3 4 + // out: 1 2 6 7 + cub::DeviceScan::InclusiveSum(cub_data, cub_size, d_true_num_array, + d_true_num_array, numel, dev_ctx.stream()); + + // calculate each dimension's stride + h_stride_array[rank - 1] = 1; + for (int i = rank - 2; i >= 0; i--) { + h_stride_array[i] = h_stride_array[i + 1] * dims[i + 1]; + } + memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), + d_stride_array, platform::CPUPlace(), h_stride_array, + rank * sizeof(int64_t), dev_ctx.stream()); + + // get total ture number and set output size + // the last element of cub::InclusiveSum is the total number + memory::Copy(platform::CPUPlace(), h_total_true_num, + BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), + d_true_num_array + numel - 1, sizeof(int64_t), + dev_ctx.stream()); + dev_ctx.Wait(); + + int64_t true_num = *h_total_true_num; out->Resize(framework::make_ddim({static_cast(true_num), rank})); - auto out_ptr = out->mutable_data(context.GetPlace()); + auto out_data = out->mutable_data(context.GetPlace()); if (true_num == 0) { return; } - thrust::host_vector h_stride(rank, 0); - h_stride[rank - 1] = 1; - for (int i = rank - 2; i >= 0; i--) { - h_stride[i] = h_stride[i + 1] * dims[i + 1]; - } - thrust::device_vector d_stride = h_stride; - int64_t* ptr_stride = thrust::raw_pointer_cast(d_stride.data()); - - auto& dev_ctx = context.template device_context(); - WhereIndexFunctor functor(ptr_true_index, true_num, ptr_stride, - rank, out_ptr); - platform::ForRange for_range(dev_ctx, true_num); - for_range(functor); + // using true_num_array and stride_array to calculate the output index + SetTrueIndex<<>>( + out_data, cond_data, numel, d_stride_array, rank, d_true_num_array); } }; diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 584dbd4756aa09..0827d6a5ae7644 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -106,11 +106,11 @@ ELSE() ENDIF() IF(WITH_ASCEND_CL) -cc_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) +cc_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) ENDIF() IF(WITH_GPU) - nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) + nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) ENDIF() IF(WITH_ROCM) hip_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) @@ -136,13 +136,18 @@ cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS} ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS}) -cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce) +cc_library(collective_helper SRCS collective_helper.cc collective_helper_npu.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce) if(WITH_GPU OR WITH_ROCM) cc_library(cuda_resource_pool SRCS cuda_resource_pool.cc DEPS gpu_info) target_link_libraries(device_context cuda_resource_pool) endif() +if(WITH_ASCEND_CL) + cc_library(npu_resource_pool SRCS npu_resource_pool.cc DEPS npu_info) + target_link_libraries(device_context npu_resource_pool) +endif() + cc_test(init_test SRCS init_test.cc DEPS device_context) if(WITH_GPU) @@ -185,6 +190,7 @@ cc_test(bfloat16_test SRCS bfloat16_test.cc DEPS lod_tensor) IF(WITH_GPU) nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor) + nv_test(bfloat16_gpu_test SRCS bfloat16_test.cu DEPS lod_tensor) nv_test(test_limit_gpu_memory SRCS test_limit_gpu_memory.cu DEPS gpu_info flags) nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info) ENDIF() diff --git a/paddle/fluid/platform/ascend_npu_info.h b/paddle/fluid/platform/ascend_npu_info.h index 7afed121a5acb6..213013f5b12777 100644 --- a/paddle/fluid/platform/ascend_npu_info.h +++ b/paddle/fluid/platform/ascend_npu_info.h @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#ifdef PADDLE_WITH_ASCEND +#ifdef PADDLE_WITH_ASCEND_CL namespace paddle { namespace platform { diff --git a/paddle/fluid/platform/bfloat16.h b/paddle/fluid/platform/bfloat16.h index 6cb4901f1dde32..a362e2903f2456 100644 --- a/paddle/fluid/platform/bfloat16.h +++ b/paddle/fluid/platform/bfloat16.h @@ -21,6 +21,15 @@ #include #include +#ifdef PADDLE_WITH_CUDA +#include +#endif + +#if defined(__CUDACC__) && CUDA_VERSION >= 11000 +#define PADDLE_CUDA_BF16 +#include +#endif + #if !defined(_WIN32) #define PADDLE_ALIGN(x) __attribute__((aligned(x))) #else @@ -44,6 +53,7 @@ struct PADDLE_ALIGN(2) bfloat16 { public: uint16_t x; + // Constructors bfloat16() = default; bfloat16(const bfloat16& o) = default; bfloat16& operator=(const bfloat16& o) = default; @@ -60,15 +70,34 @@ struct PADDLE_ALIGN(2) bfloat16 { tempRes = reinterpret_cast(&val); res = *tempRes; x = res >> 16; +#else +#if defined(PADDLE_CUDA_BF16) + __nv_bfloat16 tmp = __float2bfloat16(val); + x = *reinterpret_cast(&tmp); #else std::memcpy(&x, reinterpret_cast(&val) + 2, 2); #endif +#endif + } + +#if defined(PADDLE_CUDA_BF16) + HOSTDEVICE inline explicit bfloat16(const __nv_bfloat16& val) { + x = *reinterpret_cast(&val); } +#endif template HOSTDEVICE inline explicit bfloat16(const T& val) : x(bfloat16(static_cast(val)).x) {} +// Assignment operators +#if defined(PADDLE_CUDA_BF16) + HOSTDEVICE inline bfloat16& operator=(const __nv_bfloat16& val) { + x = *reinterpret_cast(&val); + return *this; + } +#endif + HOSTDEVICE inline bfloat16& operator=(bool b) { x = b ? 0x3f80 : 0; return *this; @@ -124,13 +153,24 @@ struct PADDLE_ALIGN(2) bfloat16 { return *this; } + // Conversion opertors HOSTDEVICE inline explicit operator float() const { +#ifdef PADDLE_CUDA_BF16 + return __bfloat162float(*reinterpret_cast(&x)); +#else float val = 0.f; uint16_t temp = x; memcpy(reinterpret_cast(&val) + 2, reinterpret_cast(&temp), 2); return val; +#endif + } + +#ifdef PADDLE_CUDA_BF16 + HOSTDEVICE inline explicit operator __nv_bfloat16() const { + return *reinterpret_cast(&x); } +#endif HOSTDEVICE inline explicit operator bool() const { return (x & 0x7fff) != 0; } @@ -223,6 +263,7 @@ HOSTDEVICE inline bfloat16 raw_uint16_to_bfloat16(uint16_t a) { return res; } +// Comparison operators HOSTDEVICE inline bool operator==(const bfloat16& a, const bfloat16& b) { return static_cast(a) == static_cast(b); } diff --git a/paddle/fluid/platform/bfloat16_test.cu b/paddle/fluid/platform/bfloat16_test.cu new file mode 100644 index 00000000000000..dbbb72920a53b0 --- /dev/null +++ b/paddle/fluid/platform/bfloat16_test.cu @@ -0,0 +1,124 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/bfloat16.h" + +#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h +#include +#include +#include +#include "paddle/fluid/framework/lod_tensor.h" + +#if defined(PADDLE_CUDA_BF16) +namespace paddle { +namespace platform { + +TEST(bfloat16, convert_float32_to_bfloat16_on_gpu) { + // Convert float32 to bfloat16 + EXPECT_EQ((bfloat16(1.0f)).x, 0x3f80); + EXPECT_EQ((bfloat16(0.5f)).x, 0x3f00); + EXPECT_EQ((bfloat16(0.33333f)).x, 0x3eab); + EXPECT_EQ((bfloat16(0.0f)).x, 0x0000); + EXPECT_EQ((bfloat16(-0.0f)).x, 0x8000); + EXPECT_EQ((bfloat16(65536.0f)).x, 0x4780); +} + +TEST(bfloat16, assignment_operator_on_gpu) { + // Assignment operator + bfloat16 v_assign; + v_assign = nv_bfloat16(bfloat16(1.0f)); + EXPECT_EQ(v_assign.x, 0x3f80); + v_assign = 0.33333; + EXPECT_EQ(v_assign.x, 0x3eab); +} + +TEST(bfloat16, convert_bfloat16_to_float32_on_gpu) { + // Conversion operator + EXPECT_EQ(static_cast(bfloat16(0.5f)), 0.5f); + EXPECT_NEAR(static_cast(bfloat16(0.33333)), 0.33333, 0.01); + EXPECT_EQ(static_cast(bfloat16(-1)), -1); + EXPECT_EQ(static_cast(bfloat16(true)), true); +} + +TEST(bfloat16, lod_tensor_on_gpu) { + framework::LoDTensor src_tensor; + framework::LoDTensor gpu_tensor; + framework::LoDTensor dst_tensor; + + bfloat16 *src_ptr = src_tensor.mutable_data( + framework::make_ddim({2, 2}), CPUPlace()); + + bfloat16 arr[4] = {bfloat16(1.0f), bfloat16(0.5f), bfloat16(0.33333f), + bfloat16(0.0f)}; + memcpy(src_ptr, arr, 4 * sizeof(bfloat16)); + + // CPU LoDTensor to GPU LoDTensor + CUDAPlace gpu_place(0); + CUDADeviceContext gpu_ctx(gpu_place); + framework::TensorCopy(src_tensor, gpu_place, gpu_ctx, &gpu_tensor); + + // GPU LoDTensor to CPU LoDTensor + framework::TensorCopy(gpu_tensor, CPUPlace(), gpu_ctx, &dst_tensor); + + // Sync before comparing LoDTensors + gpu_ctx.Wait(); + const bfloat16 *dst_ptr = dst_tensor.data(); + ASSERT_NE(src_ptr, dst_ptr); + for (size_t i = 0; i < 4; ++i) { + EXPECT_EQ(src_ptr[i].x, dst_ptr[i].x); + } +} + +TEST(bfloat16, isinf) { + bfloat16 a; + a.x = 0x7f80; + bfloat16 b = bfloat16(INFINITY); + bfloat16 c = static_cast(INFINITY); + EXPECT_EQ(std::isinf(a), true); + EXPECT_EQ(std::isinf(b), true); + EXPECT_EQ(std::isinf(c), true); +} + +TEST(bfloat16, isnan) { + bfloat16 a; + a.x = 0x7fff; + bfloat16 b = bfloat16(NAN); + bfloat16 c = static_cast(NAN); + EXPECT_EQ(std::isnan(a), true); + EXPECT_EQ(std::isnan(b), true); + EXPECT_EQ(std::isnan(c), true); +} + +TEST(bfloat16, cast) { + bfloat16 a; + a.x = 0x0070; + auto b = a; + { + // change semantic, keep the same value + bfloat16 c = reinterpret_cast(reinterpret_cast(b)); + EXPECT_EQ(b, c); + } + + { + // use uint32 low 16 bit store float16 + uint32_t c = reinterpret_cast(b); + bfloat16 d; + d.x = c; + EXPECT_EQ(b, d); + } +} + +} // namespace platform +} // namespace paddle +#endif diff --git a/paddle/fluid/platform/collective_helper.h b/paddle/fluid/platform/collective_helper.h index 197f905ba68a29..b0b857f7ee3f2a 100644 --- a/paddle/fluid/platform/collective_helper.h +++ b/paddle/fluid/platform/collective_helper.h @@ -22,6 +22,7 @@ #include "boost/variant.hpp" #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/dynload/hccl.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -126,6 +127,113 @@ class NCCLCommContext { }; #endif +#if defined(PADDLE_WITH_ASCEND_CL) +// In order to apply hierarchical communication with HCCL, we need +// a communication ring contains HCCL communicators associated to a global +// HCCLUniqueId. E.g. for a hierarchical case, +// +// 11 - 12 21 - 22 +// | | | | +// 13 - 14 - 23 - 24 +// | | +// 31 - 32 - 41 - 42 +// | | | | +// 33 - 34 43 - 44 +// +// we group (14,23,32,41) as the top, and (11,12,13,14), (21,22,23,24), +// (31,32,33,34), (41,42,43,44) as bottoms respectively. +// +// We could also use a single communication ring for the flatten case +// +// The HCCLComm instance is created and reversed in the HCCLCommContext +// singleton with a global user specified group id. +class NPUDeviceContext; + +#define ENV_RANK_TABLE_FILE "RANK_TABLE_FILE" +#define ENV_RANK_ID "PADDLE_TRAINER_ID" + +class HCCLComm { + public: + virtual int ring_id() const = 0; + virtual int nranks() const = 0; + virtual int rank() const = 0; + virtual int device_id() const = 0; + virtual HcclComm comm() const = 0; + virtual aclrtStream stream() const = 0; + virtual NPUDeviceContext* dev_context() const = 0; + virtual ~HCCLComm() = default; +}; + +// A singleton HCCL communicator context reserves communication ring ids +class HCCLCommContext { + public: + static HCCLCommContext& Instance() { + static HCCLCommContext comm_ctx; + return comm_ctx; + } + + HCCLComm* CreateHCCLComm(HcclRootInfo* hccl_id, int nranks, int rank, + int dev_id, int ring_id); + // a latter comm with the same dev_id and the same ring_id + // will override the former + HCCLComm* AssignHCCLComm(HcclComm comm, int nranks, int rank, int dev_id, + int ring_id); + + // retrieve a communicator by the ring id in multiprocessing mode + HCCLComm* Get(int ring_id) const { + PADDLE_ENFORCE_GT( + comm_map_.count(ring_id), 0, + platform::errors::InvalidArgument( + "Communicator in ring id %d has not been initialized.", ring_id)); + PADDLE_ENFORCE_EQ(comm_map_.at(ring_id).size(), 1, + platform::errors::InvalidArgument( + "One device id should be specified to retrieve from " + "multiple communicators.")); + return comm_map_.at(ring_id).begin()->second.get(); + } + + // retrieve a communicator by the ring id and the device id + HCCLComm* Get(int ring_id, int dev_id) const { + PADDLE_ENFORCE_GT( + comm_map_.count(ring_id), 0, + platform::errors::InvalidArgument( + "Communicator of ring id %d has not been initialized.", ring_id)); + PADDLE_ENFORCE_GT( + comm_map_.at(ring_id).count(dev_id), 0, + platform::errors::InvalidArgument( + "Communicator at device id %d has not been initialized in ring %d.", + dev_id, ring_id)); + return comm_map_.at(ring_id).at(dev_id).get(); + } + + // retrieve a communicator by the ring id and place + HCCLComm* Get(int ring_id, Place place) const { + return Get(ring_id, BOOST_GET_CONST(NPUPlace, place).device); + } + + private: + // Init global hcom + HCCLCommContext() {} + // we may use group feature in the feature + // HCCLCommContext() { InitHcomWorldGroup(); } + + HcclComm comm_; + + public: + ~HCCLCommContext() {} + + std::once_flag once_flag_; + std::mutex comm_map_mutex_; + // ring id to dev-HCCLComm + std::map>> comm_map_; + + // void InitHcomWorldGroup(); + void ReleaseHCCLComms(); + + DISABLE_COPY_AND_ASSIGN(HCCLCommContext); +}; +#endif + #if defined(PADDLE_WITH_XPU_BKCL) // In order to apply hierarchical communication with BKCL, we need // a communication ring contains BKCL communicators associated to a global diff --git a/paddle/fluid/platform/collective_helper_npu.cc b/paddle/fluid/platform/collective_helper_npu.cc new file mode 100644 index 00000000000000..f30e5fa833d44d --- /dev/null +++ b/paddle/fluid/platform/collective_helper_npu.cc @@ -0,0 +1,145 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include + +namespace paddle { +namespace platform { + +class HCCLCommImpl : public HCCLComm { + public: + void set_ring_id(int ring_id) { ring_id_ = ring_id; } + int ring_id() const override { return ring_id_; } + + void set_nranks(int nranks) { nranks_ = nranks; } + int nranks() const override { return nranks_; } + + void set_rank(int rank) { rank_ = rank; } + int rank() const override { return rank_; } + + int device_id() const override { + return BOOST_GET_CONST(NPUPlace, dev_ctx_->GetPlace()).device; + } + + ~HCCLCommImpl() { + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclCommDestroy(comm_)); + } + + void set_comm(HcclComm comm) { comm_ = comm; } + HcclComm comm() const override { return comm_; } + + aclrtStream stream() const override { return dev_ctx_->stream(); } + + void set_dev_ctx(std::unique_ptr&& dev_ctx) { + dev_ctx_ = std::move(dev_ctx); + } + NPUDeviceContext* dev_context() const override { return dev_ctx_.get(); } + + private: + int ring_id_; + int nranks_; + int rank_; + HcclComm comm_; + std::unique_ptr dev_ctx_; +}; + +HCCLComm* HCCLCommContext::CreateHCCLComm(HcclRootInfo* hccl_id, int nranks, + int rank, int dev_id, int ring_id) { + PADDLE_ENFORCE_NOT_NULL(hccl_id, + platform::errors::InvalidArgument( + "The hccl unique id should not be null.")); + PADDLE_ENFORCE_GT( + nranks, 1, + platform::errors::InvalidArgument( + "Expected nranks > 1. But received nranks is %d.", nranks)); + PADDLE_ENFORCE_GE(rank, 0, + platform::errors::InvalidArgument( + "Expected rank >= 0. But received rank is %d.", rank)); + PADDLE_ENFORCE_LT( + rank, nranks, + platform::errors::InvalidArgument( + "Expected rank < nranks. But received rank is %d, nranks is %d.", + rank, nranks)); + PADDLE_ENFORCE_GE( + dev_id, 0, + platform::errors::InvalidArgument( + "Expected dev_id >= 0. But received dev_id is %d.", dev_id)); + + HcclComm comm; + PADDLE_ENFORCE_NPU_SUCCESS(aclrtSetDevice(dev_id)); + VLOG(1) << "initialized comm: " << &comm << ", nranks: " << nranks + << ", hccl_id: " << hccl_id << ", rank: " << rank; + PADDLE_ENFORCE_NPU_SUCCESS( + platform::dynload::HcclCommInitRootInfo(nranks, hccl_id, rank, &comm)); + + VLOG(1) << "initialized comm: " << &comm << ", nranks: " << nranks + << ", hccl_id: " << hccl_id << ", rank: " << rank; + + auto* comm_wrapper = AssignHCCLComm(comm, nranks, rank, dev_id, ring_id); + + VLOG(1) << "hccl communicator of rank " << rank << " in ring " << ring_id + << " has been created on device " << dev_id + << ", with comm: " << comm_wrapper->comm(); + + std::call_once(once_flag_, []() { + std::atexit([]() { HCCLCommContext::Instance().ReleaseHCCLComms(); }); + }); + + return comm_wrapper; +} + +HCCLComm* HCCLCommContext::AssignHCCLComm(HcclComm comm, int nranks, int rank, + int dev_id, int ring_id) { + std::unique_ptr dev_ctx( + new NPUDeviceContext(NPUPlace(dev_id))); + + HCCLCommImpl* c = new HCCLCommImpl; + c->set_ring_id(ring_id); + c->set_nranks(nranks); + c->set_rank(rank); + c->set_comm(comm); + c->set_dev_ctx(std::move(dev_ctx)); + + comm_map_mutex_.lock(); + if (comm_map_.count(ring_id) == 0) { + comm_map_.emplace(ring_id, std::map>()); + } + auto& dev2comm = comm_map_[ring_id]; + + dev2comm.emplace(dev_id, std::unique_ptr(c)); + comm_map_mutex_.unlock(); + + if (ring_id == 0) { + auto* dev_ctx = static_cast( + platform::DeviceContextPool::Instance().Get( + platform::NPUPlace(dev_id))); + dev_ctx->set_hccl_comm(comm); + } + + return comm_map_[ring_id][dev_id].get(); +} + +void HCCLCommContext::ReleaseHCCLComms() { + for (auto& p : comm_map_) { + for (auto& q : p.second) { + q.second.reset(); + } + } +} + +} // namespace platform +} // namespace paddle +#endif diff --git a/paddle/fluid/platform/denormal.cc b/paddle/fluid/platform/denormal.cc index 35e9804e2a3081..4af156d1577dd9 100644 --- a/paddle/fluid/platform/denormal.cc +++ b/paddle/fluid/platform/denormal.cc @@ -28,7 +28,7 @@ #endif #if !defined(GCC_WITHOUT_INTRINSICS) && !defined(PADDLE_WITH_ARM) && \ - !defined(PADDLE_WITH_SW) && !defined(PADDLE_WITH_MIPS) + !defined(PADDLE_WITH_SW) && !defined(PADDLE_WITH_MIPS) && !defined(_WIN32) #define DENORM_USE_INTRINSICS #endif diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index a0ade3898c336b..9a47ac45462ed7 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -16,8 +16,8 @@ limitations under the License. */ #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h" #include "paddle/fluid/platform/cuda_device_guard.h" #endif - #include "glog/logging.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace memory { @@ -254,8 +254,9 @@ NPUDeviceContext::~NPUDeviceContext() { } void NPUDeviceContext::Wait() const { - NPUDeviceGuard guard(place_.device); - PADDLE_ENFORCE_NPU_SUCCESS(aclrtSynchronizeDevice()); + platform::RecordEvent record_event("NPUDeviceContext/wait"); + VLOG(4) << "NPU context(" << this << ") Wait"; + stream_->Wait(); } aclrtStream NPUDeviceContext::stream() const { return stream_->raw_stream(); } @@ -536,6 +537,7 @@ Place CUDAPinnedDeviceContext::GetPlace() const { return place_; } MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place) : CPUDeviceContext(place), p_blobmap_() { p_blobmap_.reset(new BlobMap()); + p_exec_items_.reset(new ExecMap()); p_mutex_.reset(new std::mutex()); } @@ -559,7 +561,7 @@ MKLDNNDeviceContextThreadLocals::Body::~Body() { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::MKLDNNDeviceContext* dev_ctx = (platform::MKLDNNDeviceContext*)pool.Get(cpu_place); - dev_ctx->ResetBlobMap(); + dev_ctx->ResetBlobMap(exec_ptr_); } void MKLDNNDeviceContextThreadLocals::Body::set_cur_mkldnn_session_id( @@ -606,17 +608,34 @@ mkldnn::stream& MKLDNNDeviceContextThreadLocals::Body::get_stream(void) { return cur_stream; } -void MKLDNNDeviceContext::ResetBlobMap() { +void MKLDNNDeviceContext::ResetBlobMap(void* ptr) { std::lock_guard lock(*p_mutex_); if (!block_next_cache_clearing_) { VLOG(3) << "Clearing DNNL cache."; - p_blobmap_->clear(); + // If no specific executor pointer then clear + // everything. For executor pointer then clear only + // objects allocated when using given executor + if (ptr == nullptr) { + p_blobmap_->clear(); + } else { + for (auto& v : (*p_exec_items_)[ptr]) { + (v.first)->erase(v.second); + } + p_exec_items_->erase(ptr); + } } else { VLOG(3) << "Prevented Clearing DNNL cache."; block_next_cache_clearing_ = false; } } +void MKLDNNDeviceContext::LinkEntryWithExecutor(BlobPtr_t pblob, + KeyBlob::iterator it) const { + // Take current executor addess from TLS + // and for this executor's items add the one defined with arguments + (*p_exec_items_)[tls().get_curr_exec()].push_back(std::make_pair(pblob, it)); +} + void MKLDNNDeviceContext::BlockNextCacheClearing() { std::lock_guard lock(*p_mutex_); VLOG(3) << "Next DNNL cache clearing has been blocked."; @@ -681,7 +700,11 @@ void MKLDNNDeviceContext::SetBlob(const std::string& name, // Find Blob via name auto blob_it = pBlob->find(name); if (blob_it == pBlob->end()) { - (*pBlob)[name] = data; + auto el = + pBlob->insert(std::make_pair(name, data)); // (*pBlob)[name] = data; + // Register new element in per executor map + // to have easily erased when executor terminated + LinkEntryWithExecutor(pBlob, el.first); } else { blob_it->second = data; // set data to existing blob } diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 2578c9b6cdea5a..d91e14ec3aa923 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -189,19 +189,35 @@ class NPUDeviceContext : public DeviceContext { /*! \brief Return npu stream in the device context. */ aclrtStream stream() const; -#ifdef PADDLE_WITH_ASCEND_HCCL - /*! \brief Return bkcl context. */ - HCCLContext_t hccl_context() const { return hccl_context_; } + template + void AddStreamCallback(Callback&& callback) const { + return stream_->AddCallback(callback); + } - /*! \brief Set bkcl context. */ - void set_hccl_context(HCCLContext_t context) { hccl_context_ = context; } + void WaitStreamCallback() const { return stream_->WaitCallback(); } + +#if defined(PADDLE_WITH_ASCEND_CL) + /*! \brief Return hccl communicators. */ + HcclComm hccl_comm() const { return hccl_comm_; } + + /*! \brief Set hccl communicators. */ + void set_hccl_comm(HcclComm comm) { hccl_comm_ = comm; } #endif + // template + // void AddStreamCallback(Callback&& callback) const { + // return stream_->AddCallback(callback); + // } + + // void WaitStreamCallback() const { return stream_->WaitCallback(); } + private: NPUPlace place_; aclrtContext context_; -#ifdef PADDLE_WITH_ASCEND_HCCL - HCCLContext_t hccl_context_; + +#ifdef PADDLE_WITH_ASCEND_CL + // HCCLContext_t hccl_context_; + HcclComm hccl_comm_{nullptr}; #endif // Need to be the same with other DeviceContext, @@ -657,6 +673,7 @@ class MKLDNNDeviceContextThreadLocals { mkldnn::stream cur_stream; std::string key_suffix; // Key identifying current Executor bool key_attach_thread_id = true; + void* exec_ptr_ = nullptr; Body(); ~Body(); @@ -673,6 +690,8 @@ class MKLDNNDeviceContextThreadLocals { const std::string& get_key_suffix(void) const { return key_suffix; } void disable_tid_in_key(void) { key_attach_thread_id = false; } bool is_tid_used_in_key(void) const { return key_attach_thread_id; } + void set_curr_exec(void* exec_ptr) { exec_ptr_ = exec_ptr; } + void* get_curr_exec(void) const { return exec_ptr_; } }; MKLDNNDeviceContextThreadLocals() = default; MKLDNNDeviceContextThreadLocals(const MKLDNNDeviceContextThreadLocals& c) = @@ -708,13 +727,19 @@ class MKLDNNDeviceContext : public CPUDeviceContext { using ShapeBlob = umap_key_string_t; using BlobMap = umap_value_smart_t; + using ExecMap = std::unordered_map< + void*, std::vector, KeyBlob::iterator>>>; + explicit MKLDNNDeviceContext(CPUPlace place); /* \brief Get the active engine */ const mkldnn::engine& GetEngine() const { return tls().get_engine(); } + // Register object to currently used executor's map + void LinkEntryWithExecutor(BlobPtr_t, KeyBlob::iterator) const; + // Remove all entries from the blob map - void ResetBlobMap(); + void ResetBlobMap(void* ptr); // Prevent next ResetBlobMap() void BlockNextCacheClearing(); @@ -737,6 +762,9 @@ class MKLDNNDeviceContext : public CPUDeviceContext { private: std::shared_ptr p_blobmap_; + // Map key is pointer of executor and value is a data(iterator in map) needed + // to erase + std::shared_ptr p_exec_items_; std::shared_ptr p_mutex_; bool block_next_cache_clearing_ = false; }; diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index 717b5ce83c6c98..724a9b8483cdee 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -587,6 +587,8 @@ class DeviceTracerImpl : public DeviceTracer { BOOST_GET_CONST(platform::CUDAPlace, r.place).GetDeviceId()); } else if (platform::is_cuda_pinned_place(r.place)) { event->set_place(proto::MemEvent::CUDAPinnedPlace); + } else if (platform::is_npu_place(r.place)) { + event->set_place(proto::MemEvent::NPUPlace); } else { PADDLE_THROW(platform::errors::Unimplemented( "The current place is not supported.")); diff --git a/paddle/fluid/platform/dynload/CMakeLists.txt b/paddle/fluid/platform/dynload/CMakeLists.txt index e65a38cd323aaf..8bff2ead0a2a3e 100644 --- a/paddle/fluid/platform/dynload/CMakeLists.txt +++ b/paddle/fluid/platform/dynload/CMakeLists.txt @@ -1,6 +1,6 @@ cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce) -list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc cusolver.cc nvtx.cc) +list(APPEND CUDA_SRCS cublas.cc cudnn.cc curand.cc cusolver.cc nvtx.cc nvjpeg.cc) if (WITH_ROCM) list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc) @@ -9,7 +9,7 @@ endif() # There is no macOS version of NCCL. # Disable nvrtc and cuda_driver api on MacOS and Windows, and only do a early test on Linux. if (NOT APPLE AND NOT WIN32) - list(APPEND CUDA_SRCS nvrtc.cc cuda_driver.cc) + list(APPEND CUDA_SRCS nvrtc.cc cuda_driver.cc) if (WITH_NCCL) list(APPEND CUDA_SRCS nccl.cc) endif() @@ -32,6 +32,8 @@ endif(CUPTI_FOUND) if(WITH_ROCM) hip_library(dynload_cuda SRCS ${HIP_SRCS} DEPS dynamic_loader) cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) +elseif (WITH_ASCEND_CL) + cc_library(dynload_warpctc SRCS warpctc.cc hccl.cc DEPS dynamic_loader warpctc) else() nv_library(dynload_cuda SRCS ${CUDA_SRCS} DEPS dynamic_loader) cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc) diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc index 956acfe2771c50..be9cda4a2e9b6c 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.cc +++ b/paddle/fluid/platform/dynload/dynamic_loader.cc @@ -36,6 +36,13 @@ DEFINE_string(nccl_dir, "", "For instance, /usr/local/cuda/lib64. If default, " "dlopen will search cuda from LD_LIBRARY_PATH"); +DEFINE_string(hccl_dir, "", + "Specify path for loading hccl library, such as libhccl.so. " + "For instance, " + "/usr/local/Ascend/ascend-toolkit/latest/fwkacllib/lib64/. If " + "default, " + "dlopen will search hccl from LD_LIBRARY_PATH"); + DEFINE_string(cupti_dir, "", "Specify path for loading cupti.so."); DEFINE_string( @@ -93,6 +100,9 @@ static constexpr char* win_cublas_lib = static constexpr char* win_curand_lib = "curand64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR ".dll;curand64_" CUDA_VERSION_MAJOR ".dll;curand64_10.dll"; +static constexpr char* win_nvjpeg_lib = + "nvjpeg64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR + ".dll;nvjpeg64_" CUDA_VERSION_MAJOR ".dll;nvjpeg64_10.dll"; static constexpr char* win_cusolver_lib = "cusolver64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR ".dll;cusolver64_" CUDA_VERSION_MAJOR ".dll;cusolver64_10.dll"; @@ -100,6 +110,9 @@ static constexpr char* win_cusolver_lib = static constexpr char* win_curand_lib = "curand64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR ".dll;curand64_" CUDA_VERSION_MAJOR ".dll"; +static constexpr char* win_nvjpeg_lib = + "nvjpeg64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR + ".dll;nvjpeg64_" CUDA_VERSION_MAJOR ".dll"; static constexpr char* win_cusolver_lib = "cusolver64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR ".dll;cusolver64_" CUDA_VERSION_MAJOR ".dll"; @@ -323,6 +336,17 @@ void* GetCurandDsoHandle() { #endif } +void* GetNvjpegDsoHandle() { +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvjpeg.dylib"); +#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA) + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_nvjpeg_lib, true, + {cuda_lib_path}); +#else + return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libnvjpeg.so"); +#endif +} + void* GetCusolverDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.dylib"); @@ -392,6 +416,24 @@ void* GetNCCLDsoHandle() { warning_msg); #endif } +void* GetHCCLDsoHandle() { + std::string warning_msg( + "You may need to install 'hccl2' from Huawei official website: " + "before install PaddlePaddle."); +#if defined(__APPLE__) || defined(__OSX__) + return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.dylib", true, {}, + warning_msg); +#elif defined(PADDLE_WITH_HIP) && defined(PADDLE_WITH_RCCL) + return GetDsoHandleFromSearchPath(FLAGS_rccl_dir, "librccl.so", true); + +#elif defined(PADDLE_WITH_ASCEND_CL) + return GetDsoHandleFromSearchPath(FLAGS_hccl_dir, "libhccl.so", true, {}, + warning_msg); +#else + return GetDsoHandleFromSearchPath(FLAGS_nccl_dir, "libnccl.so", true, {}, + warning_msg); +#endif +} void* GetTensorRtDsoHandle() { #if defined(__APPLE__) || defined(__OSX__) diff --git a/paddle/fluid/platform/dynload/dynamic_loader.h b/paddle/fluid/platform/dynload/dynamic_loader.h index c3f5953c785791..9ab6dca0126bcb 100644 --- a/paddle/fluid/platform/dynload/dynamic_loader.h +++ b/paddle/fluid/platform/dynload/dynamic_loader.h @@ -29,11 +29,13 @@ void* GetCublasDsoHandle(); void* GetCUDNNDsoHandle(); void* GetCUPTIDsoHandle(); void* GetCurandDsoHandle(); +void* GetNvjpegDsoHandle(); void* GetCusolverDsoHandle(); void* GetNVRTCDsoHandle(); void* GetCUDADsoHandle(); void* GetWarpCTCDsoHandle(); void* GetNCCLDsoHandle(); +void* GetHCCLDsoHandle(); void* GetTensorRtDsoHandle(); void* GetMKLMLDsoHandle(); void* GetOpDsoHandle(const std::string& dso_name); diff --git a/paddle/fluid/platform/dynload/hccl.cc b/paddle/fluid/platform/dynload/hccl.cc new file mode 100644 index 00000000000000..5efac7691eb98b --- /dev/null +++ b/paddle/fluid/platform/dynload/hccl.cc @@ -0,0 +1,41 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_ASCEND_CL + +#include "paddle/fluid/platform/dynload/hccl.h" + +namespace paddle { +namespace platform { +namespace dynload { + +std::once_flag hccl_dso_flag; +void *hccl_dso_handle; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +HCCL_RAND_ROUTINE_EACH(DEFINE_WRAP); + +#if HCCL_VERSION_CODE >= 2212 +HCCL_RAND_ROUTINE_EACH_AFTER_2212(DEFINE_WRAP) +#endif + +#if HCCL_VERSION_CODE >= 2703 +HCCL_RAND_ROUTINE_EACH_AFTER_2703(DEFINE_WRAP) +#endif + +} // namespace dynload +} // namespace platform +} // namespace paddle +#endif diff --git a/paddle/fluid/platform/dynload/hccl.h b/paddle/fluid/platform/dynload/hccl.h new file mode 100644 index 00000000000000..a56180ce2d4ca5 --- /dev/null +++ b/paddle/fluid/platform/dynload/hccl.h @@ -0,0 +1,75 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#ifdef PADDLE_WITH_ASCEND_CL + +#include +#include +#include // NOLINT + +#include "paddle/fluid/platform/dynload/dynamic_loader.h" +#include "paddle/fluid/platform/port.h" + +#define HCOM_GROUP_PREFIX "HCOM_GROUP_" + +namespace paddle { +namespace platform { +namespace dynload { + +extern std::once_flag hccl_dso_flag; +extern void* hccl_dso_handle; + +#define DECLARE_DYNAMIC_LOAD_HCCL_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> decltype(__name(args...)) { \ + using HCCL_func = decltype(&::__name); \ + std::call_once(hccl_dso_flag, []() { \ + hccl_dso_handle = paddle::platform::dynload::GetHCCLDsoHandle(); \ + }); \ + static void* p_##__name = dlsym(hccl_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define HCCL_RAND_ROUTINE_EACH(__macro) \ + __macro(HcclReduceScatter); \ + __macro(HcclCommDestroy); \ + __macro(HcclAllReduce); \ + __macro(HcclCommInitRootInfo); \ + __macro(HcclGetRootInfo); \ + __macro(HcclBroadcast); \ + __macro(HcclCommInitClusterInfo); \ + __macro(HcclAllGather); + +HCCL_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_HCCL_WRAP) + +#if HCCL_VERSION_CODE >= 2212 +#define HCCL_RAND_ROUTINE_EACH_AFTER_2212(__macro) __macro(HCCLBroadcast); +HCCL_RAND_ROUTINE_EACH_AFTER_2212(DECLARE_DYNAMIC_LOAD_HCCL_WRAP) +#endif + +#if HCCL_VERSION_CODE >= 2703 +#define HCCL_RAND_ROUTINE_EACH_AFTER_2703(__macro) \ + __macro(HCCLSend); \ + __macro(HCCLRecv); +HCCL_RAND_ROUTINE_EACH_AFTER_2703(DECLARE_DYNAMIC_LOAD_HCCL_WRAP) +#endif + +} // namespace dynload +} // namespace platform +} // namespace paddle +#endif diff --git a/paddle/fluid/platform/dynload/miopen.h b/paddle/fluid/platform/dynload/miopen.h index 5ff4bff4bff652..77ff3f3ccbbb6e 100644 --- a/paddle/fluid/platform/dynload/miopen.h +++ b/paddle/fluid/platform/dynload/miopen.h @@ -110,6 +110,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name); __macro(miopenActivationBackward); \ __macro(miopenConvolutionBackwardWeights); \ __macro(miopenConvolutionForward); \ + __macro(miopenConvolutionForwardBias); \ __macro(miopenConvolutionBackwardBias); \ __macro(miopenConvolutionForwardGetWorkSpaceSize); \ __macro(miopenConvolutionBackwardDataGetWorkSpaceSize); \ diff --git a/paddle/fluid/platform/dynload/nvjpeg.cc b/paddle/fluid/platform/dynload/nvjpeg.cc new file mode 100644 index 00000000000000..eb0ad78b9b73cd --- /dev/null +++ b/paddle/fluid/platform/dynload/nvjpeg.cc @@ -0,0 +1,27 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/dynload/nvjpeg.h" + +namespace paddle { +namespace platform { +namespace dynload { + +std::once_flag nvjpeg_dso_flag; +void *nvjpeg_dso_handle; + +#define DEFINE_WRAP(__name) DynLoad__##__name __name + +NVJPEG_RAND_ROUTINE_EACH(DEFINE_WRAP); + +} // namespace dynload +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/dynload/nvjpeg.h b/paddle/fluid/platform/dynload/nvjpeg.h new file mode 100644 index 00000000000000..ae457b2958f5de --- /dev/null +++ b/paddle/fluid/platform/dynload/nvjpeg.h @@ -0,0 +1,53 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#ifdef PADDLE_WITH_CUDA +#include +#include // NOLINT + +#include "paddle/fluid/platform/dynload/dynamic_loader.h" +#include "paddle/fluid/platform/port.h" + +namespace paddle { +namespace platform { +namespace dynload { +extern std::once_flag nvjpeg_dso_flag; +extern void *nvjpeg_dso_handle; + +#define DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + nvjpegStatus_t operator()(Args... args) { \ + using nvjpegFunc = decltype(&::__name); \ + std::call_once(nvjpeg_dso_flag, []() { \ + nvjpeg_dso_handle = paddle::platform::dynload::GetNvjpegDsoHandle(); \ + }); \ + static void *p_##__name = dlsym(nvjpeg_dso_handle, #__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + +#define NVJPEG_RAND_ROUTINE_EACH(__macro) \ + __macro(nvjpegCreateSimple); \ + __macro(nvjpegJpegStateCreate); \ + __macro(nvjpegGetImageInfo); \ + __macro(nvjpegJpegStateDestroy); \ + __macro(nvjpegDecode); + +NVJPEG_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_NVJPEG_WRAP); + +} // namespace dynload +} // namespace platform +} // namespace paddle + +#endif diff --git a/paddle/fluid/platform/dynload/tensorrt.cc b/paddle/fluid/platform/dynload/tensorrt.cc index e72fbd246cf05d..1d105a1fd86825 100644 --- a/paddle/fluid/platform/dynload/tensorrt.cc +++ b/paddle/fluid/platform/dynload/tensorrt.cc @@ -27,7 +27,8 @@ void* tensorrt_plugin_dso_handle; #define DEFINE_WRAP(__name) DynLoad__##__name __name -TENSORRT_RAND_ROUTINE_EACH(DEFINE_WRAP); +TENSORRT_RAND_ROUTINE_EACH_POINTER(DEFINE_WRAP); +TENSORRT_RAND_ROUTINE_EACH_NON_POINTER(DEFINE_WRAP); TENSORRT_PLUGIN_RAND_ROUTINE_EACH(DEFINE_WRAP); void* GetDsoHandle(const std::string& dso_name) { diff --git a/paddle/fluid/platform/dynload/tensorrt.h b/paddle/fluid/platform/dynload/tensorrt.h index e9bea9af9ca6e0..bc29a0472041af 100644 --- a/paddle/fluid/platform/dynload/tensorrt.h +++ b/paddle/fluid/platform/dynload/tensorrt.h @@ -37,7 +37,7 @@ void* GetTensorRtPluginHandle(); extern std::once_flag tensorrt_plugin_dso_flag; extern void* tensorrt_plugin_dso_handle; -#define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name) \ +#define DECLARE_DYNAMIC_LOAD_TENSORRT_POINTER_WRAP(__name) \ struct DynLoad__##__name { \ template \ void* operator()(Args... args) { \ @@ -55,6 +55,23 @@ extern void* tensorrt_plugin_dso_handle; }; \ extern DynLoad__##__name __name +#define DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) { \ + std::call_once(tensorrt_dso_flag, []() { \ + tensorrt_dso_handle = paddle::platform::dynload::GetTensorRtHandle(); \ + }); \ + static void* p_##__name = dlsym(tensorrt_dso_handle, #__name); \ + PADDLE_ENFORCE_NOT_NULL(p_##__name, \ + platform::errors::Unavailable( \ + "Load tensorrt api %s failed", #__name)); \ + using tensorrt_func = decltype(&::__name); \ + return reinterpret_cast(p_##__name)(args...); \ + } \ + }; \ + extern DynLoad__##__name __name + #define DECLARE_DYNAMIC_LOAD_TENSORRT_PLUGIN_WRAP(__name) \ struct DynLoad__##__name { \ template \ @@ -76,20 +93,25 @@ extern void* tensorrt_plugin_dso_handle; #ifdef NV_TENSORRT_MAJOR #if (NV_TENSORRT_MAJOR >= 6) -#define TENSORRT_RAND_ROUTINE_EACH(__macro) \ - __macro(createInferBuilder_INTERNAL); \ - __macro(createInferRuntime_INTERNAL); \ +#define TENSORRT_RAND_ROUTINE_EACH_POINTER(__macro) \ + __macro(createInferBuilder_INTERNAL); \ + __macro(createInferRuntime_INTERNAL); \ __macro(getPluginRegistry); #else -#define TENSORRT_RAND_ROUTINE_EACH(__macro) \ - __macro(createInferBuilder_INTERNAL); \ +#define TENSORRT_RAND_ROUTINE_EACH_POINTER(__macro) \ + __macro(createInferBuilder_INTERNAL); \ __macro(createInferRuntime_INTERNAL); #endif +#define TENSORRT_RAND_ROUTINE_EACH_NON_POINTER(__macro) \ + __macro(getInferLibVersion); + #define TENSORRT_PLUGIN_RAND_ROUTINE_EACH(__macro) \ __macro(initLibNvInferPlugins); -TENSORRT_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP) +TENSORRT_RAND_ROUTINE_EACH_POINTER(DECLARE_DYNAMIC_LOAD_TENSORRT_POINTER_WRAP) +TENSORRT_RAND_ROUTINE_EACH_NON_POINTER( + DECLARE_DYNAMIC_LOAD_TENSORRT_NON_POINTER_WRAP) TENSORRT_PLUGIN_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_PLUGIN_WRAP) #endif // end of NV_TENSORRT_MAJOR diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index f0809d34d493e9..d42733823e669b 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -47,6 +47,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_ASCEND_CL #include "acl/acl.h" +#include "hccl/hccl_types.h" #endif // PADDLE_WITH_ASCEND_CL #include @@ -990,6 +991,16 @@ DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess); } \ } while (0) +#define PADDLE_ENFORCE_CUDA_LAUNCH_SUCCESS(OP) \ + do { \ + auto res = cudaGetLastError(); \ + if (UNLIKELY(res != cudaSuccess)) { \ + auto msg = ::paddle::platform::build_nvidia_error_msg(res); \ + PADDLE_THROW(platform::errors::Fatal("CUDA error after kernel (%s): %s", \ + OP, msg)); \ + } \ + } while (0) + inline void retry_sleep(unsigned milliseconds) { #ifdef _WIN32 Sleep(milliseconds); @@ -1220,6 +1231,7 @@ struct NPUStatusType {}; } DEFINE_NPU_STATUS_TYPE(aclError, ACL_ERROR_NONE); +DEFINE_NPU_STATUS_TYPE(HcclResult, HCCL_SUCCESS); } // namespace details inline std::string build_npu_error_msg(aclError stat) { @@ -1228,6 +1240,12 @@ inline std::string build_npu_error_msg(aclError stat) { return sout.str(); } +inline std::string build_npu_error_msg(HcclResult stat) { + std::ostringstream sout; + sout << " HCCL error, the error code is : " << stat << ". "; + return sout.str(); +} + #define PADDLE_ENFORCE_NPU_SUCCESS(COND) \ do { \ auto __cond__ = (COND); \ diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc index 83b9544d23267b..1d76c2ea584b7e 100644 --- a/paddle/fluid/platform/flags.cc +++ b/paddle/fluid/platform/flags.cc @@ -578,6 +578,19 @@ DEFINE_string(tracer_mkldnn_ops_on, "", DEFINE_string(tracer_mkldnn_ops_off, "", "List of OneDNN operation types to be turned off"); +/** + * Debug related FLAG + * Name: check_kernel_launch + * Since Version: 2.1.0 + * Value Range: bool, default=false + * Example: + * Note: Check kernel launch status after every kernel compute. + */ +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +DEFINE_bool(check_kernel_launch, false, + "Check kernel launch status after every kernel compute"); +#endif + /** * CUDNN related FLAG * Name: conv2d_disable_cudnn diff --git a/paddle/fluid/platform/hccl_helper.h b/paddle/fluid/platform/hccl_helper.h new file mode 100644 index 00000000000000..692f8dbe0bf1ee --- /dev/null +++ b/paddle/fluid/platform/hccl_helper.h @@ -0,0 +1,355 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#if defined(PADDLE_WITH_HCCL) || defined(PADDLE_WITH_RCCL) || \ + defined(PADDLE_WITH_ASCEND_CL) + +#include +#include +#include +#include // NOLINT +#include +#include +#include + +#ifdef PADDLE_WITH_ASCEND_CL +#include "paddle/fluid/platform/dynload/hccl.h" +#endif + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/float16.h" + +#define HCCL_ID_VARNAME "HCCLID" + +namespace paddle { +namespace platform { + +inline HcclDataType ToHCCLDataType(framework::proto::VarType::Type type) { + if (type == framework::proto::VarType::FP32) { + return HCCL_DATA_TYPE_FP32; + } else if (type == framework::proto::VarType::FP16) { + return HCCL_DATA_TYPE_FP16; + } else if (type == framework::proto::VarType::INT32) { + return HCCL_DATA_TYPE_INT32; + } else if (type == framework::proto::VarType::INT8) { + return HCCL_DATA_TYPE_INT8; + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "This datatype in hccl is not supported.")); + } +} + +// NOTE(minqiyang): according to the ncclGroupEnd documentations: +// https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html, +// ncclGroupEnd will wait for all communicators to be initialized, which will +// cause blocking problem when a runtime_error was thrown, so try only guard +// HCCL actions when use it. + +// class HCCLGroupGuard { +// public: +// static std::mutex &HCCLMutex() { +// static std::mutex mtx; +// return mtx; +// } + +// inline HCCLGroupGuard() { +// HCCLMutex().lock(); +// PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupStart()); +// } + +// inline ~HCCLGroupGuard() PADDLE_MAY_THROW { +// PADDLE_ENFORCE_CUDA_SUCCESS(dynload::ncclGroupEnd()); +// HCCLMutex().unlock(); +// } +// }; + +struct HCCLContext { + std::unique_ptr ctx_; + HcclComm comm_; + + explicit HCCLContext(int dev_id) + : ctx_(new NPUDeviceContext(NPUPlace(dev_id))), comm_{nullptr} {} + + aclrtStream stream() const { return ctx_->stream(); } + HcclComm comm() const { return comm_; } + + int device_id() const { + return BOOST_GET_CONST(platform::NPUPlace, ctx_->GetPlace()).device; + } +}; + +struct HCCLContextMap { + std::unordered_map contexts_; + std::vector order_; + + explicit HCCLContextMap(const std::vector &places, + HcclRootInfo *hccl_id = nullptr, + size_t num_trainers = 1, size_t trainer_id = 0) { + PADDLE_ENFORCE_EQ(!places.empty(), true, + platform::errors::InvalidArgument( + "The HCCL place should not be empty.")); + order_.reserve(places.size()); + for (auto &p : places) { + int dev_id = BOOST_GET_CONST(NPUPlace, p).device; + order_.emplace_back(dev_id); + contexts_.emplace(dev_id, HCCLContext(dev_id)); + } + PADDLE_ENFORCE_EQ( + order_.size(), contexts_.size(), + platform::errors::Unavailable("HCCL Context Map does not support " + "contain two or more same device.")); + + std::unique_ptr comms(new HcclComm[order_.size()]); + // if num_trainers == 1, should create a new nccl id for local comms. + if (num_trainers == 1 && hccl_id == nullptr) { + // we do not know how to tackle this situation under hccl + // std::lock_guard guard(HCCLGroupGuard::HCCLMutex()); + // PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::ncclCommInitAll( + // comms.get(), static_cast(order_.size()), order_.data())); + } else { + PADDLE_ENFORCE_NOT_NULL(hccl_id, platform::errors::InvalidArgument( + "The HCCL id should not be null.")); + { + int nranks = num_trainers * order_.size(); + // HCCLGroupGuard gurad; + for (size_t i = 0; i < order_.size(); ++i) { + int gpu_id = order_[i]; + int rank; + if (order_.size() > 1) { + rank = trainer_id * order_.size() + i; + } else { + rank = trainer_id; + } + VLOG(1) << "init hccl rank:" << rank << ", nranks:" << nranks + << ", gpu_id:" << gpu_id << ", dev_id:" << order_[i]; + aclrtSetDevice(gpu_id); + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclCommInitRootInfo( + nranks, hccl_id, rank, comms.get() + i)); + } + } + } + int i = 0; + for (auto &dev_id : order_) { + contexts_.at(dev_id).comm_ = comms[i++]; + } + } + + HCCLContextMap(const HCCLContextMap &other) = delete; + HCCLContextMap &operator=(const HCCLContextMap &other) = delete; + + NPUDeviceContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); } + + NPUDeviceContext *DevCtx(platform::Place p) const { + return DevCtx(BOOST_GET_CONST(NPUPlace, p).device); + } + + const HCCLContext &at(platform::Place p) const { + return this->at(BOOST_GET_CONST(NPUPlace, p).device); + } + + const HCCLContext &at(int dev_id) const { return contexts_.at(dev_id); } + + void WaitAll() { + for (auto &p : contexts_) { + p.second.ctx_->Wait(); + } + } +}; + +inline std::string GetFlatHCCLVarName(size_t pos) { + if (pos == 0) { + return HCCL_ID_VARNAME; + } + return string::Sprintf("%s_%d", HCCL_ID_VARNAME, static_cast(pos)); +} + +inline std::string GetHierarchicalExterHCCLVarName(size_t pos) { + return string::Sprintf("Hierarchical_exter_%s_%d", HCCL_ID_VARNAME, + static_cast(pos)); +} +inline std::string GetHierarchicalInterHCCLVarName(size_t pos) { + return string::Sprintf("Hierarchical_inter_%s_%d", HCCL_ID_VARNAME, + static_cast(pos)); +} + +class HCCLCommunicator { + public: + HCCLCommunicator() {} + virtual ~HCCLCommunicator() PADDLE_MAY_THROW {} + + HCCLContextMap *DefaultFlatCtx() const { + if (flat_ctxs_.size() == 0) { + return nullptr; + } + + return flat_ctxs_[0].get(); + } + + std::vector> *GetFlatCtxs() { + return &flat_ctxs_; + } + + HCCLContextMap *GetFlatCtx(size_t run_order) const { + return flat_ctxs_[run_order % flat_ctxs_.size()].get(); + } + + HCCLContextMap *GetRunEnvHCCLCtx(size_t run_order, + bool use_hierarchical_allreduce) const { + if (!use_hierarchical_allreduce) { + return GetFlatCtx(run_order); + } + + return GetHierarchicalInterCtx(run_order); + } + + /* + When nccl inits nccl comm using ncclCommInitAll, it meets error when + allreduce ophandle and sync_batch_norm_op use ncclallreduce parallelly. So + create a new nccl comm for sync_batch_norm_op. And these codes should be + polished with a unified nccl management. + */ + + HCCLContextMap *GetSyncBatchNormCtx( + framework::Scope *scope, const std::vector &places) { + auto *hccl_id_var = scope->FindVar(HCCL_ID_VARNAME); + if (hccl_id_var != nullptr) { + return DefaultFlatCtx(); + } + + if (sync_batch_norm_ctx_.get() == nullptr) { + sync_batch_norm_ctx_.reset(new HCCLContextMap(places)); + } + return sync_batch_norm_ctx_.get(); + } + + void InitFlatCtxs(const std::vector &places, + const std::vector &hccl_ids, + size_t trainers_num, size_t trainer_id) { + if (hccl_ids.size() == 0) { + auto ptr = new platform::HCCLContextMap(places); + VLOG(1) << "init local trainer"; + flat_ctxs_.emplace_back(ptr); + } else { + for (size_t i = 0; i < hccl_ids.size(); i++) { + auto ptr = new platform::HCCLContextMap(places, hccl_ids[i], + trainers_num, trainer_id); + VLOG(1) << "init trainer_id:" << trainer_id << ", comm no:" << i; + flat_ctxs_.emplace_back(ptr); + } + } + + // as Executor have no way to use ncclComm created by ParallelExecutor, + // we assign all flatten contexts to HCCLCommContext to fix. + int nranks = static_cast(trainers_num * places.size()); + int nrings = static_cast(flat_ctxs_.size()); + for (int ring_id = 0; ring_id < nrings; ++ring_id) { + for (size_t p = 0; p < places.size(); ++p) { + int rank = trainer_id * places.size() + p; + int dev_id = BOOST_GET_CONST(NPUPlace, places[p]).device; + auto &ctx = flat_ctxs_[ring_id]->contexts_.at(dev_id); + HCCLCommContext::Instance().AssignHCCLComm(ctx.comm_, nranks, rank, + dev_id, ring_id); + } + } + } + + void InitHierarchicalCtxs(const std::vector &places, + const std::vector &inter_hccl_ids, + const std::vector &exter_hccl_ids, + size_t trainers_num, size_t trainer_id, + size_t inter_trainers_num, + size_t exter_trainers_num) { + PADDLE_ENFORCE_EQ( + trainers_num, inter_trainers_num * exter_trainers_num, + platform::errors::InvalidArgument( + "trainers_num:%llu != inter_trainers_num:%llu * " + "exter_trainers_num:%llu", + trainers_num, inter_trainers_num, exter_trainers_num)); + + PADDLE_ENFORCE_GT( + inter_trainers_num, 1, + platform::errors::InvalidArgument( + "The inter_trainers_num:%llu should be larger than 1.", + inter_trainers_num)); + + int inter_trainer_id = trainer_id % inter_trainers_num; + for (size_t i = 0; i < inter_hccl_ids.size(); i++) { + VLOG(1) << "init inter_trainer_id:" << inter_trainer_id + << ", comm no:" << i; + auto local = new HCCLContextMap(places, inter_hccl_ids[i], + inter_trainers_num, inter_trainer_id); + + h_inter_ctxs_.emplace_back(local); + } + + int exter_trainer_id = -1; + if (trainer_id % inter_trainers_num == 0) { + exter_trainer_id = trainer_id / inter_trainers_num; + } + + if (exter_trainer_id >= 0) { + for (size_t i = 0; i < exter_hccl_ids.size(); i++) { + auto ex = new HCCLContextMap(places, exter_hccl_ids[i], + exter_trainers_num, exter_trainer_id); + VLOG(1) << "init exter_trainer_id:" << exter_trainer_id + << ", comm no:" << i; + h_exter_ctxs_.emplace_back(ex); + } + } + } + + bool NeedExterAllReduce() const { return h_exter_ctxs_.size() > 0; } + + HCCLContextMap *GetHierarchicalInterCtx(size_t run_order) const { + PADDLE_ENFORCE_GT(h_inter_ctxs_.size(), 0, + platform::errors::InvalidArgument( + "Hierarchical ctxs should be initialized firstly!")); + return h_inter_ctxs_[run_order % h_inter_ctxs_.size()].get(); + } + + HCCLContextMap *GetHierarchicalExterCtx(size_t run_order) const { + PADDLE_ENFORCE_GT(h_exter_ctxs_.size(), 0, + platform::errors::InvalidArgument( + "Hierarchical ctxs should be initialized firstly!")); + return h_exter_ctxs_[run_order % h_exter_ctxs_.size()].get(); + } + + std::vector> *GetHierarchicalInterCtxs() { + return &h_inter_ctxs_; + } + + std::vector> *GetHierarchicalExterCtxs() { + return &h_exter_ctxs_; + } + + protected: + // Support multi nccl comm on default nccl ring while HCCLContextMap can't. + std::vector> flat_ctxs_; + + // h_inter_ctxs_ and h_exter_ctxs_ are for 2d allreduce. + // And h_exter_ctxs_ can support multi comm too. + std::vector> h_inter_ctxs_; + std::vector> h_exter_ctxs_; + + // just used for sync_batch_norm op. + std::unique_ptr sync_batch_norm_ctx_; +}; + +} // namespace platform +} // namespace paddle +#endif diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index 35776b9f1e6b88..0b683a742c9fd8 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -135,13 +135,14 @@ inline mkldnn::memory::desc MKLDNNMemDesc(const std::vector& dims, return mkldnn::memory::desc({dims}, data_type, format); } -inline void ClearMKLDNNCache(const platform::Place& place) { +inline void ClearMKLDNNCache(const platform::Place& place, + void* ptr = nullptr) { // Clear mkl-dnn cache, if (platform::is_cpu_place(place)) { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::MKLDNNDeviceContext* dev_ctx = (platform::MKLDNNDeviceContext*)pool.Get(place); - dev_ctx->ResetBlobMap(); + dev_ctx->ResetBlobMap(ptr); platform::MKLDNNDeviceContext::tls().set_cur_paddle_data_layout( paddle::framework::DataLayout::kNCHW); } @@ -452,6 +453,9 @@ inline void AttachPointerHashToMKLDNNKey(void* ptr, paddle::platform::MKLDNNDeviceContext::tls().set_key_suffix( "E" + std::to_string(reinterpret_cast(ptr))); } + // Let's register adress of current executor + paddle::platform::MKLDNNDeviceContext::tls().set_curr_exec(ptr); + // For first thread if (first_thread == ThreadIDasStr()) { paddle::platform::MKLDNNDeviceContext::tls().disable_tid_in_key(); diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 0c45da63edd70e..e584b849368e41 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -630,6 +630,67 @@ class BinaryMKLDNNHandler : public platform::MKLDNNHandlerT { } }; +template +class BroadcastDataMKLDNNHandler + : public platform::MKLDNNHandlerT { + public: + BroadcastDataMKLDNNHandler(const dnnl::algorithm algo, + const MKLDNNDeviceContext& dev_ctx, + const mkldnn::engine engine, + platform::Place cpu_place, const Tensor* x, + const Tensor* y, float scale_x, float scale_y, + const std::string& uniq_name, + const std::vector& input_dims) + : platform::MKLDNNHandlerT( + dev_ctx, engine, cpu_place, + platform::CreateKey(dev_ctx, framework::vectorize(x->dims()), + uniq_name)) { + if (!this->isCached()) { + PADDLE_ENFORCE_EQ( + x->layout(), DataLayout::kMKLDNN, + platform::errors::InvalidArgument("Wrong layout set for X tensor.")); + PADDLE_ENFORCE_NE( + x->format(), MKLDNNMemoryFormat::undef, + platform::errors::InvalidArgument("Wrong format set for X tensor.")); + + PADDLE_ENFORCE_EQ( + y->layout(), DataLayout::kMKLDNN, + platform::errors::InvalidArgument("Wrong layout set for Y tensor.")); + PADDLE_ENFORCE_NE( + y->format(), MKLDNNMemoryFormat::undef, + platform::errors::InvalidArgument("Wrong format set for Y tensor.")); + + const auto src0_tz = framework::vectorize(x->dims()); + + const auto src0_md = dnnl::memory::desc( + src0_tz, platform::MKLDNNGetDataType(), x->format()); + const auto src1_md = dnnl::memory::desc( + input_dims, platform::MKLDNNGetDataType(), x->format()); + + dnnl::primitive_attr attributes; + attributes.set_scales(DNNL_ARG_SRC_0, 0, {scale_x}); + attributes.set_scales(DNNL_ARG_SRC_1, 0, {scale_y}); + + this->AcquireForwardPrimitiveDescriptor(attributes, algo, src0_md, + src1_md, src0_md); + } + } + + std::shared_ptr AcquireSrcMemory(framework::Tensor* input) { + T* input_data = input->data(); + memset(input_data, 0, this->fwd_pd_->src_desc().get_size()); + return this->AcquireMemoryFromPrimitive( + this->fwd_pd_->src_desc(), to_void_cast(input_data), "@src0_mem_p"); + } + + std::shared_ptr AcquireSecondSrcMemory( + const framework::Tensor* input) { + const T* input_data = input->data(); + return this->AcquireMemoryFromPrimitive( + this->fwd_pd_->src1_desc(), to_void_cast(input_data), "@src1_mem_p"); + } +}; + template class ReductionMKLDNNHandler : public platform::MKLDNNHandlerT { @@ -639,7 +700,7 @@ class ReductionMKLDNNHandler const mkldnn::engine engine, platform::Place cpu_place, const Tensor* x, const Tensor* y, const std::string& uniq_name, - std::vector output_dims) + std::vector y_tz) : platform::MKLDNNHandlerT( dev_ctx, engine, cpu_place, platform::CreateKey(dev_ctx, framework::vectorize(x->dims()), @@ -653,14 +714,14 @@ class ReductionMKLDNNHandler x->format(), MKLDNNMemoryFormat::undef, platform::errors::InvalidArgument("Wrong format set for X tensor.")); - const auto src_tz = framework::vectorize(x->dims()); + const auto x_tz = framework::vectorize(x->dims()); - const auto src_md = dnnl::memory::desc( - src_tz, platform::MKLDNNGetDataType(), x->format()); - const auto dst_md = memory::desc( - output_dims, platform::MKLDNNGetDataType(), x->format()); + const auto x_md = dnnl::memory::desc( + x_tz, platform::MKLDNNGetDataType(), x->format()); + const auto y_md = + memory::desc(y_tz, platform::MKLDNNGetDataType(), x->format()); - this->AcquireForwardPrimitiveDescriptor(algo, src_md, dst_md, p, eps); + this->AcquireForwardPrimitiveDescriptor(algo, x_md, y_md, p, eps); } } }; diff --git a/paddle/fluid/platform/npu_info.cc b/paddle/fluid/platform/npu_info.cc index 3814faa7662fc5..bb36eedb832381 100644 --- a/paddle/fluid/platform/npu_info.cc +++ b/paddle/fluid/platform/npu_info.cc @@ -190,6 +190,8 @@ void NPUMemcpySync(void *dst, const void *src, size_t count, enum aclrtMemcpyKind kind, size_t dst_max_count) { // NOTE(zhiqiu): The default max_count is count dst_max_count = dst_max_count ? dst_max_count : count; + VLOG(4) << dst << " " << dst_max_count << " " << src << " " << count << " " + << kind; PADDLE_ENFORCE_NPU_SUCCESS(aclrtMemcpy(dst, dst_max_count, src, count, kind)); } diff --git a/paddle/fluid/platform/npu_profiler.h b/paddle/fluid/platform/npu_profiler.h new file mode 100644 index 00000000000000..a7b674d0d0c3fe --- /dev/null +++ b/paddle/fluid/platform/npu_profiler.h @@ -0,0 +1,102 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include + +#include "acl/acl_prof.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace platform { + +#ifdef PADDLE_WITH_ASCEND_STRING +// For CANN 20.2+ +// ACL_AICORE_ARITHMETIC_UTILIZATION = 0, record arithmetic stats +// ACL_AICORE_PIPE_UTILIZATION = 1, record pipeline +// ACL_AICORE_MEMORY_BANDWIDTH = 2, record memory +// ACL_AICORE_L0B_AND_WIDTH = 3, recore internal memory +// ACL_AICORE_RESOURCE_CONFLICT_RATIO = 5, record pipeline ratio +constexpr aclprofAicoreMetrics default_metrics = + ACL_AICORE_ARITHMETIC_UTILIZATION; +#else +// For CANN 20.1 +// ACL_AICORE_ARITHMATIC_THROUGHPUT = 0, record arithmetic stats +// ACL_AICORE_PIPELINE = 1, record pipeline +// ACL_AICORE_SYNCHRONIZATION = 2, record sync +// ACL_AICORE_MEMORY = 3, recore memory +// ACL_AICORE_INTERNAL_MEMORY = 4, recore internal memory +// ACL_AICORE_STALL = 5, record pipeline ratio +constexpr aclprofAicoreMetrics default_metrics = + ACL_AICORE_ARITHMATIC_THROUGHPUT; +#endif + +// ACL_PROF_ACL_API, record ACL API stats +// ACL_PROF_TASK_TIME, record AI core stats +// ACL_PROF_AICORE_METRICS, must include +// ACL_PROF_AICPU_TRACE, recore AICPU, not supported yet +constexpr uint64_t default_type = + ACL_PROF_ACL_API | ACL_PROF_AICORE_METRICS | ACL_PROF_TASK_TIME; + +aclprofConfig *NPUProfilerCreateConfig( + std::vector devices = {}, + aclprofAicoreMetrics metrics = default_metrics, uint64_t c = default_type, + aclprofAicoreEvents *events = nullptr) { + if (devices.size() == 0) { + int device_id = GetCurrentNPUDeviceId(); + devices.emplace_back(device_id); + } + aclprofConfig *config = + aclprofCreateConfig(devices.data(), devices.size(), metrics, events, c); + PADDLE_ENFORCE_NOT_NULL(config, paddle::platform::errors::External( + "Failed to create prof config for NPU")); + return config; +} + +void NPUProfilerDestroyConfig(const aclprofConfig *config) { + PADDLE_ENFORCE_NPU_SUCCESS(aclprofDestroyConfig(config)); +} + +void NPUProfilerInit(std::string output_path) { + PADDLE_ENFORCE_NPU_SUCCESS( + aclprofInit(output_path.c_str(), output_path.size())); +} + +void NPUProfilerStart(const aclprofConfig *config) { + if (config == nullptr) { + // NOTE(zhiqiu): support single device by default. + int device_id = GetCurrentNPUDeviceId(); + std::vector devices = {static_cast(device_id)}; + config = NPUProfilerCreateConfig(devices); + } + PADDLE_ENFORCE_NPU_SUCCESS(aclprofStart(config)); +} + +void NPUProfilerStop(const aclprofConfig *config) { + PADDLE_ENFORCE_NPU_SUCCESS(aclprofStop(config)); + NPUProfilerDestroyConfig(config); +} + +void NPUProfilerFinalize() { PADDLE_ENFORCE_NPU_SUCCESS(aclprofFinalize()); } + +struct NPUProfConfigWrapper { + aclprofConfig *p_; + explicit NPUProfConfigWrapper(aclprofConfig *p) : p_(p) {} + aclprofConfig *ptr() { return p_; } +}; + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/npu_resource_pool.cc b/paddle/fluid/platform/npu_resource_pool.cc new file mode 100644 index 00000000000000..22b9e8f03971e5 --- /dev/null +++ b/paddle/fluid/platform/npu_resource_pool.cc @@ -0,0 +1,101 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef PADDLE_WITH_ASCEND_CL +#include "paddle/fluid/platform/npu_resource_pool.h" +#include "paddle/fluid/platform/npu_info.h" + +namespace paddle { +namespace platform { + +NpuStreamResourcePool::NpuStreamResourcePool() { + int dev_cnt = platform::GetNPUDeviceCount(); + pool_.reserve(dev_cnt); + for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) { + auto creator = [dev_idx] { + platform::SetNPUDeviceId(dev_idx); + aclrtStream stream; + PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateStream(&stream)); + return stream; + }; + + auto deleter = [dev_idx](aclrtStream stream) { + platform::SetNPUDeviceId(dev_idx); + PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyStream(stream)); + }; + + pool_.emplace_back(ResourcePool::Create(creator, deleter)); + } +} + +NpuStreamResourcePool& NpuStreamResourcePool::Instance() { + static NpuStreamResourcePool pool; + return pool; +} + +std::shared_ptr NpuStreamResourcePool::New(int dev_idx) { + PADDLE_ENFORCE_GE( + dev_idx, 0, + platform::errors::InvalidArgument( + "The dev_idx should be not less than 0, but got %d.", dev_idx)); + PADDLE_ENFORCE_LT( + dev_idx, pool_.size(), + platform::errors::OutOfRange( + "The dev_idx should be less than device count %d, but got %d.", + pool_.size(), dev_idx)); + return pool_[dev_idx]->New(); +} + +NpuEventResourcePool::NpuEventResourcePool() { + int dev_cnt = platform::GetNPUDeviceCount(); + pool_.reserve(dev_cnt); + for (int dev_idx = 0; dev_idx < dev_cnt; ++dev_idx) { + auto creator = [dev_idx] { + platform::SetNPUDeviceId(dev_idx); + aclrtEvent event; + PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateEvent(&event)); + return event; + }; + + auto deleter = [dev_idx](aclrtEvent event) { + platform::SetNPUDeviceId(dev_idx); + PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyEvent(event)); + }; + + pool_.emplace_back(ResourcePool::Create(creator, deleter)); + } +} + +NpuEventResourcePool& NpuEventResourcePool::Instance() { + static NpuEventResourcePool pool; + return pool; +} + +std::shared_ptr NpuEventResourcePool::New(int dev_idx) { + PADDLE_ENFORCE_GE( + dev_idx, 0, + platform::errors::InvalidArgument( + "The dev_idx should be not less than 0, but got %d.", dev_idx)); + PADDLE_ENFORCE_LT( + dev_idx, pool_.size(), + platform::errors::OutOfRange( + "The dev_idx should be less than device count %d, but got %d.", + pool_.size(), dev_idx)); + return pool_[dev_idx]->New(); +} + +} // namespace platform +} // namespace paddle + +#endif diff --git a/paddle/fluid/platform/npu_resource_pool.h b/paddle/fluid/platform/npu_resource_pool.h new file mode 100644 index 00000000000000..bfd6ec7f941120 --- /dev/null +++ b/paddle/fluid/platform/npu_resource_pool.h @@ -0,0 +1,64 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#ifdef PADDLE_WITH_ASCEND_CL +#include +#include +#include + +#include "acl/acl.h" +#include "paddle/fluid/platform/resource_pool.h" + +namespace paddle { +namespace platform { + +using NpuStreamObject = std::remove_pointer::type; +using NpuEventObject = std::remove_pointer::type; + +class NpuStreamResourcePool { + public: + std::shared_ptr New(int dev_idx); + + static NpuStreamResourcePool &Instance(); + + private: + NpuStreamResourcePool(); + + DISABLE_COPY_AND_ASSIGN(NpuStreamResourcePool); + + private: + std::vector>> pool_; +}; + +class NpuEventResourcePool { + public: + std::shared_ptr New(int dev_idx); + + static NpuEventResourcePool &Instance(); + + private: + NpuEventResourcePool(); + + DISABLE_COPY_AND_ASSIGN(NpuEventResourcePool); + + private: + std::vector>> pool_; +}; + +} // namespace platform +} // namespace paddle + +#endif diff --git a/paddle/fluid/platform/profiler.proto b/paddle/fluid/platform/profiler.proto index cfa3c6906f83f7..31193534a00be0 100644 --- a/paddle/fluid/platform/profiler.proto +++ b/paddle/fluid/platform/profiler.proto @@ -21,6 +21,7 @@ message Event { enum EventType { CPU = 0; GPUKernel = 1; + NPUKernel = 2; } optional EventType type = 8; optional string name = 1; @@ -39,6 +40,8 @@ message MemEvent { CUDAPlace = 0; CPUPlace = 1; CUDAPinnedPlace = 2; + XPUPlace = 3; + NPUPlace = 4; } optional uint64 start_ns = 1; optional uint64 end_ns = 2; diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc index 287c8fc37e005a..9f4ec9b3ce0d44 100644 --- a/paddle/fluid/platform/stream_callback_manager.cc +++ b/paddle/fluid/platform/stream_callback_manager.cc @@ -71,6 +71,8 @@ void StreamCallbackManager::AddCallback( #endif #if PADDLE_WITH_ASCEND_CL + VLOG(3) << "aclrtLaunchCallback at stream: " << stream_; + // TODO(zhiqiu): failed to call aclrtLaunchCallback PADDLE_ENFORCE_NPU_SUCCESS(aclrtLaunchCallback(StreamCallbackFunc, func, ACL_CALLBACK_BLOCK, stream_)); #endif diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index b43ad592a3a253..b30214e1d83559 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -61,7 +61,7 @@ set(PYBIND_SRCS if(WITH_ASCEND) set(PYBIND_DEPS ${PYBIND_DEPS} ascend_wrapper) set(PYBIND_SRCS ${PYBIND_SRCS} ascend_wrapper_py.cc) -endif(WITH_ASCEND) +endif() if(WITH_GLOO) set(PYBIND_DEPS ${PYBIND_DEPS} gloo_context) @@ -86,7 +86,11 @@ endif() if(WITH_PYTHON) # generate op pybind functions automatically for dygraph. - set(OP_FUNCTION_GENERETOR_DEPS pybind proto_desc executor layer tracer engine imperative_profiler imperative_flag) + if (WITH_ASCEND_CL) + set(OP_FUNCTION_GENERETOR_DEPS pybind proto_desc executor layer tracer engine imperative_profiler imperative_flag ascend_wrapper) + else() + set(OP_FUNCTION_GENERETOR_DEPS pybind proto_desc executor layer tracer engine imperative_profiler imperative_flag) + endif() list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OP_LIB}) list(APPEND OP_FUNCTION_GENERETOR_DEPS ${GLOB_OPERATOR_DEPS}) @@ -100,6 +104,7 @@ if(WITH_PYTHON) add_executable(op_function_generator op_function_generator.cc) target_link_libraries(op_function_generator ${OP_FUNCTION_GENERETOR_DEPS}) + get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) target_link_libraries(op_function_generator ${os_dependency_modules}) if(WITH_ROCM) @@ -153,9 +158,9 @@ if(WITH_PYTHON) ) endif() else(WIN32) - # If there are no *.so in /usr/lib or LD_LIBRARY_PATH, + # If there are no *.so in /usr/lib or LD_LIBRARY_PATH, # copy these *.so to current directory and append current directory to - # LD_LIBRARY_PATH. This is different with Windows platformm, which search + # LD_LIBRARY_PATH. This is different with Windows platformm, which search # *.dll in current directory automatically. add_custom_command(TARGET op_function_generator POST_BUILD diff --git a/paddle/fluid/pybind/ascend_wrapper_py.cc b/paddle/fluid/pybind/ascend_wrapper_py.cc index 303ab5c0fe8ca4..43725f7dc0f73e 100644 --- a/paddle/fluid/pybind/ascend_wrapper_py.cc +++ b/paddle/fluid/pybind/ascend_wrapper_py.cc @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_ASCEND +#ifdef PADDLE_WITH_ASCEND_CL #include #ifdef _POSIX_C_SOURCE @@ -108,12 +108,14 @@ enum AttrType { AT_NAMEATTR }; +#ifdef PADDLE_WITH_ASCEND void BindAscendDevice(py::module *m) { py::class_(*m, "NPUDevice") .def_static( "get_device_count", static_cast(&platform::ascend::NPUDevice::GetDeviceCount)); } +#endif void BindAscendGraph(py::module *m) { m->def("ge_initialize", &ge_initialize, "GEInitialize"); diff --git a/paddle/fluid/pybind/ascend_wrapper_py.h b/paddle/fluid/pybind/ascend_wrapper_py.h index e999080544c31b..15fb056c90e020 100644 --- a/paddle/fluid/pybind/ascend_wrapper_py.h +++ b/paddle/fluid/pybind/ascend_wrapper_py.h @@ -14,7 +14,7 @@ #pragma once -#ifdef PADDLE_WITH_ASCEND +#ifdef PADDLE_WITH_ASCEND_CL #include "pybind11/pybind11.h" #include "pybind11/stl.h" diff --git a/paddle/fluid/pybind/global_value_getter_setter.cc b/paddle/fluid/pybind/global_value_getter_setter.cc index bc8d1e5b40585d..4824a34e843bb1 100644 --- a/paddle/fluid/pybind/global_value_getter_setter.cc +++ b/paddle/fluid/pybind/global_value_getter_setter.cc @@ -41,6 +41,7 @@ DECLARE_int32(multiple_of_cupti_buffer_size); DECLARE_bool(reader_queue_speed_test_mode); DECLARE_int32(call_stack_level); DECLARE_bool(sort_sum_gradient); +DECLARE_bool(check_kernel_launch); // device management DECLARE_int32(paddle_num_threads); // executor @@ -376,7 +377,7 @@ static void RegisterGlobalVarGetterSetter() { FLAGS_fraction_of_gpu_memory_to_use, FLAGS_initial_gpu_memory_in_mb, FLAGS_reallocate_gpu_memory_in_mb, FLAGS_enable_cublas_tensor_op_math, FLAGS_selected_gpus, FLAGS_sync_nccl_allreduce, - FLAGS_conv2d_disable_cudnn); + FLAGS_conv2d_disable_cudnn, FLAGS_check_kernel_launch); #endif #ifdef PADDLE_WITH_XPU REGISTER_PUBLIC_GLOBAL_VAR(FLAGS_selected_xpus); diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc index 0817dc33671621..450c992d41118a 100644 --- a/paddle/fluid/pybind/imperative.cc +++ b/paddle/fluid/pybind/imperative.cc @@ -718,7 +718,8 @@ void BindImperative(py::module *m_ptr) { { // Release gil and do tracing py::gil_scoped_release release; - tracer->TraceOp("set_value", ins, outs, std::move(attrs)); + tracer->TraceOp("set_value", ins, outs, std::move(attrs), + {{"Input", "Out"}}); } } else { auto self_numpy = TensorToPyArray(*self_tensor); @@ -745,7 +746,7 @@ void BindImperative(py::module *m_ptr) { // inplace operator for the VarBase self. self->BumpInplaceVersion(); }) - .def("__getitem__", + .def("_getitem_index_not_tensor", [](std::shared_ptr &self, py::handle _index) { std::vector slice_axes, slice_starts, slice_ends, slice_strides, decrease_axis, infer_flags; @@ -783,6 +784,70 @@ void BindImperative(py::module *m_ptr) { return out; } }) + .def( + "_getitem_from_offset", + [](std::shared_ptr &self, const py::args &args) { + const auto &tensor = self->Var().Get(); + PADDLE_ENFORCE_EQ( + tensor.IsInitialized(), true, + platform::errors::InvalidArgument( + "Tensor of %s is Empty, please check if it has no data.", + self->Name())); + + const auto &tensor_dims = tensor.dims(); + + std::vector dims(tensor_dims.size()); + std::vector strides(tensor_dims.size()); + + size_t numel = 1; + for (int i = tensor_dims.size() - 1; i >= 0; --i) { + strides[i] = numel; + dims[i] = static_cast(tensor_dims[i]); + numel *= dims[i]; + } + size_t offset = 0; + if (args.empty()) { + PADDLE_ENFORCE_EQ( + numel, 1, + platform::errors::InvalidArgument( + "only one element tensors can be converted to Python " + "scalars when no input coordinates")); + } else if (args.size() == 1) { + offset = args[0].cast(); + PADDLE_ENFORCE_LT( + offset, numel, + platform::errors::InvalidArgument( + "index %d is out of bounds for size %d", offset, numel)); + } else { + PADDLE_ENFORCE_EQ(args.size(), dims.size(), + platform::errors::InvalidArgument( + "incorrect number of indices for Tensor")); + + for (size_t i = 0; i < args.size(); ++i) { + size_t index = args[i].cast(); + PADDLE_ENFORCE_LT( + index, dims[i], + platform::errors::InvalidArgument( + "index %d is out fo bounds for axis %d with size %d", + index, i, dims[i])); + offset += index * strides[i]; + } + } +#define TENSOR_TO_PY_SCALAR(T, proto_type) \ + if (tensor.type() == proto_type) { \ + std::string py_dtype_str = details::TensorDTypeToPyDTypeStr(proto_type); \ + T b = TensorGetElement(tensor, offset); \ + return py::array(py::dtype(py_dtype_str.c_str()), {}, {}, \ + static_cast(&b)); \ + } + + _ForEachDataType_(TENSOR_TO_PY_SCALAR); +#undef TENSOR_TO_PY_SCALAR + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported tensor data type: %s", + framework::DataTypeToString(tensor.type()))); + }, + py::return_value_policy::copy) .def("_inplace_version", [](imperative::VarBase &self) -> uint32_t { const auto &var = self.MutableVar(); @@ -1487,7 +1552,7 @@ void BindImperative(py::module *m_ptr) { allow_ops); imperative::AmpOperators::Instance().GetMutableBlockOps()->swap( block_ops); - VLOG(4) << "AMP operators changed, " + VLOG(5) << "AMP operators changed, " << imperative::AmpOperators::Instance(); }) .def("_get_amp_op_list", diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc index dd9cb65142a3de..8a5ad5852aedf5 100644 --- a/paddle/fluid/pybind/inference_api.cc +++ b/paddle/fluid/pybind/inference_api.cc @@ -467,7 +467,10 @@ void BindAnalysisConfig(py::module *m) { .def("enable_use_gpu", &AnalysisConfig::EnableUseGpu, py::arg("memory_pool_init_size_mb"), py::arg("device_id") = 0) .def("enable_xpu", &AnalysisConfig::EnableXpu, - py::arg("l3_workspace_size")) + py::arg("l3_workspace_size") = 16 * 1024 * 1024, + py::arg("locked") = false, py::arg("autotune") = true, + py::arg("autotune_file") = "", py::arg("precision") = "int16", + py::arg("adaptive_seqlen") = false) .def("disable_gpu", &AnalysisConfig::DisableGpu) .def("use_gpu", &AnalysisConfig::use_gpu) .def("use_xpu", &AnalysisConfig::use_xpu) @@ -512,6 +515,8 @@ void BindAnalysisConfig(py::module *m) { py::arg("dla_core") = 0) .def("tensorrt_dla_enabled", &AnalysisConfig::tensorrt_dla_enabled) .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled) + .def("enable_dlnne", &AnalysisConfig::EnableDlnne, + py::arg("min_subgraph_size") = 3) .def("enable_lite_engine", &AnalysisConfig::EnableLiteEngine, py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32, py::arg("zero_copy") = false, diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc index 2c1927f49f6b70..bf3c77843219c7 100644 --- a/paddle/fluid/pybind/op_function_generator.cc +++ b/paddle/fluid/pybind/op_function_generator.cc @@ -26,7 +26,7 @@ #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/pybind/pybind.h" #include "paddle/fluid/string/string_helper.h" -#ifdef PADDLE_WITH_ASCEND +#ifdef PADDLE_WITH_ASCEND_CL #include "paddle/fluid/framework/fleet/ascend_wrapper.h" #endif @@ -123,14 +123,11 @@ std::map> op_passing_outs_map = { {"sync_batch_norm", {"MeanOut", "VarianceOut"}}, {"accuracy", {"Correct", "Total"}}, {"fill_constant", {"Out"}}, + {"recv_v2", {"Out"}}, {"matmul", {"Out"}}, {"c_broadcast", {"Out"}}, {"c_sync_calc_stream", {"Out"}}, {"c_sync_comm_stream", {"Out"}}, - {"c_allreduce_sum", {"Out"}}, - {"c_allreduce_max", {"Out"}}, - {"c_allreduce_min", {"Out"}}, - {"c_allreduce_prod", {"Out"}}, {"c_reduce_sum", {"Out"}}, {"c_reduce_max", {"Out"}}, {"c_reduce_min", {"Out"}}, @@ -182,16 +179,16 @@ const char* OUT_DUPLICABLE_INITIALIZER_TEMPLATE = R"({"%s", ConstructDuplicableO const char* INPUT_INITIALIZER_TEMPLATE = R"({"%s", {%s}})"; const char* INPUT_LIST_INITIALIZER_TEMPLATE = R"({"%s", %s})"; -const char* INPUT_INITIALIZER_TEMPLATE_WITH_NULL = R"( - if (%s != nullptr) { - ins["%s"] = {%s}; - } +const char* INPUT_INITIALIZER_TEMPLATE_WITH_NULL = R"( + if (%s != nullptr) { + ins["%s"] = {%s}; + } )"; -const char* INPUT_INITIALIZER_TEMPLATE_WITH_NULL_LIST = R"( +const char* INPUT_INITIALIZER_TEMPLATE_WITH_NULL_LIST = R"( if (%s.size() != 0) { - ins["%s"] = %s; - } + ins["%s"] = %s; + } )"; const char* OUTPUT_INITIALIZER_TEMPLATE_WITH_NULL = R"( @@ -264,8 +261,8 @@ R"( imperative::NameVarBaseMap ins = %s; %s tracer->TraceOp("%s", ins, outs, attrs, {%s}); - return %s; - } + return %s; + } })"; const char* PYBIND_ITEM_TEMPLATE = R"( %s.def("%s", &%s);)"; @@ -350,7 +347,7 @@ std::string GenerateOpFunctionsBody( } ins_initializer += "}"; - if (input_args.back() == ',') { + if (!input_args.empty() && input_args.back() == ',') { input_args.pop_back(); } @@ -364,6 +361,7 @@ std::string GenerateOpFunctionsBody( int outs_num = 0; for (auto& output : op_proto->outputs()) { auto& out_name = output.name(); + // skip those dispensable oututs if (output.dispensable() && !FindOutsMap(op_type, out_name)) { continue; @@ -459,7 +457,7 @@ std::string GenerateOpFunctionsBody( return_str.pop_back(); } outs_initializer += "}"; - if (inplace_mapping_str.back() == ',') { + if (!inplace_mapping_str.empty() && inplace_mapping_str.back() == ',') { inplace_mapping_str.pop_back(); } if (!use_inplace_strategy && FindViewOpMap(op_type)) { @@ -567,7 +565,7 @@ int main(int argc, char* argv[]) { return -1; } -#ifdef PADDLE_WITH_ASCEND +#ifdef PADDLE_WITH_ASCEND_CL auto ascend_ptr = paddle::framework::AscendInstance::GetInstance(); ascend_ptr->InitGEForUT(); #endif @@ -602,8 +600,9 @@ int main(int argc, char* argv[]) { out.close(); -#ifdef PADDLE_WITH_ASCEND +#ifdef PADDLE_WITH_ASCEND_CL ge::GEFinalize(); #endif + return 0; } diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index 06b3f10fefafa8..6fa49a85423c58 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -235,6 +235,7 @@ void BindOpDesc(pybind11::module *m) { const std::vector &vec_var_name) { self.SetOutput(name, vec_var_name); }) + .def("remove_output", &pd::OpDesc::RemoveOutput) .def("input_arg_names", &pd::OpDesc::InputArgumentNames) .def("output_arg_names", &pd::OpDesc::OutputArgumentNames) .def("_rename_input", &pd::OpDesc::RenameInput) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 39a78d86976ae9..560d8c892b09f9 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -109,6 +109,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_ASCEND_CL #include "paddle/fluid/platform/npu_info.h" +#include "paddle/fluid/platform/npu_profiler.h" #endif #ifdef PADDLE_WITH_XPU @@ -267,11 +268,6 @@ bool IsCompiledWithBrpc() { #ifndef PADDLE_WITH_DISTRIBUTE return false; #endif - -#ifdef PADDLE_WITH_GRPC - return false; -#endif - return true; } @@ -500,7 +496,56 @@ PYBIND11_MODULE(core_noavx, m) { #endif return tensor; }); - + m.def("_save_lod_tensor", [](const LoDTensor &tensor, + const std::string &str_file_name) { + std::ofstream fout(str_file_name, std::ios::binary); + PADDLE_ENFORCE_EQ(static_cast(fout), true, + platform::errors::Unavailable( + "Cannot open %s to save variables.", str_file_name)); + SerializeToStream(fout, tensor); + + int64_t tellp = fout.tellp(); + fout.close(); + return tellp; + }); + m.def("_load_lod_tensor", [](LoDTensor &tensor, + const std::string &str_file_name) { + std::ifstream fin(str_file_name, std::ios::binary); + PADDLE_ENFORCE_EQ(static_cast(fin), true, + platform::errors::Unavailable( + "Cannot open %s to load variables.", str_file_name)); + + DeserializeFromStream(fin, &tensor); + int64_t tellg = fin.tellg(); + fin.close(); + return tellg; + }); + m.def("_save_selected_rows", [](const SelectedRows &selected_rows, + const std::string &str_file_name) { + std::ofstream fout(str_file_name, std::ios::binary); + PADDLE_ENFORCE_EQ( + static_cast(fout), true, + platform::errors::Unavailable("Cannot open %s to save SelectedRows.", + str_file_name)); + + SerializeToStream(fout, selected_rows); + int64_t tellp = fout.tellp(); + fout.close(); + return tellp; + }); + m.def("_load_selected_rows", + [](SelectedRows &selected_rows, const std::string &str_file_name) { + std::ifstream fin(str_file_name, std::ios::binary); + PADDLE_ENFORCE_EQ( + static_cast(fin), true, + platform::errors::Unavailable( + "Cannot open %s to load SelectedRows.", str_file_name)); + + DeserializeFromStream(fin, &selected_rows); + int64_t tellg = fin.tellg(); + fin.close(); + return tellg; + }); m.def("_save_static_dict", [](const std::string &str_file_name, const py::handle &vec_var_list, const Scope &scope) { @@ -581,11 +626,6 @@ PYBIND11_MODULE(core_noavx, m) { make_ddim(x_dim), make_ddim(y_dim), -1)); }); -#ifdef PADDLE_WITH_ASCEND_CL - m.def("_npu_finalize", - []() { platform::AclInstance::Instance().Finalize(); }); -#endif - m.def( "_append_python_callable_object_and_return_id", [](py::object py_obj) -> size_t { @@ -1744,7 +1784,7 @@ All parameter, weight, gradient are variables in Paddle. "Cannot use NPU because you have installed CPU/GPU version " "PaddlePaddle.\n" "If you want to use NPU, please try to install NPU version " - "PaddlePaddle by: pip install paddlepaddle-xpu\n" + "PaddlePaddle by: pip install paddlepaddle-npu\n" "If you only have CPU, please change NPUPlace(%d) to be " "CPUPlace().\n", dev_id); @@ -2180,6 +2220,29 @@ All parameter, weight, gradient are variables in Paddle. #endif #endif +#ifdef PADDLE_WITH_ASCEND_CL + m.def("get_npu_device_count", platform::GetNPUDeviceCount); + m.def("npu_finalize", []() { platform::AclInstance::Instance().Finalize(); }); + + py::class_(m, "NPUProfConfigWrapper"); + + m.def("npu_prof_init", platform::NPUProfilerInit); + m.def("npu_prof_start", [](platform::NPUProfConfigWrapper c) { + platform::NPUProfilerStart(c.ptr()); + }); + m.def("npu_prof_stop", [](platform::NPUProfConfigWrapper c) { + platform::NPUProfilerStop(c.ptr()); + }); + m.def("npu_prof_finalize", platform::NPUProfilerFinalize); + m.def("npu_prof_create_config", []() { + return platform::NPUProfConfigWrapper(platform::NPUProfilerCreateConfig()); + }); + + m.def("npu_prof_destropy_config", [](platform::NPUProfConfigWrapper c) { + platform::NPUProfilerDestroyConfig(c.ptr()); + }); +#endif + py::enum_(m, "TracerOption", py::arithmetic()) .value("kDefault", platform::TracerOption::kDefault) .value("kOpDetail", platform::TracerOption::kOpDetail) diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index ab1dd8a180b5b6..416361d06a996e 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -663,6 +663,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor, } bool is_gpu_tensor = platform::is_gpu_place(tensor.place()); bool is_xpu_tensor = platform::is_xpu_place(tensor.place()); + bool is_npu_tensor = platform::is_npu_place(tensor.place()); const auto &tensor_dims = tensor.dims(); auto tensor_dtype = tensor.type(); size_t sizeof_dtype = framework::SizeOfType(tensor_dtype); @@ -681,7 +682,7 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor, std::string py_dtype_str = details::TensorDTypeToPyDTypeStr(tensor.type()); - if (!is_gpu_tensor && !is_xpu_tensor) { + if (!is_gpu_tensor && !is_xpu_tensor && !is_npu_tensor) { if (!need_deep_copy) { auto base = py::cast(std::move(tensor)); return py::array(py::dtype(py_dtype_str.c_str()), py_dims, py_strides, @@ -749,6 +750,34 @@ inline py::array TensorToPyArray(const framework::Tensor &tensor, PADDLE_THROW(platform::errors::PermissionDenied( "Cannot use CUDAPlace in CPU only version, " "Please recompile or reinstall Paddle with CUDA support.")); +#endif + } else if (is_npu_tensor) { +#ifdef PADDLE_WITH_ASCEND_CL + py::array py_arr(py::dtype(py_dtype_str.c_str()), py_dims, py_strides); + PADDLE_ENFORCE_EQ(py_arr.writeable(), true, + platform::errors::InvalidArgument( + "PyArray is not writable, in which case memory leak " + "or double free would occur")); + PADDLE_ENFORCE_EQ( + py_arr.owndata(), true, + platform::errors::InvalidArgument( + "PyArray does not own data, in which case memory leak " + "or double free would occur")); + + size_t copy_bytes = sizeof_dtype * numel; + auto p = BOOST_GET_CONST(platform::NPUPlace, tensor.place()); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &ctx = *pool.Get(tensor.place()); + paddle::memory::Copy( + platform::CPUPlace(), py_arr.mutable_data(), p, tensor_buf_ptr, + copy_bytes, + reinterpret_cast(ctx).stream()); + ctx.Wait(); + return py_arr; +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Cannot use NPUPlace in CPU/GPU/XPU version, " + "Please recompile or reinstall Paddle with NPU support.")); #endif } PADDLE_THROW(platform::errors::Unimplemented("Place is not supported")); diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat index 20c8794ba634c7..e53828ff10be60 100644 --- a/paddle/scripts/paddle_build.bat +++ b/paddle/scripts/paddle_build.bat @@ -26,22 +26,42 @@ set cache_dir=%work_dir:Paddle=cache% if not exist %cache_dir%\tools ( git clone https://github.com/zhouwei25/tools.git %cache_dir%\tools ) -taskkill /f /im op_function_generator.exe -wmic process where name="op_function_generator.exe" call terminate +taskkill /f /im op_function_generator.exe 2>NUL +taskkill /f /im cmake.exe 2>NUL +taskkill /f /im MSBuild.exe 2>NUL +taskkill /f /im CL.exe 2>NUL +taskkill /f /im Lib.exe 2>NUL +taskkill /f /im link.exe 2>NUL +taskkill /f /im vctip.exe 2>NUL +taskkill /f /im cvtres.exe 2>NUL +taskkill /f /im rc.exe 2>NUL +taskkill /f /im mspdbsrv.exe 2>NUL +taskkill /f /im csc.exe 2>NUL taskkill /f /im python.exe 2>NUL - +taskkill /f /im nvcc.exe 2>NUL +taskkill /f /im cicc.exe 2>NUL +taskkill /f /im ptxas.exe 2>NUL +taskkill /f /im test_api_impl.exe 2>NUL +taskkill /f /im op_function_generator.exe 2>NUL +wmic process where name="op_function_generator.exe" call terminate 2>NUL +wmic process where name="test_api_impl.exe" call terminate 2>NUL +wmic process where name="cvtres.exe" call terminate 2>NUL +wmic process where name="rc.exe" call terminate 2>NUL +wmic process where name="CL.exe" call terminate 2>NUL +wmic process where name="Lib.exe" call terminate 2>NUL +wmic process where name="python.exe" call terminate 2>NUL rem ------initialize common variable------ if not defined GENERATOR set GENERATOR="Visual Studio 15 2017 Win64" if not defined BRANCH set BRANCH=develop -if not defined WITH_TENSORRT set WITH_TENSORRT=ON +if not defined WITH_TENSORRT set WITH_TENSORRT=ON if not defined TENSORRT_ROOT set TENSORRT_ROOT=D:/TensorRT if not defined CUDA_ARCH_NAME set CUDA_ARCH_NAME=Auto if not defined WITH_GPU set WITH_GPU=ON if not defined WITH_MKL set WITH_MKL=ON if not defined WITH_AVX set WITH_AVX=ON if not defined WITH_TESTING set WITH_TESTING=ON -if not defined MSVC_STATIC_CRT set MSVC_STATIC_CRT=OFF +if not defined MSVC_STATIC_CRT set MSVC_STATIC_CRT=ON if not defined WITH_PYTHON set WITH_PYTHON=ON if not defined ON_INFER set ON_INFER=ON if not defined WITH_INFERENCE_API_TEST set WITH_INFERENCE_API_TEST=ON @@ -54,6 +74,8 @@ if not defined INFERENCE_DEMO_INSTALL_DIR set INFERENCE_DEMO_INSTALL_DIR=%cache_ if not defined LOG_LEVEL set LOG_LEVEL=normal if not defined PRECISION_TEST set PRECISION_TEST=OFF if not defined NIGHTLY_MODE set PRECISION_TEST=OFF +if not defined retry_times set retry_times=2 +if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37 rem -------set cache build directory----------- rmdir build\python /s/q @@ -62,9 +84,6 @@ rmdir build\paddle_inference_install_dir /s/q rmdir build\paddle_inference_c_install_dir /s/q del build\CMakeCache.txt -: set CI_SKIP_CPP_TEST if only *.py changed -git diff --name-only %BRANCH% | findstr /V "\.py" || set CI_SKIP_CPP_TEST=ON - if "%WITH_CACHE%"=="OFF" ( rmdir build /s/q goto :mkbuild @@ -114,72 +133,21 @@ dir . dir %cache_dir% dir paddle\fluid\pybind\Release -rem ------initialize the python environment------ -if not defined PYTHON_ROOT set PYTHON_ROOT=C:\Python37 -set PYTHON_EXECUTABLE=%PYTHON_ROOT%\python.exe -set PATH=%PYTHON_ROOT%;%PYTHON_ROOT%\Scripts;%PATH% - -rem ToDo: virtual environment can't be deleted safely, some process not exit when task is canceled -rem Now use system python environment temporarily -rem %PYTHON_EXECUTABLE% -m pip install virtualenv -rem %PYTHON_EXECUTABLE% -m virtualenv paddle_winci -rem call paddle_winci\Scripts\activate.bat - -rem ------pre install python requirement---------- -where python -where pip -pip install wheel --user -pip install -r %work_dir%\python\unittest_py\requirements.txt --user -pip install -r %work_dir%\python\requirements.txt --user - -if %ERRORLEVEL% NEQ 0 ( - echo pip install requirements.txt failed! - exit /b 7 -) - -rem ------pre install clcache and init config---------- -rem pip install clcache --user -pip uninstall -y clcache -:: set USE_CLCACHE to enable clcache -rem set USE_CLCACHE=1 -:: In some scenarios, CLCACHE_HARDLINK can save one file copy. -rem set CLCACHE_HARDLINK=1 -:: If it takes more than 1000s to obtain the right to use the cache, an error will be reported -rem set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000 -:: set maximum cache size to 20G -rem clcache.exe -M 21474836480 - -:: install ninja if GENERATOR is Ninja -if %GENERATOR% == "Ninja" ( - pip install ninja - if %errorlevel% NEQ 0 ( - echo pip install ninja failed! - exit /b 7 - ) -) - -rem ------show summary of current environment---------- -cmake --version -if "%WITH_GPU%"=="ON" ( - nvcc --version - where nvidia-smi - nvidia-smi -) -python %work_dir%\tools\summary_env.py -%cache_dir%\tools\busybox64.exe bash %work_dir%\tools\get_cpu_info.sh - goto :CASE_%1 echo "Usage: paddle_build.bat [OPTION]" echo "OPTION:" echo "wincheck_mkl: run Windows MKL/GPU/UnitTest CI tasks on Windows" echo "wincheck_openbals: run Windows OPENBLAS/CPU CI tasks on Windows" +echo "build_avx_whl: build Windows avx whl package on Windows" +echo "build_no_avx_whl: build Windows no avx whl package on Windows" exit /b 1 rem ------PR CI windows check for MKL/GPU---------- :CASE_wincheck_mkl set WITH_MKL=ON set WITH_GPU=ON +set WITH_AVX=ON set MSVC_STATIC_CRT=OFF call :cmake || goto cmake_error @@ -192,9 +160,11 @@ goto:success rem ------PR CI windows check for OPENBLAS/CPU------ :CASE_wincheck_openblas -set WITH_MKL=ON +set WITH_MKL=OFF set WITH_GPU=OFF +set WITH_AVX=OFF set MSVC_STATIC_CRT=ON +set retry_times=1 call :cmake || goto cmake_error call :build || goto build_error @@ -209,6 +179,7 @@ rem ------Build windows avx whl package------ set WITH_AVX=ON set ON_INFER=OFF set CUDA_ARCH_NAME=All +set retry_times=4 call :cmake || goto cmake_error call :build || goto build_error @@ -220,6 +191,7 @@ rem ------Build windows no-avx whl package------ set WITH_AVX=OFF set ON_INFER=OFF set CUDA_ARCH_NAME=All +set retry_times=4 call :cmake || goto cmake_error call :build || goto build_error @@ -240,8 +212,10 @@ rem "Other configurations are added here" rem :CASE_wincheck_others rem call ... + rem --------------------------------------------------------------------------------------------- :cmake +@ECHO OFF echo ======================================== echo Step 1. Cmake ... echo ======================================== @@ -249,16 +223,58 @@ echo ======================================== rem Configure the environment for 64-bit builds. 'DISTUTILS_USE_SDK' indicates that the user has selected the compiler. call "C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvars64.bat" set DISTUTILS_USE_SDK=1 +rem Windows 10 Kit bin dir +set PATH=C:\Program Files (x86)\Windows Kits\10\bin\10.0.17763.0\x64;%PATH% for /F %%# in ('wmic os get localdatetime^|findstr 20') do set start=%%# set start=%start:~4,10% -@ECHO ON -if not defined CUDA_TOOLKIT_ROOT_DIR set CUDA_TOOLKIT_ROOT_DIR=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.0 +if not defined CUDA_TOOLKIT_ROOT_DIR set CUDA_TOOLKIT_ROOT_DIR=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2 set PATH=%TENSORRT_ROOT:/=\%\lib;%CUDA_TOOLKIT_ROOT_DIR%\bin;%CUDA_TOOLKIT_ROOT_DIR%\libnvvp;%PATH% -rem ------set third_party cache dir------ +rem install ninja if GENERATOR is Ninja +if %GENERATOR% == "Ninja" ( + pip install ninja + if %errorlevel% NEQ 0 ( + echo pip install ninja failed! + exit /b 7 + ) +) + +rem ------show summary of current GPU environment---------- +cmake --version +if "%WITH_GPU%"=="ON" ( + nvcc --version + nvidia-smi +) + +rem ------initialize the python environment------ +set PYTHON_EXECUTABLE=%PYTHON_ROOT%\python.exe +set PATH=%PYTHON_ROOT%;%PYTHON_ROOT%\Scripts;%PATH% +if %WITH_PYTHON% == "OFF" ( + where python + where pip + pip install wheel --user + pip install -r %work_dir%\python\requirements.txt --user + if %ERRORLEVEL% NEQ 0 ( + echo pip install requirements.txt failed! + exit /b 7 + ) +) +rem ------pre install clcache and init config---------- +rem pip install clcache --user +pip uninstall -y clcache +:: set USE_CLCACHE to enable clcache +rem set USE_CLCACHE=1 +:: In some scenarios, CLCACHE_HARDLINK can save one file copy. +rem set CLCACHE_HARDLINK=1 +:: If it takes more than 1000s to obtain the right to use the cache, an error will be reported +rem set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000 +:: set maximum cache size to 20G +rem clcache.exe -M 21474836480 + +rem ------set third_party cache dir------ : clear third party cache every once in a while for /F %%# in ('wmic os get localdatetime^|findstr 20') do set datetime=%%# set day_now=%datetime:~6,2% @@ -342,7 +358,7 @@ if %GENERATOR% == "Ninja" ( ) if %ERRORLEVEL% NEQ 0 ( set /a build_times=%build_times%+1 - if %build_times% GTR 2 ( + if %build_times% GTR %retry_times% ( exit /b 7 ) else ( echo Build third_party failed, will retry! @@ -356,6 +372,28 @@ set build_times=1 :: reset clcache zero stats for collect PR's actual hit rate rem clcache.exe -z +rem -------clean up environment again----------- +taskkill /f /im MSBuild.exe 2>NUL +taskkill /f /im cl.exe 2>NUL +taskkill /f /im lib.exe 2>NUL +taskkill /f /im link.exe 2>NUL +taskkill /f /im vctip.exe 2>NUL +taskkill /f /im cvtres.exe 2>NUL +taskkill /f /im rc.exe 2>NUL +taskkill /f /im mspdbsrv.exe 2>NUL +taskkill /f /im csc.exe 2>NUL +taskkill /f /im nvcc.exe 2>NUL +taskkill /f /im cicc.exe 2>NUL +taskkill /f /im ptxas.exe 2>NUL +taskkill /f /im test_api_impl.exe 2>NUL +taskkill /f /im op_function_generator.exe 2>NUL +wmic process where name="op_function_generator.exe" call terminate 2>NUL +wmic process where name="test_api_impl.exe" call terminate 2>NUL +wmic process where name="cvtres.exe" call terminate 2>NUL +wmic process where name="rc.exe" call terminate 2>NUL +wmic process where name="CL.exe" call terminate 2>NUL +wmic process where name="Lib.exe" call terminate 2>NUL + echo Build Paddle the %build_times% time: if %GENERATOR% == "Ninja" ( ninja -j %PARALLEL_PROJECT_COUNT% @@ -369,7 +407,7 @@ if %GENERATOR% == "Ninja" ( if %ERRORLEVEL% NEQ 0 ( set /a build_times=%build_times%+1 - if %build_times% GTR 1 ( + if %build_times% GTR %retry_times% ( exit /b 7 ) else ( echo Build Paddle failed, will retry! @@ -450,6 +488,16 @@ echo ======================================== echo Step 4. Running unit tests ... echo ======================================== + +: set CI_SKIP_CPP_TEST if only *.py changed +git diff --name-only %BRANCH% | findstr /V "\.py" || set CI_SKIP_CPP_TEST=ON + +pip install -r %work_dir%\python\unittest_py\requirements.txt --user +if %ERRORLEVEL% NEQ 0 ( + echo pip install unittest requirements.txt failed! + exit /b 7 +) + for /F %%# in ('wmic os get localdatetime^|findstr 20') do set start=%%# set start=%start:~4,10% @@ -706,9 +754,21 @@ taskkill /f /im git-remote-https.exe 2>NUL taskkill /f /im vctip.exe 2>NUL taskkill /f /im cvtres.exe 2>NUL taskkill /f /im rc.exe 2>NUL +taskkill /f /im mspdbsrv.exe 2>NUL +taskkill /f /im csc.exe 2>NUL +taskkill /f /im python.exe 2>NUL +taskkill /f /im nvcc.exe 2>NUL +taskkill /f /im cicc.exe 2>NUL +taskkill /f /im ptxas.exe 2>NUL +taskkill /f /im test_api_impl.exe 2>NUL +taskkill /f /im op_function_generator.exe 2>NUL wmic process where name="op_function_generator.exe" call terminate 2>NUL +wmic process where name="test_api_impl.exe" call terminate 2>NUL +wmic process where name="cvtres.exe" call terminate 2>NUL +wmic process where name="rc.exe" call terminate 2>NUL +wmic process where name="CL.exe" call terminate 2>NUL +wmic process where name="Lib.exe" call terminate 2>NUL wmic process where name="python.exe" call terminate 2>NUL -taskkill /f /im python.exe 2>NUL echo Windows CI run successfully! exit /b 0 diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index d834d1f87a273c..0865d48c0d3432 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -145,6 +145,18 @@ function cmake_base() { else exit 1 fi + elif [ "$1" == "cp39-cp39" ]; then + if [ -d "/Library/Frameworks/Python.framework/Versions/3.9" ]; then + export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.9/lib/ + export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.9/lib/ + export PATH=/Library/Frameworks/Python.framework/Versions/3.9/bin/:${PATH} + PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.9/bin/python3 + -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.9/include/python3.9/ + -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.9/lib/libpython3.9.dylib" + pip3.9 install --user -r ${PADDLE_ROOT}/python/requirements.txt + else + exit 1 + fi fi else if [ "$1" != "" ]; then @@ -205,6 +217,13 @@ function cmake_base() { -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.8.0/include/python3.8 -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.8.0/lib/libpython3.so" pip3.8 install -r ${PADDLE_ROOT}/python/requirements.txt + elif [ "$1" == "cp39-cp39" ]; then + export LD_LIBRARY_PATH=/opt/_internal/cpython-3.9.0/lib/:${LD_LIBRARY_PATH} + export PATH=/opt/_internal/cpython-3.9.0/bin/:${PATH} + export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.9.0/bin/python3.9 + -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.9.0/include/python3.9 + -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.9.0/lib/libpython3.so" + pip3.9 install -r ${PADDLE_ROOT}/python/requirements.txt elif [ "$1" == "conda-python3.7" ]; then export LD_LIBRARY_PATH=/opt/conda/lib/:${LD_LIBRARY_PATH} export PATH=/opt/conda/bin/:${PATH} @@ -227,7 +246,6 @@ function cmake_base() { fi distibuted_flag=${WITH_DISTRIBUTE:-OFF} - grpc_flag="OFF" gloo_flag=${distibuted_flag} cat <> ${PADDLE_ROOT}/build/build_summary.txt paddle version # Recovery proxy to avoid failure in later steps - set +x export http_proxy=$my_proxy export https_proxy=$my_proxy if [ "$mactest_error" != 0 ];then show_ut_retry_result fi - set -x fi } function get_precision_ut_mac() { on_precision=0 - set -x UT_list=$(ctest -N | awk -F ': ' '{print $2}' | sed '/^$/d' | sed '$d') precison_cases="" if [ ${PRECISION_TEST:-OFF} == "ON" ]; then python3.7 $PADDLE_ROOT/tools/get_pr_ut.py if [[ -f "ut_list" ]]; then - set +x echo "PREC length: "`wc -l ut_list` precision_cases=`cat ut_list` - set -x fi fi if [ ${PRECISION_TEST:-OFF} == "ON" ] && [[ "$precision_cases" != "" ]];then @@ -803,7 +818,7 @@ function generate_api_spec() { awk -F '(' '{print $NF}' $spec_path >${spec_path}.doc awk -F '(' '{$NF="";print $0}' $spec_path >${spec_path}.api - if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ] || [ "$1" == "cp38-cp38" ]; then + if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ] || [ "$1" == "cp38-cp38" || [ "$1" == "cp39-cp39" ]; then # Use sed to make python2 and python3 sepc keeps the same sed -i 's/arg0: str/arg0: unicode/g' $spec_path sed -i "s/\(.*Transpiler.*\).__init__ (ArgSpec(args=\['self'].*/\1.__init__ /g" $spec_path @@ -848,7 +863,7 @@ function check_approvals_of_unittest() { echo -e "If you have any problems about deleting unit-test, please read the specification [https://github.com/PaddlePaddle/Paddle/wiki/Deleting-unit-test-is-forbidden]. \n" echo -e "Following unit-tests are deleted in this PR: \n ${unittest_spec_diff} \n" echo "************************************" - exit 1 + exit 6 fi fi fi @@ -1435,6 +1450,7 @@ function parallel_test() { mkdir -p ${PADDLE_ROOT}/build cd ${PADDLE_ROOT}/build pip install ${PADDLE_ROOT}/build/python/dist/*whl + cp ${PADDLE_ROOT}/build/python/paddle/fluid/tests/unittests/op_test.py ${PADDLE_ROOT}/build/python if [ "$WITH_GPU" == "ON" ] || [ "$WITH_ROCM" == "ON" ];then parallel_test_base_gpu else @@ -1540,12 +1556,14 @@ EOF ref_paddle36=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl ref_paddle37=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl ref_paddle38=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp38-cp38-linux_x86_64.whl + ref_paddle39=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp39-cp39-linux_x86_64.whl ref_paddle2_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp27-cp27mu-linux_x86_64.whl ref_paddle35_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp38-cp38-linux_x86_64.whl + ref_paddle39_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp39-cp39-linux_x86_64.whl if [[ ${PADDLE_BRANCH} != "0.0.0" && ${WITH_MKL} == "ON" && ${WITH_GPU} == "ON" ]]; then ref_paddle2=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp27-cp27mu-linux_x86_64.whl @@ -1553,11 +1571,13 @@ EOF ref_paddle36=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp36-cp36m-linux_x86_64.whl ref_paddle37=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp37-cp37m-linux_x86_64.whl ref_paddle38=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp38-cp38-linux_x86_64.whl + ref_paddle39=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp39-cp39-linux_x86_64.whl ref_paddle2_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp27-cp27mu-linux_x86_64.whl ref_paddle35_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp35-cp35m-linux_x86_64.whl ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp36-cp36m-linux_x86_64.whl ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp37-cp37m-linux_x86_64.whl ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp38-cp38-linux_x86_64.whl + ref_paddle39_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp39-cp39-linux_x86_64.whl fi #ref_paddle2_mv1="" @@ -1678,6 +1698,22 @@ EOF apt-get clean -y && \ rm -f ${ref_paddle38} && \ ldconfig +EOF + cat >> ${PADDLE_ROOT}/build/Dockerfile < /dev/null && \ + make -j8 > /dev/null && make altinstall > /dev/null && cd ../ && rm Python-3.9.0.tgz + RUN apt-get install -y libgtk2.0-dev dmidecode python3-tk && ldconfig && \ + wget ${ref_web}/${ref_paddle39} && pip3.9 install ${ref_paddle39_whl}; apt-get install -f -y && \ + apt-get clean -y && \ + rm -f ${ref_paddle39} && \ + ldconfig EOF cat >> ${PADDLE_ROOT}/build/Dockerfile < envs; std::vector undefok; -#if defined(PADDLE_WITH_DISTRIBUTE) && !defined(PADDLE_WITH_GRPC) && \ - !defined(PADDLE_WITH_PSLIB) +#if defined(PADDLE_WITH_DISTRIBUTE) && !defined(PADDLE_WITH_PSLIB) std::string str_max_body_size; if (::GFLAGS_NAMESPACE::GetCommandLineOption("max_body_size", &str_max_body_size)) { diff --git a/patches/eigen/TensorReductionGpu.h b/patches/eigen/TensorReductionGpu.h new file mode 100644 index 00000000000000..696078e54881af --- /dev/null +++ b/patches/eigen/TensorReductionGpu.h @@ -0,0 +1,996 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2014 Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. +// clang-format off +#ifndef EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H +#define EIGEN_CXX11_TENSOR_TENSOR_REDUCTION_GPU_H + +namespace Eigen { +namespace internal { + +#if defined(EIGEN_USE_GPU) && defined(EIGEN_GPUCC) +// Full reducers for GPU, don't vectorize for now + +// Reducer function that enables multiple gpu thread to safely accumulate at the same +// output address. It basically reads the current value of the output variable, and +// attempts to update it with the new value. If in the meantime another gpu thread +// updated the content of the output address it will try again. +template +__device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer) { +#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300) + if (sizeof(T) == 4) + { + unsigned int oldval = *reinterpret_cast(output); + unsigned int newval = oldval; + reducer.reduce(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + unsigned int readback; + while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) { + oldval = readback; + newval = oldval; + reducer.reduce(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + } + } + else if (sizeof(T) == 8) { + unsigned long long oldval = *reinterpret_cast(output); + unsigned long long newval = oldval; + reducer.reduce(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + unsigned long long readback; + while ((readback = atomicCAS((unsigned long long*)output, oldval, newval)) != oldval) { + oldval = readback; + newval = oldval; + reducer.reduce(accum, reinterpret_cast(&newval)); + if (newval == oldval) { + return; + } + } + } + else { + gpu_assert(0 && "Wordsize not supported"); + } +#else // EIGEN_CUDA_ARCH >= 300 + gpu_assert(0 && "Shouldn't be called on unsupported device"); +#endif // EIGEN_CUDA_ARCH >= 300 +} + +// We extend atomicExch to support extra data types +template +__device__ inline Type atomicExchCustom(Type* address, Type val) { + return atomicExch(address, val); +} + +template <> +__device__ inline double atomicExchCustom(double* address, double val) { + unsigned long long int* address_as_ull = reinterpret_cast(address); + return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val))); +} + +#ifdef EIGEN_HAS_GPU_FP16 +template