diff --git a/AUTHORS.md b/AUTHORS.md index 1eaaff29771436..60f5b424abb7ae 100644 --- a/AUTHORS.md +++ b/AUTHORS.md @@ -3,7 +3,7 @@ | abhinavarora | Abhinav Arora | | andreazanetti | Andrea Zanetti | | arlesniak | Artur Lesniak | -| arogowie-intel | Adam Osewski | +| [arogowie-intel](https://raw.githubusercontent.com/jakpiase/Paddle/new_paddle_intel_authors/img/img.jpg) | Adam Osewski | | backyes | Yan-Fei Wang | | baiyfbupt | Yi-Fan Bai | | beckett1124 | Bin Qi | @@ -25,8 +25,8 @@ | hedaoyuan | Dao-Yuan He | | helinwang | He-Lin Wang | | jacquesqiao | Long-Fei Qiao | -| jakpiase | Jakub Piasecki | -| [jczaja](https://raw.githubusercontent.com/jczaja/Paddle/paddle-poland-team/doc/images/paddle_poland_team.jpg) | Jacek Czaja | +| [jakpiase](https://raw.githubusercontent.com/jakpiase/Paddle/new_paddle_intel_authors/img/img.jpg) | Jakub Piasecki | +| [jczaja](https://raw.githubusercontent.com/jakpiase/Paddle/new_paddle_intel_authors/img/img.jpg) | Jacek Czaja | | JiayiFeng | Jia-Yi Feng | | kbinias | Krzysztof Binias | | kexinzhao | Ke-Xin Zhao | @@ -47,7 +47,8 @@ | pakchoi | Chuan-Jiang Song | | panyx0718 | Xin Pan | | pengli09 | Peng Li | -| pmajchrzak |Piotr Majchrzak | +| [piotrekobiIntel](https://raw.githubusercontent.com/jakpiase/Paddle/new_paddle_intel_authors/img/img.jpg) | Piotr Paturej | +| [pmajchrzak](https://raw.githubusercontent.com/jakpiase/Paddle/new_paddle_intel_authors/img/img.jpg) | Piotr Majchrzak | | pkuyym | Ya-Ming Yang | | pzelazko-intel | Pawel Zelazko | | [pawelpiotrowicz](https://raw.githubusercontent.com/jczaja/Paddle/paddle-poland-team/doc/images/paddle_poland_team.jpg) | Pawel Piotrowicz | @@ -55,12 +56,13 @@ | qingqing01 | Qing-Qing Dang | | reyoung | Yang Yu | | [Sand3r-](https://raw.githubusercontent.com/jczaja/Paddle/paddle-poland-team/doc/images/paddle_poland_team.jpg)| Michal Gallus | -| [sfraczek](https://raw.githubusercontent.com/jczaja/Paddle/paddle-poland-team/doc/images/paddle_poland_team.jpg)| Sylwester Fraczek | +| [sfraczek](https://raw.githubusercontent.com/jakpiase/Paddle/new_paddle_intel_authors/img/img.jpg)| Sylwester Fraczek | | sneaxiy | Jin-Le Zeng | | Superjom | Chun-Wei Yan | | tensor-tang | Jian Tang | | tianbingsz | Tian-Bing Xu | | tpatejko | Tomasz Patejko | +| [tsocha](https://raw.githubusercontent.com/jakpiase/Paddle/new_paddle_intel_authors/img/img.jpg) | Tomasz Socha | | typhoonzero | Yi Wu | | velconia | Qi-Yang Min | | wanghaoshuang | Hao-Shuang Wang | @@ -68,7 +70,7 @@ | wangzhen-nlp | Zhen Wang | | wen-bo-yang | Wen-Bo Yang | | wojtuss | Wojciech Uss | -| wozna | Joanna Wozna | +| [wozna](https://raw.githubusercontent.com/jakpiase/Paddle/new_paddle_intel_authors/img/img.jpg)| Joanna Wozna | | wwhu | Wei-Wei Hu | | xinghai-sun | Xing-Hai Sun | | Xreki | Yi-Qun Liu | diff --git a/CMakeLists.txt b/CMakeLists.txt index 98772e96781531..334a6cfcd0ee14 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -214,6 +214,7 @@ option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VER option(WITH_DGC "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE}) option(SANITIZER_TYPE "Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined" OFF) option(WITH_LITE "Compile Paddle Fluid with Lite Engine" OFF) +option(WITH_CINN "Compile PaddlePaddle with CINN" OFF) option(WITH_NCCL "Compile PaddlePaddle with NCCL support" ON) option(WITH_RCCL "Compile PaddlePaddle with RCCL support" ON) option(WITH_XPU_BKCL "Compile PaddlePaddle with BAIDU KUNLUN XPU BKCL" OFF) diff --git a/cmake/external/ascend.cmake b/cmake/external/ascend.cmake index 414b2a54be0342..03bc7784e9288d 100644 --- a/cmake/external/ascend.cmake +++ b/cmake/external/ascend.cmake @@ -85,5 +85,39 @@ if(WITH_ASCEND_CL) ADD_LIBRARY(acl_op_compiler SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET acl_op_compiler PROPERTY IMPORTED_LOCATION ${acl_op_compiler_lib}) add_custom_target(extern_ascend_cl DEPENDS ascendcl acl_op_compiler) +endif() + +if (WITH_ASCEND_CL) +macro(find_ascend_toolkit_version ascend_toolkit_version_info) + file(READ ${ascend_toolkit_version_info} ASCEND_TOOLKIT_VERSION_CONTENTS) + string(REGEX MATCH "version=([0-9]+\.[0-9]+\.[0-9]+\.[a-z]*[0-9]*)" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION_CONTENTS}") + string(REGEX REPLACE "version=([0-9]+\.[0-9]+\.[0-9]+\.[a-z]*[0-9]*)" "\\1" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION}") + string(REGEX REPLACE "[a-z|\.]" "" CANN_VERSION ${ASCEND_TOOLKIT_VERSION}) + add_definitions("-DCANN_VERSION_CODE=${CANN_VERSION}") + if(NOT ASCEND_TOOLKIT_VERSION) + set(ASCEND_TOOLKIT_VERSION "???") + else() + message(STATUS "Current Ascend Toolkit version is ${ASCEND_TOOLKIT_VERSION}") + endif() +endmacro() + +macro(find_ascend_driver_version ascend_driver_version_info) + file(READ ${ascend_driver_version_info} ASCEND_DRIVER_VERSION_CONTENTS) + string(REGEX MATCH "Version=([0-9]+\.[0-9]+\.[0-9]+)" ASCEND_DRIVER_VERSION "${ASCEND_DRIVER_VERSION_CONTENTS}") + string(REGEX REPLACE "Version=([0-9]+\.[0-9]+\.[0-9]+)" "\\1" ASCEND_DRIVER_VERSION "${ASCEND_DRIVER_VERSION}") + if(NOT ASCEND_DRIVER_VERSION) + set(ASCEND_DRIVER_VERSION "???") + else() + message(STATUS "Current Ascend Driver version is ${ASCEND_DRIVER_VERSION}") + endif() +endmacro() + +if (WITH_ARM) + set(ASCEND_TOOLKIT_DIR ${ASCEND_DIR}/ascend-toolkit/latest/arm64-linux) +else() + set(ASCEND_TOOLKIT_DIR ${ASCEND_DIR}/ascend-toolkit/latest/x86_64-linux) +endif() +find_ascend_toolkit_version(${ASCEND_TOOLKIT_DIR}/ascend_toolkit_install.info) +find_ascend_driver_version(${ASCEND_DIR}/driver/version.info) endif() diff --git a/cmake/external/cinn.cmake b/cmake/external/cinn.cmake new file mode 100644 index 00000000000000..ee5aea9f8b2942 --- /dev/null +++ b/cmake/external/cinn.cmake @@ -0,0 +1,82 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +if (NOT WITH_CINN) + return() +endif() + +# TODO(zhhsplendid): CINN has lots of warnings during early development. +# They will be treated as errors under paddle. We set no-error now and we will +# clean the code in the future. +add_definitions(-w) + +###################################### +# Build CINN from Git External Project +###################################### +include(ExternalProject) +set(CINN_SOURCE_DIR ${THIRD_PARTY_PATH}/CINN) +# TODO(zhhsplendid): Modify git tag after we have release tag +set(CINN_GIT_TAG e422c01b7875301996a2baf67a14ba61b0e6192a) +set(CINN_OPTIONAL_ARGS -DPY_VERSION=${PY_VERSION} -DWITH_CUDA=${WITH_GPU} -DWITH_CUDNN=${WITH_GPU} -DPUBLISH_LIBS=ON -DWITH_TESTING=ON) +set(CINN_BUILD_COMMAND $(MAKE) cinnapi -j) +ExternalProject_Add( + external_cinn + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "${GIT_URL}/PaddlePaddle/CINN.git" + GIT_TAG ${CINN_GIT_TAG} + PREFIX ${CINN_SOURCE_DIR} + BUILD_COMMAND ${CINN_BUILD_COMMAND} + INSTALL_COMMAND "" + CMAKE_ARGS ${CINN_OPTIONAL_ARGS}) + + + +ExternalProject_Get_property(external_cinn BINARY_DIR) +ExternalProject_Get_property(external_cinn SOURCE_DIR) +set(CINN_BINARY_DIR ${BINARY_DIR}) +set(CINN_SOURCE_DIR ${SOURCE_DIR}) + +message(STATUS "CINN BINARY_DIR: ${CINN_BINARY_DIR}") +message(STATUS "CINN SOURCE_DIR: ${CINN_SOURCE_DIR}") + + +###################################### +# Add CINN's dependencies header files +###################################### + +# Add absl +set(ABSL_INCLUDE_DIR "${CINN_BINARY_DIR}/dist/third_party/absl/include") +include_directories(${ABSL_INCLUDE_DIR}) + +# Add isl +set(ISL_INCLUDE_DIR "${CINN_BINARY_DIR}/dist/third_party/isl/include") +include_directories(${ISL_INCLUDE_DIR}) + +# Add LLVM +set(LLVM_INCLUDE_DIR "${CINN_BINARY_DIR}/dist/third_party/llvm/include") +include_directories(${LLVM_INCLUDE_DIR}) + +###################################################### +# Put external_cinn and dependencies together as a lib +###################################################### + +set(CINN_LIB_NAME "libcinnapi.so") +set(CINN_LIB_LOCATION "${CINN_BINARY_DIR}/dist/cinn/lib") +set(CINN_INCLUDE_DIR "${CINN_BINARY_DIR}/dist/cinn/include") + +add_library(cinn SHARED IMPORTED GLOBAL) +set_target_properties(cinn PROPERTIES IMPORTED_LOCATION "${CINN_LIB_LOCATION}/${CINN_LIB_NAME}") +include_directories(${CINN_INCLUDE_DIR}) +add_dependencies(cinn external_cinn) + diff --git a/cmake/external/dlpack.cmake b/cmake/external/dlpack.cmake index 87db181d953afb..43ffde75992266 100644 --- a/cmake/external/dlpack.cmake +++ b/cmake/external/dlpack.cmake @@ -18,7 +18,7 @@ set(DLPACK_PREFIX_DIR ${THIRD_PARTY_PATH}/dlpack) set(DLPACK_SOURCE_DIR ${THIRD_PARTY_PATH}/dlpack/src/extern_dlpack) set(DLPACK_REPOSITORY ${GIT_URL}/dmlc/dlpack.git) -set(DLPACK_TAG v0.2) +set(DLPACK_TAG v0.4) cache_third_party(extern_dlpack REPOSITORY ${DLPACK_REPOSITORY} diff --git a/cmake/external/lite.cmake b/cmake/external/lite.cmake index e344ebaa2477ea..097ca38be070ab 100644 --- a/cmake/external/lite.cmake +++ b/cmake/external/lite.cmake @@ -134,7 +134,7 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR) GIT_TAG ${LITE_GIT_TAG} PREFIX ${LITE_SOURCES_DIR} UPDATE_COMMAND "" - PATCH_COMMAND sed -i "s?NNadapter_bridges_path = os.path.abspath('..')+\"\/lite\/kernels\/nnadapter\/bridges\/paddle_use_bridges.h\"?NNadapter_bridges_path = os.path.abspath(\'..\')+\"\/extern_lite\/lite\/kernels\/nnadapter\/bridges\/paddle_use_bridges.h\"?" ${LITE_SOURCES_DIR}/src/extern_lite//lite/tools/cmake_tools/record_supported_kernel_op.py && sed -i "/general::ssa::ConvertToSSA(cpp_prog)$/d" ${LITE_SOURCES_DIR}/src/extern_lite/lite/model_parser/model_parser.cc + PATCH_COMMAND sed -i "s?NNadapter_bridges_path = os.path.abspath('..')+\"\/lite\/kernels\/nnadapter\/bridges\/paddle_use_bridges.h\"?NNadapter_bridges_path = os.path.abspath(\'..\')+\"\/extern_lite\/lite\/kernels\/nnadapter\/bridges\/paddle_use_bridges.h\"?" ${LITE_SOURCES_DIR}/src/extern_lite//lite/tools/cmake_tools/record_supported_kernel_op.py BUILD_COMMAND ${LITE_BUILD_COMMAND} INSTALL_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} diff --git a/cmake/external/utf8proc.cmake b/cmake/external/utf8proc.cmake new file mode 100644 index 00000000000000..a5de5c15c3b510 --- /dev/null +++ b/cmake/external/utf8proc.cmake @@ -0,0 +1,51 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +INCLUDE(ExternalProject) + +SET(UTF8PROC_PREFIX_DIR ${THIRD_PARTY_PATH}/utf8proc) +SET(UTF8PROC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/utf8proc) +# As we add extra features for utf8proc, we use the non-official repo +SET(UTF8PROC_REPOSITORY ${GIT_URL}/JuliaStrings/utf8proc.git) +SET(UTF8PROC_TAG v2.6.1) + +IF(WIN32) + SET(UTF8PROC_LIBRARIES "${UTF8PROC_INSTALL_DIR}/lib/utf8proc_static.lib") + add_definitions(-DUTF8PROC_STATIC) +ELSE(WIN32) + SET(UTF8PROC_LIBRARIES "${UTF8PROC_INSTALL_DIR}/lib/libutf8proc.a") +ENDIF(WIN32) + +INCLUDE_DIRECTORIES(${UTF8PROC_INSTALL_DIR}/include) + +ExternalProject_Add( + extern_utf8proc + ${EXTERNAL_PROJECT_LOG_ARGS} + ${SHALLOW_CLONE} + GIT_REPOSITORY ${UTF8PROC_REPOSITORY} + GIT_TAG ${UTF8PROC_TAG} + PREFIX ${UTF8PROC_PREFIX_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DBUILD_SHARED=ON + -DBUILD_STATIC=ON + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_INSTALL_PREFIX:PATH=${UTF8PROC_INSTALL_DIR} + -DCMAKE_BUILD_TYPE:STRING=${CMAKE_BUILD_TYPE} + BUILD_BYPRODUCTS ${UTF8PROC_LIBRARIES} +) + +ADD_LIBRARY(utf8proc STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET utf8proc PROPERTY IMPORTED_LOCATION ${UTF8PROC_LIBRARIES}) +ADD_DEPENDENCIES(utf8proc extern_utf8proc) diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index 70bdc67980c038..11a7adbbeb9a81 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -35,7 +35,7 @@ ELSE () ENDIF() SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev") -SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210921") +SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20211020") SET(XPU_XRE_URL "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) SET(XPU_XCCL_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210623/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index cb2ed614d3d7ca..dfd93f49e73404 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -124,6 +124,11 @@ function(copy_part_of_thrid_party TARGET DST) SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES} DSTS ${dst_dir} ${dst_dir}/lib) + set(dst_dir "${DST}/third_party/install/utf8proc") + copy(${TARGET} + SRCS ${UTF8PROC_INSTALL_DIR}/include ${UTF8PROC_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib) + if (WITH_CRYPTO) set(dst_dir "${DST}/third_party/install/cryptopp") copy(${TARGET} @@ -353,7 +358,9 @@ function(version version_file) "WITH_MKL: ${WITH_MKL}\n" "WITH_MKLDNN: ${WITH_MKLDNN}\n" "WITH_GPU: ${WITH_GPU}\n" - "WITH_ROCM: ${WITH_ROCM}\n") + "WITH_ROCM: ${WITH_ROCM}\n" + "WITH_ASCEND_CL: ${WITH_ASCEND_CL}\n" + "WITH_ASCEND_CXX11: ${WITH_ASCEND_CXX11}\n") if(WITH_GPU) file(APPEND ${version_file} "CUDA version: ${CUDA_VERSION}\n" @@ -364,6 +371,11 @@ function(version version_file) "HIP version: ${HIP_VERSION}\n" "MIOpen version: v${MIOPEN_MAJOR_VERSION}.${MIOPEN_MINOR_VERSION}\n") endif() + if(WITH_ASCEND_CL) + file(APPEND ${version_file} + "Ascend Toolkit version: ${ASCEND_TOOLKIT_VERSION}\n" + "Ascend Driver version: ${ASCEND_DRIVER_VERSION}\n") + endif() file(APPEND ${version_file} "CXX compiler version: ${CMAKE_CXX_COMPILER_VERSION}\n") if(TENSORRT_FOUND) file(APPEND ${version_file} diff --git a/cmake/miopen.cmake b/cmake/miopen.cmake index f482f423dc5c12..493c37955f7258 100644 --- a/cmake/miopen.cmake +++ b/cmake/miopen.cmake @@ -15,8 +15,6 @@ find_path(MIOPEN_INCLUDE_DIR "miopen/miopen.h" NO_DEFAULT_PATH ) -get_filename_component(__libpath_hist ${CUDA_CUDART_LIBRARY} PATH) - find_library(MIOPEN_LIBRARY NAMES "libMIOpen.so" PATHS ${MIOPEN_ROOT} ${MIOPEN_ROOT}/lib ${MIOPEN_ROOT}/lib64 ${__libpath_hist} $ENV{MIOPEN_ROOT} $ENV{MIOPEN_ROOT}/lib $ENV{MIOPEN_ROOT}/lib64 diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 2c010a1e6297f0..a537719cc75829 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -185,6 +185,8 @@ function(op_library TARGET) list(REMOVE_ITEM hip_srcs "cholesky_op.cu") list(REMOVE_ITEM hip_srcs "matrix_rank_op.cu") list(REMOVE_ITEM hip_srcs "svd_op.cu") + list(REMOVE_ITEM hip_srcs "eigvalsh_op.cu") + list(REMOVE_ITEM hip_srcs "qr_op.cu") list(REMOVE_ITEM hip_srcs "eigh_op.cu") list(REMOVE_ITEM hip_srcs "multinomial_op.cu") list(REMOVE_ITEM hip_srcs "decode_jpeg_op.cu") @@ -214,9 +216,10 @@ function(op_library TARGET) foreach(manual_pybind_op "compare_all_op" "compare_op" "logical_op" "bitwise_op" "nccl_op" "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op" "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" -"sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op" +"sync_batch_norm_op" "sparse_attention_op" "dgc_op" "fused_fc_elementwise_layernorm_op" "skip_layernorm_op" "multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op" "fusion_lstm_op" -"fused_bn_add_activation_op") +"fused_bn_add_activation_op" "fused_attention_op" "resnet_unit_op" "fused_feedforward_op") + if ("${TARGET}" STREQUAL "${manual_pybind_op}") set(pybind_flag 1) endif() @@ -297,7 +300,7 @@ function(op_library TARGET) file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n") endif() - if (WITH_XPU AND ${xpu_cc_srcs_len} GREATER 0) + if (WITH_XPU AND ${pybind_flag} EQUAL 0 AND ${xpu_cc_srcs_len} GREATER 0) file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n") endif() diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake index 44463f29923b2e..7cdbee1746a8ff 100644 --- a/cmake/third_party.cmake +++ b/cmake/third_party.cmake @@ -210,6 +210,10 @@ include(external/threadpool)# download threadpool include(external/dlpack) # download dlpack include(external/xxhash) # download, build, install xxhash include(external/warpctc) # download, build, install warpctc +include(external/utf8proc) # download, build, install utf8proc + +list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boost extern_xxhash) +list(APPEND third_party_deps extern_zlib extern_dlpack extern_warpctc extern_threadpool extern_utf8proc) include(external/lapack) # download, build, install lapack list(APPEND third_party_deps extern_eigen3 extern_gflags extern_glog extern_boost extern_xxhash) @@ -251,8 +255,8 @@ if(WITH_GPU) include(external/cub) # download cub list(APPEND third_party_deps extern_cub) endif() - set(URL "https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg.tar.gz" CACHE STRING "" FORCE) - file_download_and_uncompress(${URL} "externalError" MD5 c0749523ebb536eb7382487d645d9cd4) # download file externalErrorMsg.tar.gz + set(URL "https://paddlepaddledeps.bj.bcebos.com/externalErrorMsg_20210928.tar.gz" CACHE STRING "" FORCE) + file_download_and_uncompress(${URL} "externalError" MD5 a712a49384e77ca216ad866712f7cafa) # download file externalErrorMsg.tar.gz if(WITH_TESTING) # copy externalErrorMsg.pb, just for unittest can get error message correctly. set(SRC_DIR ${THIRD_PARTY_PATH}/externalError/data) @@ -356,6 +360,12 @@ if (WITH_LITE) include(external/lite) endif (WITH_LITE) +if (WITH_CINN) + message(STATUS "Compile Paddle with CINN.") + include(external/cinn) + add_definitions(-DPADDLE_WITH_CINN) +endif (WITH_CINN) + if (WITH_CRYPTO) include(external/cryptopp) # download, build, install cryptopp list(APPEND third_party_deps extern_cryptopp) diff --git a/log b/log deleted file mode 100644 index c02e10686b5fbc..00000000000000 Binary files a/log and /dev/null differ diff --git a/paddle/fluid/distributed/common/local_random.h b/paddle/fluid/distributed/common/local_random.h new file mode 100644 index 00000000000000..96b8d2d21a5605 --- /dev/null +++ b/paddle/fluid/distributed/common/local_random.h @@ -0,0 +1,65 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include + +namespace paddle { +namespace distributed { + +// Get time in seconds. +inline double current_realtime() { + struct timespec tp; + clock_gettime(CLOCK_REALTIME, &tp); + return tp.tv_sec + tp.tv_nsec * 1e-9; +} + +inline std::default_random_engine& local_random_engine() { + struct engine_wrapper_t { + std::default_random_engine engine; + engine_wrapper_t() { + static std::atomic x(0); // NOLINT + std::seed_seq sseq = { + x++, x++, x++, (unsigned long)(current_realtime() * 1000)}; // NOLINT + engine.seed(sseq); + } + }; + thread_local engine_wrapper_t r; + return r.engine; +} + +template +std::uniform_real_distribution& local_uniform_real_distribution() { + thread_local std::uniform_real_distribution distr; + assert(distr.a() == 0.0 && distr.b() == 1.0); + return distr; +} + +template +T uniform_real() { + return local_uniform_real_distribution()(local_random_engine()); +} + +template +T uniform_real(T a, T b) { + if (a == b) { + return a; + } + return (T)(a + uniform_real() * (b - a)); +} +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/ps.proto b/paddle/fluid/distributed/ps.proto index 862ae4a504d9b4..4483f960eb1371 100644 --- a/paddle/fluid/distributed/ps.proto +++ b/paddle/fluid/distributed/ps.proto @@ -122,7 +122,36 @@ message TableAccessorParameter { optional uint32 fea_dim = 4 [ default = 11 ]; optional uint32 embedx_dim = 5 [ default = 8 ]; optional uint32 embedx_threshold = 6 [ default = 10 ]; + optional CtrAccessorParameter ctr_accessor_param = 7; repeated TableAccessorSaveParameter table_accessor_save_param = 8; + optional SparseCommonSGDRuleParameter embed_sgd_param = 10; + optional SparseCommonSGDRuleParameter embedx_sgd_param = 11; +} + +message CtrAccessorParameter { + optional float nonclk_coeff = 1 + [ default = 0.1 ]; // to calculate show_click_score + optional float click_coeff = 2 + [ default = 1 ]; // to calculate show_click_score + optional float base_threshold = 3 [ + default = 1.5 + ]; // show_click_score > base_threshold, this feature can be saved + optional float delta_threshold = 4 + [ default = + 0.25 ]; // delta_score > delta_threshold, this feature can be saved + optional float delta_keep_days = 5 + [ default = + 16 ]; // unseen_day < delta_keep_days, this feature can be saved + optional float show_click_decay_rate = 6 [ + default = 0.98 + ]; // show/click will update to show/click * show_click_decay_rate after a day + optional float delete_threshold = 7 + [ default = 0.8 ]; // threshold to shrink a feasign + optional float delete_after_unseen_days = 8 + [ default = 30 ]; // unseen_day > delete_after_unseen_days, this feature + // will be delete in shrink_model + optional int32 ssd_unseenday_threshold = 9 + [ default = 1 ]; // threshold to save ssd } message TensorAccessorParameter { @@ -150,3 +179,33 @@ message TableAccessorSaveParameter { optional string converter = 2; optional string deconverter = 3; } + +message SparseCommonSGDRuleParameter { + optional string name = 1; + optional SparseNaiveSGDRuleParameter naive = 2; + optional SparseAdagradSGDRuleParameter adagrad = 3; + optional SparseAdamSGDParameter adam = 4; +} + +message SparseNaiveSGDRuleParameter { // SparseNaiveSGDRule + optional double learning_rate = 1 [ default = 0.05 ]; + optional double initial_range = 2 [ default = 0.0001 ]; + repeated float weight_bounds = 3; +} + +message + SparseAdagradSGDRuleParameter { // SparseAdaGradSGDRule|StdAdaGradSGDRule + optional double learning_rate = 1 [ default = 0.05 ]; + optional double initial_g2sum = 2 [ default = 3.0 ]; + optional double initial_range = 3 [ default = 0.0001 ]; + repeated float weight_bounds = 4; +} + +message SparseAdamSGDParameter { // SparseAdamSGDRule + optional double learning_rate = 1 [ default = 0.001 ]; + optional double initial_range = 2 [ default = 0.0001 ]; + optional double beta1_decay_rate = 3 [ default = 0.9 ]; + optional double beta2_decay_rate = 4 [ default = 0.999 ]; + optional double ada_epsilon = 5 [ default = 1e-08 ]; + repeated float weight_bounds = 6; +} diff --git a/paddle/fluid/distributed/service/brpc_utils.cc b/paddle/fluid/distributed/service/brpc_utils.cc index a356b77e73733e..92dcde99cccb0b 100644 --- a/paddle/fluid/distributed/service/brpc_utils.cc +++ b/paddle/fluid/distributed/service/brpc_utils.cc @@ -138,23 +138,11 @@ void SerializeSelectedRows(framework::Variable* var, var_data->clear(); var_data->resize(rows->size() * sizeof(int64_t)); char* data_ptr = const_cast(var_data->data()); - - if (platform::is_cpu_place(tensor->place())) { - memcpy(data_ptr, &(*rows)[0], rows->size() * sizeof(int64_t)); - } else { -#ifdef PADDLE_WITH_CUDA - auto stream = - reinterpret_cast(ctx).stream(); - memory::Copy(platform::CPUPlace(), data_ptr, - BOOST_GET_CONST(platform::CUDAPlace, tensor->place()), - &(*rows)[0], rows->size() * sizeof(int64_t), stream); -#endif - } + memcpy(data_ptr, &((*rows)[0]), rows->size() * sizeof(int64_t)); var_msg->set_data_type(static_cast(tensor->type())); for (auto& dim : framework::vectorize(tensor->dims())) { var_msg->add_dims(dim); } - // IO Buffer if (platform::is_cpu_place(tensor->place())) { auto data_len = tensor->numel() * framework::SizeOfType(tensor->type()); @@ -273,8 +261,8 @@ void DeserializeSelectedRows(framework::Variable* var, const VarMsg& msg, auto* slr = var->GetMutable(); framework::Tensor* tensor = slr->mutable_value(); slr->set_height(msg.slr_height()); - std::vector tmp_rows(msg.slr_height()); - memcpy(&tmp_rows[0], msg.data().data(), msg.slr_height() * sizeof(int64_t)); + std::vector tmp_rows(msg.dims()[0]); + memcpy(tmp_rows.data(), msg.data().data(), msg.dims()[0] * sizeof(int64_t)); slr->set_rows(tmp_rows); std::vector vec_dim; for (auto& x : msg.dims()) { diff --git a/paddle/fluid/distributed/service/graph_brpc_client.cc b/paddle/fluid/distributed/service/graph_brpc_client.cc index 68d9c9669b6972..9f65a66708def0 100644 --- a/paddle/fluid/distributed/service/graph_brpc_client.cc +++ b/paddle/fluid/distributed/service/graph_brpc_client.cc @@ -304,7 +304,63 @@ std::future GraphBrpcClient::remove_graph_node( // char* &buffer,int &actual_size std::future GraphBrpcClient::batch_sample_neighboors( uint32_t table_id, std::vector node_ids, int sample_size, - std::vector>> &res) { + std::vector>> &res, + int server_index) { + if (server_index != -1) { + res.resize(node_ids.size()); + DownpourBrpcClosure *closure = new DownpourBrpcClosure(1, [&](void *done) { + int ret = 0; + auto *closure = (DownpourBrpcClosure *)done; + if (closure->check_response(0, PS_GRAPH_SAMPLE_NODES_FROM_ONE_SERVER) != + 0) { + ret = -1; + } else { + auto &res_io_buffer = closure->cntl(0)->response_attachment(); + butil::IOBufBytesIterator io_buffer_itr(res_io_buffer); + size_t bytes_size = io_buffer_itr.bytes_left(); + std::unique_ptr buffer_wrapper(new char[bytes_size]); + char *buffer = buffer_wrapper.get(); + io_buffer_itr.copy_and_forward((void *)(buffer), bytes_size); + + size_t node_num = *(size_t *)buffer; + int *actual_sizes = (int *)(buffer + sizeof(size_t)); + char *node_buffer = buffer + sizeof(size_t) + sizeof(int) * node_num; + + int offset = 0; + for (size_t node_idx = 0; node_idx < node_num; ++node_idx) { + int actual_size = actual_sizes[node_idx]; + int start = 0; + while (start < actual_size) { + res[node_idx].push_back( + {*(uint64_t *)(node_buffer + offset + start), + *(float *)(node_buffer + offset + start + + GraphNode::id_size)}); + start += GraphNode::id_size + GraphNode::weight_size; + } + offset += actual_size; + } + } + closure->set_promise_value(ret); + }); + auto promise = std::make_shared>(); + closure->add_promise(promise); + std::future fut = promise->get_future(); + ; + closure->request(0)->set_cmd_id(PS_GRAPH_SAMPLE_NODES_FROM_ONE_SERVER); + closure->request(0)->set_table_id(table_id); + closure->request(0)->set_client_id(_client_id); + closure->request(0)->add_params((char *)node_ids.data(), + sizeof(uint64_t) * node_ids.size()); + closure->request(0)->add_params((char *)&sample_size, sizeof(int)); + ; + // PsService_Stub rpc_stub(get_cmd_channel(server_index)); + GraphPsService_Stub rpc_stub = + getServiceStub(get_cmd_channel(server_index)); + closure->cntl(0)->set_log_id(butil::gettimeofday_ms()); + rpc_stub.service(closure->cntl(0), closure->request(0), + closure->response(0), closure); + return fut; + } std::vector request2server; std::vector server2request(server_size, -1); res.clear(); diff --git a/paddle/fluid/distributed/service/graph_brpc_client.h b/paddle/fluid/distributed/service/graph_brpc_client.h index 8acb2047b8e972..1fbb3fa9b0550e 100644 --- a/paddle/fluid/distributed/service/graph_brpc_client.h +++ b/paddle/fluid/distributed/service/graph_brpc_client.h @@ -64,7 +64,8 @@ class GraphBrpcClient : public BrpcPsClient { // given a batch of nodes, sample graph_neighboors for each of them virtual std::future batch_sample_neighboors( uint32_t table_id, std::vector node_ids, int sample_size, - std::vector>>& res); + std::vector>>& res, + int server_index = -1); virtual std::future pull_graph_list(uint32_t table_id, int server_index, int start, diff --git a/paddle/fluid/distributed/service/graph_brpc_server.cc b/paddle/fluid/distributed/service/graph_brpc_server.cc index 110d4406fc5569..b404082f7c4102 100644 --- a/paddle/fluid/distributed/service/graph_brpc_server.cc +++ b/paddle/fluid/distributed/service/graph_brpc_server.cc @@ -61,6 +61,10 @@ int32_t GraphBrpcServer::initialize() { return 0; } +brpc::Channel *GraphBrpcServer::get_cmd_channel(size_t server_index) { + return _pserver_channels[server_index].get(); +} + uint64_t GraphBrpcServer::start(const std::string &ip, uint32_t port) { std::unique_lock lock(mutex_); @@ -80,6 +84,42 @@ uint64_t GraphBrpcServer::start(const std::string &ip, uint32_t port) { return 0; } +int32_t GraphBrpcServer::build_peer2peer_connection(int rank) { + this->rank = rank; + auto _env = environment(); + brpc::ChannelOptions options; + options.protocol = "baidu_std"; + options.timeout_ms = 500000; + options.connection_type = "pooled"; + options.connect_timeout_ms = 10000; + options.max_retry = 3; + + std::vector server_list = _env->get_ps_servers(); + _pserver_channels.resize(server_list.size()); + std::ostringstream os; + std::string server_ip_port; + for (size_t i = 0; i < server_list.size(); ++i) { + server_ip_port.assign(server_list[i].ip.c_str()); + server_ip_port.append(":"); + server_ip_port.append(std::to_string(server_list[i].port)); + _pserver_channels[i].reset(new brpc::Channel()); + if (_pserver_channels[i]->Init(server_ip_port.c_str(), "", &options) != 0) { + VLOG(0) << "GraphServer connect to Server:" << server_ip_port + << " Failed! Try again."; + std::string int_ip_port = + GetIntTypeEndpoint(server_list[i].ip, server_list[i].port); + if (_pserver_channels[i]->Init(int_ip_port.c_str(), "", &options) != 0) { + LOG(ERROR) << "GraphServer connect to Server:" << int_ip_port + << " Failed!"; + return -1; + } + } + os << server_ip_port << ","; + } + LOG(INFO) << "servers peer2peer connection success:" << os.str(); + return 0; +} + int32_t GraphBrpcService::clear_nodes(Table *table, const PsRequestMessage &request, PsResponseMessage &response, @@ -160,6 +200,9 @@ int32_t GraphBrpcService::initialize() { &GraphBrpcService::remove_graph_node; _service_handler_map[PS_GRAPH_SET_NODE_FEAT] = &GraphBrpcService::graph_set_node_feat; + _service_handler_map[PS_GRAPH_SAMPLE_NODES_FROM_ONE_SERVER] = + &GraphBrpcService::sample_neighboors_across_multi_servers; + // shard初始化,server启动后才可从env获取到server_list的shard信息 initialize_shard_info(); @@ -172,10 +215,10 @@ int32_t GraphBrpcService::initialize_shard_info() { if (_is_initialize_shard_info) { return 0; } - size_t shard_num = _server->environment()->get_ps_servers().size(); + server_size = _server->environment()->get_ps_servers().size(); auto &table_map = *(_server->table()); for (auto itr : table_map) { - itr.second->set_shard(_rank, shard_num); + itr.second->set_shard(_rank, server_size); } _is_initialize_shard_info = true; } @@ -209,7 +252,9 @@ void GraphBrpcService::service(google::protobuf::RpcController *cntl_base, int service_ret = (this->*handler_func)(table, *request, *response, cntl); if (service_ret != 0) { response->set_err_code(service_ret); - response->set_err_msg("server internal error"); + if (!response->has_err_msg()) { + response->set_err_msg("server internal error"); + } } } @@ -403,7 +448,156 @@ int32_t GraphBrpcService::graph_get_node_feat(Table *table, return 0; } - +int32_t GraphBrpcService::sample_neighboors_across_multi_servers( + Table *table, const PsRequestMessage &request, PsResponseMessage &response, + brpc::Controller *cntl) { + // sleep(5); + CHECK_TABLE_EXIST(table, request, response) + if (request.params_size() < 2) { + set_response_code( + response, -1, + "graph_random_sample request requires at least 2 arguments"); + return 0; + } + size_t node_num = request.params(0).size() / sizeof(uint64_t), + size_of_size_t = sizeof(size_t); + uint64_t *node_data = (uint64_t *)(request.params(0).c_str()); + int sample_size = *(uint64_t *)(request.params(1).c_str()); + // std::vector res = ((GraphTable + // *)table).filter_out_non_exist_nodes(node_data, sample_size); + std::vector request2server; + std::vector server2request(server_size, -1); + std::vector local_id; + std::vector local_query_idx; + size_t rank = get_rank(); + for (int query_idx = 0; query_idx < node_num; ++query_idx) { + int server_index = + ((GraphTable *)table)->get_server_index_by_id(node_data[query_idx]); + if (server2request[server_index] == -1) { + server2request[server_index] = request2server.size(); + request2server.push_back(server_index); + } + } + if (server2request[rank] != -1) { + auto pos = server2request[rank]; + std::swap(request2server[pos], + request2server[(int)request2server.size() - 1]); + server2request[request2server[pos]] = pos; + server2request[request2server[(int)request2server.size() - 1]] = + request2server.size() - 1; + } + size_t request_call_num = request2server.size(); + std::vector> local_buffers; + std::vector local_actual_sizes; + std::vector seq; + std::vector> node_id_buckets(request_call_num); + std::vector> query_idx_buckets(request_call_num); + for (int query_idx = 0; query_idx < node_num; ++query_idx) { + int server_index = + ((GraphTable *)table)->get_server_index_by_id(node_data[query_idx]); + int request_idx = server2request[server_index]; + node_id_buckets[request_idx].push_back(node_data[query_idx]); + query_idx_buckets[request_idx].push_back(query_idx); + seq.push_back(request_idx); + } + size_t remote_call_num = request_call_num; + if (request2server.size() != 0 && request2server.back() == rank) { + remote_call_num--; + local_buffers.resize(node_id_buckets.back().size()); + local_actual_sizes.resize(node_id_buckets.back().size()); + } + cntl->response_attachment().append(&node_num, sizeof(size_t)); + auto local_promise = std::make_shared>(); + std::future local_fut = local_promise->get_future(); + std::vector failed(server_size, false); + std::function func = [&, node_id_buckets, query_idx_buckets, + request_call_num](void *done) { + local_fut.get(); + std::vector actual_size; + auto *closure = (DownpourBrpcClosure *)done; + std::vector> res( + remote_call_num); + size_t fail_num = 0; + for (size_t request_idx = 0; request_idx < remote_call_num; ++request_idx) { + if (closure->check_response(request_idx, PS_GRAPH_SAMPLE_NEIGHBOORS) != + 0) { + ++fail_num; + failed[request2server[request_idx]] = true; + } else { + auto &res_io_buffer = closure->cntl(request_idx)->response_attachment(); + size_t node_size; + res[request_idx].reset(new butil::IOBufBytesIterator(res_io_buffer)); + size_t num; + res[request_idx]->copy_and_forward(&num, sizeof(size_t)); + } + } + int size; + int local_index = 0; + for (size_t i = 0; i < node_num; i++) { + if (fail_num > 0 && failed[seq[i]]) { + size = 0; + } else if (request2server[seq[i]] != rank) { + res[seq[i]]->copy_and_forward(&size, sizeof(int)); + } else { + size = local_actual_sizes[local_index++]; + } + actual_size.push_back(size); + } + cntl->response_attachment().append(actual_size.data(), + actual_size.size() * sizeof(int)); + + local_index = 0; + for (size_t i = 0; i < node_num; i++) { + if (fail_num > 0 && failed[seq[i]]) { + continue; + } else if (request2server[seq[i]] != rank) { + char temp[actual_size[i] + 1]; + res[seq[i]]->copy_and_forward(temp, actual_size[i]); + cntl->response_attachment().append(temp, actual_size[i]); + } else { + char *temp = local_buffers[local_index++].get(); + cntl->response_attachment().append(temp, actual_size[i]); + } + } + closure->set_promise_value(0); + }; + + DownpourBrpcClosure *closure = new DownpourBrpcClosure(remote_call_num, func); + + auto promise = std::make_shared>(); + closure->add_promise(promise); + std::future fut = promise->get_future(); + + for (int request_idx = 0; request_idx < remote_call_num; ++request_idx) { + int server_index = request2server[request_idx]; + closure->request(request_idx)->set_cmd_id(PS_GRAPH_SAMPLE_NEIGHBOORS); + closure->request(request_idx)->set_table_id(request.table_id()); + closure->request(request_idx)->set_client_id(rank); + size_t node_num = node_id_buckets[request_idx].size(); + + closure->request(request_idx) + ->add_params((char *)node_id_buckets[request_idx].data(), + sizeof(uint64_t) * node_num); + closure->request(request_idx) + ->add_params((char *)&sample_size, sizeof(int)); + PsService_Stub rpc_stub( + ((GraphBrpcServer *)get_server())->get_cmd_channel(server_index)); + // GraphPsService_Stub rpc_stub = + // getServiceStub(get_cmd_channel(server_index)); + closure->cntl(request_idx)->set_log_id(butil::gettimeofday_ms()); + rpc_stub.service(closure->cntl(request_idx), closure->request(request_idx), + closure->response(request_idx), closure); + } + if (server2request[rank] != -1) { + ((GraphTable *)table) + ->random_sample_neighboors(node_id_buckets.back().data(), sample_size, + local_buffers, local_actual_sizes); + } + local_promise.get()->set_value(0); + if (remote_call_num == 0) func(closure); + fut.get(); + return 0; +} int32_t GraphBrpcService::graph_set_node_feat(Table *table, const PsRequestMessage &request, PsResponseMessage &response, @@ -412,7 +606,7 @@ int32_t GraphBrpcService::graph_set_node_feat(Table *table, if (request.params_size() < 3) { set_response_code( response, -1, - "graph_set_node_feat request requires at least 2 arguments"); + "graph_set_node_feat request requires at least 3 arguments"); return 0; } size_t node_num = request.params(0).size() / sizeof(uint64_t); diff --git a/paddle/fluid/distributed/service/graph_brpc_server.h b/paddle/fluid/distributed/service/graph_brpc_server.h index 6b4853fa679923..817fe08331165d 100644 --- a/paddle/fluid/distributed/service/graph_brpc_server.h +++ b/paddle/fluid/distributed/service/graph_brpc_server.h @@ -32,6 +32,8 @@ class GraphBrpcServer : public PSServer { virtual ~GraphBrpcServer() {} PsBaseService *get_service() { return _service.get(); } virtual uint64_t start(const std::string &ip, uint32_t port); + virtual int32_t build_peer2peer_connection(int rank); + virtual brpc::Channel *get_cmd_channel(size_t server_index); virtual int32_t stop() { std::unique_lock lock(mutex_); if (stoped_) return 0; @@ -50,6 +52,7 @@ class GraphBrpcServer : public PSServer { mutable std::mutex mutex_; std::condition_variable cv_; bool stoped_ = false; + int rank; brpc::Server _server; std::shared_ptr _service; std::vector> _pserver_channels; @@ -113,12 +116,18 @@ class GraphBrpcService : public PsBaseService { int32_t print_table_stat(Table *table, const PsRequestMessage &request, PsResponseMessage &response, brpc::Controller *cntl); + int32_t sample_neighboors_across_multi_servers( + Table *table, const PsRequestMessage &request, + PsResponseMessage &response, brpc::Controller *cntl); + private: bool _is_initialize_shard_info; std::mutex _initialize_shard_mutex; std::unordered_map _msg_handler_map; std::vector _ori_values; const int sample_nodes_ranges = 23; + size_t server_size; + std::shared_ptr<::ThreadPool> task_pool; }; } // namespace distributed diff --git a/paddle/fluid/distributed/service/graph_py_service.cc b/paddle/fluid/distributed/service/graph_py_service.cc index b4159627013174..498805136417f2 100644 --- a/paddle/fluid/distributed/service/graph_py_service.cc +++ b/paddle/fluid/distributed/service/graph_py_service.cc @@ -107,6 +107,7 @@ void GraphPyServer::start_server(bool block) { empty_vec.push_back(empty_prog); pserver_ptr->configure(server_proto, _ps_env, rank, empty_vec); pserver_ptr->start(ip, port); + pserver_ptr->build_peer2peer_connection(rank); std::condition_variable* cv_ = pserver_ptr->export_cv(); if (block) { std::mutex mutex_; diff --git a/paddle/fluid/distributed/service/sendrecv.proto b/paddle/fluid/distributed/service/sendrecv.proto index 696c950d9b33ba..42e25258ec3fe1 100644 --- a/paddle/fluid/distributed/service/sendrecv.proto +++ b/paddle/fluid/distributed/service/sendrecv.proto @@ -56,6 +56,7 @@ enum PsCmdID { PS_GRAPH_ADD_GRAPH_NODE = 35; PS_GRAPH_REMOVE_GRAPH_NODE = 36; PS_GRAPH_SET_NODE_FEAT = 37; + PS_GRAPH_SAMPLE_NODES_FROM_ONE_SERVER = 38; } message PsRequestMessage { diff --git a/paddle/fluid/distributed/service/server.h b/paddle/fluid/distributed/service/server.h index 89b089386f5018..dffe19545ce52b 100644 --- a/paddle/fluid/distributed/service/server.h +++ b/paddle/fluid/distributed/service/server.h @@ -147,7 +147,7 @@ class PsBaseService : public PsService { public: PsBaseService() : _rank(0), _server(NULL), _config(NULL) {} virtual ~PsBaseService() {} - + virtual size_t get_rank() { return _rank; } virtual int32_t configure(PSServer *server) { _server = server; _rank = _server->rank(); @@ -167,6 +167,7 @@ class PsBaseService : public PsService { } virtual int32_t initialize() = 0; + PSServer *get_server() { return _server; } protected: size_t _rank; diff --git a/paddle/fluid/distributed/table/CMakeLists.txt b/paddle/fluid/distributed/table/CMakeLists.txt index c928ebe90ceb9e..7ec7041b63ba1f 100644 --- a/paddle/fluid/distributed/table/CMakeLists.txt +++ b/paddle/fluid/distributed/table/CMakeLists.txt @@ -35,4 +35,9 @@ cc_library(tensor_accessor SRCS tensor_accessor.cc DEPS ${TABLE_DEPS} eigen3 ps_ cc_library(tensor_table SRCS tensor_table.cc DEPS eigen3 ps_framework_proto executor scope device_context tensor ${TABLE_DEPS}) set_source_files_properties(table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) -cc_library(table SRCS table.cc DEPS common_table tensor_accessor tensor_table ps_framework_proto string_helper device_context gflags glog boost) +set_source_files_properties(sparse_sgd_rule.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +set_source_files_properties(ctr_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_library(sparse_sgd_rule SRCS sparse_sgd_rule.cc DEPS ${TABLE_DEPS} ps_framework_proto) +cc_library(ctr_accessor SRCS ctr_accessor.cc DEPS ${TABLE_DEPS} ps_framework_proto sparse_sgd_rule) + +cc_library(table SRCS table.cc DEPS common_table tensor_accessor tensor_table ps_framework_proto string_helper device_context gflags glog boost ctr_accessor) diff --git a/paddle/fluid/distributed/table/common_graph_table.cc b/paddle/fluid/distributed/table/common_graph_table.cc index 41f4b0dac4d96e..2c20e79b3b2d34 100644 --- a/paddle/fluid/distributed/table/common_graph_table.cc +++ b/paddle/fluid/distributed/table/common_graph_table.cc @@ -305,12 +305,12 @@ Node *GraphTable::find_node(uint64_t id) { return node; } uint32_t GraphTable::get_thread_pool_index(uint64_t node_id) { - return node_id % shard_num % shard_num_per_table % task_pool_size_; + return node_id % shard_num % shard_num_per_server % task_pool_size_; } uint32_t GraphTable::get_thread_pool_index_by_shard_index( uint64_t shard_index) { - return shard_index % shard_num_per_table % task_pool_size_; + return shard_index % shard_num_per_server % task_pool_size_; } int32_t GraphTable::clear_nodes() { @@ -575,6 +575,11 @@ int32_t GraphTable::pull_graph_list(int start, int total_size, actual_size = size; return 0; } + +int32_t GraphTable::get_server_index_by_id(uint64_t id) { + return id % shard_num / shard_num_per_server; +} + int32_t GraphTable::initialize() { _shards_task_pool.resize(task_pool_size_); for (size_t i = 0; i < _shards_task_pool.size(); ++i) { @@ -611,13 +616,12 @@ int32_t GraphTable::initialize() { shard_num = _config.shard_num(); VLOG(0) << "in init graph table shard num = " << shard_num << " shard_idx" << _shard_idx; - shard_num_per_table = sparse_local_shard_num(shard_num, server_num); - shard_start = _shard_idx * shard_num_per_table; - shard_end = shard_start + shard_num_per_table; + shard_num_per_server = sparse_local_shard_num(shard_num, server_num); + shard_start = _shard_idx * shard_num_per_server; + shard_end = shard_start + shard_num_per_server; VLOG(0) << "in init graph table shard idx = " << _shard_idx << " shard_start " << shard_start << " shard_end " << shard_end; - // shards.resize(shard_num_per_table); - shards = std::vector(shard_num_per_table, GraphShard(shard_num)); + shards = std::vector(shard_num_per_server, GraphShard(shard_num)); return 0; } } // namespace distributed diff --git a/paddle/fluid/distributed/table/common_graph_table.h b/paddle/fluid/distributed/table/common_graph_table.h index f643337a80f7c2..d681262c664807 100644 --- a/paddle/fluid/distributed/table/common_graph_table.h +++ b/paddle/fluid/distributed/table/common_graph_table.h @@ -94,6 +94,7 @@ class GraphTable : public SparseTable { int32_t remove_graph_node(std::vector &id_list); + int32_t get_server_index_by_id(uint64_t id); Node *find_node(uint64_t id); virtual int32_t pull_sparse(float *values, @@ -128,9 +129,11 @@ class GraphTable : public SparseTable { const std::vector &feature_names, const std::vector> &res); + size_t get_server_num() { return server_num; } + protected: std::vector shards; - size_t shard_start, shard_end, server_num, shard_num_per_table, shard_num; + size_t shard_start, shard_end, server_num, shard_num_per_server, shard_num; const int task_pool_size_ = 24; const int random_sample_nodes_ranges = 3; diff --git a/paddle/fluid/distributed/table/ctr_accessor.cc b/paddle/fluid/distributed/table/ctr_accessor.cc new file mode 100644 index 00000000000000..1ef8c9e152733f --- /dev/null +++ b/paddle/fluid/distributed/table/ctr_accessor.cc @@ -0,0 +1,329 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/table/ctr_accessor.h" +#include +#include "glog/logging.h" +#include "paddle/fluid/string/string_helper.h" + +namespace paddle { +namespace distributed { + +int CtrCommonAccessor::initialize() { + auto name = _config.embed_sgd_param().name(); + _embed_sgd_rule = CREATE_PSCORE_CLASS(SparseValueSGDRule, name); + _embed_sgd_rule->load_config(_config.embed_sgd_param(), 1); + + name = _config.embedx_sgd_param().name(); + _embedx_sgd_rule = CREATE_PSCORE_CLASS(SparseValueSGDRule, name); + _embedx_sgd_rule->load_config(_config.embedx_sgd_param(), + _config.embedx_dim()); + + common_feature_value.embed_sgd_dim = _embed_sgd_rule->dim(); + common_feature_value.embedx_dim = _config.embedx_dim(); + common_feature_value.embedx_sgd_dim = _embedx_sgd_rule->dim(); + _show_click_decay_rate = _config.ctr_accessor_param().show_click_decay_rate(); + + return 0; +} + +size_t CtrCommonAccessor::dim() { return common_feature_value.dim(); } + +size_t CtrCommonAccessor::dim_size(size_t dim) { + auto embedx_dim = _config.embedx_dim(); + return common_feature_value.dim_size(dim, embedx_dim); +} + +size_t CtrCommonAccessor::size() { return common_feature_value.size(); } + +size_t CtrCommonAccessor::mf_size() { + return (_config.embedx_dim() + common_feature_value.embedx_sgd_dim) * + sizeof(float); // embedx embedx_g2sum +} + +// pull value +size_t CtrCommonAccessor::select_dim() { + auto embedx_dim = _config.embedx_dim(); + return 1 + embedx_dim; +} + +size_t CtrCommonAccessor::select_dim_size(size_t dim) { return sizeof(float); } + +size_t CtrCommonAccessor::select_size() { return select_dim() * sizeof(float); } + +// push value +size_t CtrCommonAccessor::update_dim() { + auto embedx_dim = _config.embedx_dim(); + return 4 + embedx_dim; +} + +size_t CtrCommonAccessor::update_dim_size(size_t dim) { return sizeof(float); } + +size_t CtrCommonAccessor::update_size() { return update_dim() * sizeof(float); } + +bool CtrCommonAccessor::shrink(float* value) { + auto base_threshold = _config.ctr_accessor_param().base_threshold(); + auto delta_threshold = _config.ctr_accessor_param().delta_threshold(); + auto delete_after_unseen_days = + _config.ctr_accessor_param().delete_after_unseen_days(); + auto delete_threshold = _config.ctr_accessor_param().delete_threshold(); + + // time_decay first + common_feature_value.show(value) *= _show_click_decay_rate; + common_feature_value.click(value) *= _show_click_decay_rate; + + // shrink after + auto score = show_click_score(common_feature_value.show(value), + common_feature_value.click(value)); + auto unseen_days = common_feature_value.unseen_days(value); + if (score < delete_threshold || unseen_days > delete_after_unseen_days) { + return true; + } + return false; +} + +bool CtrCommonAccessor::save(float* value, int param) { + auto base_threshold = _config.ctr_accessor_param().base_threshold(); + auto delta_threshold = _config.ctr_accessor_param().delta_threshold(); + auto delta_keep_days = _config.ctr_accessor_param().delta_keep_days(); + if (param == 2) { + delta_threshold = 0; + } + switch (param) { + // save all + case 0: { + return true; + } + // save xbox delta + case 1: + // save xbox base + case 2: { + if (show_click_score(common_feature_value.show(value), + common_feature_value.click(value)) >= + base_threshold && + common_feature_value.delta_score(value) >= delta_threshold && + common_feature_value.unseen_days(value) <= delta_keep_days) { + // do this after save, because it must not be modified when retry + if (param == 2) { + common_feature_value.delta_score(value) = 0; + } + return true; + } else { + return false; + } + } + // already decayed in shrink + case 3: { + // do this after save, because it must not be modified when retry + // common_feature_value.unseen_days(value)++; + return true; + } + // save revert batch_model + case 5: { + return true; + } + default: + return true; + } +} + +void CtrCommonAccessor::update_stat_after_save(float* value, int param) { + auto base_threshold = _config.ctr_accessor_param().base_threshold(); + auto delta_threshold = _config.ctr_accessor_param().delta_threshold(); + auto delta_keep_days = _config.ctr_accessor_param().delta_keep_days(); + if (param == 2) { + delta_threshold = 0; + } + switch (param) { + case 1: { + if (show_click_score(common_feature_value.show(value), + common_feature_value.click(value)) >= + base_threshold && + common_feature_value.delta_score(value) >= delta_threshold && + common_feature_value.unseen_days(value) <= delta_keep_days) { + common_feature_value.delta_score(value) = 0; + } + } + return; + case 3: { + common_feature_value.unseen_days(value)++; + } + return; + default: + return; + } +} + +int32_t CtrCommonAccessor::create(float** values, size_t num) { + auto embedx_dim = _config.embedx_dim(); + for (size_t value_item = 0; value_item < num; ++value_item) { + float* value = values[value_item]; + value[common_feature_value.unseen_days_index()] = 0; + value[common_feature_value.delta_score_index()] = 0; + value[common_feature_value.show_index()] = 0; + value[common_feature_value.click_index()] = 0; + value[common_feature_value.slot_index()] = -1; + _embed_sgd_rule->init_value( + value + common_feature_value.embed_w_index(), + value + common_feature_value.embed_g2sum_index()); + _embedx_sgd_rule->init_value( + value + common_feature_value.embedx_w_index(), + value + common_feature_value.embedx_g2sum_index(), false); + } + return 0; +} + +bool CtrCommonAccessor::need_extend_mf(float* value) { + float show = value[common_feature_value.show_index()]; + float click = value[common_feature_value.click_index()]; + float score = (show - click) * _config.ctr_accessor_param().nonclk_coeff() + + click * _config.ctr_accessor_param().click_coeff(); + return score >= _config.embedx_threshold(); +} + +bool CtrCommonAccessor::has_mf(size_t size) { + return size > common_feature_value.embedx_g2sum_index(); +} + +// from CommonFeatureValue to CtrCommonPullValue +int32_t CtrCommonAccessor::select(float** select_values, const float** values, + size_t num) { + auto embedx_dim = _config.embedx_dim(); + for (size_t value_item = 0; value_item < num; ++value_item) { + float* select_value = select_values[value_item]; + const float* value = values[value_item]; + select_value[CtrCommonPullValue::embed_w_index()] = + value[common_feature_value.embed_w_index()]; + memcpy(select_value + CtrCommonPullValue::embedx_w_index(), + value + common_feature_value.embedx_w_index(), + embedx_dim * sizeof(float)); + } + return 0; +} + +// from CtrCommonPushValue to CtrCommonPushValue +// first dim: item +// second dim: field num +int32_t CtrCommonAccessor::merge(float** update_values, + const float** other_update_values, + size_t num) { + auto embedx_dim = _config.embedx_dim(); + size_t total_dim = CtrCommonPushValue::dim(embedx_dim); + for (size_t value_item = 0; value_item < num; ++value_item) { + float* update_value = update_values[value_item]; + const float* other_update_value = other_update_values[value_item]; + for (auto i = 0u; i < total_dim; ++i) { + if (i != CtrCommonPushValue::slot_index()) { + update_value[i] += other_update_value[i]; + } + } + } + return 0; +} + +// from CtrCommonPushValue to CommonFeatureValue +// first dim: item +// second dim: field num +int32_t CtrCommonAccessor::update(float** update_values, + const float** push_values, size_t num) { + auto embedx_dim = _config.embedx_dim(); + for (size_t value_item = 0; value_item < num; ++value_item) { + float* update_value = update_values[value_item]; + const float* push_value = push_values[value_item]; + float push_show = push_value[CtrCommonPushValue::show_index()]; + float push_click = push_value[CtrCommonPushValue::click_index()]; + float slot = push_value[CtrCommonPushValue::slot_index()]; + update_value[common_feature_value.show_index()] += push_show; + update_value[common_feature_value.click_index()] += push_click; + update_value[common_feature_value.slot_index()] = slot; + update_value[common_feature_value.delta_score_index()] += + (push_show - push_click) * _config.ctr_accessor_param().nonclk_coeff() + + push_click * _config.ctr_accessor_param().click_coeff(); + update_value[common_feature_value.unseen_days_index()] = 0; + _embed_sgd_rule->update_value( + update_value + common_feature_value.embed_w_index(), + update_value + common_feature_value.embed_g2sum_index(), + push_value + CtrCommonPushValue::embed_g_index()); + _embedx_sgd_rule->update_value( + update_value + common_feature_value.embedx_w_index(), + update_value + common_feature_value.embedx_g2sum_index(), + push_value + CtrCommonPushValue::embedx_g_index()); + } + return 0; +} + +bool CtrCommonAccessor::create_value(int stage, const float* value) { + // stage == 0, pull + // stage == 1, push + if (stage == 0) { + return true; + } else if (stage == 1) { + // operation + auto show = CtrCommonPushValue::show_const(value); + auto click = CtrCommonPushValue::click_const(value); + auto score = show_click_score(show, click); + if (score <= 0) { + return false; + } + if (score >= 1) { + return true; + } + return local_uniform_real_distribution()(local_random_engine()) < + score; + } else { + return true; + } +} + +float CtrCommonAccessor::show_click_score(float show, float click) { + auto nonclk_coeff = _config.ctr_accessor_param().nonclk_coeff(); + auto click_coeff = _config.ctr_accessor_param().click_coeff(); + return (show - click) * nonclk_coeff + click * click_coeff; +} + +std::string CtrCommonAccessor::parse_to_string(const float* v, int param) { + thread_local std::ostringstream os; + os.clear(); + os.str(""); + os << v[0] << " " << v[1] << " " << v[2] << " " << v[3] << " " << v[4] << " " + << v[5]; + for (int i = common_feature_value.embed_g2sum_index(); + i < common_feature_value.embedx_w_index(); i++) { + os << " " << v[i]; + } + auto show = common_feature_value.show_const(v); + auto click = common_feature_value.click_const(v); + auto score = show_click_score(show, click); + if (score >= _config.embedx_threshold()) { + for (auto i = common_feature_value.embedx_w_index(); + i < common_feature_value.dim(); ++i) { + os << " " << v[i]; + } + } + return os.str(); +} + +int CtrCommonAccessor::parse_from_string(const std::string& str, float* value) { + int embedx_dim = _config.embedx_dim(); + + _embedx_sgd_rule->init_value( + value + common_feature_value.embedx_w_index(), + value + common_feature_value.embedx_g2sum_index()); + auto ret = paddle::string::str_to_float(str.data(), value); + CHECK(ret >= 6) << "expect more than 6 real:" << ret; + return ret; +} + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/table/ctr_accessor.h b/paddle/fluid/distributed/table/ctr_accessor.h new file mode 100644 index 00000000000000..3c2ac7189f7772 --- /dev/null +++ b/paddle/fluid/distributed/table/ctr_accessor.h @@ -0,0 +1,223 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include "paddle/fluid/distributed/common/registerer.h" +#include "paddle/fluid/distributed/ps.pb.h" +#include "paddle/fluid/distributed/table/accessor.h" +#include "paddle/fluid/distributed/table/sparse_sgd_rule.h" + +namespace paddle { +namespace distributed { + +class CtrCommonAccessor : public ValueAccessor { + public: + struct CtrCommonFeatureValue { + /* + float slot; + float unseen_days; + float delta_score; + float show; + float click; + float embed_w; + std::vector embed_g2sum; + std::vector embedx_w; + std::float embedx_g2sum; + */ + + int dim() { return 6 + embed_sgd_dim + embedx_sgd_dim + embedx_dim; } + int dim_size(size_t dim, int embedx_dim) { return sizeof(float); } + int size() { return dim() * sizeof(float); } + int slot_index() { return 0; } + int unseen_days_index() { return slot_index() + 1; } + int delta_score_index() { return unseen_days_index() + 1; } + int show_index() { return delta_score_index() + 1; } + int click_index() { return show_index() + 1; } + int embed_w_index() { return click_index() + 1; } + int embed_g2sum_index() { return embed_w_index() + 1; } + int embedx_w_index() { return embed_g2sum_index() + embed_sgd_dim; } + int embedx_g2sum_index() { return embedx_w_index() + embedx_dim; } + + float& unseen_days(float* val) { return val[unseen_days_index()]; } + float& delta_score(float* val) { return val[delta_score_index()]; } + float& show(float* val) { return val[show_index()]; } + float& click(float* val) { return val[click_index()]; } + float& slot(float* val) { return val[slot_index()]; } + float& embed_w(float* val) { return val[embed_w_index()]; } + float& embed_g2sum(float* val) { return val[embed_g2sum_index()]; } + float& embedx_w(float* val) { return val[embedx_w_index()]; } + float& embedx_g2sum(float* val) { return val[embedx_g2sum_index()]; } + float show_const(const float* val) { + float s = val[show_index()]; + return s; + } + float click_const(const float* val) { + float c = val[click_index()]; + return c; + } + int embed_sgd_dim; + int embedx_dim; + int embedx_sgd_dim; + }; + + struct CtrCommonPushValue { + /* + float slot; + float show; + float click; + float embed_g; + std::vector embedx_g; + */ + + static int dim(int embedx_dim) { return 4 + embedx_dim; } + + static int dim_size(int dim, int embedx_dim) { return sizeof(float); } + static int size(int embedx_dim) { return dim(embedx_dim) * sizeof(float); } + static int slot_index() { return 0; } + static int show_index() { return CtrCommonPushValue::slot_index() + 1; } + static int click_index() { return CtrCommonPushValue::show_index() + 1; } + static int embed_g_index() { return CtrCommonPushValue::click_index() + 1; } + static int embedx_g_index() { + return CtrCommonPushValue::embed_g_index() + 1; + } + static float& slot(float* val) { + return val[CtrCommonPushValue::slot_index()]; + } + static float& show(float* val) { + return val[CtrCommonPushValue::show_index()]; + } + static float& click(float* val) { + return val[CtrCommonPushValue::click_index()]; + } + static float show_const(const float* val) { + float s = val[show_index()]; + return s; + } + static float click_const(const float* val) { + float c = val[click_index()]; + return c; + } + static float& embed_g(float* val) { + return val[CtrCommonPushValue::embed_g_index()]; + } + static float* embedx_g(float* val) { + return val + CtrCommonPushValue::embedx_g_index(); + } + }; + + struct CtrCommonPullValue { + /* + float embed_w; + std::vector embedx_w; + */ + + static int dim(int embedx_dim) { return 1 + embedx_dim; } + static int dim_size(size_t dim) { return sizeof(float); } + static int size(int embedx_dim) { return dim(embedx_dim) * sizeof(float); } + static int embed_w_index() { return 0; } + static int embedx_w_index() { return 1; } + static float& embed_w(float* val) { + return val[CtrCommonPullValue::embed_w_index()]; + } + static float* embedx_w(float* val) { + return val + CtrCommonPullValue::embedx_w_index(); + } + }; + CtrCommonAccessor() {} + virtual int initialize(); + virtual ~CtrCommonAccessor() {} + + // value维度 + virtual size_t dim(); + // value各个维度的size + virtual size_t dim_size(size_t dim); + // value各维度相加总size + virtual size_t size(); + // value中mf动态长度部分总size大小, sparse下生效 + virtual size_t mf_size(); + // pull value维度 + virtual size_t select_dim(); + // pull value各个维度的size + virtual size_t select_dim_size(size_t dim); + // pull value各维度相加总size + virtual size_t select_size(); + // push value维度 + virtual size_t update_dim(); + // push value各个维度的size + virtual size_t update_dim_size(size_t dim); + // push value各维度相加总size + virtual size_t update_size(); + // 判断该value是否进行shrink + virtual bool shrink(float* value); + // 判断该value是否保存到ssd + // virtual bool save_ssd(float* value); + virtual bool need_extend_mf(float* value); + virtual bool has_mf(size_t size); + // 判断该value是否在save阶段dump, + // param作为参数用于标识save阶段,如downpour的xbox与batch_model + // param = 0, save all feature + // param = 1, save delta feature + // param = 2, save xbox base feature + bool save(float* value, int param) override; + // update delta_score and unseen_days after save + void update_stat_after_save(float* value, int param) override; + // keys不存在时,为values生成随机值 + // 要求value的内存由外部调用者分配完毕 + virtual int32_t create(float** value, size_t num); + // 从values中选取到select_values中 + virtual int32_t select(float** select_values, const float** values, + size_t num); + // 将update_values聚合到一起 + virtual int32_t merge(float** update_values, + const float** other_update_values, size_t num); + // 将update_values聚合到一起,通过it.next判定是否进入下一个key + // virtual int32_t merge(float** update_values, iterator it); + // 将update_values更新应用到values中 + virtual int32_t update(float** values, const float** update_values, + size_t num); + + std::string parse_to_string(const float* value, int param) override; + int32_t parse_from_string(const std::string& str, float* v) override; + virtual bool create_value(int type, const float* value); + + // 这个接口目前只用来取show + float get_field(float* value, const std::string& name) override { + // CHECK(name == "show"); + if (name == "show") { + return common_feature_value.show(value); + } + return 0.0; + } + + private: + // float show_click_score(float show, float click); + + // SparseValueSGDRule* _embed_sgd_rule; + // SparseValueSGDRule* _embedx_sgd_rule; + // CtrCommonFeatureValue common_feature_value; + float _show_click_decay_rate; + int32_t _ssd_unseenday_threshold; + + public: // TODO(zhaocaibei123): it should be private, but we make it public + // for unit test + CtrCommonFeatureValue common_feature_value; + float show_click_score(float show, float click); + SparseValueSGDRule* _embed_sgd_rule; + SparseValueSGDRule* _embedx_sgd_rule; +}; +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/table/depends/feature_value.h b/paddle/fluid/distributed/table/depends/feature_value.h new file mode 100644 index 00000000000000..ad037a86bce80c --- /dev/null +++ b/paddle/fluid/distributed/table/depends/feature_value.h @@ -0,0 +1,167 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include // NOLINT +#include +#include +#include // NOLINT +#include +#include +#include +#include +#include "gflags/gflags.h" + +#include "butil/object_pool.h" +#include "paddle/fluid/distributed/common/utils.h" +#include "paddle/fluid/distributed/table/depends/initializers.h" +#include "paddle/fluid/distributed/thirdparty/round_robin.h" +#include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/rw_lock.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/threadpool.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/platform/port.h" +#include "paddle/fluid/string/printf.h" +#include "paddle/fluid/string/string_helper.h" + +namespace paddle { +namespace distributed { + +static const int CTR_SPARSE_SHARD_BUCKET_NUM_BITS = 6; +static const size_t CTR_SPARSE_SHARD_BUCKET_NUM = + static_cast(1) << CTR_SPARSE_SHARD_BUCKET_NUM_BITS; + +class FixedFeatureValue { + public: + FixedFeatureValue() {} + ~FixedFeatureValue() {} + float *data() { return data_.data(); } + size_t size() { return data_.size(); } + void resize(size_t size) { data_.resize(size); } + void shrink_to_fit() { data_.shrink_to_fit(); } + + private: + std::vector data_; +}; + +class SparseTableShard { + public: + typedef typename robin_hood::unordered_map + map_type; + SparseTableShard() {} + ~SparseTableShard() {} + + FixedFeatureValue *Init(const uint64_t &id) { + size_t hash = hasher_(id); + size_t bucket = compute_bucket(hash); + auto &table = values_[bucket]; + + FixedFeatureValue *value = nullptr; + value = butil::get_object(); + table[id] = value; + return value; + } + + // dont judge if (has(id)) + float *Get(const uint64_t &id) { + size_t hash = hasher_(id); + size_t bucket = compute_bucket(hash); + auto &table = values_[bucket]; + + // auto &value = table.at(id); + // return value->data_.data(); + auto res = table.find(id); + FixedFeatureValue *value = res->second; + return value->data(); + } + + // for load, to reset count, unseen_days + FixedFeatureValue *GetValue(const uint64_t &id) { + size_t hash = hasher_(id); + size_t bucket = compute_bucket(hash); + + auto &table = values_[bucket]; + auto res = table.find(id); + return res->second; + } + + void erase(uint64_t feasign) { + size_t hash = hasher_(feasign); + size_t bucket = compute_bucket(hash); + auto &table = values_[bucket]; + + auto iter = table.find(feasign); + if (iter != table.end()) { + butil::return_object(iter->second); + iter = table.erase(iter); + } + } + + void clear() {} + + size_t compute_bucket(size_t hash) { + if (CTR_SPARSE_SHARD_BUCKET_NUM == 1) { + return 0; + } else { + return hash >> (sizeof(size_t) * 8 - CTR_SPARSE_SHARD_BUCKET_NUM_BITS); + } + } + + map_type::iterator end() { + return values_[CTR_SPARSE_SHARD_BUCKET_NUM - 1].end(); + } + + map_type::iterator Find(uint64_t id) { + size_t hash = hasher_(id); + size_t bucket = compute_bucket(hash); + auto &table = values_[bucket]; + + auto got = table.find(id); + if (got == table.end()) { + return end(); + } else { + return got; + } + } + + private: + bool Has(const uint64_t id) { + size_t hash = hasher_(id); + size_t bucket = compute_bucket(hash); + auto &table = values_[bucket]; + + auto got = table.find(id); + if (got == table.end()) { + return false; + } else { + return true; + } + } + + public: + map_type values_[CTR_SPARSE_SHARD_BUCKET_NUM]; + std::hash hasher_; +}; + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/table/depends/sparse_utils.h b/paddle/fluid/distributed/table/depends/sparse_utils.h index c185dd17d792e4..708f7786bf3b09 100644 --- a/paddle/fluid/distributed/table/depends/sparse_utils.h +++ b/paddle/fluid/distributed/table/depends/sparse_utils.h @@ -31,8 +31,9 @@ struct PullSparseValue { feasigns_(nullptr), frequencies_(nullptr) {} - explicit PullSparseValue(std::vector feasigns, - std::vector frequencies, int dim) { + explicit PullSparseValue(std::vector& feasigns, // NOLINT + std::vector& frequencies, // NOLINT + int dim) { numel_ = feasigns.size(); dim_ = dim; is_training_ = true; diff --git a/paddle/fluid/distributed/table/sparse_sgd_rule.cc b/paddle/fluid/distributed/table/sparse_sgd_rule.cc new file mode 100644 index 00000000000000..614656a5a85d30 --- /dev/null +++ b/paddle/fluid/distributed/table/sparse_sgd_rule.cc @@ -0,0 +1,243 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/distributed/table/sparse_sgd_rule.h" +#include +#include "glog/logging.h" + +DEFINE_bool(enable_show_scale_gradient, true, "enable show scale gradient"); + +namespace paddle { +namespace distributed { + +void SparseNaiveSGDRule::load_config(const SparseCommonSGDRuleParameter& param, + size_t emb_dim) { + _embedding_dim = emb_dim; + auto naive_param = param.naive(); + learning_rate_ = naive_param.learning_rate(); + _initial_range = naive_param.initial_range(); + if (naive_param.weight_bounds_size() == 0) { + _min_bound = -std::numeric_limits::max(); + _max_bound = std::numeric_limits::max(); + } else { + CHECK(naive_param.weight_bounds_size() >= 2) + << "invalid repeated size for weight_bounds:" + << naive_param.weight_bounds_size(); + _min_bound = naive_param.weight_bounds(0); + _max_bound = naive_param.weight_bounds(1); + } +} + +void SparseNaiveSGDRule::update_value_work(float* w, float* sgd, + const float* push_value, + float scale) { + for (size_t i = 0; i < _embedding_dim; ++i) { + w[i] -= learning_rate_ * push_value[i]; + bound_value(w[i]); + } +} + +void SparseNaiveSGDRule::init_value_work(float* value, float* sgd, + bool zero_init) { + if (zero_init) { + for (size_t i = 0; i < _embedding_dim; ++i) { + value[i] = 0; + } + } else { + for (size_t i = 0; i < _embedding_dim; ++i) { + value[i] = + (local_uniform_real_distribution()(local_random_engine()) * 2 - + 1) * + _initial_range; + bound_value(value[i]); + } + } +} +void SparseAdaGradSGDRule::load_config( + const SparseCommonSGDRuleParameter& param, size_t emb_dim) { + _embedding_dim = emb_dim; + auto adagrad_param = param.adagrad(); + learning_rate_ = adagrad_param.learning_rate(); + _initial_g2sum = adagrad_param.initial_g2sum(); + _initial_range = adagrad_param.initial_range(); + + if (adagrad_param.weight_bounds_size() == 0) { + _min_bound = -std::numeric_limits::max(); + _max_bound = std::numeric_limits::max(); + } else { + CHECK(adagrad_param.weight_bounds_size() >= 2) + << "invalid repeated size for weight_bounds:" + << adagrad_param.weight_bounds_size(); + _min_bound = adagrad_param.weight_bounds(0); + _max_bound = adagrad_param.weight_bounds(1); + } +} + +void SparseAdaGradSGDRule::update_value_work(float* w, float* sgd, + const float* grad, float scale) { + float& g2sum = sgd[g2sum_index()]; + double add_g2sum = 0; + + for (int i = 0; i < _embedding_dim; i++) { + double scaled_grad = grad[i] / scale; + w[i] -= learning_rate_ * scaled_grad * + sqrt(_initial_g2sum / (_initial_g2sum + g2sum)); + bound_value(w[i]); + add_g2sum += scaled_grad * scaled_grad; + } + + g2sum += add_g2sum / _embedding_dim; +} + +void SparseAdaGradSGDRule::init_value_work(float* value, float* sgd, + bool zero_init) { + for (int i = 0; i < _embedding_dim; ++i) { + if (zero_init) { + value[i] = 0.0; + bound_value(value[i]); + } else { + value[i] = + (local_uniform_real_distribution()(local_random_engine()) * + 2 - + 1) * + _initial_range; + bound_value(value[i]); + } + } + sgd[g2sum_index()] = 0; +} + +void StdAdaGradSGDRule::load_config(const SparseCommonSGDRuleParameter& param, + size_t emb_dim) { + _embedding_dim = emb_dim; + auto adagrad_param = param.adagrad(); + learning_rate_ = adagrad_param.learning_rate(); + _initial_g2sum = adagrad_param.initial_g2sum(); + _initial_range = adagrad_param.initial_range(); + + if (adagrad_param.weight_bounds_size() == 0) { + _min_bound = -std::numeric_limits::max(); + _max_bound = std::numeric_limits::max(); + } else { + CHECK(adagrad_param.weight_bounds_size() >= 2) + << "invalid repeated size for weight_bounds:" + << adagrad_param.weight_bounds_size(); + _min_bound = adagrad_param.weight_bounds(0); + _max_bound = adagrad_param.weight_bounds(1); + } +} + +void StdAdaGradSGDRule::update_value_work(float* w, float* sgd, + const float* grad, float scale) { + for (int i = 0; i < _embedding_dim; i++) { + float& g2sum = sgd[g2sum_index() + i]; + double scaled_grad = grad[i] / scale; + w[i] -= learning_rate_ * scaled_grad * + sqrt(_initial_g2sum / (_initial_g2sum + g2sum)); + bound_value(w[i]); + g2sum += scaled_grad * scaled_grad; + } +} + +void StdAdaGradSGDRule::init_value_work(float* value, float* sgd, + bool zero_init) { + for (int i = 0; i < _embedding_dim; ++i) { + if (zero_init) { + value[i] = 0.0; + bound_value(value[i]); + } else { + value[i] = + (local_uniform_real_distribution()(local_random_engine()) * + 2 - + 1) * + _initial_range; + bound_value(value[i]); + } + sgd[g2sum_index() + i] = 0; + } +} + +void SparseAdamSGDRule::load_config(const SparseCommonSGDRuleParameter& param, + size_t emb_dim) { + _embedding_dim = emb_dim; + auto adam_param = param.adam(); + learning_rate_ = adam_param.learning_rate(); + _initial_range = adam_param.initial_range(); + _beta1_decay_rate = adam_param.beta1_decay_rate(); + _beta2_decay_rate = adam_param.beta2_decay_rate(); + _ada_epsilon = adam_param.ada_epsilon(); + if (adam_param.weight_bounds_size() == 0) { + _min_bound = -std::numeric_limits::max(); + _max_bound = std::numeric_limits::max(); + } else { + CHECK(adam_param.weight_bounds_size() >= 2) + << "invalid repeated size for weight_bounds:" + << adam_param.weight_bounds_size(); + _min_bound = adam_param.weight_bounds(0); + _max_bound = adam_param.weight_bounds(1); + } +} + +void SparseAdamSGDRule::update_value_work(float* w, float* sgd, + const float* grad, float scale) { + float* gsum = sgd + gsum_index(); + float* g2sum = sgd + g2sum_index(); + float* beta1_pow = sgd + beta1_pow_index(); + float* beta2_pow = sgd + beta2_pow_index(); + const float* g = grad; + + float lr = learning_rate_; + float beta1_pow_ = *beta1_pow; + float beta2_pow_ = *beta2_pow; + + // lr not change in one update + lr *= sqrt(1 - beta2_pow_) / (1 - beta1_pow_); + for (int i = 0; i < _embedding_dim; i++) { + // Calculation + gsum[i] = _beta1_decay_rate * gsum[i] + (1 - _beta1_decay_rate) * g[i]; + g2sum[i] = + _beta2_decay_rate * g2sum[i] + (1 - _beta2_decay_rate) * g[i] * g[i]; + w[i] = w[i] - lr * (gsum[i] / (sqrt(g2sum[i]) + _ada_epsilon)); + bound_value(w[i]); + } + // update beta_pow_decay + (*beta1_pow) *= _beta1_decay_rate; + (*beta2_pow) *= _beta2_decay_rate; +} + +void SparseAdamSGDRule::init_value_work(float* value, float* sgd, + bool zero_init) { + for (int i = 0; i < _embedding_dim; ++i) { + if (zero_init) { + value[i] = 0.0; + bound_value(value[i]); + } else { + value[i] = + (local_uniform_real_distribution()(local_random_engine()) * + 2 - + 1) * + _initial_range; + bound_value(value[i]); + } + } + // init rule gsum and g2sum + for (int i = gsum_index(); i < beta1_pow_index(); i++) { + sgd[i] = 0.0; + } + // init beta1_pow and beta2_pow + *(sgd + beta1_pow_index()) = _beta1_decay_rate; + *(sgd + beta2_pow_index()) = _beta2_decay_rate; +} +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/table/sparse_sgd_rule.h b/paddle/fluid/distributed/table/sparse_sgd_rule.h new file mode 100644 index 00000000000000..ba2baa42f742ab --- /dev/null +++ b/paddle/fluid/distributed/table/sparse_sgd_rule.h @@ -0,0 +1,134 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include "glog/logging.h" // for CHECK +#include "paddle/fluid/distributed/common/local_random.h" // for local_uniform_real_distribution +#include "paddle/fluid/distributed/common/registerer.h" +#include "paddle/fluid/distributed/ps.pb.h" + +namespace paddle { +namespace distributed { + +class SparseValueSGDRule { + public: + SparseValueSGDRule() {} + virtual ~SparseValueSGDRule() {} + virtual void load_config(const SparseCommonSGDRuleParameter& param, + size_t emb_dim) { + _embedding_dim = emb_dim; + _name = param.name(); + } + virtual void update_value_work(float* w, float* sgd, const float* push_value, + float scale) = 0; + virtual void init_value_work(float* value, float* sgd, bool zero_init) = 0; + virtual size_t dim() = 0; + const std::string& get_name() const { return _name; } + void init_value(float* value, float* sgd, bool zero_init = true) { + init_value_work(value, sgd, zero_init); + } + void update_value(float* w, float* sgd, const float* push_value, + float scale = 1) { + update_value_work(w, sgd, push_value, scale); + } + template + void bound_value(T& w) { // NOLINT + if (!(w >= _min_bound)) { + w = (T)_min_bound; + } else if (!(w <= _max_bound)) { + w = (T)_max_bound; + } + } + float& min_bound() { return _min_bound; } + float& max_bound() { return _max_bound; } + + protected: + float _min_bound; + float _max_bound; + float _initial_range; + size_t _embedding_dim; + + private: + std::string _name; +}; + +REGISTER_PSCORE_REGISTERER(SparseValueSGDRule); + +class SparseNaiveSGDRule : public SparseValueSGDRule { + public: + virtual void load_config(const SparseCommonSGDRuleParameter& param, + size_t emb_dim); + virtual void update_value_work(float* w, float* sgd, const float* push_value, + float scale); + virtual void init_value_work(float* value, float* sgd, bool zero_init); + virtual size_t dim() { return 0; } + + private: + float learning_rate_; +}; + +class SparseAdaGradSGDRule : public SparseValueSGDRule { + public: + virtual void load_config(const SparseCommonSGDRuleParameter& param, + size_t emb_dim); + virtual void update_value_work(float* w, float* sgd, const float* push_value, + float scale); + virtual void init_value_work(float* value, float* sgd, bool zero_init); + virtual size_t dim() { return 1; } + size_t g2sum_index() { return 0; } + + private: + float learning_rate_; + float _initial_g2sum; +}; + +class StdAdaGradSGDRule : public SparseValueSGDRule { + public: + virtual void load_config(const SparseCommonSGDRuleParameter& param, + size_t emb_dim); + virtual void update_value_work(float* w, float* sgd, const float* push_value, + float scale); + virtual void init_value_work(float* value, float* sgd, bool zero_init); + virtual size_t dim() { return _embedding_dim; } + size_t g2sum_index() { return 0; } + + private: + float learning_rate_; + float _initial_g2sum; +}; + +class SparseAdamSGDRule : public SparseValueSGDRule { + public: + virtual void load_config(const SparseCommonSGDRuleParameter& param, + size_t emb_dim); + virtual void update_value_work(float* w, float* sgd, const float* push_value, + float scale); + virtual void init_value_work(float* value, float* sgd, bool zero_init); + virtual size_t dim() { return _embedding_dim * 2 + 2; } + size_t gsum_index() { return 0; } + size_t g2sum_index() { return gsum_index() + _embedding_dim; } + size_t beta1_pow_index() { return g2sum_index() + _embedding_dim; } + size_t beta2_pow_index() { return beta1_pow_index() + 1; } + + protected: + float learning_rate_; + float _beta1_decay_rate; + float _beta2_decay_rate; + float _ada_epsilon; +}; +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/test/CMakeLists.txt b/paddle/fluid/distributed/test/CMakeLists.txt index af87e1b6cc61d1..f8cd9af4774ec5 100644 --- a/paddle/fluid/distributed/test/CMakeLists.txt +++ b/paddle/fluid/distributed/test/CMakeLists.txt @@ -20,3 +20,12 @@ cc_test(brpc_utils_test SRCS brpc_utils_test.cc DEPS brpc_utils scope math_funct set_source_files_properties(graph_node_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(graph_node_test SRCS graph_node_test.cc DEPS graph_py_service scope server client communicator ps_service boost table ps_framework_proto ${COMMON_DEPS}) + +set_source_files_properties(feature_value_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test(feature_value_test SRCS feature_value_test.cc DEPS ${COMMON_DEPS} boost table) + +set_source_files_properties(sparse_sgd_rule_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test(sparse_sgd_rule_test SRCS sparse_sgd_rule_test.cc DEPS ${COMMON_DEPS} boost table) + +set_source_files_properties(ctr_accessor_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) +cc_test(ctr_accessor_test SRCS ctr_accessor_test.cc DEPS ${COMMON_DEPS} boost table) diff --git a/paddle/fluid/distributed/test/ctr_accessor_test.cc b/paddle/fluid/distributed/test/ctr_accessor_test.cc new file mode 100644 index 00000000000000..8c667cad605fcc --- /dev/null +++ b/paddle/fluid/distributed/test/ctr_accessor_test.cc @@ -0,0 +1,304 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/distributed/table/ctr_accessor.h" +#include +#include +#include "gtest/gtest.h" +#include "paddle/fluid/distributed/common/registerer.h" +#include "paddle/fluid/distributed/ps.pb.h" +#include "paddle/fluid/distributed/table/sparse_sgd_rule.h" + +namespace paddle { +namespace distributed { +REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseAdaGradSGDRule); +REGISTER_PSCORE_CLASS(SparseValueSGDRule, StdAdaGradSGDRule); +REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseAdamSGDRule); +REGISTER_PSCORE_CLASS(SparseValueSGDRule, SparseNaiveSGDRule); + +TableAccessorParameter gen_param() { + TableAccessorParameter param; + param.set_accessor_class("CtrCommonAccessor"); + param.set_fea_dim(11); + param.set_embedx_dim(8); + param.mutable_ctr_accessor_param()->set_nonclk_coeff(0.2); + param.mutable_ctr_accessor_param()->set_click_coeff(1); + param.mutable_ctr_accessor_param()->set_base_threshold(0.5); + param.mutable_ctr_accessor_param()->set_delta_threshold(0.2); + param.mutable_ctr_accessor_param()->set_delta_keep_days(16); + param.mutable_ctr_accessor_param()->set_show_click_decay_rate(0.99); + /* + param.mutable_embed_sgd_param()->set_name("naive"); + auto* naive_param = param.mutable_embed_sgd_param()->mutable_naive(); + naive_param->set_learning_rate(0.1); + naive_param->set_initial_range(0.3); + naive_param->add_weight_bounds(-10.0); + naive_param->add_weight_bounds(10.0); + */ + param.mutable_embed_sgd_param()->set_name("StdAdaGradSGDRule"); + auto* adagrad_param = param.mutable_embed_sgd_param()->mutable_adagrad(); + adagrad_param->set_learning_rate(0.1); + adagrad_param->set_initial_range(0.3); + adagrad_param->set_initial_g2sum(0.0); + adagrad_param->add_weight_bounds(-10.0); + adagrad_param->add_weight_bounds(10.0); + + param.mutable_embedx_sgd_param()->set_name("SparseNaiveSGDRule"); + auto* naive_param = param.mutable_embedx_sgd_param()->mutable_naive(); + naive_param->set_learning_rate(0.1); + naive_param->set_initial_range(0.3); + naive_param->add_weight_bounds(-10.0); + naive_param->add_weight_bounds(10.0); + + return std::move(param); +} + +TEST(downpour_feature_value_accessor_test, test_shrink) { + TableAccessorParameter parameter = gen_param(); + CtrCommonAccessor* acc = new CtrCommonAccessor(); + ASSERT_EQ(acc->configure(parameter), 0); + ASSERT_EQ(acc->initialize(), 0); + + VLOG(3) << "size of struct: " << acc->common_feature_value.embed_sgd_dim + << " " << acc->common_feature_value.embedx_dim << " " + << acc->common_feature_value.embedx_sgd_dim << " " + << acc->common_feature_value.dim() << "\n"; + + float* value = new float[acc->dim()]; + for (auto i = 0u; i < acc->dim(); ++i) { + value[i] = i * 1.0; + } + ASSERT_TRUE(!acc->shrink(value)); + + // set unseen_days too long + value[1] = 1000; + // set delta score too small + value[2] = 0.001; + ASSERT_TRUE(acc->shrink(value)); +} + +TEST(downpour_feature_value_accessor_test, test_save) { + TableAccessorParameter parameter = gen_param(); + CtrCommonAccessor* acc = new CtrCommonAccessor(); + ASSERT_EQ(acc->configure(parameter), 0); + ASSERT_EQ(acc->initialize(), 0); + + float* value = new float[acc->dim()]; + for (auto i = 0u; i < acc->dim(); ++i) { + value[i] = i * 1.0; + } + + // save all feature + ASSERT_TRUE(acc->save(value, 0)); + + // save delta feature + ASSERT_TRUE(acc->save(value, 1)); + + // save base feature with time decay + ASSERT_TRUE(acc->save(value, 2)); + + VLOG(3) << "test_save:"; + for (auto i = 0u; i < acc->dim(); ++i) { + VLOG(3) << value[i]; + } +} + +TEST(downpour_feature_value_accessor_test, test_create) { + TableAccessorParameter parameter = gen_param(); + CtrCommonAccessor* acc = new CtrCommonAccessor(); + ASSERT_EQ(acc->configure(parameter), 0); + ASSERT_EQ(acc->initialize(), 0); + + const int field_size = 7 + 8; + const int item_size = 10; + + float** value = new float*[item_size]; + for (auto i = 0u; i < item_size; ++i) { + value[i] = new float[field_size]; + } + ASSERT_EQ(acc->create(value, item_size), 0); + + for (auto i = 0u; i < item_size; ++i) { + for (auto j = 0u; j < field_size; ++j) { + VLOG(3) << value[i][j] << " "; + // ASSERT_FLOAT_EQ(value[i][j], 0); + } + VLOG(3) << "\n"; + } +} + +TEST(downpour_feature_value_accessor_test, test_update) { + TableAccessorParameter parameter = gen_param(); + CtrCommonAccessor* acc = new CtrCommonAccessor(); + ASSERT_EQ(acc->configure(parameter), 0); + ASSERT_EQ(acc->initialize(), 0); + + VLOG(3) << "dim: " << acc->common_feature_value.dim() << "\n"; + VLOG(3) << "update_dim: " << acc->update_dim() << "\n"; + + const int field_size = 7 + 8; + const int item_size = 10; + + float** value = new float*[item_size]; + for (auto i = 0u; i < item_size; ++i) { + value[i] = new float[field_size]; + + for (auto j = 0u; j < field_size; ++j) { + value[i][j] = 0; + } + } + + typedef const float* const_float_ptr; + const_float_ptr* grad = new const_float_ptr[item_size]; + for (auto i = 0u; i < item_size; ++i) { + float* p = new float[acc->update_dim()]; + for (auto j = 0u; j < acc->update_dim(); ++j) { + p[j] = i; + } + grad[i] = p; + } + + struct DownpourSparseValueTest { + float slot; + float unseen_days; + float delta_score; + float show; + float click; + float embed_w; + std::vector embed_g2sum; + std::vector embedx_w; + std::vector embedx_g2sum; + + void to_array(float* ptr, size_t dim) { + ptr[0] = slot; + ptr[1] = unseen_days; + ptr[2] = delta_score; + ptr[3] = show; + ptr[4] = click; + ptr[5] = embed_w; + int idx = 6; + for (auto j = 0u; j < 1; ++j) { + ptr[idx + j] = embed_g2sum[j]; + } + idx += 1; + for (auto j = 0u; j < 8; ++j) { + ptr[idx + j] = embedx_w[j]; + } + idx += 8; + for (auto j = 0u; j < 0; ++j) { + ptr[idx + j] = embedx_g2sum[j]; + } + } + }; + struct DownpourSparsePushValueTest { + float slot; + float show; + float click; + float embed_g; + std::vector embedx_g; + }; + std::vector exp_value; + for (auto i = 0u; i < item_size; ++i) { + DownpourSparseValueTest v; + v.slot = value[i][0]; + v.unseen_days = value[i][1]; + v.delta_score = value[i][2]; + v.show = value[i][3]; + v.click = value[i][4]; + v.embed_w = value[i][5]; + + int idx = 6; + for (auto j = 0u; j < acc->common_feature_value.embed_sgd_dim; ++j) { + v.embed_g2sum.push_back(value[i][idx + j]); + } + idx += acc->common_feature_value.embed_sgd_dim; + for (auto j = 0u; j < acc->common_feature_value.embedx_dim; ++j) { + v.embedx_w.push_back(value[i][idx + j]); + } + idx += acc->common_feature_value.embedx_dim; + for (auto j = 0u; j < acc->common_feature_value.embedx_sgd_dim; ++j) { + v.embedx_g2sum.push_back(value[i][idx + j]); + } + + DownpourSparsePushValueTest push_v; + push_v.slot = grad[i][0]; + push_v.show = grad[i][1]; + push_v.click = grad[i][2]; + push_v.embed_g = grad[i][3]; + for (auto j = 0; j < parameter.embedx_dim(); ++j) { + push_v.embedx_g.push_back(grad[i][4 + j]); + } + + v.slot = push_v.slot; + v.unseen_days = 0; + v.show += push_v.show; + v.click += push_v.click; + v.delta_score += acc->show_click_score(push_v.show, push_v.click); + + acc->_embed_sgd_rule->update_value(&v.embed_w, &v.embed_g2sum[0], + &push_v.embed_g); + acc->_embedx_sgd_rule->update_value(&v.embedx_w[0], &v.embedx_g2sum[0], + &push_v.embedx_g[0]); + + float* ptr = new float[acc->dim()]; + v.to_array(ptr, parameter.embedx_dim()); + exp_value.push_back(ptr); + } + acc->update(value, grad, item_size); + + for (auto i = 0u; i < item_size; ++i) { + for (auto j = 0u; j < acc->dim(); ++j) { + VLOG(3) << value[i][j] << ":" << exp_value[i][j] << " "; + ASSERT_FLOAT_EQ(value[i][j], exp_value[i][j]); + } + } +} + +TEST(downpour_feature_value_accessor_test, test_show_click_score) { + TableAccessorParameter parameter = gen_param(); + CtrCommonAccessor* acc = new CtrCommonAccessor(); + ASSERT_EQ(acc->configure(parameter), 0); + ASSERT_EQ(acc->initialize(), 0); + + float show = 10; + float click = 6; + ASSERT_FLOAT_EQ(acc->show_click_score(show, click), 6.8); +} + +TEST(downpour_feature_value_accessor_test, test_string_related) { + TableAccessorParameter parameter = gen_param(); + CtrCommonAccessor* acc = new CtrCommonAccessor(); + ASSERT_EQ(acc->configure(parameter), 0); + ASSERT_EQ(acc->initialize(), 0); + + const int field_size = 15; + float* value = new float[field_size]; + for (auto i = 0u; i < field_size; ++i) { + value[i] = i; + } + + auto str = acc->parse_to_string(value, 0); + + VLOG(3) << str << std::endl; + + str = "0 1 2 3 4 5 6"; + ASSERT_NE(acc->parse_from_string(str, value), 0); + // make sure init_zero=true + + for (auto i = 7; i < 15; ++i) { + ASSERT_FLOAT_EQ(value[i], 0); + } +} +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/test/feature_value_test.cc b/paddle/fluid/distributed/test/feature_value_test.cc new file mode 100644 index 00000000000000..9c9f0ffcac321d --- /dev/null +++ b/paddle/fluid/distributed/test/feature_value_test.cc @@ -0,0 +1,55 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include +#include +#include // NOLINT +#include + +#include "google/protobuf/text_format.h" +#include "gtest/gtest.h" +#include "paddle/fluid/distributed/table/depends/feature_value.h" + +namespace paddle { +namespace distributed { + +TEST(BENCHMARK, LargeScaleKV) { + std::shared_ptr shard = + std::make_shared(); + uint64_t key = 1; + auto itr = shard->Find(key); + ASSERT_TRUE(itr == shard->end()); + + std::vector vec = {0.0, 0.1, 0.2, 0.3}; + + auto* feature_value = shard->Init(key); + feature_value->resize(vec.size()); + memcpy(feature_value->data(), vec.data(), vec.size() * sizeof(float)); + + itr = shard->Find(key); + ASSERT_TRUE(itr != shard->end()); + + feature_value = itr->second; + float* value_data = feature_value->data(); + + ASSERT_FLOAT_EQ(value_data[0], 0.0); + ASSERT_FLOAT_EQ(value_data[1], 0.1); + ASSERT_FLOAT_EQ(value_data[2], 0.2); + ASSERT_FLOAT_EQ(value_data[3], 0.3); +} + +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/distributed/test/graph_node_test.cc b/paddle/fluid/distributed/test/graph_node_test.cc index 810530cdbec94d..613770220f9d79 100644 --- a/paddle/fluid/distributed/test/graph_node_test.cc +++ b/paddle/fluid/distributed/test/graph_node_test.cc @@ -138,6 +138,10 @@ void testSingleSampleNeighboor( for (auto g : s) { ASSERT_EQ(true, s1.find(g) != s1.end()); } + vs.clear(); + pull_status = worker_ptr_->batch_sample_neighboors(0, {96, 37}, 4, vs, 0); + pull_status.wait(); + ASSERT_EQ(vs.size(), 2); } void testAddNode( @@ -356,6 +360,7 @@ void RunServer() { pserver_ptr_->configure(server_proto, _ps_env, 0, empty_vec); LOG(INFO) << "first server, run start(ip,port)"; pserver_ptr_->start(ip_, port_); + pserver_ptr_->build_peer2peer_connection(0); LOG(INFO) << "init first server Done"; } @@ -373,6 +378,7 @@ void RunServer2() { empty_vec2.push_back(empty_prog2); pserver_ptr2->configure(server_proto2, _ps_env2, 1, empty_vec2); pserver_ptr2->start(ip2, port2); + pserver_ptr2->build_peer2peer_connection(1); } void RunClient( diff --git a/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc b/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc new file mode 100644 index 00000000000000..e86234f1bd9c76 --- /dev/null +++ b/paddle/fluid/distributed/test/sparse_sgd_rule_test.cc @@ -0,0 +1,191 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/distributed/table/sparse_sgd_rule.h" +#include +#include +#include "gtest/gtest.h" +#include "paddle/fluid/distributed/ps.pb.h" + +namespace paddle { +namespace distributed { + +TEST(sparse_value_naive_sgd_test, init_and_update) { + SparseNaiveSGDRule rule; + SparseCommonSGDRuleParameter param; + param.set_name("naive"); + auto* naive_param = param.mutable_naive(); + naive_param->set_learning_rate(0.1); + naive_param->set_initial_range(0.3); + naive_param->add_weight_bounds(-10.0); + naive_param->add_weight_bounds(10.0); + + rule.load_config(param, 10); + + // check init_value for zero + const int kItemSize = 10; + float w[kItemSize]; + float grad[kItemSize]; + rule.init_value(w, w + 9, true); + + for (auto i = 0u; i < kItemSize; ++i) { + ASSERT_FLOAT_EQ(w[i], 0); + } + + // check init_value for random + rule.init_value(w, w + 9, false); + for (auto i = 0u; i < kItemSize; ++i) { + ASSERT_TRUE(w[i] >= rule.min_bound() && w[i] <= rule.max_bound()); + } + + // check update_value for one field + for (auto i = 0u; i < kItemSize; ++i) { + w[i] = 0; + } + for (auto i = 0u; i < kItemSize; ++i) { + grad[i] = (i + 1) * 1.0; + } + float label[] = {-0.100000, -0.200000, -0.300000, -0.400000, -0.500000, + -0.600000, -0.700000, -0.800000, -0.900000, -1.000000}; + const float* ptr_grad = grad; + rule.update_value(w, w + 9, ptr_grad); + + for (auto i = 0u; i < kItemSize; ++i) { + VLOG(3) << w[i] << "\n"; + ASSERT_FLOAT_EQ(w[i], label[i]); + } +} + +TEST(downpour_sparse_adagrad_test, test_init_and_update) { + SparseAdaGradSGDRule rule; + SparseCommonSGDRuleParameter param; + param.set_name("adagrad"); + auto* adagrad_param = param.mutable_adagrad(); + adagrad_param->set_learning_rate(0.1); + adagrad_param->set_initial_g2sum(0.2); + adagrad_param->set_initial_range(0.3); + adagrad_param->add_weight_bounds(-10.0); + adagrad_param->add_weight_bounds(10.0); + + rule.load_config(param, 10); + + // check init_value for zero + const int kValueSize = 11; + int kEmbSize = 10; + float w[kValueSize]; + + rule.init_value(w, w + 10, true); + + for (auto i = 0u; i < kEmbSize; ++i) { + ASSERT_FLOAT_EQ(w[i], 0); + } + ASSERT_FLOAT_EQ(w[kEmbSize], 0); + + // check init_value for random + rule.init_value(w, w + 10, false); + for (auto i = 0u; i < kEmbSize; ++i) { + ASSERT_TRUE(w[i] >= rule.min_bound() && w[i] <= rule.max_bound()); + } + ASSERT_FLOAT_EQ(w[kEmbSize], 0); + + // check update_value for one field + for (auto i = 0u; i < kEmbSize; ++i) { + w[i] = 0; + } + w[kEmbSize] = 0; + float grad[kEmbSize]; + for (auto i = 0u; i < kEmbSize; ++i) { + grad[i] = (i + 1) * 1.0; + } + + const float* ptr_grad = grad; + rule.update_value(w, w + 10, ptr_grad); + float label[] = {-0.100000, -0.200000, -0.300000, -0.400000, + -0.500000, -0.600000, -0.700000, -0.800000, + -0.900000, -1.000000, 38.500000}; + for (auto i = 0u; i < kValueSize; ++i) { + ASSERT_FLOAT_EQ(w[i], label[i]); + } +} + +TEST(downpour_sparse_adam_test, test_init_and_update) { + const int embed_dim = 10; // dims of parameters + SparseCommonSGDRuleParameter param; + param.set_name("adam"); + auto* adam_param = param.mutable_adam(); + adam_param->set_learning_rate(0.1); + adam_param->set_initial_range(0.3); + adam_param->set_beta1_decay_rate(0.9); + adam_param->set_beta2_decay_rate(0.999); + adam_param->set_ada_epsilon(1e-08); + adam_param->add_weight_bounds(-10.0); + adam_param->add_weight_bounds(10.0); + + ASSERT_FLOAT_EQ(param.adam().learning_rate(), 0.1); + ASSERT_FLOAT_EQ(param.adam().initial_range(), 0.3); + ASSERT_FLOAT_EQ(param.adam().beta1_decay_rate(), 0.9); + ASSERT_FLOAT_EQ(param.adam().beta2_decay_rate(), 0.999); + ASSERT_FLOAT_EQ(param.adam().ada_epsilon(), 1e-08); + + SparseAdamSGDRule rule; + + rule.load_config(param, embed_dim); + + // check init_value for zero + const int rule_dim = + rule.dim(); // dims of gsum + g2sum + beta1_pow + beta2_pow in adam + const int value_dim = embed_dim + rule_dim; // total dims of w + rule + float* value = new float[value_dim]; + rule.init_value(value, value + embed_dim, true); + for (auto i = 0u; i < rule.beta1_pow_index(); ++i) { + ASSERT_FLOAT_EQ(value[i], 0); + } + ASSERT_FLOAT_EQ(*(value + embed_dim + rule.beta1_pow_index()), 0.9); + ASSERT_FLOAT_EQ(*(value + embed_dim + rule.beta2_pow_index()), 0.999); + + // check init_value for random + rule.init_value(value, value + embed_dim, false); + for (auto i = 0u; i < embed_dim; ++i) { + ASSERT_TRUE(value[i] >= rule.min_bound() && value[i] <= rule.max_bound()); + } + for (auto i = rule.gsum_index(); i < rule.beta1_pow_index(); ++i) { + ASSERT_FLOAT_EQ(value[i + embed_dim], 0); + } + ASSERT_FLOAT_EQ(*(value + embed_dim + rule.beta1_pow_index()), 0.9); + ASSERT_FLOAT_EQ(*(value + embed_dim + rule.beta2_pow_index()), 0.999); + + // check update_value + rule.init_value(value, value + embed_dim, true); + float* grad = new float[embed_dim]; + for (auto i = 0u; i < embed_dim; ++i) { + grad[i] = (i + 1) * 1.0; + } + + float label[] = {-0.0999999642, -0.099999994, -0.099999994, -0.099999994, + -0.099999994, -0.099999994, -0.099999994, -0.100000001, + -0.100000009, -0.100000001, 0.100000024, 0.200000048, + 0.300000072, 0.400000095, 0.500000119, 0.600000143, + 0.700000167, 0.800000191, 0.900000215, 1.00000024, + 0.000999987125, 0.0039999485, 0.00899988413, 0.015999794, + 0.0249996781, 0.0359995365, 0.0489993691, 0.063999176, + 0.0809989572, 0.0999987125, 0.809999943, 0.998001039}; + + rule.update_value(value, value + embed_dim, grad); + + for (auto i = 0u; i < value_dim; ++i) { // check update + ASSERT_FLOAT_EQ(value[i], label[i]) << "i is " << i; + } +} +} // namespace distributed +} // namespace paddle diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index de19c7a0e773e3..edb43b8d38c276 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -26,6 +26,9 @@ add_subdirectory(details) add_subdirectory(fleet) add_subdirectory(io) add_subdirectory(new_executor) +if (WITH_CINN) + add_subdirectory(paddle2cinn) +endif() #ddim lib proto_library(framework_proto SRCS framework.proto) proto_library(pass_desc_proto SRCS pass_desc.proto DEPS framework_proto) @@ -50,6 +53,8 @@ proto_library(data_feed_proto SRCS data_feed.proto) proto_library(trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto data_feed_proto) +cc_library(string_array SRCS string_array.cc DEPS utf8proc) + cc_library(ddim SRCS ddim.cc DEPS eigen3 boost enforce) cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) if(WITH_GPU) diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h index 503f1513aad20c..80fee94f1c85d9 100644 --- a/paddle/fluid/framework/channel.h +++ b/paddle/fluid/framework/channel.h @@ -157,7 +157,19 @@ class ChannelObject { p.resize(finished); return finished; } + // read once only + size_t ReadOnce(std::vector& p, size_t size) { // NOLINT + if (size == 0) { + return 0; + } + std::unique_lock lock(mutex_); + p.resize(size); + size_t finished = Read(size, &p[0], lock, true); + p.resize(finished); + Notify(); + return finished; + } size_t ReadAll(std::vector& p) { // NOLINT p.clear(); size_t finished = 0; @@ -241,17 +253,21 @@ class ChannelObject { return !closed_; } - size_t Read(size_t n, T* p, std::unique_lock& lock) { // NOLINT + size_t Read(size_t n, T* p, std::unique_lock& lock, // NOLINT + bool once = false) { // NOLINT size_t finished = 0; CHECK(n <= MaxCapacity() - reading_count_); reading_count_ += n; while (finished < n && WaitForRead(lock)) { - size_t m = std::min(n - finished, data_.size()); + size_t m = (std::min)(n - finished, data_.size()); for (size_t i = 0; i < m; i++) { p[finished++] = std::move(data_.front()); data_.pop_front(); } reading_count_ -= m; + if (once && m > 0) { + break; + } } reading_count_ -= n - finished; return finished; diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc index fdb24ee18eca7d..2d089b4721b82c 100644 --- a/paddle/fluid/framework/data_feed.cc +++ b/paddle/fluid/framework/data_feed.cc @@ -28,6 +28,7 @@ limitations under the License. */ #include "paddle/fluid/platform/timer.h" USE_INT_STAT(STAT_total_feasign_num_in_mem); +DECLARE_bool(enable_ins_parser_file); namespace paddle { namespace framework { @@ -36,6 +37,107 @@ DLManager& global_dlmanager_pool() { return manager; } +class BufferedLineFileReader { + typedef std::function SampleFunc; + static const int MAX_FILE_BUFF_SIZE = 4 * 1024 * 1024; + class FILEReader { + public: + explicit FILEReader(FILE* fp) : fp_(fp) {} + int read(char* buf, int len) { return fread(buf, sizeof(char), len, fp_); } + + private: + FILE* fp_; + }; + + public: + typedef std::function LineFunc; + + private: + template + int read_lines(T* reader, LineFunc func, int skip_lines) { + int lines = 0; + size_t ret = 0; + char* ptr = NULL; + char* eol = NULL; + total_len_ = 0; + error_line_ = 0; + + SampleFunc spfunc = get_sample_func(); + std::string x; + while (!is_error() && (ret = reader->read(buff_, MAX_FILE_BUFF_SIZE)) > 0) { + total_len_ += ret; + ptr = buff_; + eol = reinterpret_cast(memchr(ptr, '\n', ret)); + while (eol != NULL) { + int size = static_cast((eol - ptr) + 1); + x.append(ptr, size - 1); + ++lines; + if (lines > skip_lines && spfunc()) { + if (!func(x)) { + ++error_line_; + } + } + + x.clear(); + ptr += size; + ret -= size; + eol = reinterpret_cast(memchr(ptr, '\n', ret)); + } + if (ret > 0) { + x.append(ptr, ret); + } + } + if (!is_error() && !x.empty()) { + ++lines; + if (lines > skip_lines && spfunc()) { + if (!func(x)) { + ++error_line_; + } + } + } + return lines; + } + + public: + BufferedLineFileReader() + : random_engine_(std::random_device()()), + uniform_distribution_(0.0f, 1.0f) { + total_len_ = 0; + sample_line_ = 0; + buff_ = + reinterpret_cast(calloc(MAX_FILE_BUFF_SIZE + 1, sizeof(char))); + } + ~BufferedLineFileReader() { free(buff_); } + + int read_file(FILE* fp, LineFunc func, int skip_lines) { + FILEReader reader(fp); + return read_lines(&reader, func, skip_lines); + } + uint64_t file_size(void) { return total_len_; } + void set_sample_rate(float r) { sample_rate_ = r; } + size_t get_sample_line() { return sample_line_; } + bool is_error(void) { return (error_line_ > 10); } + + private: + SampleFunc get_sample_func() { + if (std::abs(sample_rate_ - 1.0f) < 1e-5f) { + return [this](void) { return true; }; + } + return [this](void) { + return (uniform_distribution_(random_engine_) < sample_rate_); + }; + } + + private: + char* buff_ = nullptr; + uint64_t total_len_ = 0; + + std::default_random_engine random_engine_; + std::uniform_real_distribution uniform_distribution_; + float sample_rate_ = 1.0f; + size_t sample_line_ = 0; + size_t error_line_ = 0; +}; void RecordCandidateList::ReSize(size_t length) { mutex_.lock(); capacity_ = length; @@ -301,7 +403,7 @@ int InMemoryDataFeed::Next() { << ", thread_id=" << thread_id_; } } else { - VLOG(3) << "enable heter NEXT: " << offset_index_ + VLOG(3) << "enable heter next: " << offset_index_ << " batch_offsets: " << batch_offsets_.size(); if (offset_index_ >= batch_offsets_.size()) { VLOG(3) << "offset_index: " << offset_index_ @@ -318,14 +420,7 @@ int InMemoryDataFeed::Next() { VLOG(3) << "finish reading for heterps, batch size zero, thread_id=" << thread_id_; } - /* - if (offset_index_ == batch_offsets_.size() - 1) { - std::vector data; - output_channel_->ReadAll(data); - consume_channel_->Write(std::move(data)); - } - */ - VLOG(3) << "#15 enable heter NEXT: " << offset_index_ + VLOG(3) << "enable heter next: " << offset_index_ << " batch_offsets: " << batch_offsets_.size() << " baych_size: " << this->batch_size_; } @@ -1835,5 +1930,646 @@ void PaddleBoxDataFeed::PutToFeedVec(const std::vector& ins_vec) { #endif } +template class InMemoryDataFeed; +void SlotRecordInMemoryDataFeed::Init(const DataFeedDesc& data_feed_desc) { + finish_init_ = false; + finish_set_filelist_ = false; + finish_start_ = false; + PADDLE_ENFORCE(data_feed_desc.has_multi_slot_desc(), + platform::errors::PreconditionNotMet( + "Multi_slot_desc has not been set in data_feed_desc")); + paddle::framework::MultiSlotDesc multi_slot_desc = + data_feed_desc.multi_slot_desc(); + SetBatchSize(data_feed_desc.batch_size()); + size_t all_slot_num = multi_slot_desc.slots_size(); + + all_slots_.resize(all_slot_num); + all_slots_info_.resize(all_slot_num); + used_slots_info_.resize(all_slot_num); + use_slot_size_ = 0; + use_slots_.clear(); + + float_total_dims_size_ = 0; + float_total_dims_without_inductives_.clear(); + for (size_t i = 0; i < all_slot_num; ++i) { + const auto& slot = multi_slot_desc.slots(i); + all_slots_[i] = slot.name(); + + AllSlotInfo& all_slot = all_slots_info_[i]; + all_slot.slot = slot.name(); + all_slot.type = slot.type(); + all_slot.used_idx = slot.is_used() ? use_slot_size_ : -1; + all_slot.slot_value_idx = -1; + + if (slot.is_used()) { + UsedSlotInfo& info = used_slots_info_[use_slot_size_]; + info.idx = i; + info.slot = slot.name(); + info.type = slot.type(); + info.dense = slot.is_dense(); + info.total_dims_without_inductive = 1; + info.inductive_shape_index = -1; + + // record float value and uint64_t value pos + if (info.type[0] == 'u') { + info.slot_value_idx = uint64_use_slot_size_; + all_slot.slot_value_idx = uint64_use_slot_size_; + ++uint64_use_slot_size_; + } else if (info.type[0] == 'f') { + info.slot_value_idx = float_use_slot_size_; + all_slot.slot_value_idx = float_use_slot_size_; + ++float_use_slot_size_; + } + + use_slots_.push_back(slot.name()); + + if (slot.is_dense()) { + for (int j = 0; j < slot.shape_size(); ++j) { + if (slot.shape(j) > 0) { + info.total_dims_without_inductive *= slot.shape(j); + } + if (slot.shape(j) == -1) { + info.inductive_shape_index = j; + } + } + } + if (info.type[0] == 'f') { + float_total_dims_without_inductives_.push_back( + info.total_dims_without_inductive); + float_total_dims_size_ += info.total_dims_without_inductive; + } + info.local_shape.clear(); + for (int j = 0; j < slot.shape_size(); ++j) { + info.local_shape.push_back(slot.shape(j)); + } + ++use_slot_size_; + } + } + used_slots_info_.resize(use_slot_size_); + + feed_vec_.resize(used_slots_info_.size()); + const int kEstimatedFeasignNumPerSlot = 5; // Magic Number + for (size_t i = 0; i < all_slot_num; i++) { + batch_float_feasigns_.push_back(std::vector()); + batch_uint64_feasigns_.push_back(std::vector()); + batch_float_feasigns_[i].reserve(default_batch_size_ * + kEstimatedFeasignNumPerSlot); + batch_uint64_feasigns_[i].reserve(default_batch_size_ * + kEstimatedFeasignNumPerSlot); + offset_.push_back(std::vector()); + offset_[i].reserve(default_batch_size_ + + 1); // Each lod info will prepend a zero + } + visit_.resize(all_slot_num, false); + pipe_command_ = data_feed_desc.pipe_command(); + finish_init_ = true; + input_type_ = data_feed_desc.input_type(); + size_t pos = pipe_command_.find(".so"); + if (pos != std::string::npos) { + pos = pipe_command_.rfind('|'); + if (pos == std::string::npos) { + so_parser_name_ = pipe_command_; + pipe_command_.clear(); + } else { + so_parser_name_ = pipe_command_.substr(pos + 1); + pipe_command_ = pipe_command_.substr(0, pos); + } + so_parser_name_ = paddle::string::erase_spaces(so_parser_name_); + } else { + so_parser_name_.clear(); + } +} + +void SlotRecordInMemoryDataFeed::LoadIntoMemory() { + VLOG(3) << "SlotRecord LoadIntoMemory() begin, thread_id=" << thread_id_; + if (!so_parser_name_.empty()) { + LoadIntoMemoryByLib(); + } else { + LoadIntoMemoryByCommand(); + } +} +void SlotRecordInMemoryDataFeed::LoadIntoMemoryByLib(void) { + if (true) { + // user defined file format analysis + LoadIntoMemoryByFile(); + } else { + LoadIntoMemoryByLine(); + } +} + +void SlotRecordInMemoryDataFeed::LoadIntoMemoryByFile(void) { +#ifdef _LINUX + paddle::framework::CustomParser* parser = + global_dlmanager_pool().Load(so_parser_name_, all_slots_info_); + CHECK(parser != nullptr); + // get slotrecord object + auto pull_record_func = [this](std::vector& record_vec, + int max_fetch_num, int offset) { + if (offset > 0) { + input_channel_->WriteMove(offset, &record_vec[0]); + if (max_fetch_num > 0) { + SlotRecordPool().get(&record_vec[0], offset); + } else { // free all + max_fetch_num = static_cast(record_vec.size()); + if (max_fetch_num > offset) { + SlotRecordPool().put(&record_vec[offset], (max_fetch_num - offset)); + } + } + } else if (max_fetch_num > 0) { + SlotRecordPool().get(&record_vec, max_fetch_num); + } else { + SlotRecordPool().put(&record_vec); + } + }; + + std::string filename; + while (this->PickOneFile(&filename)) { + VLOG(3) << "PickOneFile, filename=" << filename + << ", thread_id=" << thread_id_; + platform::Timer timeline; + timeline.Start(); + + int lines = 0; + bool is_ok = true; + do { + int err_no = 0; + this->fp_ = fs_open_read(filename, &err_no, this->pipe_command_); + + CHECK(this->fp_ != nullptr); + __fsetlocking(&*(this->fp_), FSETLOCKING_BYCALLER); + is_ok = parser->ParseFileInstance( + [this](char* buf, int len) { + return fread(buf, sizeof(char), len, this->fp_.get()); + }, + pull_record_func, lines); + + if (!is_ok) { + LOG(WARNING) << "parser error, filename=" << filename + << ", lines=" << lines; + } + } while (!is_ok); + timeline.Pause(); + VLOG(3) << "LoadIntoMemoryByLib() read all file, file=" << filename + << ", cost time=" << timeline.ElapsedSec() + << " seconds, thread_id=" << thread_id_ << ", lines=" << lines; + } +#endif +} + +void SlotRecordInMemoryDataFeed::LoadIntoMemoryByLine(void) { +#ifdef _LINUX + paddle::framework::CustomParser* parser = + global_dlmanager_pool().Load(so_parser_name_, all_slots_info_); + std::string filename; + BufferedLineFileReader line_reader; + line_reader.set_sample_rate(sample_rate_); + BufferedLineFileReader::LineFunc line_func = nullptr; + + while (this->PickOneFile(&filename)) { + VLOG(3) << "PickOneFile, filename=" << filename + << ", thread_id=" << thread_id_; + std::vector record_vec; + platform::Timer timeline; + timeline.Start(); + int offset = 0; + int old_offset = 0; + + SlotRecordPool().get(&record_vec, OBJPOOL_BLOCK_SIZE); + // get slotrecord object function + auto record_func = [this, &offset, &record_vec, &old_offset]( + std::vector& vec, int num) { + vec.resize(num); + if (offset + num > OBJPOOL_BLOCK_SIZE) { + input_channel_->WriteMove(offset, &record_vec[0]); + SlotRecordPool().get(&record_vec[0], offset); + record_vec.resize(OBJPOOL_BLOCK_SIZE); + offset = 0; + old_offset = 0; + } + for (int i = 0; i < num; ++i) { + auto& ins = record_vec[offset + i]; + ins->reset(); + vec[i] = ins; + } + offset = offset + num; + }; + + line_func = [this, &parser, &record_vec, &offset, &filename, &record_func, + &old_offset](const std::string& line) { + old_offset = offset; + if (!parser->ParseOneInstance(line, record_func)) { + offset = old_offset; + LOG(WARNING) << "read file:[" << filename << "] item error, line:[" + << line << "]"; + return false; + } + if (offset >= OBJPOOL_BLOCK_SIZE) { + input_channel_->Write(std::move(record_vec)); + record_vec.clear(); + SlotRecordPool().get(&record_vec, OBJPOOL_BLOCK_SIZE); + offset = 0; + } + return true; + }; + + int lines = 0; + + do { + int err_no = 0; + this->fp_ = fs_open_read(filename, &err_no, this->pipe_command_); + CHECK(this->fp_ != nullptr); + __fsetlocking(&*(this->fp_), FSETLOCKING_BYCALLER); + lines = line_reader.read_file(this->fp_.get(), line_func, lines); + } while (line_reader.is_error()); + + if (offset > 0) { + input_channel_->WriteMove(offset, &record_vec[0]); + if (offset < OBJPOOL_BLOCK_SIZE) { + SlotRecordPool().put(&record_vec[offset], + (OBJPOOL_BLOCK_SIZE - offset)); + } + } else { + SlotRecordPool().put(&record_vec); + } + record_vec.clear(); + record_vec.shrink_to_fit(); + timeline.Pause(); + VLOG(3) << "LoadIntoMemoryByLib() read all lines, file=" << filename + << ", cost time=" << timeline.ElapsedSec() + << " seconds, thread_id=" << thread_id_ << ", lines=" << lines + << ", sample lines=" << line_reader.get_sample_line() + << ", filesize=" << line_reader.file_size() / 1024.0 / 1024.0 + << "MB"; + } + + VLOG(3) << "LoadIntoMemoryByLib() end, thread_id=" << thread_id_ + << ", total size: " << line_reader.file_size(); +#endif +} + +void SlotRecordInMemoryDataFeed::LoadIntoMemoryByCommand(void) { +#ifdef _LINUX + std::string filename; + BufferedLineFileReader line_reader; + line_reader.set_sample_rate(sample_rate_); + + while (this->PickOneFile(&filename)) { + VLOG(3) << "PickOneFile, filename=" << filename + << ", thread_id=" << thread_id_; + int lines = 0; + std::vector record_vec; + platform::Timer timeline; + timeline.Start(); + SlotRecordPool().get(&record_vec, OBJPOOL_BLOCK_SIZE); + int offset = 0; + + do { + int err_no = 0; + this->fp_ = fs_open_read(filename, &err_no, this->pipe_command_); + CHECK(this->fp_ != nullptr); + __fsetlocking(&*(this->fp_), FSETLOCKING_BYCALLER); + + lines = line_reader.read_file( + this->fp_.get(), + [this, &record_vec, &offset, &filename](const std::string& line) { + if (ParseOneInstance(line, &record_vec[offset])) { + ++offset; + } else { + LOG(WARNING) << "read file:[" << filename + << "] item error, line:[" << line << "]"; + return false; + } + if (offset >= OBJPOOL_BLOCK_SIZE) { + input_channel_->Write(std::move(record_vec)); + record_vec.clear(); + SlotRecordPool().get(&record_vec, OBJPOOL_BLOCK_SIZE); + offset = 0; + } + return true; + }, + lines); + } while (line_reader.is_error()); + if (offset > 0) { + input_channel_->WriteMove(offset, &record_vec[0]); + if (offset < OBJPOOL_BLOCK_SIZE) { + SlotRecordPool().put(&record_vec[offset], + (OBJPOOL_BLOCK_SIZE - offset)); + } + } else { + SlotRecordPool().put(&record_vec); + } + record_vec.clear(); + record_vec.shrink_to_fit(); + timeline.Pause(); + VLOG(3) << "LoadIntoMemory() read all lines, file=" << filename + << ", lines=" << lines + << ", sample lines=" << line_reader.get_sample_line() + << ", cost time=" << timeline.ElapsedSec() + << " seconds, thread_id=" << thread_id_; + } + VLOG(3) << "LoadIntoMemory() end, thread_id=" << thread_id_ + << ", total size: " << line_reader.file_size(); +#endif +} + +static void parser_log_key(const std::string& log_key, uint64_t* search_id, + uint32_t* cmatch, uint32_t* rank) { + std::string searchid_str = log_key.substr(16, 16); + *search_id = static_cast(strtoull(searchid_str.c_str(), NULL, 16)); + std::string cmatch_str = log_key.substr(11, 3); + *cmatch = static_cast(strtoul(cmatch_str.c_str(), NULL, 16)); + std::string rank_str = log_key.substr(14, 2); + *rank = static_cast(strtoul(rank_str.c_str(), NULL, 16)); +} + +bool SlotRecordInMemoryDataFeed::ParseOneInstance(const std::string& line, + SlotRecord* ins) { + SlotRecord& rec = (*ins); + // parse line + const char* str = line.c_str(); + char* endptr = const_cast(str); + int pos = 0; + + thread_local std::vector> slot_float_feasigns; + thread_local std::vector> slot_uint64_feasigns; + slot_float_feasigns.resize(float_use_slot_size_); + slot_uint64_feasigns.resize(uint64_use_slot_size_); + + if (parse_ins_id_) { + int num = strtol(&str[pos], &endptr, 10); + CHECK(num == 1); // NOLINT + pos = endptr - str + 1; + size_t len = 0; + while (str[pos + len] != ' ') { + ++len; + } + rec->ins_id_ = std::string(str + pos, len); + pos += len + 1; + } + if (parse_logkey_) { + int num = strtol(&str[pos], &endptr, 10); + CHECK(num == 1); // NOLINT + pos = endptr - str + 1; + size_t len = 0; + while (str[pos + len] != ' ') { + ++len; + } + // parse_logkey + std::string log_key = std::string(str + pos, len); + uint64_t search_id; + uint32_t cmatch; + uint32_t rank; + parser_log_key(log_key, &search_id, &cmatch, &rank); + + rec->ins_id_ = log_key; + rec->search_id = search_id; + rec->cmatch = cmatch; + rec->rank = rank; + pos += len + 1; + } + + int float_total_slot_num = 0; + int uint64_total_slot_num = 0; + + for (size_t i = 0; i < all_slots_info_.size(); ++i) { + auto& info = all_slots_info_[i]; + int num = strtol(&str[pos], &endptr, 10); + PADDLE_ENFORCE(num, + "The number of ids can not be zero, you need padding " + "it in data generator; or if there is something wrong with " + "the data, please check if the data contains unresolvable " + "characters.\nplease check this error line: %s", + str); + if (info.used_idx != -1) { + if (info.type[0] == 'f') { // float + auto& slot_fea = slot_float_feasigns[info.slot_value_idx]; + slot_fea.clear(); + for (int j = 0; j < num; ++j) { + float feasign = strtof(endptr, &endptr); + if (fabs(feasign) < 1e-6 && !used_slots_info_[info.used_idx].dense) { + continue; + } + slot_fea.push_back(feasign); + ++float_total_slot_num; + } + } else if (info.type[0] == 'u') { // uint64 + auto& slot_fea = slot_uint64_feasigns[info.slot_value_idx]; + slot_fea.clear(); + for (int j = 0; j < num; ++j) { + uint64_t feasign = + static_cast(strtoull(endptr, &endptr, 10)); + if (feasign == 0 && !used_slots_info_[info.used_idx].dense) { + continue; + } + slot_fea.push_back(feasign); + ++uint64_total_slot_num; + } + } + pos = endptr - str; + } else { + for (int j = 0; j <= num; ++j) { + // pos = line.find_first_of(' ', pos + 1); + while (line[pos + 1] != ' ') { + pos++; + } + } + } + } + rec->slot_float_feasigns_.add_slot_feasigns(slot_float_feasigns, + float_total_slot_num); + rec->slot_uint64_feasigns_.add_slot_feasigns(slot_uint64_feasigns, + uint64_total_slot_num); + + return (uint64_total_slot_num > 0); +} + +void SlotRecordInMemoryDataFeed::PutToFeedVec(const SlotRecord* ins_vec, + int num) { + for (int j = 0; j < use_slot_size_; ++j) { + auto& feed = feed_vec_[j]; + if (feed == nullptr) { + continue; + } + + auto& slot_offset = offset_[j]; + slot_offset.clear(); + slot_offset.reserve(num + 1); + slot_offset.push_back(0); + + int total_instance = 0; + auto& info = used_slots_info_[j]; + // fill slot value with default value 0 + if (info.type[0] == 'f') { // float + auto& batch_fea = batch_float_feasigns_[j]; + batch_fea.clear(); + + for (int i = 0; i < num; ++i) { + auto r = ins_vec[i]; + size_t fea_num = 0; + float* slot_values = + r->slot_float_feasigns_.get_values(info.slot_value_idx, &fea_num); + batch_fea.resize(total_instance + fea_num); + memcpy(&batch_fea[total_instance], slot_values, + sizeof(float) * fea_num); + total_instance += fea_num; + slot_offset.push_back(total_instance); + } + + float* feasign = batch_fea.data(); + float* tensor_ptr = + feed->mutable_data({total_instance, 1}, this->place_); + CopyToFeedTensor(tensor_ptr, feasign, total_instance * sizeof(float)); + + } else if (info.type[0] == 'u') { // uint64 + auto& batch_fea = batch_uint64_feasigns_[j]; + batch_fea.clear(); + + for (int i = 0; i < num; ++i) { + auto r = ins_vec[i]; + size_t fea_num = 0; + uint64_t* slot_values = + r->slot_uint64_feasigns_.get_values(info.slot_value_idx, &fea_num); + if (fea_num > 0) { + batch_fea.resize(total_instance + fea_num); + memcpy(&batch_fea[total_instance], slot_values, + sizeof(uint64_t) * fea_num); + total_instance += fea_num; + } + if (fea_num == 0) { + batch_fea.resize(total_instance + fea_num); + batch_fea[total_instance] = 0; + total_instance += 1; + } + slot_offset.push_back(total_instance); + } + + // no uint64_t type in paddlepaddle + uint64_t* feasign = batch_fea.data(); + int64_t* tensor_ptr = + feed->mutable_data({total_instance, 1}, this->place_); + CopyToFeedTensor(tensor_ptr, feasign, total_instance * sizeof(int64_t)); + } + + if (info.dense) { + if (info.inductive_shape_index != -1) { + info.local_shape[info.inductive_shape_index] = + total_instance / info.total_dims_without_inductive; + } + feed->Resize(framework::make_ddim(info.local_shape)); + } else { + LoD data_lod{slot_offset}; + feed_vec_[j]->set_lod(data_lod); + } + } +} + +void SlotRecordInMemoryDataFeed::ExpandSlotRecord(SlotRecord* rec) { + SlotRecord& ins = (*rec); + if (ins->slot_float_feasigns_.slot_offsets.empty()) { + return; + } + size_t total_value_size = ins->slot_float_feasigns_.slot_values.size(); + if (float_total_dims_size_ == total_value_size) { + return; + } + int float_slot_num = + static_cast(float_total_dims_without_inductives_.size()); + CHECK(float_slot_num == float_use_slot_size_); + std::vector old_values; + std::vector old_offsets; + old_values.swap(ins->slot_float_feasigns_.slot_values); + old_offsets.swap(ins->slot_float_feasigns_.slot_offsets); + + ins->slot_float_feasigns_.slot_values.resize(float_total_dims_size_); + ins->slot_float_feasigns_.slot_offsets.assign(float_slot_num + 1, 0); + + auto& slot_offsets = ins->slot_float_feasigns_.slot_offsets; + auto& slot_values = ins->slot_float_feasigns_.slot_values; + + uint32_t offset = 0; + int num = 0; + uint32_t old_off = 0; + int dim = 0; + + for (int i = 0; i < float_slot_num; ++i) { + dim = float_total_dims_without_inductives_[i]; + old_off = old_offsets[i]; + num = static_cast(old_offsets[i + 1] - old_off); + if (num == 0) { + // fill slot value with default value 0 + for (int k = 0; k < dim; ++k) { + slot_values[k + offset] = 0.0; + } + } else { + if (num == dim) { + memcpy(&slot_values[offset], &old_values[old_off], dim * sizeof(float)); + } else { + // position fea + // record position index need fix values + int pos_idx = static_cast(old_values[old_off]); + for (int k = 0; k < dim; ++k) { + if (k == pos_idx) { + slot_values[k + offset] = 1.0; + } else { + slot_values[k + offset] = 0.0; + } + } + } + } + slot_offsets[i] = offset; + offset += dim; + } + slot_offsets[float_slot_num] = offset; + CHECK(float_total_dims_size_ == static_cast(offset)); +} + +bool SlotRecordInMemoryDataFeed::Start() { +#ifdef _LINUX + this->CheckSetFileList(); + if (input_channel_->Size() != 0) { + std::vector data; + input_channel_->Read(data); + } +#endif + if (batch_offsets_.size() > 0) { + VLOG(3) << "batch_size offsets: " << batch_offsets_.size(); + enable_heterps_ = true; + this->offset_index_ = 0; + } + this->finish_start_ = true; + return true; +} + +int SlotRecordInMemoryDataFeed::Next() { +#ifdef _LINUX + this->CheckStart(); + + VLOG(3) << "enable heter next: " << offset_index_ + << " batch_offsets: " << batch_offsets_.size(); + if (offset_index_ >= batch_offsets_.size()) { + VLOG(3) << "offset_index: " << offset_index_ + << " batch_offsets: " << batch_offsets_.size(); + return 0; + } + auto& batch = batch_offsets_[offset_index_++]; + this->batch_size_ = batch.second; + VLOG(3) << "batch_size_=" << this->batch_size_ + << ", thread_id=" << thread_id_; + if (this->batch_size_ != 0) { + PutToFeedVec(&records_[batch.first], this->batch_size_); + } else { + VLOG(3) << "finish reading for heterps, batch size zero, thread_id=" + << thread_id_; + } + VLOG(3) << "enable heter next: " << offset_index_ + << " batch_offsets: " << batch_offsets_.size() + << " baych_size: " << this->batch_size_; + + return this->batch_size_; +#else + return 0; +#endif +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h index 198bc51463af35..a4100e66e72850 100644 --- a/paddle/fluid/framework/data_feed.h +++ b/paddle/fluid/framework/data_feed.h @@ -39,8 +39,14 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/platform/timer.h" #include "paddle/fluid/string/string_helper.h" +DECLARE_int32(record_pool_max_size); +DECLARE_int32(slotpool_thread_num); +DECLARE_bool(enable_slotpool_wait_release); +DECLARE_bool(enable_slotrecord_reset_shrink); + namespace paddle { namespace framework { class DataFeedDesc; @@ -69,6 +75,50 @@ namespace framework { // while (reader->Next()) { // // trainer do something // } + +template +struct SlotValues { + std::vector slot_values; + std::vector slot_offsets; + + void add_values(const T* values, uint32_t num) { + if (slot_offsets.empty()) { + slot_offsets.push_back(0); + } + if (num > 0) { + slot_values.insert(slot_values.end(), values, values + num); + } + slot_offsets.push_back(static_cast(slot_values.size())); + } + T* get_values(int idx, size_t* size) { + uint32_t& offset = slot_offsets[idx]; + (*size) = slot_offsets[idx + 1] - offset; + return &slot_values[offset]; + } + void add_slot_feasigns(const std::vector>& slot_feasigns, + uint32_t fea_num) { + slot_values.reserve(fea_num); + int slot_num = static_cast(slot_feasigns.size()); + slot_offsets.resize(slot_num + 1); + for (int i = 0; i < slot_num; ++i) { + auto& slot_val = slot_feasigns[i]; + slot_offsets[i] = static_cast(slot_values.size()); + uint32_t num = static_cast(slot_val.size()); + if (num > 0) { + slot_values.insert(slot_values.end(), slot_val.begin(), slot_val.end()); + } + } + slot_offsets[slot_num] = slot_values.size(); + } + void clear(bool shrink) { + slot_offsets.clear(); + slot_values.clear(); + if (shrink) { + slot_values.shrink_to_fit(); + slot_offsets.shrink_to_fit(); + } + } +}; union FeatureFeasign { uint64_t uint64_feasign_; float float_feasign_; @@ -97,6 +147,38 @@ struct FeatureItem { uint16_t slot_; }; +struct AllSlotInfo { + std::string slot; + std::string type; + int used_idx; + int slot_value_idx; +}; +struct UsedSlotInfo { + int idx; + int slot_value_idx; + std::string slot; + std::string type; + bool dense; + std::vector local_shape; + int total_dims_without_inductive; + int inductive_shape_index; +}; +struct SlotRecordObject { + uint64_t search_id; + uint32_t rank; + uint32_t cmatch; + std::string ins_id_; + SlotValues slot_uint64_feasigns_; + SlotValues slot_float_feasigns_; + + ~SlotRecordObject() { clear(true); } + void reset(void) { clear(FLAGS_enable_slotrecord_reset_shrink); } + void clear(bool shrink) { + slot_uint64_feasigns_.clear(shrink); + slot_float_feasigns_.clear(shrink); + } +}; +using SlotRecord = SlotRecordObject*; // sizeof Record is much less than std::vector struct Record { std::vector uint64_feasigns_; @@ -108,6 +190,179 @@ struct Record { uint32_t cmatch; }; +inline SlotRecord make_slotrecord() { + static const size_t slot_record_byte_size = sizeof(SlotRecordObject); + void* p = malloc(slot_record_byte_size); + new (p) SlotRecordObject; + return reinterpret_cast(p); +} + +inline void free_slotrecord(SlotRecordObject* p) { + p->~SlotRecordObject(); + free(p); +} + +template +class SlotObjAllocator { + public: + explicit SlotObjAllocator(std::function deleter) + : free_nodes_(NULL), capacity_(0), deleter_(deleter) {} + ~SlotObjAllocator() { clear(); } + + void clear() { + T* tmp = NULL; + while (free_nodes_ != NULL) { + tmp = reinterpret_cast(reinterpret_cast(free_nodes_)); + free_nodes_ = free_nodes_->next; + deleter_(tmp); + --capacity_; + } + CHECK_EQ(capacity_, static_cast(0)); + } + T* acquire(void) { + T* x = NULL; + x = reinterpret_cast(reinterpret_cast(free_nodes_)); + free_nodes_ = free_nodes_->next; + --capacity_; + return x; + } + void release(T* x) { + Node* node = reinterpret_cast(reinterpret_cast(x)); + node->next = free_nodes_; + free_nodes_ = node; + ++capacity_; + } + size_t capacity(void) { return capacity_; } + + private: + struct alignas(T) Node { + union { + Node* next; + char data[sizeof(T)]; + }; + }; + Node* free_nodes_; // a list + size_t capacity_; + std::function deleter_ = nullptr; +}; +static const int OBJPOOL_BLOCK_SIZE = 10000; +class SlotObjPool { + public: + SlotObjPool() + : max_capacity_(FLAGS_record_pool_max_size), alloc_(free_slotrecord) { + ins_chan_ = MakeChannel(); + ins_chan_->SetBlockSize(OBJPOOL_BLOCK_SIZE); + for (int i = 0; i < FLAGS_slotpool_thread_num; ++i) { + threads_.push_back(std::thread([this]() { run(); })); + } + disable_pool_ = false; + count_ = 0; + } + ~SlotObjPool() { + ins_chan_->Close(); + for (auto& t : threads_) { + t.join(); + } + } + void disable_pool(bool disable) { disable_pool_ = disable; } + void set_max_capacity(size_t max_capacity) { max_capacity_ = max_capacity; } + void get(std::vector* output, int n) { + output->resize(n); + return get(&(*output)[0], n); + } + void get(SlotRecord* output, int n) { + int size = 0; + mutex_.lock(); + int left = static_cast(alloc_.capacity()); + if (left > 0) { + size = (left >= n) ? n : left; + for (int i = 0; i < size; ++i) { + output[i] = alloc_.acquire(); + } + } + mutex_.unlock(); + count_ += n; + if (size == n) { + return; + } + for (int i = size; i < n; ++i) { + output[i] = make_slotrecord(); + } + } + void put(std::vector* input) { + size_t size = input->size(); + if (size == 0) { + return; + } + put(&(*input)[0], size); + input->clear(); + } + void put(SlotRecord* input, size_t size) { + CHECK(ins_chan_->WriteMove(size, input) == size); + } + void run(void) { + std::vector input; + while (ins_chan_->ReadOnce(input, OBJPOOL_BLOCK_SIZE)) { + if (input.empty()) { + continue; + } + // over max capacity + size_t n = input.size(); + count_ -= n; + if (disable_pool_ || n + capacity() > max_capacity_) { + for (auto& t : input) { + free_slotrecord(t); + } + } else { + for (auto& t : input) { + t->reset(); + } + mutex_.lock(); + for (auto& t : input) { + alloc_.release(t); + } + mutex_.unlock(); + } + input.clear(); + } + } + void clear(void) { + platform::Timer timeline; + timeline.Start(); + mutex_.lock(); + alloc_.clear(); + mutex_.unlock(); + // wait release channel data + if (FLAGS_enable_slotpool_wait_release) { + while (!ins_chan_->Empty()) { + sleep(1); + } + } + timeline.Pause(); + VLOG(3) << "clear slot pool data size=" << count_.load() + << ", span=" << timeline.ElapsedSec(); + } + size_t capacity(void) { + mutex_.lock(); + size_t total = alloc_.capacity(); + mutex_.unlock(); + return total; + } + + private: + size_t max_capacity_; + Channel ins_chan_; + std::vector threads_; + std::mutex mutex_; + SlotObjAllocator alloc_; + bool disable_pool_; + std::atomic count_; // NOLINT +}; + +inline SlotObjPool& SlotRecordPool() { + static SlotObjPool pool; + return pool; +} struct PvInstanceObject { std::vector ads; void merge_instance(Record* ins) { ads.push_back(ins); } @@ -129,7 +384,21 @@ class CustomParser { CustomParser() {} virtual ~CustomParser() {} virtual void Init(const std::vector& slots) = 0; + virtual bool Init(const std::vector& slots); virtual void ParseOneInstance(const char* str, Record* instance) = 0; + virtual bool ParseOneInstance( + const std::string& line, + std::function&, int)> + GetInsFunc) { // NOLINT + return true; + } + virtual bool ParseFileInstance( + std::function ReadBuffFunc, + std::function&, int, int)> + PullRecordsFunc, // NOLINT + int& lines) { // NOLINT + return false; + } }; typedef paddle::framework::CustomParser* (*CreateParserObjectFunc)(); @@ -194,6 +463,34 @@ class DLManager { return nullptr; } + paddle::framework::CustomParser* Load(const std::string& name, + const std::vector& conf) { +#ifdef _LINUX + std::lock_guard lock(mutex_); + DLHandle handle; + std::map::iterator it = handle_map_.find(name); + if (it != handle_map_.end()) { + return it->second.parser; + } + handle.module = dlopen(name.c_str(), RTLD_NOW); + if (handle.module == nullptr) { + VLOG(0) << "Create so of " << name << " fail"; + exit(-1); + return nullptr; + } + + CreateParserObjectFunc create_parser_func = + (CreateParserObjectFunc)dlsym(handle.module, "CreateParserObject"); + handle.parser = create_parser_func(); + handle.parser->Init(conf); + handle_map_.insert({name, handle}); + + return handle.parser; +#endif + VLOG(0) << "Not implement in windows"; + return nullptr; + } + paddle::framework::CustomParser* ReLoad(const std::string& name, const std::vector& conf) { Close(name); @@ -415,6 +712,11 @@ class InMemoryDataFeed : public DataFeed { virtual void SetCurrentPhase(int current_phase); virtual void LoadIntoMemory(); virtual void LoadIntoMemoryFromSo(); + virtual void SetRecord(T* records) { records_ = records; } + int GetDefaultBatchSize() { return default_batch_size_; } + void AddBatchOffset(const std::pair& offset) { + batch_offsets_.push_back(offset); + } protected: virtual bool ParseOneInstance(T* instance) = 0; @@ -424,6 +726,11 @@ class InMemoryDataFeed : public DataFeed { virtual void PutToFeedVec(const std::vector& ins_vec) = 0; virtual void PutToFeedVec(const T* ins_vec, int num) = 0; + std::vector> batch_float_feasigns_; + std::vector> batch_uint64_feasigns_; + std::vector> offset_; + std::vector visit_; + int thread_id_; int thread_num_; bool parse_ins_id_; @@ -783,11 +1090,7 @@ class MultiSlotInMemoryDataFeed : public InMemoryDataFeed { MultiSlotInMemoryDataFeed() {} virtual ~MultiSlotInMemoryDataFeed() {} virtual void Init(const DataFeedDesc& data_feed_desc); - void SetRecord(Record* records) { records_ = records; } - int GetDefaultBatchSize() { return default_batch_size_; } - void AddBatchOffset(const std::pair& offset) { - batch_offsets_.push_back(offset); - } + // void SetRecord(Record* records) { records_ = records; } protected: virtual bool ParseOneInstance(Record* instance); @@ -798,10 +1101,42 @@ class MultiSlotInMemoryDataFeed : public InMemoryDataFeed { virtual void GetMsgFromLogKey(const std::string& log_key, uint64_t* search_id, uint32_t* cmatch, uint32_t* rank); virtual void PutToFeedVec(const Record* ins_vec, int num); - std::vector> batch_float_feasigns_; - std::vector> batch_uint64_feasigns_; - std::vector> offset_; - std::vector visit_; +}; + +class SlotRecordInMemoryDataFeed : public InMemoryDataFeed { + public: + SlotRecordInMemoryDataFeed() {} + virtual ~SlotRecordInMemoryDataFeed() {} + virtual void Init(const DataFeedDesc& data_feed_desc); + virtual void LoadIntoMemory(); + void ExpandSlotRecord(SlotRecord* ins); + + protected: + virtual bool Start(); + virtual int Next(); + virtual bool ParseOneInstance(SlotRecord* instance) { return false; } + virtual bool ParseOneInstanceFromPipe(SlotRecord* instance) { return false; } + // virtual void ParseOneInstanceFromSo(const char* str, T* instance, + // CustomParser* parser) {} + virtual void PutToFeedVec(const std::vector& ins_vec) {} + + virtual void LoadIntoMemoryByCommand(void); + virtual void LoadIntoMemoryByLib(void); + virtual void LoadIntoMemoryByLine(void); + virtual void LoadIntoMemoryByFile(void); + virtual void SetInputChannel(void* channel) { + input_channel_ = static_cast*>(channel); + } + bool ParseOneInstance(const std::string& line, SlotRecord* rec); + virtual void PutToFeedVec(const SlotRecord* ins_vec, int num); + float sample_rate_ = 1.0f; + int use_slot_size_ = 0; + int float_use_slot_size_ = 0; + int uint64_use_slot_size_ = 0; + std::vector all_slots_info_; + std::vector used_slots_info_; + size_t float_total_dims_size_ = 0; + std::vector float_total_dims_without_inductives_; }; class PaddleBoxDataFeed : public MultiSlotInMemoryDataFeed { diff --git a/paddle/fluid/framework/data_feed_factory.cc b/paddle/fluid/framework/data_feed_factory.cc index ec1b8ec773fa64..e46e4aeb0124c2 100644 --- a/paddle/fluid/framework/data_feed_factory.cc +++ b/paddle/fluid/framework/data_feed_factory.cc @@ -58,8 +58,8 @@ std::shared_ptr DataFeedFactory::CreateDataFeed( std::string data_feed_class) { if (g_data_feed_map.count(data_feed_class) < 1) { LOG(WARNING) << "Your DataFeed " << data_feed_class - << "is not supported currently"; - LOG(WARNING) << "Supported DataFeed: " << DataFeedTypeList(); + << " is not supported currently"; + LOG(WARNING) << " Supported DataFeed: " << DataFeedTypeList(); exit(-1); } return g_data_feed_map[data_feed_class](); @@ -68,6 +68,7 @@ std::shared_ptr DataFeedFactory::CreateDataFeed( REGISTER_DATAFEED_CLASS(MultiSlotDataFeed); REGISTER_DATAFEED_CLASS(MultiSlotInMemoryDataFeed); REGISTER_DATAFEED_CLASS(PaddleBoxDataFeed); +REGISTER_DATAFEED_CLASS(SlotRecordInMemoryDataFeed); #if (defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)) && !defined(_WIN32) REGISTER_DATAFEED_CLASS(MultiSlotFileInstantDataFeed); #endif diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc index 08c42a93d1fcbf..2a071665b263c6 100644 --- a/paddle/fluid/framework/data_set.cc +++ b/paddle/fluid/framework/data_set.cc @@ -351,10 +351,8 @@ static int compute_thread_batch_nccl( return thread_avg_batch_num; } -template -void DatasetImpl::SetHeterPs(bool enable_heterps) { +void MultiSlotDataset::PrepareTrain() { #ifdef PADDLE_WITH_GLOO - enable_heterps_ = enable_heterps; if (enable_heterps_) { if (input_records_.size() == 0 && input_channel_ != nullptr && input_channel_->Size() != 0) { @@ -541,22 +539,21 @@ void DatasetImpl::LocalShuffle() { << timeline.ElapsedSec() << " seconds"; } -template -void DatasetImpl::GlobalShuffle(int thread_num) { +void MultiSlotDataset::GlobalShuffle(int thread_num) { #ifdef PADDLE_WITH_PSLIB - VLOG(3) << "DatasetImpl::GlobalShuffle() begin"; + VLOG(3) << "MultiSlotDataset::GlobalShuffle() begin"; platform::Timer timeline; timeline.Start(); auto fleet_ptr = FleetWrapper::GetInstance(); if (!input_channel_ || input_channel_->Size() == 0) { - VLOG(3) << "DatasetImpl::GlobalShuffle() end, no data to shuffle"; + VLOG(3) << "MultiSlotDataset::GlobalShuffle() end, no data to shuffle"; return; } // local shuffle input_channel_->Close(); - std::vector data; + std::vector data; input_channel_->ReadAll(data); std::shuffle(data.begin(), data.end(), fleet_ptr->LocalRandomEngine()); input_channel_->Open(); @@ -566,10 +563,10 @@ void DatasetImpl::GlobalShuffle(int thread_num) { input_channel_->Close(); input_channel_->SetBlockSize(fleet_send_batch_size_); - VLOG(3) << "DatasetImpl::GlobalShuffle() input_channel_ size " + VLOG(3) << "MultiSlotDataset::GlobalShuffle() input_channel_ size " << input_channel_->Size(); - auto get_client_id = [this, fleet_ptr](const T& data) -> size_t { + auto get_client_id = [this, fleet_ptr](const Record& data) -> size_t { if (!this->merge_by_insid_) { return fleet_ptr->LocalRandomEngine()() % this->trainer_num_; } else { @@ -580,7 +577,7 @@ void DatasetImpl::GlobalShuffle(int thread_num) { auto global_shuffle_func = [this, get_client_id]() { auto fleet_ptr = FleetWrapper::GetInstance(); - std::vector data; + std::vector data; while (this->input_channel_->Read(data)) { std::vector ars(this->trainer_num_); for (auto& t : data) { @@ -835,9 +832,6 @@ void DatasetImpl::CreateReaders() { channel_idx = 0; } } - if (enable_heterps_) { - SetHeterPs(true); - } VLOG(3) << "readers size: " << readers_.size(); } @@ -923,9 +917,8 @@ int64_t DatasetImpl::GetShuffleDataSize() { return sum; } -template -int DatasetImpl::ReceiveFromClient(int msg_type, int client_id, - const std::string& msg) { +int MultiSlotDataset::ReceiveFromClient(int msg_type, int client_id, + const std::string& msg) { #ifdef _LINUX VLOG(3) << "ReceiveFromClient msg_type=" << msg_type << ", client_id=" << client_id << ", msg length=" << msg.length(); @@ -937,9 +930,9 @@ int DatasetImpl::ReceiveFromClient(int msg_type, int client_id, if (ar.Cursor() == ar.Finish()) { return 0; } - std::vector data; + std::vector data; while (ar.Cursor() < ar.Finish()) { - data.push_back(ar.Get()); + data.push_back(ar.Get()); } CHECK(ar.Cursor() == ar.Finish()); @@ -966,6 +959,20 @@ int DatasetImpl::ReceiveFromClient(int msg_type, int client_id, // explicit instantiation template class DatasetImpl; +void MultiSlotDataset::DynamicAdjustReadersNum(int thread_num) { + if (thread_num_ == thread_num) { + VLOG(3) << "DatasetImpl::DynamicAdjustReadersNum thread_num_=" + << thread_num_ << ", thread_num_=thread_num, no need to adjust"; + return; + } + VLOG(3) << "adjust readers num from " << thread_num_ << " to " << thread_num; + thread_num_ = thread_num; + std::vector>().swap(readers_); + CreateReaders(); + VLOG(3) << "adjust readers num done"; + PrepareTrain(); +} + void MultiSlotDataset::PostprocessInstance() { // divide pv instance, and merge to input_channel_ if (enable_pv_merge_) { @@ -1503,5 +1510,154 @@ void MultiSlotDataset::SlotsShuffle( << ", cost time=" << timeline.ElapsedSec() << " seconds"; } +template class DatasetImpl; +void SlotRecordDataset::CreateChannel() { + if (input_channel_ == nullptr) { + input_channel_ = paddle::framework::MakeChannel(); + } +} +void SlotRecordDataset::CreateReaders() { + VLOG(3) << "Calling CreateReaders()"; + VLOG(3) << "thread num in Dataset: " << thread_num_; + VLOG(3) << "Filelist size in Dataset: " << filelist_.size(); + VLOG(3) << "channel num in Dataset: " << channel_num_; + CHECK(thread_num_ > 0) << "thread num should > 0"; + CHECK(channel_num_ > 0) << "channel num should > 0"; + CHECK(channel_num_ <= thread_num_) << "channel num should <= thread num"; + VLOG(3) << "readers size: " << readers_.size(); + if (readers_.size() != 0) { + VLOG(3) << "readers_.size() = " << readers_.size() + << ", will not create again"; + return; + } + VLOG(3) << "data feed class name: " << data_feed_desc_.name(); + for (int i = 0; i < thread_num_; ++i) { + readers_.push_back(DataFeedFactory::CreateDataFeed(data_feed_desc_.name())); + readers_[i]->Init(data_feed_desc_); + readers_[i]->SetThreadId(i); + readers_[i]->SetThreadNum(thread_num_); + readers_[i]->SetFileListMutex(&mutex_for_pick_file_); + readers_[i]->SetFileListIndex(&file_idx_); + readers_[i]->SetFeaNumMutex(&mutex_for_fea_num_); + readers_[i]->SetFeaNum(&total_fea_num_); + readers_[i]->SetFileList(filelist_); + readers_[i]->SetParseInsId(parse_ins_id_); + readers_[i]->SetParseContent(parse_content_); + readers_[i]->SetParseLogKey(parse_logkey_); + readers_[i]->SetEnablePvMerge(enable_pv_merge_); + readers_[i]->SetCurrentPhase(current_phase_); + if (input_channel_ != nullptr) { + readers_[i]->SetInputChannel(input_channel_.get()); + } + } + VLOG(3) << "readers size: " << readers_.size(); +} + +void SlotRecordDataset::ReleaseMemory() { + VLOG(3) << "SlotRecordDataset::ReleaseMemory() begin"; + platform::Timer timeline; + timeline.Start(); + + if (input_channel_) { + input_channel_->Clear(); + input_channel_ = nullptr; + } + if (enable_heterps_) { + VLOG(3) << "put pool records size: " << input_records_.size(); + SlotRecordPool().put(&input_records_); + input_records_.clear(); + input_records_.shrink_to_fit(); + VLOG(3) << "release heterps input records records size: " + << input_records_.size(); + } + + readers_.clear(); + readers_.shrink_to_fit(); + + std::vector>().swap(readers_); + + VLOG(3) << "SlotRecordDataset::ReleaseMemory() end"; + VLOG(3) << "total_feasign_num_(" << STAT_GET(STAT_total_feasign_num_in_mem) + << ") - current_fea_num_(" << total_fea_num_ << ") = (" + << STAT_GET(STAT_total_feasign_num_in_mem) - total_fea_num_ << ")" + << " object pool size=" << SlotRecordPool().capacity(); // For Debug + STAT_SUB(STAT_total_feasign_num_in_mem, total_fea_num_); +} +void SlotRecordDataset::GlobalShuffle(int thread_num) { + // TODO(yaoxuefeng) + return; +} + +void SlotRecordDataset::DynamicAdjustChannelNum(int channel_num, + bool discard_remaining_ins) { + if (channel_num_ == channel_num) { + VLOG(3) << "DatasetImpl::DynamicAdjustChannelNum channel_num_=" + << channel_num_ << ", channel_num_=channel_num, no need to adjust"; + return; + } + VLOG(3) << "adjust channel num from " << channel_num_ << " to " + << channel_num; + channel_num_ = channel_num; + + if (static_cast(input_channel_->Size()) >= channel_num) { + input_channel_->SetBlockSize(input_channel_->Size() / channel_num + + (discard_remaining_ins ? 0 : 1)); + } + + VLOG(3) << "adjust channel num done"; +} + +void SlotRecordDataset::PrepareTrain() { +#ifdef PADDLE_WITH_GLOO + if (enable_heterps_) { + if (input_records_.size() == 0 && input_channel_ != nullptr && + input_channel_->Size() != 0) { + input_channel_->ReadAll(input_records_); + VLOG(3) << "read from channel to records with records size: " + << input_records_.size(); + } + VLOG(3) << "input records size: " << input_records_.size(); + int64_t total_ins_num = input_records_.size(); + std::vector> offset; + int default_batch_size = + reinterpret_cast(readers_[0].get()) + ->GetDefaultBatchSize(); + VLOG(3) << "thread_num: " << thread_num_ + << " memory size: " << total_ins_num + << " default batch_size: " << default_batch_size; + compute_thread_batch_nccl(thread_num_, total_ins_num, default_batch_size, + &offset); + VLOG(3) << "offset size: " << offset.size(); + for (int i = 0; i < thread_num_; i++) { + reinterpret_cast(readers_[i].get()) + ->SetRecord(&input_records_[0]); + } + for (size_t i = 0; i < offset.size(); i++) { + reinterpret_cast( + readers_[i % thread_num_].get()) + ->AddBatchOffset(offset[i]); + } + } +#else + PADDLE_THROW(platform::errors::Unavailable( + "dataset set heterps need compile with GLOO")); +#endif + return; +} + +void SlotRecordDataset::DynamicAdjustReadersNum(int thread_num) { + if (thread_num_ == thread_num) { + VLOG(3) << "DatasetImpl::DynamicAdjustReadersNum thread_num_=" + << thread_num_ << ", thread_num_=thread_num, no need to adjust"; + return; + } + VLOG(3) << "adjust readers num from " << thread_num_ << " to " << thread_num; + thread_num_ = thread_num; + std::vector>().swap(readers_); + CreateReaders(); + VLOG(3) << "adjust readers num done"; + PrepareTrain(); +} + } // end namespace framework } // end namespace paddle diff --git a/paddle/fluid/framework/data_set.h b/paddle/fluid/framework/data_set.h index f3ee96fab8297f..981fb694e0fec9 100644 --- a/paddle/fluid/framework/data_set.h +++ b/paddle/fluid/framework/data_set.h @@ -149,7 +149,6 @@ class Dataset { virtual void DynamicAdjustReadersNum(int thread_num) = 0; // set fleet send sleep seconds virtual void SetFleetSendSleepSeconds(int seconds) = 0; - virtual void SetHeterPs(bool enable_heterps) = 0; protected: virtual int ReceiveFromClient(int msg_type, int client_id, @@ -207,7 +206,7 @@ class DatasetImpl : public Dataset { virtual void WaitPreLoadDone(); virtual void ReleaseMemory(); virtual void LocalShuffle(); - virtual void GlobalShuffle(int thread_num = -1); + virtual void GlobalShuffle(int thread_num = -1) {} virtual void SlotsShuffle(const std::set& slots_to_replace) {} virtual const std::vector& GetSlotsOriginalData() { return slots_shuffle_original_data_; @@ -233,7 +232,11 @@ class DatasetImpl : public Dataset { bool discard_remaining_ins = false); virtual void DynamicAdjustReadersNum(int thread_num); virtual void SetFleetSendSleepSeconds(int seconds); - virtual void SetHeterPs(bool enable_heterps); + /* for enable_heterps_ + virtual void EnableHeterps(bool enable_heterps) { + enable_heterps_ = enable_heterps; + } + */ std::vector>& GetMultiOutputChannel() { return multi_output_channel_; @@ -251,7 +254,10 @@ class DatasetImpl : public Dataset { protected: virtual int ReceiveFromClient(int msg_type, int client_id, - const std::string& msg); + const std::string& msg) { + // TODO(yaoxuefeng) for SlotRecordDataset + return -1; + } std::vector> readers_; std::vector> preload_readers_; paddle::framework::Channel input_channel_; @@ -327,6 +333,32 @@ class MultiSlotDataset : public DatasetImpl { const std::unordered_set& slots_to_replace, std::vector* result); virtual ~MultiSlotDataset() {} + virtual void GlobalShuffle(int thread_num = -1); + virtual void DynamicAdjustReadersNum(int thread_num); + virtual void PrepareTrain(); + + protected: + virtual int ReceiveFromClient(int msg_type, int client_id, + const std::string& msg); +}; +class SlotRecordDataset : public DatasetImpl { + public: + SlotRecordDataset() { SlotRecordPool(); } + virtual ~SlotRecordDataset() {} + // create input channel + virtual void CreateChannel(); + // create readers + virtual void CreateReaders(); + // release memory + virtual void ReleaseMemory(); + virtual void GlobalShuffle(int thread_num = -1); + virtual void DynamicAdjustChannelNum(int channel_num, + bool discard_remaining_ins); + virtual void PrepareTrain(); + virtual void DynamicAdjustReadersNum(int thread_num); + + protected: + bool enable_heterps_ = true; }; } // end namespace framework diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc index 888687c06ce907..faff846cf2a609 100644 --- a/paddle/fluid/framework/data_type_transform.cc +++ b/paddle/fluid/framework/data_type_transform.cc @@ -65,11 +65,24 @@ struct CastDataType { void TransDataType(const OpKernelType& kernel_type_for_var, const OpKernelType& expected_kernel_type, const Tensor& in, Tensor* out) { + PADDLE_ENFORCE_EQ(in.type(), kernel_type_for_var.data_type_, + platform::errors::InvalidArgument( + "The src dtype(%s) of input tensor and kernel_type(%s) " + "are not conststent.", + DataTypeToString(in.type()), + DataTypeToString(kernel_type_for_var.data_type_))); + auto dst_type = expected_kernel_type.data_type_; + TransDataType(in, dst_type, out); +} + +void TransDataType(const Tensor& in, + const paddle::framework::proto::VarType::Type& type, + Tensor* out) { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); out->Resize(in.dims()); - auto src_type = kernel_type_for_var.data_type_; - auto dst_type = expected_kernel_type.data_type_; + auto src_type = in.type(); + auto dst_type = type; auto ctx = pool.Get(in.place()); switch (src_type) { diff --git a/paddle/fluid/framework/data_type_transform.h b/paddle/fluid/framework/data_type_transform.h index 499b133dadb17d..678764430f0ffa 100644 --- a/paddle/fluid/framework/data_type_transform.h +++ b/paddle/fluid/framework/data_type_transform.h @@ -32,6 +32,9 @@ using KernelTypePair = std::pair; void TransDataType(const OpKernelType& kernel_type_for_var, const OpKernelType& expected_kernel_type, const Tensor& in, Tensor* out); +void TransDataType(const Tensor& in, + const paddle::framework::proto::VarType::Type& type, + Tensor* out); /** * Transform complex gradient to real data type. diff --git a/paddle/fluid/framework/dataset_factory.cc b/paddle/fluid/framework/dataset_factory.cc index aeaf9611853238..38200927c5586f 100644 --- a/paddle/fluid/framework/dataset_factory.cc +++ b/paddle/fluid/framework/dataset_factory.cc @@ -53,7 +53,7 @@ std::unique_ptr DatasetFactory::CreateDataset( std::string dataset_class) { if (g_dataset_map.count(dataset_class) < 1) { LOG(WARNING) << "Your Dataset " << dataset_class - << "is not supported currently"; + << " is not supported currently"; LOG(WARNING) << "Supported Dataset: " << DatasetTypeList(); exit(-1); } @@ -61,5 +61,6 @@ std::unique_ptr DatasetFactory::CreateDataset( } REGISTER_DATASET_CLASS(MultiSlotDataset); +REGISTER_DATASET_CLASS(SlotRecordDataset); } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 72f7f0e6011c1b..87f77ec2fff3a6 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -140,6 +140,11 @@ set(IR_PASS_DEPS graph_viz_pass multi_devices_graph_pass fuse_adam_op_pass fuse_sgd_op_pass fuse_momentum_op_pass sync_batch_norm_pass runtime_context_cache_pass graph_to_program_pass fix_op_run_order_pass) + +if (WITH_CINN) + set(IR_PASS_DEPS ${IR_PASS_DEPS} build_cinn_pass) +endif() + if(NOT APPLE AND NOT WIN32 AND (WITH_GPU OR WITH_ROCM)) set(IR_PASS_DEPS ${IR_PASS_DEPS} fusion_group_pass) endif() diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 0d55882953db35..1bb1ae0ea67558 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -19,8 +19,11 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/graph_printer.h" #include "paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h" -DECLARE_bool(use_mkldnn); DECLARE_bool(convert_all_blocks); +DECLARE_bool(use_mkldnn); +#ifdef PADDLE_WITH_CINN +DECLARE_bool(use_cinn); +#endif namespace paddle { namespace framework { @@ -71,6 +74,13 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { // Note: This pass is used to check whether the multi_device_graph is right. AppendPass("multi_devices_check_pass"); +#ifdef PADDLE_WITH_CINN + if (FLAGS_use_cinn) { + // Note: This pass is used to enable cinn. + AppendPass("build_cinn_pass"); + } +#endif + SetCollectiveContext(); } @@ -481,6 +491,9 @@ USE_PASS(fuse_momentum_op_pass); USE_PASS(fuse_all_reduce_op_pass); USE_PASS(runtime_context_cache_pass); USE_PASS(add_reader_dependency_pass); +#ifdef PADDLE_WITH_CINN +USE_PASS(build_cinn_pass); +#endif #ifdef PADDLE_WITH_MKLDNN USE_PASS(mkldnn_placement_pass); #endif diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 0629f1b91504a2..25110fe24f5871 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -143,6 +143,8 @@ struct BuildStrategy { // Turn off inplace addto by default. bool enable_addto_{false}; + bool allow_cuda_graph_capture_{false}; + // FIXME(zcd): is_distribution_ is a temporary field, because in pserver mode, // num_trainers is 1, so the current fields of build_strategy doesn't tell if // it's distributed model. diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc index 2256b826ed501f..60b8461668f6fa 100644 --- a/paddle/fluid/framework/details/computation_op_handle.cc +++ b/paddle/fluid/framework/details/computation_op_handle.cc @@ -16,6 +16,8 @@ #include +DECLARE_bool(allreduce_record_one_event); + namespace paddle { namespace framework { namespace details { @@ -31,11 +33,13 @@ ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope, scope_idx_(scope_idx) {} void ComputationOpHandle::RunImpl() { - WaitInputVarGenerated(place_); + if (!FLAGS_allreduce_record_one_event) { + WaitInputVarGenerated(place_); + } auto run_func = [this]() { op_->Run(*local_exec_scopes_[0], place_); }; - if (is_lock_and_record_event_free_) { + if (is_lock_and_record_event_free_ || FLAGS_allreduce_record_one_event) { run_func(); } else { this->RunAndRecordEvent(run_func); diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc index 120bdd2bc9f563..75998e4582e2bc 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -130,10 +130,12 @@ FetchResultType FastThreadedSSAGraphExecutor::Run( } } // Wait FetchOps. - ClearFetchOp(graph_, &fetch_ops); + if (!fetch_ops.empty()) { + ClearFetchOp(graph_, &fetch_ops); - for (auto &place : places_) { - fetch_ctxs_.Get(place)->Wait(); + for (auto &place : places_) { + fetch_ctxs_.Get(place)->Wait(); + } } return fetches; diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc index 8f45c364476a75..94507140a81d61 100644 --- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc @@ -19,6 +19,8 @@ #include "paddle/fluid/platform/profiler.h" DEFINE_bool(skip_fused_all_reduce_check, false, ""); +DECLARE_bool(allreduce_record_one_event); + namespace paddle { namespace framework { namespace details { @@ -48,11 +50,80 @@ FusedAllReduceOpHandle::FusedAllReduceOpHandle( num_of_all_reduce_(num_of_all_reduce) {} #endif +FusedAllReduceOpHandle::~FusedAllReduceOpHandle() { +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + auto destroy_event = [](gpuEvent_t event) { + if (event == nullptr) return; +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS(hipEventDestroy(event)); +#else + PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventDestroy(event)); +#endif + }; + destroy_event(start_event_); + destroy_event(end_event_); +#endif +} + void FusedAllReduceOpHandle::RunImpl() { platform::RecordEvent record_event(Name()); VLOG(4) << this->DebugString(); +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + if (FLAGS_allreduce_record_one_event && start_event_ == nullptr) { + VLOG(10) << "FLAGS_allreduce_record_one_event=true"; + PADDLE_ENFORCE_EQ(use_hierarchical_allreduce_, false, + platform::errors::Unimplemented( + "The hierarchical allreduce does not support " + "FLAGS_allreduce_record_one_event=true")); + PADDLE_ENFORCE_EQ(places_.size(), 1, + platform::errors::Unimplemented( + "FLAGS_allreduce_record_one_event=true is only valid " + "when using one GPU device per process.")); + PADDLE_ENFORCE_EQ(platform::is_gpu_place(places_[0]), true, + platform::errors::Unimplemented( + "FLAGS_allreduce_record_one_event=true is only valid " + "when using GPU device.")); + auto create_event = [](gpuEvent_t *event) { + if (*event) return; +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS( + hipEventCreateWithFlags(event, hipEventDisableTiming)); +#else + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaEventCreateWithFlags(event, cudaEventDisableTiming)); +#endif + }; + create_event(&start_event_); + create_event(&end_event_); + } + + gpuStream_t nccl_stream{nullptr}; + gpuStream_t compute_stream{nullptr}; + + if (FLAGS_allreduce_record_one_event) { + auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, places_[0]); + compute_stream = + platform::DeviceContextPool::Instance().GetByPlace(gpu_place)->stream(); + auto flat_nccl_ctxs = nccl_ctxs_->GetFlatCtx(run_order_); + auto &nccl_ctx = flat_nccl_ctxs->at(gpu_place.device); + nccl_stream = nccl_ctx.stream(); +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(start_event_, compute_stream)); + PADDLE_ENFORCE_CUDA_SUCCESS( + hipStreamWaitEvent(nccl_stream, start_event_, 0)); +#else + PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(start_event_, compute_stream)); + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaStreamWaitEvent(nccl_stream, start_event_, 0)); +#endif + } else { + WaitInputVarGenerated(); + } +#else WaitInputVarGenerated(); +#endif + // The input: grad0(dev0), grad0(dev1), grad1(dev0), grad1(dev1)... // The output: grad0(dev0), grad0(dev1), grad1(dev0), grad1(dev1)... auto in_var_handles = DynamicCast(this->Inputs()); @@ -94,6 +165,20 @@ void FusedAllReduceOpHandle::RunImpl() { } else { FusedAllReduceFunc(in_var_handles, out_var_handles); } + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + if (FLAGS_allreduce_record_one_event) { +#ifdef PADDLE_WITH_HIP + PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(end_event_, nccl_stream)); + PADDLE_ENFORCE_CUDA_SUCCESS( + hipStreamWaitEvent(compute_stream, end_event_, 0)); +#else + PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(end_event_, nccl_stream)); + PADDLE_ENFORCE_CUDA_SUCCESS( + cudaStreamWaitEvent(compute_stream, end_event_, 0)); +#endif + } +#endif } void FusedAllReduceOpHandle::FusedAllReduceFunc( diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h index d22dc0a421ac0e..8473700867ce32 100644 --- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h @@ -67,12 +67,19 @@ struct FusedAllReduceOpHandle : public AllReduceOpHandle { #endif std::string Name() const override; + ~FusedAllReduceOpHandle(); + protected: void RunImpl() override; private: size_t num_of_all_reduce_; +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + gpuEvent_t start_event_{nullptr}; + gpuEvent_t end_event_{nullptr}; +#endif + // Check the dtype of the input void GetDTypeAndNumel( const std::vector> &g_tensor, diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc index fcfbfd0557e256..1e3cd4f0aa77c9 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc @@ -86,26 +86,35 @@ struct ScaleLossGradFunctor { } }; +std::string ScaleLossGradOpHandle::LossGradName() const { + return static_cast(this->outputs_[0])->name(); +} + void ScaleLossGradOpHandle::RunImpl() { platform::RecordEvent record_event(Name()); - // Doesn't wait any event - std::string var_name = static_cast(this->outputs_[0])->name(); + RunOnVar(local_exec_scopes_[0]->FindVar(LossGradName()), true); +} - auto *tensor = - local_exec_scopes_[0]->FindVar(var_name)->GetMutable(); +void ScaleLossGradOpHandle::RunOnVar(Variable *var, bool record_event) { + auto *tensor = var->GetMutable(); tensor->Resize(make_ddim({1})); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) ScaleLossGradFunctor func(coeff_, tensor, place_, out_dtype_, this->dev_ctxes_.at(place_)); - this->RunAndRecordEvent([&] { framework::VisitDataType(out_dtype_, func); }); + if (record_event) { + this->RunAndRecordEvent( + [&] { framework::VisitDataType(out_dtype_, func); }); + } else { + framework::VisitDataType(out_dtype_, func); + } #else ScaleLossGradFunctor func(coeff_, tensor, place_, out_dtype_, nullptr); framework::VisitDataType(out_dtype_, func); #endif } -std::string ScaleLossGradOpHandle::Name() const { return "Scale LossGrad"; } +std::string ScaleLossGradOpHandle::Name() const { return "ScaleLossGrad"; } } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h index 02e5aa88443df1..88fe02a749fe4b 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.h +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.h @@ -46,6 +46,12 @@ struct ScaleLossGradOpHandle : public OpHandleBase { std::string Name() const override; + platform::Place GetPlace() const { return place_; } + + void RunOnVar(Variable *var, bool record_event = false); + + std::string LossGradName() const; + protected: void RunImpl() override; diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index ad47846c59a05b..5d271d06b6922f 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -22,7 +22,9 @@ #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" #include "paddle/fluid/platform/profiler.h" + namespace paddle { namespace framework { namespace details { @@ -49,8 +51,29 @@ ScopeBufferedSSAGraphExecutor::ScopeBufferedSSAGraphExecutor( PrepareLocalExeScopes(); } +static void RunProgramDescs(const ProgramDescs &programs, + const std::vector &local_exec_scopes, + const std::vector &places) { + for (auto &program : programs) { + for (auto &op_desc : program.Block(0).AllOps()) { + for (size_t i = 0; i < local_exec_scopes.size(); ++i) { + auto op = OpRegistry::CreateOp(*op_desc); + op->Run(*local_exec_scopes[i], places[i]); + } + } + } +} + FetchResultType ScopeBufferedSSAGraphExecutor::Run( const std::vector &fetch_tensors, bool return_merged) { +#ifdef PADDLE_WITH_CUDA + if (platform::IsCUDAGraphCapturing()) { + strategy_.num_iteration_per_drop_scope_ = + std::numeric_limits::max(); + DropLocalExeScopes(/*need_wait=*/false); + } +#endif + if (drop_scope_counter_ == 0) { platform::RecordEvent e("InitLocalVars"); InitVariables(); @@ -84,7 +107,7 @@ FetchResultType ScopeBufferedSSAGraphExecutor::Run( ++drop_scope_counter_; if (drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_ || DropScopeOrNot()) { - DropLocalExeScopes(); + DropLocalExeScopes(!platform::IsCUDAGraphCapturing()); } if (VLOG_IS_ON(5)) { @@ -128,15 +151,7 @@ void ScopeBufferedSSAGraphExecutor::InitVariables() { if (graph.Has(details::kStartupProgramDescs)) { auto &program_descs = graph.Get(details::kStartupProgramDescs); - - for (auto &program_desc : program_descs) { - for (auto &op_desc : program_desc.Block(0).AllOps()) { - for (size_t i = 0; i < local_exec_scopes_.size(); ++i) { - auto op = OpRegistry::CreateOp(*op_desc); - op->Run(*local_exec_scopes_[i], places_[i]); - } - } - } + RunProgramDescs(program_descs, local_exec_scopes_, places_); } is_initialized_ = true; } @@ -144,23 +159,17 @@ void ScopeBufferedSSAGraphExecutor::InitVariables() { if (graph.Has(details::kProgramDescs)) { auto &program_descs = graph.Get(details::kProgramDescs); - - for (auto &program_desc : program_descs) { - for (auto &op_desc : program_desc.Block(0).AllOps()) { - for (size_t i = 0; i < local_exec_scopes_.size(); ++i) { - auto op = OpRegistry::CreateOp(*op_desc); - op->Run(*local_exec_scopes_[i], places_[i]); - } - } - } + RunProgramDescs(program_descs, local_exec_scopes_, places_); } } -void ScopeBufferedSSAGraphExecutor::DropLocalExeScopes() { +void ScopeBufferedSSAGraphExecutor::DropLocalExeScopes(bool need_wait) { platform::RecordEvent drop_scope_event("DropLocalExeScopes"); drop_scope_counter_ = 0; - for (auto &p : places_) { - platform::DeviceContextPool::Instance().Get(p)->Wait(); + if (need_wait) { + for (auto &p : places_) { + platform::DeviceContextPool::Instance().Get(p)->Wait(); + } } scope_monitor_.ClearHistoryLocalExecScopes(); for (size_t i = 0; i < local_exec_scopes_.size(); ++i) { diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h index aa2b113c960a38..ea5a3c07957bfd 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h @@ -53,7 +53,7 @@ class ScopeBufferedSSAGraphExecutor : public SSAGraphExecutor { FetchResultType Run(const std::vector& fetch_tensors, bool return_merged) override; - void DropLocalExeScopes(); + void DropLocalExeScopes(bool need_wait = true); bool NeedCreateLocalExeScope(); diff --git a/paddle/fluid/framework/device_worker.h b/paddle/fluid/framework/device_worker.h index 810e9a087d1220..11beb84d74914a 100644 --- a/paddle/fluid/framework/device_worker.h +++ b/paddle/fluid/framework/device_worker.h @@ -454,7 +454,6 @@ class PSGPUWorker : public HogwildWorker { virtual void Initialize(const TrainerDesc& desc); virtual void TrainFiles(); virtual void TrainFilesWithProfiler(); - virtual void SetNeedDump(bool need_dump_field); virtual void SetChannelWriter(ChannelObject* queue); virtual void SetWorkerNum(int num) { worker_num_ = num; } virtual void CacheProgram(const ProgramDesc& main_program) { @@ -467,7 +466,6 @@ class PSGPUWorker : public HogwildWorker { protected: void PushGradients(); - void DumpParam(); void CopySparseTable(); void CopyDenseTable(); void CopyDenseVars(); @@ -475,18 +473,12 @@ class PSGPUWorker : public HogwildWorker { private: int mpi_rank_; std::mutex mutex_; - std::vector send_var_list_; int worker_num_; ProgramDesc program_; HeterObjectPool object_pool_; - bool need_dump_param_; - std::vector dump_param_; bool need_to_push_dense_; - bool need_dump_field_; bool dump_slot_; bool need_to_push_sparse_; - std::vector dump_fields_; - ChannelWriter writer_; DownpourWorkerParameter param_; float scale_datanorm_; // just save the value in param_ for easy access diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto index 17d15a94c7287b..28eebeb4d9bdc2 100644 --- a/paddle/fluid/framework/distributed_strategy.proto +++ b/paddle/fluid/framework/distributed_strategy.proto @@ -115,6 +115,7 @@ message BuildStrategy { optional bool enable_auto_fusion = 11 [ default = false ]; optional bool enable_addto = 12 [ default = false ]; optional bool fix_op_run_order = 13 [ default = false ]; + optional bool allow_cuda_graph_capture = 14 [ default = false ]; } message ExecutionStrategy { @@ -132,6 +133,10 @@ message GradientScaleConfig { // Else if sum, the gradient will accumulated among multiple // devices. optional string scale_strategy = 1 [ default = 'avg' ]; + // The avg_loss flag is used to determine the position of average + // If scale_gradient is False, it will avg the loss@Grad before grad merge. + // Otherwise, it will do grad merge firstly, then avg the grad after merging. + optional bool scale_gradient = 2 [ default = false ]; } message AsyncConfig { diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc index f1f5ba7789ea61..71b53b8a51882f 100644 --- a/paddle/fluid/framework/dlpack_tensor.cc +++ b/paddle/fluid/framework/dlpack_tensor.cc @@ -30,14 +30,10 @@ static ::DLDataType GetDLDataTypeCode() { ::DLDataType dtype; if (std::is_same>::value || std::is_same>::value) { - // The current dlpack library version is v0.2, and does not define - // kDLComplex value. But kDLComplex is defined by 5U in v0.4, so we set - // dtype.code to 5U directly here. After the dlpack library version being - // upgraded to v0.4, it should be written as follow. - // dtype.code = kDLComplex; - dtype.code = 5U; + dtype.code = kDLComplex; + } else if (std::is_same::value) { + dtype.code = kDLBfloat; } else if (std::is_same::value || - std::is_same::value || std::is_floating_point::value) { dtype.code = kDLFloat; } else if (std::is_unsigned::value) { @@ -77,47 +73,47 @@ static DLDataType GetDLDataTypeFromTypeIndex(proto::VarType::Type type) { #undef REG_DL_DATA_TYPE } -struct DLContextVisitor : public boost::static_visitor<::DLContext> { - inline ::DLContext operator()(const platform::CPUPlace &place) const { - ::DLContext ctx; - ctx.device_type = kDLCPU; - ctx.device_id = 0; - return ctx; +struct DLDeviceVisitor : public boost::static_visitor<::DLDevice> { + inline ::DLDevice operator()(const platform::CPUPlace &place) const { + ::DLDevice device; + device.device_type = kDLCPU; + device.device_id = 0; + return device; } - inline ::DLContext operator()(const platform::XPUPlace &place) const { + inline ::DLDevice operator()(const platform::XPUPlace &place) const { PADDLE_THROW( platform::errors::Unimplemented("platform::XPUPlace is not supported")); } - inline ::DLContext operator()(const platform::NPUPlace &place) const { + inline ::DLDevice operator()(const platform::NPUPlace &place) const { PADDLE_THROW( platform::errors::Unimplemented("platform::NPUPlace is not supported")); } - inline ::DLContext operator()(const platform::NPUPinnedPlace &place) const { + inline ::DLDevice operator()(const platform::NPUPinnedPlace &place) const { PADDLE_THROW(platform::errors::Unimplemented( "platform::NPUPinnedPlace is not supported")); } - inline ::DLContext operator()(const platform::CUDAPlace &place) const { + inline ::DLDevice operator()(const platform::CUDAPlace &place) const { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - ::DLContext ctx; - ctx.device_type = kDLGPU; - ctx.device_id = place.device; - return ctx; + ::DLDevice device; + device.device_type = kDLGPU; + device.device_id = place.device; + return device; #else PADDLE_THROW(platform::errors::Unavailable( "platform::CUDAPlace is not supported in CPU only version.")); #endif } - inline ::DLContext operator()(const platform::CUDAPinnedPlace &place) const { + inline ::DLDevice operator()(const platform::CUDAPinnedPlace &place) const { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - ::DLContext ctx; - ctx.device_type = kDLCPUPinned; - ctx.device_id = 0; - return ctx; + ::DLDevice device; + device.device_type = kDLCPUPinned; + device.device_id = 0; + return device; #else PADDLE_THROW(platform::errors::Unavailable( "platform::CUDAPinnedPlace is not supported in CPU only version.")); @@ -130,9 +126,9 @@ DLPackTensor::DLPackTensor(const Tensor &tensor, LaneType lanes) { // init data, data buffer t_.data = const_cast(tensor.data()); - // init ctx, DLContext type with device_type and device_id + // init device, DLDevice type with device_type and device_id auto place = tensor.place(); - t_.ctx = boost::apply_visitor(internal::DLContextVisitor(), place); + t_.device = boost::apply_visitor(internal::DLDeviceVisitor(), place); // init dtype t_.dtype = internal::GetDLDataTypeFromTypeIndex(tensor.type()); @@ -156,10 +152,8 @@ DLPackTensor::DLPackTensor(const Tensor &tensor, LaneType lanes) { t_.byte_offset = 0; } -::DLManagedTensor *DLPackTensor::ToCudfCompatibleDLManagedTensor() { - // init shape, tensor dims - // for DLManagedTensor shape need to be compatible with ndim - // refer to cupy and cudf, we new int64[ndim] +::DLManagedTensor *DLPackTensor::ToDLManagedTensor() { + // init shape auto shape = new int64_t[t_.ndim]; using DimType = decltype(t_.ndim); // int for (DimType i = 0; i < t_.ndim; ++i) { @@ -167,19 +161,15 @@ ::DLManagedTensor *DLPackTensor::ToCudfCompatibleDLManagedTensor() { } t_.shape = shape; - // init strides, nullptr means the tensor is compact - // refer to cupy and cudf, the compact tensor first dim's strides need to be 1 - // and second dim's strides need to be length of rows of cudf - // cudf now only support dim=2 - PADDLE_ENFORCE_LE(t_.ndim, 2, platform::errors::InvalidArgument( - "cudf now only supports dimension is 2, " - "but received dimension is %d.", - t_.ndim)); - - if (t_.ndim > 1) - t_.strides = new int64_t[2]{1, t_.shape[1]}; - else - t_.strides = new int64_t[1]{1}; + // init strides + auto strides = new int64_t[t_.ndim]; + for (DimType i = 0; i < t_.ndim; ++i) { + strides[i] = 1; + } + for (DimType i = t_.ndim - 2; i >= 0; --i) { + strides[i] = t_.shape[i + 1] * strides[i + 1]; + } + t_.strides = strides; auto tensor = new DLManagedTensor; tensor->dl_tensor = t_; diff --git a/paddle/fluid/framework/dlpack_tensor.h b/paddle/fluid/framework/dlpack_tensor.h index e342523718b34b..03ed8884925ce4 100644 --- a/paddle/fluid/framework/dlpack_tensor.h +++ b/paddle/fluid/framework/dlpack_tensor.h @@ -36,7 +36,7 @@ class DLPackTensor { inline operator ::DLTensor&() { return t_; } - ::DLManagedTensor* ToCudfCompatibleDLManagedTensor(); + ::DLManagedTensor* ToDLManagedTensor(); private: ::DLTensor t_; diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc index 8265d105accae0..4e2d7bb979b617 100644 --- a/paddle/fluid/framework/dlpack_tensor_test.cc +++ b/paddle/fluid/framework/dlpack_tensor_test.cc @@ -30,7 +30,11 @@ template constexpr uint8_t GetDLDataTypeCode() { if (std::is_same>::value || std::is_same>::value) { - return static_cast(5); + return static_cast(kDLComplex); + } + + if (std::is_same::value) { + return static_cast(kDLBfloat); } return std::is_same::value || @@ -55,15 +59,15 @@ void TestMain(const platform::Place &place, uint16_t lanes) { CHECK_EQ(p, dl_tensor.data); if (platform::is_cpu_place(place)) { - CHECK_EQ(kDLCPU, dl_tensor.ctx.device_type); - CHECK_EQ(0, dl_tensor.ctx.device_id); + CHECK_EQ(kDLCPU, dl_tensor.device.device_type); + CHECK_EQ(0, dl_tensor.device.device_id); } else if (platform::is_gpu_place(place)) { - CHECK_EQ(kDLGPU, dl_tensor.ctx.device_type); + CHECK_EQ(kDLGPU, dl_tensor.device.device_type); CHECK_EQ(BOOST_GET_CONST(platform::CUDAPlace, place).device, - dl_tensor.ctx.device_id); + dl_tensor.device.device_id); } else if (platform::is_cuda_pinned_place(place)) { - CHECK_EQ(kDLCPUPinned, dl_tensor.ctx.device_type); - CHECK_EQ(0, dl_tensor.ctx.device_id); + CHECK_EQ(kDLCPUPinned, dl_tensor.device.device_type); + CHECK_EQ(0, dl_tensor.device.device_id); } else { CHECK_EQ(false, true); } @@ -83,8 +87,7 @@ void TestMain(const platform::Place &place, uint16_t lanes) { } template -void TestToCudfCompatibleDLManagedTensor(const platform::Place &place, - uint16_t lanes) { +void TestToDLManagedTensor(const platform::Place &place, uint16_t lanes) { DDim dims{6, 7}; Tensor tensor; tensor.Resize(dims); @@ -92,8 +95,7 @@ void TestToCudfCompatibleDLManagedTensor(const platform::Place &place, DLPackTensor dlpack_tensor(tensor, lanes); - ::DLManagedTensor *dl_managed_tensor = - dlpack_tensor.ToCudfCompatibleDLManagedTensor(); + ::DLManagedTensor *dl_managed_tensor = dlpack_tensor.ToDLManagedTensor(); CHECK_EQ(dl_managed_tensor->manager_ctx == nullptr, true); @@ -101,7 +103,8 @@ void TestToCudfCompatibleDLManagedTensor(const platform::Place &place, CHECK_EQ(dims[i], dl_managed_tensor->dl_tensor.shape[i]); } - CHECK_EQ(dl_managed_tensor->dl_tensor.strides[0] == 1, true); + CHECK_EQ(dl_managed_tensor->dl_tensor.strides[0] == 7, true); + CHECK_EQ(dl_managed_tensor->dl_tensor.strides[1] == 1, true); dl_managed_tensor->deleter(dl_managed_tensor); } @@ -122,7 +125,7 @@ void TestMainLoop() { for (auto &p : places) { for (auto &l : lanes) { TestMain(p, l); - TestToCudfCompatibleDLManagedTensor(p, l); + TestToDLManagedTensor(p, l); } } } diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index de007c128d7543..5f681ec7ea241f 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -102,14 +102,18 @@ void Executor::CreateVariables(const ProgramDesc& pdesc, Scope* scope, if (var->Persistable()) { auto* ptr = const_cast(ancestor_scope)->Var(var->Name()); + + VLOG(3) << "Initialize Variable " << var->Name(); InitializeVariable(ptr, var->GetType()); VLOG(3) << "Create Variable " << var->Name() - << " global, which pointer is " << ptr; + << " global, which pointer is " << ptr << " type is " + << static_cast(var->GetType()); } else { auto* ptr = scope->Var(var->Name()); InitializeVariable(ptr, var->GetType()); VLOG(3) << "Create Variable " << var->Name() - << " locally, which pointer is " << ptr; + << " locally, which pointer is " << ptr << "Variable Type " + << static_cast(var->GetType()); } } } else { diff --git a/paddle/fluid/framework/executor_gc_helper.cc b/paddle/fluid/framework/executor_gc_helper.cc index 43eb1ce8c77f89..8c64d65ff4be66 100644 --- a/paddle/fluid/framework/executor_gc_helper.cc +++ b/paddle/fluid/framework/executor_gc_helper.cc @@ -125,6 +125,7 @@ void DeleteUnusedTensors(const Scope &scope, for (auto &t : *lod_tensor_arr) { garbages.emplace_back(t.MoveMemoryHolder()); } + } else if (var->IsType()) { } else { PADDLE_THROW(platform::errors::Unimplemented( "Type %s of variable %s is not supported eager deletion.", diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc index 3bd85b2b24b97b..2eac65c90c02fa 100644 --- a/paddle/fluid/framework/feed_fetch_method.cc +++ b/paddle/fluid/framework/feed_fetch_method.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include +#include #include "glog/logging.h" namespace paddle { @@ -35,9 +36,24 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input, feed_inputs.resize(index + 1); } // shared data with input tensor - feed_inputs[index].ShareDataWith(input); + auto& val = BOOST_GET(LoDTensor, feed_inputs[index]); + val.ShareDataWith(input); // set lod - feed_inputs[index].set_lod(input.lod()); + val.set_lod(input.lod()); +} + +void SetFeedVariable(Scope* scope, const Strings& input, + const std::string& var_name, size_t index) { + // If var_name Variable is not found in GlobalScope, a new variable will + // be created. + VLOG(3) << "SetFeedStringVariable name=" << var_name << " index=" << index; + Variable* g_feed_value = scope->Var(var_name); + auto& feed_inputs = *(g_feed_value->GetMutable()); + if (index >= feed_inputs.size()) { + feed_inputs.resize(index + 1); + } + // shared data with input tensor + feed_inputs[index] = input; } FetchType& GetFetchVariable(const Scope& scope, const std::string& var_name, diff --git a/paddle/fluid/framework/feed_fetch_method.h b/paddle/fluid/framework/feed_fetch_method.h index a52ef517c8b734..4c2f5b9796a223 100644 --- a/paddle/fluid/framework/feed_fetch_method.h +++ b/paddle/fluid/framework/feed_fetch_method.h @@ -18,6 +18,7 @@ limitations under the License. */ #include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/string_array.h" namespace paddle { namespace framework { @@ -28,6 +29,9 @@ class Scope; void SetFeedVariable(Scope* scope, const LoDTensor& input, const std::string& var_name, size_t index); +void SetFeedVariable(Scope* scope, const Strings& input, + const std::string& var_name, size_t index); + FetchType& GetFetchVariable(const Scope& scope, const std::string& var_name, size_t index); diff --git a/paddle/fluid/framework/feed_fetch_type.h b/paddle/fluid/framework/feed_fetch_type.h index 1996327fe82bc0..12c111e58f58a0 100644 --- a/paddle/fluid/framework/feed_fetch_type.h +++ b/paddle/fluid/framework/feed_fetch_type.h @@ -13,14 +13,17 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once + #include + #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/string_array.h" #include "paddle/fluid/platform/variant.h" namespace paddle { namespace framework { -using FeedType = LoDTensor; +using FeedType = boost::variant; using FeedList = std::vector; using FetchType = boost::variant; @@ -43,6 +46,13 @@ inline bool data_is_lod_tensor_array(const FetchType &data) { return false; } +inline bool data_is_string_tensor(const FeedType &data) { + if (data.type() == typeid(Strings)) { + return true; + } + return false; +} + static const char kFeedOpType[] = "feed"; static const char kFetchOpType[] = "fetch"; diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc index dc5e24ef5de42f..7aeb9eaf3f1958 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.cc +++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc @@ -1334,6 +1334,29 @@ void FleetWrapper::SaveModelOneTablePrefix(const uint64_t table_id, #endif } +void FleetWrapper::SetDate(const uint64_t table_id, const std::string& date) { +#ifdef PADDLE_WITH_PSLIB + assert(date.size() == 8); + int year = std::stoi(date.substr(0, 4)); + int month = std::stoi(date.substr(4, 2)); + int day = std::stoi(date.substr(6, 2)); + struct std::tm b; + b.tm_year = year - 1900; + b.tm_mon = month - 1; + b.tm_mday = day; + b.tm_hour = b.tm_min = b.tm_sec = 0; + std::time_t seconds_from_1970 = std::mktime(&b); + int day_id = seconds_from_1970 / 86400; + auto ret = pslib_ptr_->_worker_ptr->set_day_id(table_id, day_id); + ret.wait(); + if (ret.get() != 0) { + LOG(ERROR) << "setdate : " << date << " failed"; + } +#else + VLOG(0) << "FleetWrapper::SetDate does nothing when no pslib"; +#endif +} + void FleetWrapper::PrintTableStat(const uint64_t table_id) { #ifdef PADDLE_WITH_PSLIB auto ret = pslib_ptr_->_worker_ptr->print_table_stat(table_id); @@ -1347,6 +1370,20 @@ void FleetWrapper::PrintTableStat(const uint64_t table_id) { #endif } +void FleetWrapper::SetFileNumOneShard(const uint64_t table_id, int file_num) { +#ifdef PADDLE_WITH_PSLIB + auto ret = + pslib_ptr_->_worker_ptr->set_file_num_one_shard(table_id, file_num); + ret.wait(); + int32_t err_code = ret.get(); + if (err_code == -1) { + LOG(ERROR) << "set_file_num_one_shard failed"; + } +#else + VLOG(0) << "FleetWrapper::SetFileNumOneShard does nothing when no pslib"; +#endif +} + double FleetWrapper::GetCacheThreshold(int table_id) { #ifdef PADDLE_WITH_PSLIB double cache_threshold = 0.0; diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.h b/paddle/fluid/framework/fleet/fleet_wrapper.h index c1db06a298c861..6fddedccf02585 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.h +++ b/paddle/fluid/framework/fleet/fleet_wrapper.h @@ -266,6 +266,7 @@ class FleetWrapper { bool load_combine); void PrintTableStat(const uint64_t table_id); + void SetFileNumOneShard(const uint64_t table_id, int file_num); // mode = 0, load all feature // mode = 1, load delta feature, which means load diff void LoadModel(const std::string& path, const int mode); @@ -335,6 +336,8 @@ class FleetWrapper { // this performs better than rand_r, especially large data std::default_random_engine& LocalRandomEngine(); + void SetDate(const uint64_t table_id, const std::string& date); + #ifdef PADDLE_WITH_PSLIB static std::shared_ptr pslib_ptr_; #endif diff --git a/paddle/fluid/framework/fleet/gloo_wrapper.cc b/paddle/fluid/framework/fleet/gloo_wrapper.cc index 489cef9f04654a..14e5f2f51924ba 100644 --- a/paddle/fluid/framework/fleet/gloo_wrapper.cc +++ b/paddle/fluid/framework/fleet/gloo_wrapper.cc @@ -71,6 +71,18 @@ void HdfsStore::set(const std::string& key, const std::vector& data) { } } paddle::framework::fs_mv(tmp, path); + auto start = std::chrono::steady_clock::now(); + while (paddle::framework::fs_exists(path) == false) { + VLOG(0) << "HdfsStore::set fs_mv retrying..."; + paddle::framework::fs_mv(tmp, path); + auto elapsed = std::chrono::duration_cast( + std::chrono::steady_clock::now() - start); + if (wait_timeout_ != gloo::kNoTimeout && elapsed > wait_timeout_) { + PADDLE_THROW(paddle::platform::errors::ExecutionTimeout( + "fs_mv failed, tmp: %s, path: %s", tmp, path)); + } + std::this_thread::sleep_for(std::chrono::milliseconds(wait_sleep_ms_)); + } #endif } @@ -140,6 +152,7 @@ void HdfsStore::wait(const std::vector& keys, auto start = std::chrono::steady_clock::now(); std::vector check_key_status(keys.size(), false); while (!Check(keys, &check_key_status)) { + VLOG(0) << "HdfsStore::wait checking repeatedly..."; auto elapsed = std::chrono::duration_cast( std::chrono::steady_clock::now() - start); if (wait_timeout_ != gloo::kNoTimeout && elapsed > wait_timeout_) { @@ -209,6 +222,8 @@ void ParallelConnectContext::connectFullMesh( // Create pairs auto transportContext = dev->createContext(rank, size); transportContext->setTimeout(getTimeout()); + VLOG(0) << "transportContext timeout: " << getTimeout().count() + << ", curr rank: " << rank; for (int i = 0; i < size; i++) { if (i == rank) { continue; @@ -225,6 +240,7 @@ void ParallelConnectContext::connectFullMesh( std::vector> connect_threads(thread_num_); // Connect every pair + VLOG(0) << "connect_thread_num: " << thread_num_ << ", size: " << size; for (uint32_t i = 0; i < connect_threads.size(); ++i) { connect_threads[i].reset(new std::thread( [&store, &transportContext, total_add_size, this]( @@ -252,10 +268,36 @@ void ParallelConnectContext::connectFullMesh( sleep(5); --max_retry_times; } - auto addr = extractAddress(allAddrs, i); + if (addr.empty()) { + VLOG(0) << "peer address is null"; + } + Impl impl_; + memcpy(&impl_, addr.data(), sizeof(impl_)); + struct sockaddr_in* sa = (struct sockaddr_in*)&(impl_.ss); + std::string ip = getCharIpAddr(sa->sin_addr.s_addr); + VLOG(0) << "peer " << i << " ip addr: " << ip + << ", port: " << sa->sin_port; + + auto start = std::chrono::steady_clock::now(); + std::chrono::seconds connect_wait_timeout_ = + std::chrono::seconds(600); + while (true) { + auto elapsed = std::chrono::duration_cast( + std::chrono::steady_clock::now() - start); + if (elapsed > connect_wait_timeout_) { + break; + } + try { + transportContext->getPair(i)->connect(addr); + break; + } catch (...) { + VLOG(0) << "gloo connect failed, retrying..."; + } + } transportContext->getPair(i)->connect(addr); } + VLOG(0) << "peer connected success"; }, i, connect_threads.size())); } @@ -264,6 +306,7 @@ void ParallelConnectContext::connectFullMesh( } device_ = dev; transportContext_ = std::move(transportContext); + VLOG(0) << "ParallelConnectContext::connectFullMesh() is over"; } #endif } // namespace rendezvous diff --git a/paddle/fluid/framework/fleet/gloo_wrapper.h b/paddle/fluid/framework/fleet/gloo_wrapper.h index 4eb40da1bfd39b..42ae73f9b13f1e 100644 --- a/paddle/fluid/framework/fleet/gloo_wrapper.h +++ b/paddle/fluid/framework/fleet/gloo_wrapper.h @@ -27,6 +27,7 @@ limitations under the License. */ #include #ifdef PADDLE_WITH_GLOO #include +#include #include #include #include @@ -97,6 +98,26 @@ class ParallelConnectContext : public gloo::rendezvous::Context { // slowly in case big size, especialy in HdfsStore void connectFullMesh(Store& store, // NOLINT std::shared_ptr& dev); // NOLINT + struct Impl { + // IP address of the listening socket. + struct sockaddr_storage ss; + // Sequence number of this address. + // If this is equal to -1, the address is assumed to + // represent the listening socket of a device. The sequence number + // must be set before it can be used by a pair. + ssize_t seq{-1}; + }; + std::string getCharIpAddr(uint32_t ipAddress) { + const int NBYTES = 4; + uint8_t octet[NBYTES]; + char ipAddressFinal[16]; + for (int i = 0; i < NBYTES; i++) { + octet[i] = ipAddress >> (i * 8); + } + snprintf(ipAddressFinal, sizeof(ipAddressFinal), "%d.%d.%d.%d", octet[0], + octet[1], octet[2], octet[3]); + return std::string(ipAddressFinal); + } protected: int thread_num_ = 6; @@ -218,6 +239,39 @@ class GlooWrapper { return ret; } + // NOTE(@xiongkun03): support all gather array of + // numbers with different length + // if the third argument is int, use allgather, + // if it is vector, use AllgathervOptions, + // which works in different length occasion. + template + void AllGatherVector(T* input_ptr, T* output_ptr, + std::vector& element_nums) { // NOLINT + CHECK_EQ(is_initialized_, true); +#ifdef PADDLE_WITH_GLOO + gloo::AllgathervOptions opts(context_); + opts.setInput(input_ptr, element_nums[rank_]); + opts.setOutput(output_ptr, element_nums); + gloo::allgatherv(opts); +#else + LOG(WARNING) << "AllGather does nothing when WITH_GLOO=OFF"; +#endif + } + + template + void AllGatherVector(T* input_ptr, T* output_ptr, + size_t element_num) { // NOLINT + CHECK_EQ(is_initialized_, true); +#ifdef PADDLE_WITH_GLOO + gloo::AllgatherOptions opts(context_); + opts.setInput(input_ptr, element_num); + opts.setOutput(output_ptr, element_num * size_); + gloo::allgather(opts); +#else + LOG(WARNING) << "AllGather does nothing when WITH_GLOO=OFF"; +#endif + } + protected: bool is_initialized_ = false; #ifdef PADDLE_WITH_GLOO diff --git a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h index 9facbff1f25269..9f3d1a7adcafcc 100644 --- a/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h +++ b/paddle/fluid/framework/fleet/heter_ps/hashtable_inl.h @@ -128,7 +128,7 @@ void HashTable::dump_to_cpu(int devid, cudaStream_t stream) { downpour_value->resize(gpu_val.mf_size + downpour_value_size); } float* cpu_val = downpour_value->data(); - cpu_val[0] = 0; + // cpu_val[0] = 0; cpu_val[1] = gpu_val.delta_score; cpu_val[2] = gpu_val.show; cpu_val[3] = gpu_val.clk; diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc index 784cbc3d90b865..4fb98e526d5fc4 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.cc @@ -40,63 +40,99 @@ namespace framework { std::shared_ptr PSGPUWrapper::s_instance_ = NULL; bool PSGPUWrapper::is_initialized_ = false; -void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task) { +void PSGPUWrapper::PreBuildTask(std::shared_ptr gpu_task) { VLOG(3) << "PSGPUWrapper::BuildGPUPSTask begin"; platform::Timer timeline; timeline.Start(); int device_num = heter_devices_.size(); - MultiSlotDataset* dataset = dynamic_cast(dataset_); gpu_task->init(thread_keys_shard_num_, device_num); - auto input_channel = dataset->GetInputChannel(); auto& local_keys = gpu_task->feature_keys_; auto& local_ptr = gpu_task->value_ptr_; - auto& device_keys = gpu_task->device_keys_; - auto& device_vals = gpu_task->device_values_; - auto& device_mutex = gpu_task->mutex_; - std::vector threads; -#ifdef PADDLE_WITH_PSLIB - auto fleet_ptr = FleetWrapper::GetInstance(); -#endif -#ifdef PADDLE_WITH_PSCORE - auto fleet_ptr = paddle::distributed::Communicator::GetInstance(); -#endif // data should be in input channel thread_keys_.resize(thread_keys_thread_num_); for (int i = 0; i < thread_keys_thread_num_; i++) { thread_keys_[i].resize(thread_keys_shard_num_); } - const std::deque& vec_data = input_channel->GetData(); - size_t total_len = vec_data.size(); - size_t len_per_thread = total_len / thread_keys_thread_num_; - int remain = total_len % thread_keys_thread_num_; + + size_t total_len = 0; + size_t len_per_thread = 0; + int remain = 0; size_t begin = 0; - auto gen_func = [this](const std::deque& total_data, int begin_index, - int end_index, int i) { - for (auto iter = total_data.begin() + begin_index; - iter != total_data.begin() + end_index; iter++) { - const auto& ins = *iter; - const auto& feasign_v = ins.uint64_feasigns_; - for (const auto feasign : feasign_v) { - uint64_t cur_key = feasign.sign().uint64_feasign_; - int shard_id = cur_key % thread_keys_shard_num_; - this->thread_keys_[i][shard_id].insert(cur_key); + + std::string data_set_name = std::string(typeid(*dataset_).name()); + + if (data_set_name.find("SlotRecordDataset") != std::string::npos) { + VLOG(0) << "ps_gpu_wrapper use SlotRecordDataset"; + SlotRecordDataset* dataset = dynamic_cast(dataset_); + auto input_channel = dataset->GetInputChannel(); + VLOG(0) << "yxf::buildtask::inputslotchannle size: " + << input_channel->Size(); + const std::deque& vec_data = input_channel->GetData(); + total_len = vec_data.size(); + len_per_thread = total_len / thread_keys_thread_num_; + remain = total_len % thread_keys_thread_num_; + VLOG(0) << "total len: " << total_len; + auto gen_func = [this](const std::deque& total_data, + int begin_index, int end_index, int i) { + for (auto iter = total_data.begin() + begin_index; + iter != total_data.begin() + end_index; iter++) { + const auto& ins = *iter; + const auto& feasign_v = ins->slot_uint64_feasigns_.slot_values; + for (const auto feasign : feasign_v) { + int shard_id = feasign % thread_keys_shard_num_; + this->thread_keys_[i][shard_id].insert(feasign); + } } + }; + for (int i = 0; i < thread_keys_thread_num_; i++) { + threads.push_back( + std::thread(gen_func, std::ref(vec_data), begin, + begin + len_per_thread + (i < remain ? 1 : 0), i)); + begin += len_per_thread + (i < remain ? 1 : 0); } - }; - for (int i = 0; i < thread_keys_thread_num_; i++) { - threads.push_back(std::thread(gen_func, std::ref(vec_data), begin, - begin + len_per_thread + (i < remain ? 1 : 0), - i)); - begin += len_per_thread + (i < remain ? 1 : 0); - } - for (std::thread& t : threads) { - t.join(); + for (std::thread& t : threads) { + t.join(); + } + timeline.Pause(); + VLOG(1) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds."; + } else { + CHECK(data_set_name.find("MultiSlotDataset") != std::string::npos); + VLOG(0) << "ps_gpu_wrapper use MultiSlotDataset"; + MultiSlotDataset* dataset = dynamic_cast(dataset_); + auto input_channel = dataset->GetInputChannel(); + + const std::deque& vec_data = input_channel->GetData(); + total_len = vec_data.size(); + len_per_thread = total_len / thread_keys_thread_num_; + remain = total_len % thread_keys_thread_num_; + auto gen_func = [this](const std::deque& total_data, + int begin_index, int end_index, int i) { + for (auto iter = total_data.begin() + begin_index; + iter != total_data.begin() + end_index; iter++) { + const auto& ins = *iter; + const auto& feasign_v = ins.uint64_feasigns_; + for (const auto feasign : feasign_v) { + uint64_t cur_key = feasign.sign().uint64_feasign_; + int shard_id = cur_key % thread_keys_shard_num_; + this->thread_keys_[i][shard_id].insert(cur_key); + } + } + }; + for (int i = 0; i < thread_keys_thread_num_; i++) { + threads.push_back( + std::thread(gen_func, std::ref(vec_data), begin, + begin + len_per_thread + (i < remain ? 1 : 0), i)); + begin += len_per_thread + (i < remain ? 1 : 0); + } + for (std::thread& t : threads) { + t.join(); + } + timeline.Pause(); + VLOG(1) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds."; } - timeline.Pause(); - VLOG(1) << "GpuPs build task cost " << timeline.ElapsedSec() << " seconds."; timeline.Start(); @@ -135,6 +171,38 @@ void PSGPUWrapper::BuildTask(std::shared_ptr gpu_task) { VLOG(3) << "GpuPs shard: " << i << " key len: " << local_keys[i].size(); local_ptr[i].resize(local_keys[i].size()); } +} + +void PSGPUWrapper::BuildPull(std::shared_ptr gpu_task) { + platform::Timer timeline; + int device_num = heter_devices_.size(); + auto& local_keys = gpu_task->feature_keys_; + auto& local_ptr = gpu_task->value_ptr_; + + auto& device_keys = gpu_task->device_keys_; + auto& device_vals = gpu_task->device_values_; + auto& device_mutex = gpu_task->mutex_; + + std::vector threads(thread_keys_shard_num_); +#ifdef PADDLE_WITH_PSLIB + auto fleet_ptr = FleetWrapper::GetInstance(); +#endif +#ifdef PADDLE_WITH_PSCORE + auto fleet_ptr = paddle::distributed::Communicator::GetInstance(); +#endif + +#ifdef PADDLE_WITH_PSLIB + // get day_id: day nums from 1970 + struct std::tm b; + b.tm_year = year_ - 1900; + b.tm_mon = month_ - 1; + b.tm_mday = day_; + b.tm_min = b.tm_hour = b.tm_sec = 0; + std::time_t seconds_from_1970 = std::mktime(&b); + int day_id = seconds_from_1970 / 86400; + fleet_ptr->pslib_ptr_->_worker_ptr->set_day_id(table_id_, day_id); +#endif + timeline.Start(); auto ptl_func = [this, &local_keys, &local_ptr, &fleet_ptr](int i) { size_t key_size = local_keys[i].size(); @@ -423,29 +491,32 @@ void PSGPUWrapper::LoadIntoMemory(bool is_shuffle) { void PSGPUWrapper::start_build_thread() { running_ = true; VLOG(3) << "start build CPU&GPU ps thread."; - build_cpu_threads_ = std::thread([this] { build_cpu_thread(); }); - build_gpu_threads_ = std::thread([this] { build_gpu_thread(); }); + pre_build_threads_ = std::thread([this] { pre_build_thread(); }); + build_threads_ = std::thread([this] { build_thread(); }); } -void PSGPUWrapper::build_cpu_thread() { +void PSGPUWrapper::pre_build_thread() { + // prebuild: process load_data while (running_) { std::shared_ptr gpu_task = nullptr; if (!data_ready_channel_->Get(gpu_task)) { continue; } - VLOG(3) << "thread BuildTask start."; + VLOG(3) << "thread PreBuildTask start."; platform::Timer timer; timer.Start(); // build cpu ps data process - BuildTask(gpu_task); + PreBuildTask(gpu_task); timer.Pause(); - VLOG(1) << "thread BuildTask end, cost time: " << timer.ElapsedSec() << "s"; + VLOG(1) << "thread PreBuildTask end, cost time: " << timer.ElapsedSec() + << "s"; buildcpu_ready_channel_->Put(gpu_task); } VLOG(3) << "build cpu thread end"; } -void PSGPUWrapper::build_gpu_thread() { +void PSGPUWrapper::build_thread() { + // build: build_pull + build_gputask while (running_) { std::shared_ptr gpu_task = nullptr; if (!gpu_free_channel_->Get(gpu_task)) { @@ -457,12 +528,14 @@ void PSGPUWrapper::build_gpu_thread() { VLOG(3) << "thread BuildGPUTask start."; platform::Timer timer; timer.Start(); + BuildPull(gpu_task); + timer.Pause(); + timer.Start(); BuildGPUTask(gpu_task); timer.Pause(); VLOG(1) << "thread BuildGPUTask end, cost time: " << timer.ElapsedSec() << "s"; - gpu_task_pool_.Push(gpu_task); train_ready_channel_->Put(gpu_task); } VLOG(3) << "build gpu thread end"; @@ -498,6 +571,8 @@ void PSGPUWrapper::EndPass() { if (keysize_max != 0) { HeterPs_->end_pass(); } + + gpu_task_pool_.Push(current_task_); current_task_ = nullptr; gpu_free_channel_->Put(current_task_); timer.Pause(); diff --git a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h index b7e8bbb3694922..c1f83d2fe9274d 100644 --- a/paddle/fluid/framework/fleet/ps_gpu_wrapper.h +++ b/paddle/fluid/framework/fleet/ps_gpu_wrapper.h @@ -84,13 +84,14 @@ class PSGPUWrapper { const int batch_size); void BuildGPUTask(std::shared_ptr gpu_task); - void BuildTask(std::shared_ptr gpu_task); + void PreBuildTask(std::shared_ptr gpu_task); + void BuildPull(std::shared_ptr gpu_task); void LoadIntoMemory(bool is_shuffle); void BeginPass(); void EndPass(); void start_build_thread(); - void build_cpu_thread(); - void build_gpu_thread(); + void pre_build_thread(); + void build_thread(); void Finalize() { VLOG(3) << "PSGPUWrapper Begin Finalize."; @@ -102,10 +103,10 @@ class PSGPUWrapper { gpu_free_channel_->Close(); train_ready_channel_->Close(); running_ = false; - VLOG(3) << "begin stop build_cpu_threads_"; - build_cpu_threads_.join(); - VLOG(3) << "begin stop build_gpu_threads_"; - build_gpu_threads_.join(); + VLOG(3) << "begin stop pre_build_threads_"; + pre_build_threads_.join(); + VLOG(3) << "begin stop build_threads_"; + build_threads_.join(); s_instance_ = nullptr; VLOG(3) << "PSGPUWrapper Finalize Finished."; } @@ -117,6 +118,15 @@ class PSGPUWrapper { resource_ = std::make_shared(dev_ids); resource_->enable_p2p(); keys_tensor.resize(resource_->total_gpu()); +#ifdef PADDLE_WITH_GLOO + auto gloo = paddle::framework::GlooWrapper::GetInstance(); + if (gloo->Size() > 1) { + multi_node_ = 1; + } +#else + PADDLE_THROW( + platform::errors::Unavailable("heter ps need compile with GLOO")); +#endif if (multi_node_) { int dev_size = dev_ids.size(); // init inner comm @@ -127,7 +137,6 @@ class PSGPUWrapper { // init inter comm #ifdef PADDLE_WITH_GLOO inter_comms_.resize(dev_size); - auto gloo = paddle::framework::GlooWrapper::GetInstance(); if (gloo->Rank() == 0) { for (int i = 0; i < dev_size; ++i) { platform::dynload::ncclGetUniqueId(&inter_ncclids_[i]); @@ -232,6 +241,12 @@ class PSGPUWrapper { mf_max_bound); } } + void SetDate(int year, int month, int day) { + year_ = year; + month_ = month; + day_ = day; + } + void SetDataset(Dataset* dataset) { dataset_ = dataset; } // PSGPUWrapper singleton @@ -275,6 +290,9 @@ class PSGPUWrapper { int thread_keys_thread_num_ = 37; int thread_keys_shard_num_ = 37; uint64_t max_fea_num_per_pass_ = 5000000000; + int year_; + int month_; + int day_; std::shared_ptr< paddle::framework::ChannelObject>> @@ -293,8 +311,8 @@ class PSGPUWrapper { train_ready_channel_ = paddle::framework::MakeChannel>(); std::shared_ptr current_task_ = nullptr; - std::thread build_cpu_threads_; - std::thread build_gpu_threads_; + std::thread pre_build_threads_; + std::thread build_threads_; bool running_ = false; protected: diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto index eb72d9e1420dce..300d5f6e8fad10 100644 --- a/paddle/fluid/framework/framework.proto +++ b/paddle/fluid/framework/framework.proto @@ -147,6 +147,11 @@ message VarType { // in operators like nccl_op RAW = 17; TUPLE = 18; + + STRING = 25; + STRINGS = 26; + VOCAB = 27; + FEED_LIST = 28; } required Type type = 1; @@ -175,6 +180,10 @@ message VarType { message Tuple { repeated Type element_type = 1; } optional Tuple tuple = 7; + + optional TensorDesc string = 8; + optional TensorDesc strings = 9; + optional TensorDesc vocab = 10; } message VarDesc { diff --git a/paddle/fluid/framework/generator.cc b/paddle/fluid/framework/generator.cc index 4b64722a7abf5a..154154fc795179 100644 --- a/paddle/fluid/framework/generator.cc +++ b/paddle/fluid/framework/generator.cc @@ -63,6 +63,43 @@ const std::shared_ptr& DefaultCPUGenerator() { return default_cpu_generator; } +using RNGMap = std::unordered_map>; + +static RNGMap& GetRandomSeedGeneratorMap() { + static auto random_seed_generator_map = RNGMap(); + return random_seed_generator_map; +} + +const std::shared_ptr& SetRandomSeedGenerator( + const std::string& name, uint64_t seed) { + auto& rng_map = GetRandomSeedGeneratorMap(); + auto iter = rng_map.find(name); + PADDLE_ENFORCE_EQ(iter == rng_map.end(), true, + platform::errors::AlreadyExists( + "%s RandomSeedGenerator is already exist", name)); + + auto generator = std::make_shared(seed); + bool emplace_success = rng_map.emplace(name, generator).second; + PADDLE_ENFORCE_EQ( + emplace_success, true, + platform::errors::PermissionDenied( + "SetRandomSeedGenerator cannot emplace %s RandomSeedGenerator", + name)); + return rng_map[name]; +} + +const std::shared_ptr& GetRandomSeedGenerator( + const std::string& name) { + auto& rng_map = GetRandomSeedGeneratorMap(); + auto iter = rng_map.find(name); + PADDLE_ENFORCE_EQ(iter != rng_map.end(), true, + platform::errors::NotFound( + "%s RandomSeedGenerator is not found, please " + "use `set_random_seed_generator` to set rng first", + name)); + return iter->second; +} + std::shared_ptr OpDefaultCPUEngine() { static auto op_default_cpu_engine = std::make_shared(); return op_default_cpu_engine; diff --git a/paddle/fluid/framework/generator.h b/paddle/fluid/framework/generator.h index 862e63c4c6af5a..d0a5b4443e3f49 100644 --- a/paddle/fluid/framework/generator.h +++ b/paddle/fluid/framework/generator.h @@ -126,5 +126,11 @@ std::shared_ptr GetCPURandomEngine(uint64_t); const std::shared_ptr& GetDefaultCUDAGenerator( int64_t device_id = -1); +const std::shared_ptr& SetRandomSeedGenerator( + const std::string& name, uint64_t seed); + +const std::shared_ptr& GetRandomSeedGenerator( + const std::string& name); + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 99c691e6cf6f7a..80ae0f04daa4a0 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -96,6 +96,7 @@ pass_library(multihead_matmul_fuse_pass inference) pass_library(adaptive_pool2d_convert_global_pass inference) pass_library(unsqueeze2_eltwise_fuse_pass inference) pass_library(layer_norm_fuse_pass inference) +pass_library(add_support_int8_pass inference) pass_library(generate_pass DEPS pass_desc_proto) target_link_libraries(generate_pass pass_desc_proto) if(WITH_GPU OR WITH_ROCM) @@ -122,6 +123,7 @@ if(WITH_MKLDNN) pass_library(cpu_quantize_squash_pass inference DIR mkldnn) pass_library(reshape_transpose_matmul_mkldnn_fuse_pass inference DIR mkldnn) pass_library(matmul_transpose_reshape_fuse_pass inference DIR mkldnn) + pass_library(matmul_v2_transpose_reshape_fuse_pass inference DIR mkldnn) pass_library(batch_norm_act_fuse_pass inference DIR mkldnn) pass_library(multi_gru_fuse_pass inference DIR mkldnn) pass_library(multi_gru_seq_fuse_pass inference DIR mkldnn) @@ -142,6 +144,9 @@ cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper) cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry) cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry) cc_test(graph_to_program_pass_test SRCS graph_to_program_pass_test.cc DEPS graph_to_program_pass) +if (WITH_CINN) + cc_test(cinn_lib_test SRCS cinn_lib_test.cc DEPS cinn) +endif() cc_test(cost_model_test SRCS cost_model_test.cc DEPS cost_model op_registry) cc_test(test_graph_pattern_detector SRCS graph_pattern_detector_tester.cc DEPS graph_pattern_detector) cc_test(test_op_compat_sensible_pass SRCS op_compat_sensible_pass_tester.cc DEPS op_compat_sensible_pass) @@ -188,7 +193,7 @@ endif() cc_test(test_cpu_quantize_pass SRCS mkldnn/cpu_quantize_pass_tester.cc DEPS cpu_quantize_pass naive_executor) cc_test(test_cpu_quantize_squash_pass SRCS mkldnn/cpu_quantize_squash_pass_tester.cc DEPS cpu_quantize_squash_pass naive_executor) cc_test(test_reshape_transpose_matmul_mkldnn_fuse_pass SRCS mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass_tester.cc DEPS reshape_transpose_matmul_mkldnn_fuse_pass) - cc_test(test_matmul_transpose_reshape_fuse_pass SRCS mkldnn/matmul_transpose_reshape_fuse_pass_tester.cc DEPS matmul_transpose_reshape_fuse_pass) + cc_test(test_matmul_transpose_reshape_fuse_pass SRCS mkldnn/matmul_transpose_reshape_fuse_pass_tester.cc DEPS matmul_transpose_reshape_fuse_pass matmul_v2_transpose_reshape_fuse_pass) cc_test(test_cpu_bfloat16_placement_pass SRCS mkldnn/cpu_bfloat16_placement_pass_tester.cc DEPS cpu_bfloat16_placement_pass) cc_test(test_cpu_bfloat16_pass SRCS mkldnn/cpu_bfloat16_pass_tester.cc DEPS cpu_bfloat16_pass) cc_test(test_multi_gru_fuse_pass SRCS mkldnn/multi_gru_fuse_pass_tester.cc DEPS multi_gru_fuse_pass) diff --git a/paddle/fluid/framework/ir/add_support_int8_pass.cc b/paddle/fluid/framework/ir/add_support_int8_pass.cc new file mode 100644 index 00000000000000..d157d2e934acea --- /dev/null +++ b/paddle/fluid/framework/ir/add_support_int8_pass.cc @@ -0,0 +1,54 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/add_support_int8_pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern); +#define GET_NODES \ + GET_IR_NODE(prev_op); \ + GET_IR_NODE(prev_out); \ + GET_IR_NODE(quant_op); \ + GET_IR_NODE(quant_out); + +void AddSupportInt8Pass::ApplyImpl(ir::Graph* graph) const { + const std::string pattern_name = "add_support_int8"; + FusePassBase::Init(pattern_name, graph); + + GraphPatternDetector gpd; + + patterns::AddSupportInt8 pattern(gpd.mutable_pattern(), pattern_name); + pattern(); + int found_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_NODES; + if (prev_op->Op()->HasAttr("out_threshold") && + quant_op->Op()->HasAttr("out_threshold")) { + quant_op->Op()->SetAttr("support_int8", true); + } + found_count++; + }; + gpd(graph, handler); + AddStatis(found_count); +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(add_support_int8_pass, paddle::framework::ir::AddSupportInt8Pass); diff --git a/paddle/fluid/framework/ir/add_support_int8_pass.h b/paddle/fluid/framework/ir/add_support_int8_pass.h new file mode 100644 index 00000000000000..372250d60169d3 --- /dev/null +++ b/paddle/fluid/framework/ir/add_support_int8_pass.h @@ -0,0 +1,36 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +class Graph; + +class AddSupportInt8Pass : public FusePassBase { + public: + AddSupportInt8Pass() {} + virtual ~AddSupportInt8Pass() {} + + protected: + void ApplyImpl(ir::Graph* graph) const override; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/cinn_lib_test.cc b/paddle/fluid/framework/ir/cinn_lib_test.cc new file mode 100644 index 00000000000000..23cb653fef22ac --- /dev/null +++ b/paddle/fluid/framework/ir/cinn_lib_test.cc @@ -0,0 +1,152 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include +#include +#include +#include + +#ifdef PADDLE_WITH_CUDA +#include +#endif + +#include "cinn/cinn.h" +#include "cinn/common/target.h" +#include "cinn/frontend/net_builder.h" +#include "cinn/frontend/syntax.h" +#include "cinn/hlir/framework/graph.h" +#include "cinn/hlir/framework/graph_compiler.h" +#include "cinn/hlir/framework/pass.h" +#include "cinn/hlir/framework/tensor.h" +#include "cinn/hlir/op/use_ops.h" +#include "cinn/hlir/pass/use_pass.h" + +namespace cinn { +namespace frontend { + +Program CreateAddProgram() { + constexpr int M = 32; + constexpr int N = 24; + + NetBuilder builder("net_builder"); + auto a = builder.CreateInput(Float(32), {M, N}); + auto b = builder.CreateInput(Float(32), {M, N}); + auto c = builder.add(a, b); + auto d = builder.add(a, c); + auto program = builder.Build(); + + return program; +} + +void SetRandData(hlir::framework::Tensor tensor, Target target) { + auto* data = tensor->mutable_data(target); + std::random_device seed; + std::default_random_engine engine(seed()); + std::uniform_real_distribution dist(0.f, 1.f); + size_t num_ele = tensor->shape().numel(); + std::vector random_data(num_ele); + for (size_t i = 0; i < num_ele; i++) { + random_data[i] = dist(engine); // All random data + } + +#ifdef PADDLE_WITH_CUDA + cudaMemcpy(data, random_data.data(), num_ele * sizeof(float), + cudaMemcpyHostToDevice); +#else + std::copy(random_data.begin(), random_data.end(), data); +#endif +} + +TEST(net_build, basic) { + auto program = CreateAddProgram(); + // output program + for (size_t i = 0; i < program.size(); i++) { + LOG(INFO) << "instruction: " << program[i]; + } +} + +TEST(net_build, program_execute_multi_elementwise_add) { + auto program = CreateAddProgram(); +#ifdef PADDLE_WITH_CUDA + Target target = common::DefaultNVGPUTarget(); +#else + Target target = common::DefaultHostTarget(); +#endif + + auto graph = std::make_shared(program, target); + std::cout << "graph:\n" << graph->Visualize() << std::endl; + + auto scope = BuildScope(target, graph); + hlir::framework::GraphCompiler gc(target, scope, graph); + auto runtime_program = gc.Build(); + + scope->Var("A"); + scope->Var("B"); + + auto A = scope->GetTensor("A"); + auto B = scope->GetTensor("B"); + SetRandData(A, target); + SetRandData(B, target); + + runtime_program->Execute(); +} + +TEST(net_build, program_execute_fc) { + constexpr int B = 10; // batch size + constexpr int M = 32; + constexpr int K = 18; + constexpr int N = 24; + + NetBuilder builder("net_builder"); + auto a = builder.CreateInput(Float(32), {B, M, K}, "A"); + auto w = builder.CreateInput(Float(32), {N, K}, "W"); // weight + auto b = builder.CreateInput(Float(32), {N}, "B"); // bias + + auto mul_out = builder.mul(a, w, 2, 1); + auto add_out = builder.add(mul_out, b); + auto program = builder.Build(); + +#ifdef PADDLE_WITH_CUDA + Target target = common::DefaultNVGPUTarget(); +#else + Target target = common::DefaultHostTarget(); +#endif + + auto graph = std::make_shared(program, target); + auto scope = BuildScope(target, graph); + hlir::framework::GraphCompiler gc(target, scope, graph); + auto runtime_program = gc.Build(); + + scope->Var(std::string(a.id())); + scope->Var(std::string(w.id())); + scope->Var(std::string(b.id())); + scope->Var(std::string(mul_out->id)); + + auto a_ten = scope->GetTensor(std::string(a.id())); + auto w_ten = scope->GetTensor(std::string(w.id())); + auto b_ten = scope->GetTensor(std::string(b.id())); + auto fake_out_ten = scope->GetTensor(std::string(mul_out->id)); + auto add_out_ten = scope->GetTensor(std::string(add_out->id)); + SetRandData(a_ten, target); + SetRandData(w_ten, target); + SetRandData(b_ten, target); + + runtime_program->Execute(); +} + +} // namespace frontend +} // namespace cinn diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_filter_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_filter_op_pass.cc index b9cc337df87929..2fc133edb7a960 100644 --- a/paddle/fluid/framework/ir/delete_quant_dequant_filter_op_pass.cc +++ b/paddle/fluid/framework/ir/delete_quant_dequant_filter_op_pass.cc @@ -181,7 +181,7 @@ void DeleteQuantDequantFilterOpPass::ApplyImpl(ir::Graph* graph) const { "Weight scale should be nonzero, but get zero.")); weight_scale[i] = weight_scale[i] / range; } - } else { + } else if (dequant_type == "fake_quantize_dequantize_abs_max") { // Implement quantize_dequantize_abs_max quantization algorithm float abs_max_weight = 0.; for (int j = 0; j < weight_tensor->numel(); j++) { @@ -192,6 +192,9 @@ void DeleteQuantDequantFilterOpPass::ApplyImpl(ir::Graph* graph) const { platform::errors::InvalidArgument( "Weight scale should be nonzero, but get zero")); weight_scale.push_back(abs_max_weight / range); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Unsupported quantize_dequantize op type: %s", dequant_type)); } nodes2rm.insert(quant_dequant_op_outscale); diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc index 4510aea925e788..bb78cdab677526 100644 --- a/paddle/fluid/framework/ir/fc_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc @@ -51,7 +51,12 @@ FCFusePass::FCFusePass() { .IsTensor() .End() .AddAttr("axis") - .IsNumGE(1) + .IsNumMatch([](int axis) -> bool { + if (axis == -1 || axis >= 1) { + return true; + } + return false; + }) .End(); AddOpCompat(OpCompat("relu")) diff --git a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc index 9a43edf40ef443..52e88c6408b0e8 100644 --- a/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_gru_fuse_pass.cc @@ -335,9 +335,9 @@ void FCGRUFusePass::ApplyImpl(ir::Graph* graph) const { graph, name_scope_, param_scope(), true /*with_fc_bias*/); AddStatis(fusion_count); - - string::PrettyLogDetail("--- fused %d pairs of fc gru patterns", - fusion_count); + if (!Has("disable_logs") || !Get("disable_logs")) + string::PrettyLogDetail("--- fused %d pairs of fc gru patterns", + fusion_count); } } // namespace ir diff --git a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc index 2e6ce1a0f73818..d72b626fc1ebcf 100644 --- a/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/fc_lstm_fuse_pass.cc @@ -349,9 +349,9 @@ void FCLstmFusePass::ApplyImpl(ir::Graph* graph) const { BuildFusion(graph, name_scope_, param_scope(), true /*with_fc_bias*/); AddStatis(fusion_count); - - string::PrettyLogDetail("--- fused %d pairs of fc lstm patterns", - fusion_count); + if (!Has("disable_logs") || !Get("disable_logs")) + string::PrettyLogDetail("--- fused %d pairs of fc lstm patterns", + fusion_count); } } // namespace ir diff --git a/paddle/fluid/framework/ir/generate_pass.cc b/paddle/fluid/framework/ir/generate_pass.cc index 9eba6fc89a2e96..b261cbeb08e3bf 100644 --- a/paddle/fluid/framework/ir/generate_pass.cc +++ b/paddle/fluid/framework/ir/generate_pass.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/ir/generate_pass.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" namespace paddle { namespace framework { @@ -20,6 +21,16 @@ namespace ir { void InitGeneratePattern(const proto::PassDesc& pass_desc, PDPattern* pattern) { const proto::BlockDesc& block = pass_desc.pattern().blocks(0); + for (const proto::VarDesc& var : block.vars()) { + PDNode* var_pdnode = pattern->NewNode(var.name())->AsInput(); + var_pdnode->assert_is_var(); + var_pdnode->assert_more([&](Node* x) { + if (VarDesc(var).GetShape() == x->Var()->GetShape()) { + return true; + } + return false; + }); + } // Traverse all operators to create subgraph. for (int index = 0; index < block.ops_size(); ++index) { const proto::OpDesc& op = block.ops(index); @@ -30,15 +41,32 @@ void InitGeneratePattern(const proto::PassDesc& pass_desc, PDPattern* pattern) { pattern->NewNode(std::to_string(index))->assert_is_op(op.type()); // Create PDNodes for inputs of current operator. for (const proto::OpDesc::Var& var : op.inputs()) { - for (const std::string& argument : var.arguments()) { + for (int n = 0; n < var.arguments_size(); ++n) { + const std::string& argument = var.arguments(n); // The input may be the output of other operator. PDNode* var_pdnode = pattern->RetrieveNode(argument); if (nullptr == var_pdnode) { var_pdnode = pattern->NewNode(argument)->AsInput(); + var_pdnode->assert_is_var(); } else if (var_pdnode->IsOutput()) { var_pdnode->AsIntermediate(); } - var_pdnode->assert_is_op_input(op.type()); + var_pdnode->assert_more([&](Node* x) { + for (auto* out : x->outputs) { + if (out->IsOp() && out->Op()->Type() == op.type()) { + const auto& inputs = out->Op()->Inputs(); + const auto& iter = inputs.find(var.parameter()); + if (inputs.end() != iter) { + if (iter->second.end() != std::find(iter->second.begin(), + iter->second.end(), + x->Name())) { + return true; + } + } + } + } + return false; + }); pattern->AddEdge(var_pdnode, op_pdnode); } } @@ -49,6 +77,24 @@ void InitGeneratePattern(const proto::PassDesc& pass_desc, PDPattern* pattern) { PDNode* var_pdnode = pattern->RetrieveNode(argument); if (nullptr == var_pdnode) { var_pdnode = pattern->NewNode(argument)->AsOutput(); + var_pdnode->assert_is_var(); + var_pdnode->assert_more([&](Node* x) { + for (Node* input : x->inputs) { + if (input && input->IsOp() && input->Op() && + input->Op()->Type() == op.type()) { + const auto& outputs = input->Op()->Outputs(); + const auto& iter = outputs.find(var.parameter()); + if (outputs.end() != iter) { + if (iter->second.end() != std::find(iter->second.begin(), + iter->second.end(), + x->Name())) { + return true; + } + } + } + } + return false; + }); } else if (var_pdnode->IsInput()) { var_pdnode->AsIntermediate(); } @@ -72,18 +118,64 @@ void InitGeneratePattern(const proto::PassDesc& pass_desc, PDPattern* pattern) { } } -GraphPatternDetector::handle_t GetGenerateRewrite( +// There are some duplicate patterns. +bool IsDuplicatePattern(const GraphPatternDetector::subgraph_t& subgraph, + Graph* graph) { + for (auto iter : subgraph) { + if (nullptr == graph->RetrieveNode(iter.second->id())) { + VLOG(3) << "Node [" << iter.second->Name() + << "] of subgraph has been removed. So skip this optimize."; + return true; + } + } + return false; +} + +GraphPatternDetector::handle_t GetGenerateDelete( const PDPattern& pattern, const proto::PassDesc& pass_desc) { GraphPatternDetector::handle_t handler = [&]( - const GraphPatternDetector::subgraph_t subgraph, Graph* graph) { - // There are some duplicate patterns. - for (auto iter : subgraph) { - if (nullptr == graph->RetrieveNode(iter.second->id())) { - VLOG(3) << "Node [" << iter.second->Name() - << "] of subgraph has been removed. So skip this optimize."; - return; + const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { + if (IsDuplicatePattern(subgraph, graph)) { + return; + } + // `var_node_maps` record the mapping of variable to the pattern subgraph. + std::map var_node_maps; + for (const proto::PassDesc::VarMap& var_map : pass_desc.var_maps()) { + Node* node = subgraph.at(pattern.RetrieveNode(var_map.pattern_var())); + const auto& iter = var_node_maps.find(var_map.replace_var()); + if (var_node_maps.end() == iter) { + // first node is input + var_node_maps.insert({var_map.replace_var(), node}); + } else { + // output node + for (Node* s_node : node->outputs) { + iter->second->outputs.push_back(s_node); + std::replace(s_node->inputs.begin(), s_node->inputs.end(), node, + iter->second); + s_node->Op()->RenameInput(node->Name(), iter->second->Name()); + } } } + // Remove nodes that are intermediate. + std::unordered_set remove_nodes; + for (const std::unique_ptr& pdnode : pattern.nodes()) { + remove_nodes.emplace(subgraph.at(pdnode.get())); + } + for (auto iter : var_node_maps) { + remove_nodes.erase(iter.second); + } + GraphSafeRemoveNodes(graph, remove_nodes); + }; + return handler; +} + +GraphPatternDetector::handle_t GetGenerateRewrite( + const PDPattern& pattern, const proto::PassDesc& pass_desc) { + GraphPatternDetector::handle_t handler = [&]( + const GraphPatternDetector::subgraph_t& subgraph, Graph* graph) { + if (IsDuplicatePattern(subgraph, graph)) { + return; + } const proto::BlockDesc& block = pass_desc.replace().blocks(0); // `var_node_maps` record the mapping of variable to the pattern subgraph. std::map var_node_maps; @@ -174,7 +266,11 @@ void GeneratePass::ApplyImpl(Graph* graph) const { for (const proto::PassDesc& pass_desc : multi_pass_desc_.pass_descs()) { GraphPatternDetector detector; InitGeneratePattern(pass_desc, detector.mutable_pattern()); - detector(graph, GetGenerateRewrite(detector.pattern(), pass_desc)); + if (pass_desc.replace().blocks(0).ops_size() == 0) { + detector(graph, GetGenerateDelete(detector.pattern(), pass_desc)); + } else { + detector(graph, GetGenerateRewrite(detector.pattern(), pass_desc)); + } // The rewrited graph needs to be verified. Current Pass should be skipped // if validation failed. Rewrite based on the original graph cannot // implement rollback operation. @@ -224,6 +320,115 @@ bool GeneratePass::VerifyGraph(const Graph& graph) { return true; } +namespace generate_pass { + +VarHelper::VarHelper(const char* name) : name_(name), type_(Type::kInput) {} +VarHelper::VarHelper(const std::string& name, Type type) + : name_(name), type_(type) {} + +OpHelper::OpHelper(const char* type, SubgraphHelper* subgraph_helper) + : type_(type), subgraph_helper_(subgraph_helper) { + op_desc_ = subgraph_helper_->ProgramDesc()->mutable_blocks(0)->add_ops(); + op_desc_->set_type(type_); +} + +OpHelper::Arguments::Arguments(const char* parameter, + const VarHelper& var_helper) + : parameter_(parameter) { + var_helpers_.push_back(var_helper); +} + +OpHelper::Arguments::Arguments(const char* parameter, + std::initializer_list var_helpers) + : parameter_(parameter), var_helpers_(var_helpers) {} + +OpHelper& OpHelper::operator()(const Arguments& input) { + proto::OpDesc::Var* var = op_desc_->add_inputs(); + var->set_parameter(input.parameter_); + for (const VarHelper& var_helper : input.var_helpers_) { + var->add_arguments()->assign(var_helper.name_); + if (VarHelper::Type::kInput == var_helper.type_) { + subgraph_helper_->AddInputVar(var_helper.name_); + } + } + return *this; +} + +OpHelper& OpHelper::operator()(std::initializer_list inputs) { + for (const auto& input : inputs) { + operator()(input); + } + return *this; +} + +VarHelper OpHelper::Out(const char* name) { + std::string argument = patterns::UniqueKey(type_); + proto::OpDesc::Var* var = op_desc_->add_outputs(); + var->set_parameter(name); + var->add_arguments()->assign(argument); + return VarHelper(argument, VarHelper::Type::kOutput); +} + +proto::ProgramDesc* SubgraphHelper::ProgramDesc() { return &program_desc_; } + +const proto::ProgramDesc& SubgraphHelper::ProgramDesc() const { + return program_desc_; +} + +const std::vector& SubgraphHelper::InputVars() const { + return input_vars_; +} + +const std::vector& SubgraphHelper::OutputVars() const { + return output_vars_; +} + +void SubgraphHelper::AddInputVar(const std::string& name) { + auto iter = std::find(input_vars_.begin(), input_vars_.end(), name); + if (input_vars_.end() == iter) { + input_vars_.push_back(name); + } +} + +void SubgraphHelper::AddOutputVars(const VarHelper& var_helper) { + output_vars_.push_back(var_helper.name_); +} + +} // namespace generate_pass + +PassPairs::PassPairs(const SubgraphType& pattern, const SubgraphType& replace) { + AddPassDesc(pattern, replace); +} + +void PassPairs::AddPassDesc(const SubgraphType& pattern, + const SubgraphType& replace) { + proto::PassDesc* pass_desc = multi_pass_desc_.add_pass_descs(); + pass_desc->mutable_pattern()->CopyFrom(pattern.ProgramDesc()); + pass_desc->mutable_replace()->CopyFrom(replace.ProgramDesc()); + PADDLE_ENFORCE_EQ(pattern.InputVars().size(), replace.InputVars().size(), + platform::errors::InvalidArgument( + "Size of lambda expression arguments is not equal " + "between pattern/replace subgraph.")); + for (size_t i = 0; i < pattern.InputVars().size(); i++) { + proto::PassDesc::VarMap* var_map = pass_desc->add_var_maps(); + var_map->set_pattern_var(pattern.InputVars()[i]); + var_map->set_replace_var(replace.InputVars()[i]); + } + PADDLE_ENFORCE_EQ(pattern.OutputVars().size(), replace.OutputVars().size(), + platform::errors::InvalidArgument( + "Size of lambda expression returns is not equal " + "between pattern/replace subgraph.")); + for (size_t i = 0; i < pattern.OutputVars().size(); i++) { + proto::PassDesc::VarMap* var_map = pass_desc->add_var_maps(); + var_map->set_pattern_var(pattern.OutputVars()[i]); + var_map->set_replace_var(replace.OutputVars()[i]); + } +} + +const proto::MultiPassDesc& PassPairs::MultiPassDesc() const { + return multi_pass_desc_; +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/generate_pass.h b/paddle/fluid/framework/ir/generate_pass.h index f73173233aed32..26e5231fbc16e7 100644 --- a/paddle/fluid/framework/ir/generate_pass.h +++ b/paddle/fluid/framework/ir/generate_pass.h @@ -13,7 +13,6 @@ // limitations under the License. #pragma once -#include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/pass_desc.pb.h" @@ -43,6 +42,158 @@ class GeneratePass : public Pass { proto::MultiPassDesc multi_pass_desc_; }; +namespace generate_pass { + +class VarHelper; +class OpHelper; +class SubgraphHelper; + +// VarHelper is used to represent a variable node. +struct VarHelper { + enum class Type { kInput, kOutput }; + + explicit VarHelper(const char* name); + VarHelper(const std::string& name, Type type); + + std::string name_; + Type type_; +}; + +// OpHelper is used to represent a operator node. +class OpHelper { + public: + // Convert multiple inputs. + struct Arguments { + Arguments(const char* parameter, const VarHelper& var_helper); + Arguments(const char* parameter, + std::initializer_list var_helpers); + + std::string parameter_; + std::vector var_helpers_; + }; + + OpHelper(const char* type, SubgraphHelper* subgraph_helper); + + OpHelper& operator()(const Arguments& input); + OpHelper& operator()(std::initializer_list inputs); + + VarHelper Out(const char* name); + + private: + OpHelper() = delete; + DISABLE_COPY_AND_ASSIGN(OpHelper); + + const char* type_; + proto::OpDesc* op_desc_; + SubgraphHelper* subgraph_helper_; +}; + +/* + * SubgraphHelper is used to define pattern/replace subgraphs. + * + * Use lambda expression to define subgraph like Python. SubgraphHelper + * converts lambda expression to ProgramDesc. + * + * In order to define a subgraph, user need to use VarHelper and OpHelper. + * Use the macros instead of class names, so user can develop better and + * don't need to know too much about underlying implementation. + * + * An example of defining a subgraph as follows: + * + * SUBGRAPH_(subgraph)([subgraph=&subgraph](VAR_(x), VAR_(y), VAR_(z)) { + * auto ewadd1 = OP_(elementwise_add)({{"X", x}, {"Y", y}}).Out("Out"); + * auto ewadd2 = OP_(elementwise_add)({{"X", ewadd1}, {"Y", z}}).Out("Out"); + * return ewadd2; + * }); + * + */ +class SubgraphHelper { + public: + SubgraphHelper() = default; + // The lambda expression is a prvalue expression. + template + SubgraphHelper& operator=(const T&& f) { + proto::BlockDesc* block = program_desc_.add_blocks(); + block->set_idx(0); + block->set_parent_idx(0); + AddOutputVars(f()); + return *this; + } + + proto::ProgramDesc* ProgramDesc(); + const proto::ProgramDesc& ProgramDesc() const; + const std::vector& InputVars() const; + const std::vector& OutputVars() const; + + void AddInputVar(const std::string& name); + + void AddOutputVars(const VarHelper& var_helper); + + template * = nullptr> + void AddOutputVars(const std::tuple& outputs) { + AddOutputVars(std::get(outputs)); + AddOutputVars(outputs); + } + + template * = nullptr> + void AddOutputVars(const std::tuple& outputs) { + AddOutputVars(std::get(outputs)); + } + + template + void AddOutputVars(const std::tuple& outputs) { + AddOutputVars<0>(outputs); + } + + private: + DISABLE_COPY_AND_ASSIGN(SubgraphHelper); + std::vector input_vars_; + std::vector output_vars_; + proto::ProgramDesc program_desc_; +}; + +} // namespace generate_pass + +class PassPairs { + public: + using SubgraphType = generate_pass::SubgraphHelper; + + PassPairs() = default; + PassPairs(const SubgraphType& pattern, const SubgraphType& replace); + + void AddPassDesc(const SubgraphType& pattern, const SubgraphType& replace); + + const proto::MultiPassDesc& MultiPassDesc() const; + + private: + proto::MultiPassDesc multi_pass_desc_; +}; + +// Use function to register in CC. +template +class MacroPassHelper : public GeneratePass { + public: + MacroPassHelper() : GeneratePass(Functor().MultiPassDesc()) {} +}; + +#define VAR_(name) \ + ::paddle::framework::ir::generate_pass::VarHelper name = \ + ::paddle::framework::ir::generate_pass::VarHelper(#name) +#define OP_(type) \ + ::paddle::framework::ir::generate_pass::OpHelper(#type, subgraph) +#define SUBGRAPH_(name) \ + ::paddle::framework::ir::generate_pass::SubgraphHelper name; \ + name + +#define REGISTER_GENERATE_PASS(pass_type) \ + paddle::framework::ir::PassPairs register_##pass_type(); \ + REGISTER_PASS( \ + pass_type, \ + ::paddle::framework::ir::MacroPassHelper<®ister_##pass_type>); \ + paddle::framework::ir::PassPairs register_##pass_type() + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/generate_pass_tester.cc b/paddle/fluid/framework/ir/generate_pass_tester.cc index c3852d29c308ff..6876dde50c157c 100644 --- a/paddle/fluid/framework/ir/generate_pass_tester.cc +++ b/paddle/fluid/framework/ir/generate_pass_tester.cc @@ -16,234 +16,71 @@ #include "gtest/gtest.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h" -namespace paddle { -namespace framework { -namespace ir { - -template -class CXXGeneratePass : public GeneratePass { - public: - CXXGeneratePass() : GeneratePass(Functor()) {} -}; - -#define REGISTER_GENERATE_PASS(pass_type, function) \ - REGISTER_PASS(pass_type, ::paddle::framework::ir::CXXGeneratePass<&function>) - -proto::MultiPassDesc generate_fc_fuse() { - proto::MultiPassDesc multi_pass_desc; +REGISTER_GENERATE_PASS(generate_fc_fuse) { + paddle::framework::ir::PassPairs pass_pairs; for (bool with_relu : {true, false}) { - proto::PassDesc* pass_desc = multi_pass_desc.add_pass_descs(); - proto::BlockDesc* pattern = pass_desc->mutable_pattern()->add_blocks(); - pattern->set_idx(0); - pattern->set_parent_idx(0); - proto::OpDesc* mul = pattern->add_ops(); - mul->set_type("mul"); - proto::OpDesc::Var* mul_x = mul->add_inputs(); - mul_x->set_parameter("X"); - mul_x->add_arguments()->assign("x"); - proto::OpDesc::Var* mul_y = mul->add_inputs(); - mul_y->set_parameter("Y"); - mul_y->add_arguments()->assign("w"); - proto::OpDesc::Var* mul_out = mul->add_outputs(); - mul_out->set_parameter("Out"); - mul_out->add_arguments()->assign("mul_out"); - proto::OpDesc* ewadd = pattern->add_ops(); - ewadd->set_type("elementwise_add"); - proto::OpDesc::Var* ewadd_x = ewadd->add_inputs(); - ewadd_x->set_parameter("X"); - ewadd_x->add_arguments()->assign("mul_out"); - proto::OpDesc::Var* ewadd_y = ewadd->add_inputs(); - ewadd_y->set_parameter("Y"); - ewadd_y->add_arguments()->assign("b"); - proto::OpDesc::Var* ewadd_out = ewadd->add_outputs(); - ewadd_out->set_parameter("Out"); - ewadd_out->add_arguments()->assign("ewadd_out"); - proto::OpDesc* relu = nullptr; - proto::BlockDesc* replace = pass_desc->mutable_replace()->add_blocks(); - replace->set_idx(0); - replace->set_parent_idx(0); - proto::OpDesc* fc = replace->add_ops(); - fc->set_type("fc"); - proto::OpDesc::Var* fc_x = fc->add_inputs(); - fc_x->set_parameter("Input"); - fc_x->add_arguments()->assign("x"); - proto::OpDesc::Var* fc_w = fc->add_inputs(); - fc_w->set_parameter("W"); - fc_w->add_arguments()->assign("w"); - proto::OpDesc::Var* fc_b = fc->add_inputs(); - fc_b->set_parameter("Bias"); - fc_b->add_arguments()->assign("b"); - proto::OpDesc::Var* fc_out = fc->add_outputs(); - fc_out->set_parameter("Out"); - fc_out->add_arguments()->assign("fc_out"); - for (const char* var : {"x", "w", "b", "fc_out"}) { - proto::PassDesc::VarMap* var_map = pass_desc->add_var_maps(); - var_map->set_pattern_var(var); - var_map->set_replace_var(var); - } - proto::PassDesc::AttrMap* attr_map = pass_desc->add_attr_maps(); - attr_map->set_pattern_op_idx(0); - attr_map->set_pattern_name("x_num_col_dims"); - attr_map->set_replace_op_idx(0); - attr_map->set_replace_name("in_num_col_dims"); - if (with_relu) { - relu = pattern->add_ops(); - relu->set_type("relu"); - proto::OpDesc::Var* relu_x = relu->add_inputs(); - relu_x->set_parameter("X"); - relu_x->add_arguments()->assign("ewadd_out"); - proto::OpDesc::Var* relu_out = relu->add_outputs(); - relu_out->set_parameter("Out"); - relu_out->add_arguments()->assign("relu_out"); - pass_desc->mutable_var_maps(3)->set_pattern_var("relu_out"); - proto::OpDesc::Attr* attr = fc->add_attrs(); - attr->set_name("activation_type"); - attr->set_type(proto::AttrType::STRING); - attr->set_s("relu"); - } else { - pass_desc->mutable_var_maps(3)->set_pattern_var("ewadd_out"); - } + // pattern + SUBGRAPH_(pattern) = + [ subgraph = &pattern, with_relu ](VAR_(x), VAR_(y), VAR_(z)) { + VLOG(3) << "exec lambda func."; + auto mul = OP_(mul)({{"X", x}, {"Y", y}}).Out("Out"); + auto ewadd = OP_(elementwise_add)({{"X", mul}, {"Y", z}}).Out("Out"); + if (with_relu) { + return OP_(relu)({"X", ewadd}).Out("Out"); + } else { + return ewadd; + } + }; + // replace + SUBGRAPH_(replace) = + [ subgraph = &replace, with_relu ](VAR_(x), VAR_(y), VAR_(z)) { + auto& fc = OP_(fc)({{"Input", x}, {"W", y}, {"Bias", z}}); + return fc.Out("Out"); + }; + pass_pairs.AddPassDesc(pattern, replace); } - return multi_pass_desc; + return pass_pairs; } -proto::MultiPassDesc generate_multi_add_to_addn() { - proto::MultiPassDesc multi_pass_desc; - proto::PassDesc* pass_desc = multi_pass_desc.add_pass_descs(); - proto::BlockDesc* pattern = pass_desc->mutable_pattern()->add_blocks(); - proto::OpDesc* ewadd_0 = pattern->add_ops(); - ewadd_0->set_type("elementwise_add"); - proto::OpDesc::Var* ewadd_0_x = ewadd_0->add_inputs(); - ewadd_0_x->set_parameter("X"); - ewadd_0_x->add_arguments()->assign("a"); - proto::OpDesc::Var* ewadd_0_y = ewadd_0->add_inputs(); - ewadd_0_y->set_parameter("Y"); - ewadd_0_y->add_arguments()->assign("b"); - proto::OpDesc::Var* ewadd_0_out = ewadd_0->add_outputs(); - ewadd_0_out->set_parameter("Out"); - ewadd_0_out->add_arguments()->assign("ewadd_out_0"); - proto::OpDesc* ewadd_1 = pattern->add_ops(); - ewadd_1->set_type("elementwise_add"); - proto::OpDesc::Var* ewadd_1_x = ewadd_1->add_inputs(); - ewadd_1_x->set_parameter("X"); - ewadd_1_x->add_arguments()->assign("ewadd_out_0"); - proto::OpDesc::Var* ewadd_1_y = ewadd_1->add_inputs(); - ewadd_1_y->set_parameter("Y"); - ewadd_1_y->add_arguments()->assign("c"); - proto::OpDesc::Var* ewadd_1_out = ewadd_1->add_outputs(); - ewadd_1_out->set_parameter("Out"); - ewadd_1_out->add_arguments()->assign("ewadd_out_1"); - proto::BlockDesc* replace = pass_desc->mutable_replace()->add_blocks(); - proto::OpDesc* addn = replace->add_ops(); - addn->set_type("add_n"); - proto::OpDesc::Var* addn_x = addn->add_inputs(); - addn_x->set_parameter("X"); - addn_x->add_arguments()->assign("a"); - addn_x->add_arguments()->assign("b"); - addn_x->add_arguments()->assign("c"); - proto::OpDesc::Var* addn_out = addn->add_outputs(); - addn_out->set_parameter("Out"); - addn_out->add_arguments()->assign("addn_out"); - for (const char* var : {"a", "b", "c", "ewadd_out_1"}) { - proto::PassDesc::VarMap* var_map = pass_desc->add_var_maps(); - var_map->set_pattern_var(var); - var_map->set_replace_var(var); - } - pass_desc->mutable_var_maps(3)->set_replace_var("addn_out"); - return multi_pass_desc; +REGISTER_GENERATE_PASS(generate_multi_add_to_addn) { + // pattern + SUBGRAPH_(pattern) = [subgraph = &pattern](VAR_(x), VAR_(y), VAR_(z)) { + auto ewadd1 = OP_(elementwise_add)({{"X", x}, {"Y", y}}).Out("Out"); + auto ewadd2 = OP_(elementwise_add)({{"X", ewadd1}, {"Y", z}}).Out("Out"); + return ewadd2; + }; + // replace + SUBGRAPH_(replace) = [subgraph = &replace](VAR_(x), VAR_(y), VAR_(z)) { + return OP_(sum)({"X", {x, y, z}}).Out("Out"); + }; + return {pattern, replace}; } -proto::MultiPassDesc generate_combine_matmul() { - proto::MultiPassDesc multi_pass_desc; - proto::PassDesc* pass_desc = multi_pass_desc.add_pass_descs(); - proto::BlockDesc* pattern = pass_desc->mutable_pattern()->add_blocks(); - proto::OpDesc* matmul_0 = pattern->add_ops(); - matmul_0->set_type("matmul"); - proto::OpDesc::Var* matmul_0_x = matmul_0->add_inputs(); - matmul_0_x->set_parameter("X"); - matmul_0_x->add_arguments()->assign("a"); - proto::OpDesc::Var* matmul_0_y = matmul_0->add_inputs(); - matmul_0_y->set_parameter("Y"); - matmul_0_y->add_arguments()->assign("b"); - proto::OpDesc::Var* matmul_0_out = matmul_0->add_outputs(); - matmul_0_out->set_parameter("Out"); - matmul_0_out->add_arguments()->assign("matmul_out_0"); - proto::OpDesc* matmul_1 = pattern->add_ops(); - matmul_1->set_type("matmul"); - proto::OpDesc::Var* matmul_1_x = matmul_1->add_inputs(); - matmul_1_x->set_parameter("X"); - matmul_1_x->add_arguments()->assign("a"); - proto::OpDesc::Var* matmul_1_y = matmul_1->add_inputs(); - matmul_1_y->set_parameter("Y"); - matmul_1_y->add_arguments()->assign("c"); - proto::OpDesc::Var* matmul_1_out = matmul_1->add_outputs(); - matmul_1_out->set_parameter("Out"); - matmul_1_out->add_arguments()->assign("matmul_out_1"); - proto::BlockDesc* replace = pass_desc->mutable_replace()->add_blocks(); - proto::OpDesc* concat = replace->add_ops(); - concat->set_type("concat"); - proto::OpDesc::Var* concat_x = concat->add_inputs(); - concat_x->set_parameter("X"); - concat_x->add_arguments()->assign("b"); - concat_x->add_arguments()->assign("c"); - proto::OpDesc::Var* concat_out = concat->add_outputs(); - concat_out->set_parameter("Out"); - concat_out->add_arguments()->assign("concat_out"); - proto::OpDesc* matmul = replace->add_ops(); - matmul->set_type("matmul"); - proto::OpDesc::Var* matmul_x = matmul->add_inputs(); - matmul_x->set_parameter("X"); - matmul_x->add_arguments()->assign("a"); - proto::OpDesc::Var* matmul_y = matmul->add_inputs(); - matmul_y->set_parameter("Y"); - matmul_y->add_arguments()->assign("concat_out"); - proto::OpDesc::Var* matmul_out = matmul->add_outputs(); - matmul_out->set_parameter("Out"); - matmul_out->add_arguments()->assign("matmul_out"); - proto::OpDesc* slice_0 = replace->add_ops(); - slice_0->set_type("slice"); - proto::OpDesc::Var* slice_0_x = slice_0->add_inputs(); - slice_0_x->set_parameter("X"); - slice_0_x->add_arguments()->assign("matmul_out"); - proto::OpDesc::Var* slice_0_out = slice_0->add_outputs(); - slice_0_out->set_parameter("Out"); - slice_0_out->add_arguments()->assign("slice_out_0"); - proto::OpDesc* slice_1 = replace->add_ops(); - slice_1->set_type("slice"); - proto::OpDesc::Var* slice_1_x = slice_1->add_inputs(); - slice_1_x->set_parameter("X"); - slice_1_x->add_arguments()->assign("matmul_out"); - proto::OpDesc::Var* slice_1_out = slice_1->add_outputs(); - slice_1_out->set_parameter("Out"); - slice_1_out->add_arguments()->assign("slice_out_1"); - for (const char* var : {"a", "b", "c", "matmul_out_0", "matmul_out_1"}) { - proto::PassDesc::VarMap* var_map = pass_desc->add_var_maps(); - var_map->set_pattern_var(var); - var_map->set_replace_var(var); - } - pass_desc->mutable_var_maps(3)->set_replace_var("slice_out_0"); - pass_desc->mutable_var_maps(4)->set_replace_var("slice_out_1"); - return multi_pass_desc; +REGISTER_GENERATE_PASS(generate_combine_matmul) { + // pattern + SUBGRAPH_(pattern) = [subgraph = &pattern](VAR_(x), VAR_(y), VAR_(z)) { + auto matmul1 = OP_(matmul)({{"X", x}, {"Y", y}}).Out("Out"); + auto matmul2 = OP_(matmul)({{"X", x}, {"Y", z}}).Out("Out"); + return std::make_tuple(matmul1, matmul2); + }; + // replace + SUBGRAPH_(replace) = [subgraph = &replace](VAR_(x), VAR_(y), VAR_(z)) { + auto concat = OP_(concat)({"X", {y, z}}).Out("Out"); + auto matmul = OP_(matmul)({{"X", x}, {"Y", concat}}).Out("Out"); + auto slice1 = OP_(slice)({"X", matmul}).Out("Out"); + auto slice2 = OP_(slice)({"X", matmul}).Out("Out"); + return std::make_tuple(slice1, slice2); + }; + return {pattern, replace}; } -} // namespace ir -} // namespace framework -} // namespace paddle - -REGISTER_GENERATE_PASS(generate_fc_fuse, - paddle::framework::ir::generate_fc_fuse); -REGISTER_GENERATE_PASS(generate_multi_add_to_addn, - paddle::framework::ir::generate_multi_add_to_addn); -REGISTER_GENERATE_PASS(generate_combine_matmul, - paddle::framework::ir::generate_combine_matmul); - namespace paddle { namespace framework { namespace ir { TEST(GeneratePass, construct_with_string) { std::string binary_str; - generate_fc_fuse().SerializeToString(&binary_str); + register_generate_fc_fuse().MultiPassDesc().SerializeToString(&binary_str); GeneratePass generate_pass(binary_str); } @@ -318,7 +155,7 @@ TEST(GeneratePass, generate_multi_add_to_addn) { graph.reset(pass->Apply(graph.release())); int num_nodes_after = graph->Nodes().size(); - int num_addn_nodes_after = GetNumOpNodes(graph, "add_n"); + int num_addn_nodes_after = GetNumOpNodes(graph, "sum"); VLOG(3) << DebugString(graph); PADDLE_ENFORCE_EQ(num_nodes_before, num_nodes_after + 2, diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 4150d0ca555c9d..6830a1f85e02a9 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -1606,6 +1606,7 @@ PDNode *patterns::Matmul::operator()() { ->assert_is_op_input("matmul", "X"); auto matmul_in_y = pattern->NewNode(matmul_in_y_repr()) ->AsInput() + ->assert_is_persistable_var() ->assert_is_op_input("matmul", "Y"); auto matmul_out = pattern->NewNode(matmul_out_repr()) ->AsOutput() @@ -1615,6 +1616,47 @@ PDNode *patterns::Matmul::operator()() { return matmul_out; } +// MatmulV2: tensor * weight +PDNode *patterns::MatmulV2Weight::operator()() { + auto matmul_v2_op = + pattern->NewNode(matmul_v2_op_repr())->assert_is_op("matmul_v2"); + + auto matmul_v2_in_x = pattern->NewNode(matmul_v2_in_x_repr()) + ->AsInput() + ->assert_is_op_input("matmul_v2", "X"); + auto matmul_v2_in_y = pattern->NewNode(matmul_v2_in_y_repr()) + ->AsInput() + ->assert_is_persistable_var() // Y is weight + ->assert_is_op_input("matmul_v2", "Y"); + auto matmul_v2_out = pattern->NewNode(matmul_v2_out_repr()) + ->AsOutput() + ->assert_is_op_output("matmul_v2", "Out"); + + matmul_v2_op->LinksFrom({matmul_v2_in_x, matmul_v2_in_y}) + .LinksTo({matmul_v2_out}); + return matmul_v2_out; +} + +// MatmulV2: tensor * tensor or tensor * weight +PDNode *patterns::MatmulV2::operator()() { + auto matmul_v2_op = + pattern->NewNode(matmul_v2_op_repr())->assert_is_op("matmul_v2"); + + auto matmul_v2_in_x = pattern->NewNode(matmul_v2_in_x_repr()) + ->AsInput() + ->assert_is_op_input("matmul_v2", "X"); + auto matmul_v2_in_y = pattern->NewNode(matmul_v2_in_y_repr()) + ->AsInput() + ->assert_is_op_input("matmul_v2", "Y"); + auto matmul_v2_out = pattern->NewNode(matmul_v2_out_repr()) + ->AsOutput() + ->assert_is_op_output("matmul_v2", "Out"); + + matmul_v2_op->LinksFrom({matmul_v2_in_x, matmul_v2_in_y}) + .LinksTo({matmul_v2_out}); + return matmul_v2_out; +} + PDNode *patterns::Squeeze2Matmul::operator()() { auto squeeze2_in_x = pattern->NewNode(squeeze2_in_x_repr()) ->assert_is_op_input("squeeze2", "X") @@ -2263,15 +2305,34 @@ PDNode *patterns::QuantizePlacement::operator()( PDNode *patterns::Bfloat16Placement::operator()( const std::unordered_set &bfloat16_enabled_op_types) { std::unordered_set supported_op_types = - std::unordered_set( - {"concat", "conv2d", "conv2d_transpose", - "elementwise_add", "elementwise_mul", "fc", - "fusion_gru", "fusion_lstm", "gelu", - "layer_norm", "matmul", "matmul_v2", - "pool2d", "prelu", "relu", - "reshape2", "softmax", "split", - "squeeze", "squeeze2", "sum", - "transpose2"}); + std::unordered_set({"cast", + "clip", + "concat", + "conv2d", + "conv2d_transpose", + "elementwise_add", + "elementwise_mul", + "expand_v2", + "fc", + "fusion_gru", + "fusion_lstm", + "gelu", + "layer_norm", + "matmul", + "matmul_v2", + "pool2d", + "prelu", + "relu", + "reshape2", + "scale", + "sigmoid", + "slice", + "softmax", + "split", + "squeeze", + "squeeze2", + "sum", + "transpose2"}); if (!bfloat16_enabled_op_types.empty()) { supported_op_types = bfloat16_enabled_op_types; } @@ -2659,16 +2720,18 @@ PDNode *patterns::ReshapeTransposeMatmulPattern::operator()( return matmul_out; } -PDNode *patterns::MatmulTransposeReshapePattern::operator()() { +// shared function for matmul and matmul_v2 +PDNode *patterns::MatmulTransposeReshapePattern::operator()( + const std::string &op_name) { auto reshape_op = pattern->NewNode(reshape_op_repr())->assert_is_op("reshape2"); auto transpose_op = pattern->NewNode(transpose_op_repr())->assert_is_op("transpose2"); - auto matmul_op = pattern->NewNode(matmul_op_repr())->assert_is_op("matmul"); + auto matmul_op = pattern->NewNode(matmul_op_repr())->assert_is_op(op_name); auto matmul_out = pattern->NewNode(matmul_out_repr()) ->AsInput() - ->assert_is_op_output("matmul", "Out") + ->assert_is_op_output(op_name, "Out") ->assert_is_op_input("transpose2", "X"); auto transpose_out = pattern->NewNode(transpose_out_repr()) @@ -2967,6 +3030,29 @@ PDNode *patterns::LayerNorm::operator()() { return shift_out; } +// Add support int8 flag +PDNode *patterns::AddSupportInt8::operator()() { + auto prev_op = + pattern->NewNode(prev_op_repr()) + ->assert_is_op() + ->assert_more([&](Node *node) { + return node->Op()->HasAttr("out_threshold") ? true : false; + }); + auto prev_out = pattern->NewNode(prev_out_repr())->assert_is_var(); + auto quant_op = + pattern->NewNode(quant_op_repr()) + ->assert_is_op() + ->assert_more([&](Node *node) { + return node->Op()->HasAttr("out_threshold") ? true : false; + }); + auto quant_out = + pattern->NewNode(quant_out_repr())->assert_is_var()->AsOutput(); + prev_op->LinksTo({prev_out}); + prev_out->LinksTo({quant_op}); + quant_op->LinksTo({quant_out}); + return quant_out; +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index 40c3e4f59bf262..6657ab5a6a5764 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -976,6 +976,30 @@ struct Matmul : public PatternBase { PATTERN_DECL_NODE(matmul_out); }; +// MatmulV2: tensor * weight +struct MatmulV2Weight : public PatternBase { + MatmulV2Weight(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "matmul_v2_weight") {} + + PDNode* operator()(); + PATTERN_DECL_NODE(matmul_v2_in_x); + PATTERN_DECL_NODE(matmul_v2_in_y); + PATTERN_DECL_NODE(matmul_v2_op); + PATTERN_DECL_NODE(matmul_v2_out); +}; + +// MatmulV2: tensor * tensor or tensor * weight +struct MatmulV2 : public PatternBase { + MatmulV2(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "matmul_v2") {} + + PDNode* operator()(); + PATTERN_DECL_NODE(matmul_v2_in_x); + PATTERN_DECL_NODE(matmul_v2_in_y); + PATTERN_DECL_NODE(matmul_v2_op); + PATTERN_DECL_NODE(matmul_v2_out); +}; + // Squeeze2 + Matmul // Forward pass. struct Squeeze2Matmul : public PatternBase { @@ -1533,7 +1557,7 @@ struct MatmulTransposeReshapePattern : public PatternBase { const std::string& name_scope) : PatternBase(pattern, name_scope, "matmul_transpose_reshape") {} - PDNode* operator()(); + PDNode* operator()(const std::string& op_name); PATTERN_DECL_NODE(matmul_op); PATTERN_DECL_NODE(matmul_out); @@ -1682,6 +1706,18 @@ struct LayerNorm : public PatternBase { PATTERN_DECL_NODE(shift_out); }; +// Add support int8 flag +struct AddSupportInt8 : public PatternBase { + AddSupportInt8(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "Add_support_int8") {} + + PDNode* operator()(); + PATTERN_DECL_NODE(prev_op); + PATTERN_DECL_NODE(prev_out); + PATTERN_DECL_NODE(quant_op); + PATTERN_DECL_NODE(quant_out); +}; + } // namespace patterns // Link two ir::Nodes from each other. diff --git a/paddle/fluid/framework/ir/graph_viz_pass.cc b/paddle/fluid/framework/ir/graph_viz_pass.cc index f2c711fb6f0047..735b433b6cfe1b 100644 --- a/paddle/fluid/framework/ir/graph_viz_pass.cc +++ b/paddle/fluid/framework/ir/graph_viz_pass.cc @@ -62,10 +62,14 @@ void GraphVizPass::ApplyImpl(ir::Graph* graph) const { } } } + const std::string& optim_cache_dir = Get("optim_cache_dir"); std::string program_bytes = program_desc.Proto()->SerializeAsString(); // rename from "17_ir_fc_fuse_pass.dot" to "fc_fuse_pass.pdmodel" program_path = graph_viz_path.substr(found1 + 4, found2 - found1 - 4) + ".pdmodel"; + if (!optim_cache_dir.empty()) { + program_path = optim_cache_dir + "/" + program_path; + } std::ofstream file(program_path.c_str(), std::ios::binary); file.write(program_bytes.c_str(), program_bytes.size()); file.close(); diff --git a/paddle/fluid/framework/ir/is_test_pass.cc b/paddle/fluid/framework/ir/is_test_pass.cc index 25bf03f426a1d9..a97873e82f4554 100644 --- a/paddle/fluid/framework/ir/is_test_pass.cc +++ b/paddle/fluid/framework/ir/is_test_pass.cc @@ -35,7 +35,7 @@ void IsTestPass::ApplyImpl(ir::Graph* graph) const { "hard_shrink", "hard_sigmoid", "relu6", "soft_relu", "swish", "thresholded_relu", "log", "square", "softplus", - "softsign", "silu"}; + "softsign", "silu", "mish"}; for (const Node* n : graph->Nodes()) { if (n->IsOp()) { auto* op = n->Op(); diff --git a/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc b/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc index 95d55834f823bf..86191587e18495 100644 --- a/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc +++ b/paddle/fluid/framework/ir/layer_norm_fuse_pass.cc @@ -351,8 +351,9 @@ void LayerNormFusePass::ApplyImpl(Graph* graph) const { gpd(graph, handler); AddStatis(found_layer_norm_count); - PrettyLogDetail("--- Fused %d subgraphs into layer_norm op.", - found_layer_norm_count); + if (!Has("disable_logs") || !Get("disable_logs")) + PrettyLogDetail("--- Fused %d subgraphs into layer_norm op.", + found_layer_norm_count); } } // namespace ir diff --git a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc index 864055cfa3620d..865b556f301c0d 100644 --- a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc +++ b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.cc @@ -16,6 +16,7 @@ #include #include +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/op_proto_maker.h" #include "paddle/fluid/framework/op_version_registry.h" @@ -67,6 +68,81 @@ MapMatmul2MulPass::MapMatmul2MulPass() { .End(); } +MapMatmulV2ToMulPass::MapMatmulV2ToMulPass() { + AddOpCompat(OpCompat("matmul_v2")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("trans_x") + .IsBoolEQ(false) + .End() + .AddAttr("trans_y") + .IsBoolEQ(false) + .End(); + + AddOpCompat(OpCompat("mul")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("x_num_col_dims") + .IsNumGE(1) + .End() + .AddAttr("y_num_col_dims") + .IsNumEQ(1) + .End(); +} + +MapMatmulV2ToMatmulPass::MapMatmulV2ToMatmulPass() { + AddOpCompat(OpCompat("matmul_v2")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("trans_x") + .IsType() + .End() + .AddAttr("trans_y") + .IsType() + .End(); + + AddOpCompat(OpCompat("matmul")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddAttr("alpha") + .IsNumEQ(1.0f) + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("transpose_X") + .IsType() + .End() + .AddAttr("transpose_Y") + .IsType() + .End(); +} + Flatten2MatmulFusePass::Flatten2MatmulFusePass() { AddOpCompat(OpCompat("matmul")) .AddInput("X") @@ -209,15 +285,11 @@ void MapMatmul2MulPass::ApplyImpl(ir::Graph* graph) const { std::vector y_shape = matmul_in_y->Var()->GetShape(); size_t x_rank = x_shape.size(); size_t y_rank = y_shape.size(); - flag = flag && (x_rank == 2 || x_rank == 3) && y_rank == 2; - - std::vector& next_ops = matmul_out->outputs; - flag = flag && next_ops.size() == 1 && - next_ops[0]->Name() == "elementwise_add"; + flag = flag && x_rank >= 2 && y_rank == 2; if (flag) { if (!IsCompat(subgraph, g)) { - LOG(WARNING) << "Pass in op compat failed."; + LOG(WARNING) << "MapMatmul2MulPass in op compat failed."; return; } OpDesc desc(matmul_op->Op()->Block()); @@ -231,6 +303,8 @@ void MapMatmul2MulPass::ApplyImpl(ir::Graph* graph) const { desc.SetAttr("enable_int8", matmul_op->Op()->GetAttr("enable_int8")); desc.SetAttr("X_scale", matmul_op->Op()->GetAttr("X_scale")); desc.SetAttr("weight_scale", matmul_op->Op()->GetAttr("weight_scale")); + desc.SetAttr("out_threshold", + matmul_op->Op()->GetAttr("out_threshold")); } auto mul_node = g->CreateOpNode(&desc); IR_NODE_LINK_TO(matmul_in_x, mul_node); @@ -250,6 +324,157 @@ void MapMatmul2MulPass::ApplyImpl(ir::Graph* graph) const { AddStatis(found_count); } +void MapMatmulV2ToMulPass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); + std::string name_scope = "map_matmul_v2_to_mul_pass"; + FusePassBase::Init(name_scope, graph); + + GraphPatternDetector gpd; + patterns::MatmulV2Weight matmul_v2_weight_pattern(gpd.mutable_pattern(), + name_scope); + matmul_v2_weight_pattern(); + + int found_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(3) << "map matmul_v2 to mul"; + GET_IR_NODE_FROM_SUBGRAPH(matmul_v2_in_x, matmul_v2_in_x, + matmul_v2_weight_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_v2_in_y, matmul_v2_in_y, + matmul_v2_weight_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_v2_op, matmul_v2_op, + matmul_v2_weight_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_v2_out, matmul_v2_out, + matmul_v2_weight_pattern); + + bool flag = true; + bool trans_x = + BOOST_GET_CONST(bool, matmul_v2_op->Op()->GetAttr("trans_x")); + bool trans_y = + BOOST_GET_CONST(bool, matmul_v2_op->Op()->GetAttr("trans_y")); + flag = flag && !trans_x && !trans_y; + + std::vector x_shape = matmul_v2_in_x->Var()->GetShape(); + std::vector y_shape = matmul_v2_in_y->Var()->GetShape(); + size_t x_rank = x_shape.size(); + size_t y_rank = y_shape.size(); + flag = flag && x_rank >= 2 && y_rank == 2; + + if (flag) { + if (!IsCompat(subgraph, g)) { + LOG(WARNING) << "MapMatmulV2ToMulPass in op compat failed."; + return; + } + OpDesc desc(matmul_v2_op->Op()->Block()); + desc.SetType("mul"); + desc.SetInput("X", {matmul_v2_in_x->Name()}); + desc.SetInput("Y", {matmul_v2_in_y->Name()}); + desc.SetOutput("Out", {matmul_v2_out->Name()}); + desc.SetAttr("x_num_col_dims", static_cast(x_rank - 1)); + desc.SetAttr("y_num_col_dims", 1); + if (matmul_v2_op->Op()->HasAttr("enable_int8")) { + desc.SetAttr("enable_int8", matmul_v2_op->Op()->GetAttr("enable_int8")); + desc.SetAttr("X_scale", matmul_v2_op->Op()->GetAttr("X_scale")); + desc.SetAttr("weight_scale", + matmul_v2_op->Op()->GetAttr("weight_scale")); + desc.SetAttr("out_threshold", + matmul_v2_op->Op()->GetAttr("out_threshold")); + } + auto mul_node = g->CreateOpNode(&desc); + IR_NODE_LINK_TO(matmul_v2_in_x, mul_node); + IR_NODE_LINK_TO(matmul_v2_in_y, mul_node); + IR_NODE_LINK_TO(mul_node, matmul_v2_out); + GraphSafeRemoveNodes(graph, {matmul_v2_op}); + ++found_count; + + if (!IsCompat(desc)) { + LOG(WARNING) << "MapMatmulV2ToMulPass in out mul op compat failed."; + return; + } + } + }; + + gpd(graph, handler); + AddStatis(found_count); +} + +void MapMatmulV2ToMatmulPass::ApplyImpl(ir::Graph* graph) const { + PADDLE_ENFORCE_NOT_NULL( + graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); + std::string name_scope = "map_matmul_v2_to_matmul_pass"; + FusePassBase::Init(name_scope, graph); + + GraphPatternDetector gpd; + patterns::MatmulV2 matmul_v2_pattern(gpd.mutable_pattern(), name_scope); + matmul_v2_pattern(); + + int found_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "map matmul_v2 to matmul"; + GET_IR_NODE_FROM_SUBGRAPH(matmul_v2_in_x, matmul_v2_in_x, + matmul_v2_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_v2_in_y, matmul_v2_in_y, + matmul_v2_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_v2_op, matmul_v2_op, matmul_v2_pattern); + GET_IR_NODE_FROM_SUBGRAPH(matmul_v2_out, matmul_v2_out, matmul_v2_pattern); + if (!IsCompat(subgraph, g)) { + LOG(WARNING) << "MapMatmulV2ToMatmulPass in op compat failed."; + return; + } + + std::vector x_shape = matmul_v2_in_x->Var()->GetShape(); + std::vector y_shape = matmul_v2_in_y->Var()->GetShape(); + if (x_shape.size() != y_shape.size()) { + LOG(WARNING) + << "matmul op not support broadcast, please check inputs'shape. "; + return; + } + uint64_t dims = 2; + for (size_t i = 0; i < x_shape.size() - dims; ++i) { + if (x_shape[i] != y_shape[i] && (x_shape[i] == 1 || y_shape[i] == 1)) { + LOG(WARNING) << "matmul op not support broadcast, please check " + "inputs'shape[i]. "; + return; + } + } + + OpDesc desc(matmul_v2_op->Op()->Block()); + desc.SetType("matmul"); + desc.SetInput("X", {matmul_v2_in_x->Name()}); + desc.SetInput("Y", {matmul_v2_in_y->Name()}); + desc.SetOutput("Out", {matmul_v2_out->Name()}); + desc.SetAttr("transpose_X", matmul_v2_op->Op()->GetAttr("trans_x")); + desc.SetAttr("transpose_Y", matmul_v2_op->Op()->GetAttr("trans_y")); + desc.SetAttr("alpha", 1.0f); + if (matmul_v2_op->Op()->HasAttr("use_mkldnn")) { + desc.SetAttr("use_mkldnn", matmul_v2_op->Op()->GetAttr("use_mkldnn")); + } + if (matmul_v2_op->Op()->HasAttr("enable_int8")) { + desc.SetAttr("enable_int8", matmul_v2_op->Op()->GetAttr("enable_int8")); + desc.SetAttr("X_scale", matmul_v2_op->Op()->GetAttr("X_scale")); + desc.SetAttr("weight_scale", matmul_v2_op->Op()->GetAttr("weight_scale")); + desc.SetAttr("out_threshold", + matmul_v2_op->Op()->GetAttr("out_threshold")); + } + auto matmul_node = g->CreateOpNode(&desc); + IR_NODE_LINK_TO(matmul_v2_in_x, matmul_node); + IR_NODE_LINK_TO(matmul_v2_in_y, matmul_node); + IR_NODE_LINK_TO(matmul_node, matmul_v2_out); + GraphSafeRemoveNodes(graph, {matmul_v2_op}); + ++found_count; + + if (!IsCompat(desc)) { + LOG(WARNING) << "MapMatmulV2ToMatmulPass in out matmul op compat failed."; + return; + } + }; + + gpd(graph, handler); + AddStatis(found_count); +} + void Squeeze2MatmulFusePass::ApplyImpl(ir::Graph* graph) const { PADDLE_ENFORCE_NOT_NULL( graph, platform::errors::InvalidArgument("Graph cannot be nullptr.")); @@ -296,7 +521,7 @@ void Squeeze2MatmulFusePass::ApplyImpl(ir::Graph* graph) const { if (flag) { if (!IsCompat(subgraph, g)) { - LOG(WARNING) << "Pass in op compat failed."; + LOG(WARNING) << "Squeeze2MatmulFusePass in op compat failed."; return; } OpDesc desc(matmul_op->Op()->Block()); @@ -310,6 +535,8 @@ void Squeeze2MatmulFusePass::ApplyImpl(ir::Graph* graph) const { desc.SetAttr("enable_int8", matmul_op->Op()->GetAttr("enable_int8")); desc.SetAttr("X_scale", matmul_op->Op()->GetAttr("X_scale")); desc.SetAttr("weight_scale", matmul_op->Op()->GetAttr("weight_scale")); + desc.SetAttr("out_threshold", + matmul_op->Op()->GetAttr("out_threshold")); } auto mul_node = g->CreateOpNode(&desc); IR_NODE_LINK_TO(squeeze2_in_x, mul_node); @@ -438,7 +665,7 @@ void Reshape2MatmulFusePass::ApplyImpl(ir::Graph* graph) const { if (flag) { if (!IsCompat(subgraph, g)) { - LOG(WARNING) << "Pass in op compat failed."; + LOG(WARNING) << "Reshape2MatmulFusePass in op compat failed."; return; } OpDesc desc(matmul_op->Op()->Block()); @@ -452,9 +679,11 @@ void Reshape2MatmulFusePass::ApplyImpl(ir::Graph* graph) const { desc.SetAttr("enable_int8", matmul_op->Op()->GetAttr("enable_int8")); desc.SetAttr("X_scale", matmul_op->Op()->GetAttr("X_scale")); desc.SetAttr("weight_scale", matmul_op->Op()->GetAttr("weight_scale")); + desc.SetAttr("out_threshold", + matmul_op->Op()->GetAttr("out_threshold")); } if (!IsCompat(desc)) { - LOG(WARNING) << "reshape2 matmul pass in out mul op compat failed."; + LOG(WARNING) << "Reshape2MatmulFusePass in out mul op compat failed."; return; } auto mul_node = g->CreateOpNode(&desc); @@ -523,7 +752,7 @@ void Flatten2MatmulFusePass::ApplyImpl(ir::Graph* graph) const { if (pattern_found) { if (!IsCompat(subgraph, g)) { - LOG(WARNING) << "Pass in op compat failed."; + LOG(WARNING) << "Flatten2MatmulFusePass in op compat failed."; return; } OpDesc desc(matmul_op->Op()->Block()); @@ -537,6 +766,8 @@ void Flatten2MatmulFusePass::ApplyImpl(ir::Graph* graph) const { desc.SetAttr("enable_int8", matmul_op->Op()->GetAttr("enable_int8")); desc.SetAttr("X_scale", matmul_op->Op()->GetAttr("X_scale")); desc.SetAttr("weight_scale", matmul_op->Op()->GetAttr("weight_scale")); + desc.SetAttr("out_threshold", + matmul_op->Op()->GetAttr("out_threshold")); } auto mul_node = g->CreateOpNode(&desc); IR_NODE_LINK_TO(flatten2_in_x, mul_node); @@ -567,6 +798,22 @@ REGISTER_PASS_CAPABILITY(map_matmul_to_mul_pass) .LE("matmul", 1) .EQ("mul", 0)); +REGISTER_PASS(map_matmul_v2_to_mul_pass, + paddle::framework::ir::MapMatmulV2ToMulPass); +REGISTER_PASS_CAPABILITY(map_matmul_v2_to_mul_pass) + .AddCombination( + paddle::framework::compatible::OpVersionComparatorCombination() + .EQ("matmul_v2", 0) + .EQ("mul", 0)); + +REGISTER_PASS(map_matmul_v2_to_matmul_pass, + paddle::framework::ir::MapMatmulV2ToMatmulPass); +REGISTER_PASS_CAPABILITY(map_matmul_v2_to_matmul_pass) + .AddCombination( + paddle::framework::compatible::OpVersionComparatorCombination() + .EQ("matmul_v2", 0) + .LE("matmul", 1)); + REGISTER_PASS(squeeze2_matmul_fuse_pass, paddle::framework::ir::Squeeze2MatmulFusePass); REGISTER_PASS_CAPABILITY(squeeze2_matmul_fuse_pass) diff --git a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.h b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.h index 192dcfc00f9d34..a924cd8ddf92c6 100644 --- a/paddle/fluid/framework/ir/map_matmul_to_mul_pass.h +++ b/paddle/fluid/framework/ir/map_matmul_to_mul_pass.h @@ -46,6 +46,30 @@ class MapMatmul2MulPass : public FusePassBase { void ApplyImpl(Graph* graph) const override; }; +/* + * Map matmul_v2 to mul, the same as MapMatmul2MulPass. + */ +class MapMatmulV2ToMulPass : public FusePassBase { + public: + MapMatmulV2ToMulPass(); + virtual ~MapMatmulV2ToMulPass() {} + + protected: + void ApplyImpl(Graph* graph) const override; +}; + +/* + * Map matmul_v2 to matmul, not supoort broadcast. + */ +class MapMatmulV2ToMatmulPass : public FusePassBase { + public: + MapMatmulV2ToMatmulPass(); + virtual ~MapMatmulV2ToMatmulPass() {} + + protected: + void ApplyImpl(Graph* graph) const override; +}; + /* * Fuse squeeze2+matmul to mul, so the optimization can use fc_fuse_pass. * The squeeze2 op must satisfy the following conditions: diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc index 849d0dabab7796..d09de5be84c358 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/inplace_addto_op_pass.cc @@ -179,7 +179,8 @@ void InplaceAddToOpPass::Run(Graph *graph) const { out_var_ptr->GeneratedOp()); // NOTE(zhiqiu): currently, only conv2d_grad supports addto strategy - if (right_generated_op->Name() != "conv2d_grad") { + if (right_generated_op->Name() != "conv2d_grad" && + right_generated_op->Name() != "resnet_unit_grad") { continue; } @@ -224,11 +225,13 @@ static bool IsValidConv2DGradDataGradNode(const Node &node) { if (node.inputs.empty()) return false; auto *generated_op = node.inputs[0]; auto *op_desc = generated_op->Op(); - if (op_desc == nullptr || op_desc->Type() != "conv2d_grad") { + if (op_desc == nullptr || (op_desc->Type() != "conv2d_grad" && + op_desc->Type() != "resnet_unit_grad")) { return false; } const auto &outputs = op_desc->Outputs(); - auto iter = outputs.find(GradVarName("Input")); + std::string grad_var_name = op_desc->Type() == "conv2d_grad" ? "Input" : "X"; + auto iter = outputs.find(GradVarName(grad_var_name)); return iter != outputs.end() && !iter->second.empty() && iter->second[0] == node.Name() && !op_desc->GetAttrIfExists("use_addto"); diff --git a/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.cc index 3fdb87f2544036..c5bb4bf0b2fc97 100644 --- a/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.cc @@ -150,8 +150,9 @@ void FuseBatchNormActOneDNNPass::FuseBatchNormAct( gpd(graph, handler); AddStatis(found_bn_act_count); - PrettyLogDetail("--- fused %d batch norm with relu activation", - found_bn_act_count); + if (!Has("disable_logs") || !Get("disable_logs")) + PrettyLogDetail("--- fused %d batch norm with relu activation", + found_bn_act_count); } } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc index 85d308c7eb30db..093fd5ec538db1 100644 --- a/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc @@ -68,9 +68,9 @@ void FuseFCActOneDNNPass::FuseFCAct(Graph *graph, bool approximate = BOOST_GET_CONST(bool, act_op->GetAttr("approximate")); std::string type = approximate ? "_tanh" : "_erf"; fc_op->SetAttr("activation_type", act_type + type); - } else + } else { fc_op->SetAttr("activation_type", act_type); - + } fc_op->SetAttr("use_mkldnn", true); fc_op->SetOutput("Out", {act_out->Name()}); @@ -82,8 +82,9 @@ void FuseFCActOneDNNPass::FuseFCAct(Graph *graph, gpd(graph, handler); AddStatis(found_fc_act_count); - PrettyLogDetail("--- fused %d fc with %s activation", found_fc_act_count, - act_type); + if (!Has("disable_logs") || !Get("disable_logs")) + PrettyLogDetail("--- fused %d fc with %s activation", found_fc_act_count, + act_type); } } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc index e5bdb08fe4ab48..34a35877a7f256 100644 --- a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.cc @@ -23,7 +23,9 @@ namespace framework { namespace ir { MatmulTransposeReshapeMKLDNNPass::MatmulTransposeReshapeMKLDNNPass() { - AddOpCompat(OpCompat("matmul")) + op_name_ = "matmul"; + + AddOpCompat(OpCompat(op_name_)) .AddInput("X") .IsTensor() .End() @@ -89,7 +91,7 @@ void MatmulTransposeReshapeMKLDNNPass::ApplyImpl(ir::Graph *graph) const { patterns::MatmulTransposeReshapePattern mtrp(gpd.mutable_pattern(), name_scope_); - mtrp(); + mtrp(op_name_); int found_matmul_transpose_reshape_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph, @@ -98,7 +100,7 @@ void MatmulTransposeReshapeMKLDNNPass::ApplyImpl(ir::Graph *graph) const { LOG(WARNING) << "Pass in op compat failed."; return; } - VLOG(4) << "handle matmul_transpose_reshape fuse"; + VLOG(4) << "handle " + op_name_ + "_transpose_reshape fuse"; GET_IR_NODE_FROM_SUBGRAPH(matmul_op, matmul_op, mtrp); GET_IR_NODE_FROM_SUBGRAPH(matmul_out, matmul_out, mtrp); GET_IR_NODE_FROM_SUBGRAPH(transpose_op, transpose_op, mtrp); @@ -118,17 +120,17 @@ void MatmulTransposeReshapeMKLDNNPass::ApplyImpl(ir::Graph *graph) const { const bool supported_transpose_axis = std::equal( transpose_axis.begin(), transpose_axis.end(), supported_axis.begin()); if (transpose_out_size != 4) { - VLOG(3) << "do not perform matmul_transpose_reshape fuse: " + VLOG(3) << "do not perform " + op_name_ + "_transpose_reshape fuse: " << "supported rank is 4, received " << transpose_out_size; return; } if (!supported_transpose_axis) { - VLOG(3) << "do not perform matmul_transpose_reshape fuse: " + VLOG(3) << "do not perform " + op_name_ + "_transpose_reshape fuse: " << "supported transpose axis for the fuse are {0, 2, 1, 3}"; return; } if (reshape_out_size != 3) { - VLOG(3) << "do not perform matmul_transpose_reshape fuse: " + VLOG(3) << "do not perform " + op_name_ + "_transpose_reshape fuse: " << "reshape_out supported rank is 3, received " << reshape_out_size; return; @@ -149,10 +151,12 @@ void MatmulTransposeReshapeMKLDNNPass::ApplyImpl(ir::Graph *graph) const { gpd(graph, handler); AddStatis(found_matmul_transpose_reshape_count); - std::stringstream msg_ss; - msg_ss << "--- Fused " << found_matmul_transpose_reshape_count - << " MatmulTransposeReshape patterns"; - paddle::string::PrettyLogDetail(msg_ss.str().c_str()); + if (!Has("disable_logs") || !Get("disable_logs")) { + std::stringstream msg_ss; + msg_ss << "--- Fused " << found_matmul_transpose_reshape_count + << " MatmulTransposeReshape patterns for " + op_name_ + " Op"; + paddle::string::PrettyLogDetail(msg_ss.str().c_str()); + } } } // namespace ir } // namespace framework diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.h index 09cbe9bdf7b2fb..e03746e6e80e85 100644 --- a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.h @@ -31,6 +31,7 @@ class MatmulTransposeReshapeMKLDNNPass : public FusePassBase { protected: void ApplyImpl(Graph* graph) const override; const std::string name_scope_{"matmul_transpose_reshape_fuse"}; + std::string op_name_; }; } // namespace ir } // namespace framework diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass_tester.cc index d98d640e1002b1..ed99989cf382f1 100644 --- a/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass_tester.cc @@ -12,8 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.h" #include +#include "paddle/fluid/framework/ir/mkldnn/matmul_v2_transpose_reshape_fuse_pass.h" namespace paddle { namespace framework { @@ -42,9 +42,15 @@ void SetOp(ProgramDesc *prog, const std::string &type, op->SetAttr("transpose_X", true); op->SetAttr("transpose_Y", true); } + if (type == "matmul_v2") { + op->SetInput("Y", {inputs[1]}); + op->SetAttr("use_mkldnn", true); + op->SetAttr("trans_x", true); + op->SetAttr("trans_y", true); + } } -ProgramDesc BuildProgramDesc() { +ProgramDesc BuildProgramDesc(const std::string &op_name) { ProgramDesc prog; for (auto &v : std::initializer_list( {"a1", "a2", "b", "c", "cx", "d", "dx", "e"})) { @@ -52,7 +58,7 @@ ProgramDesc BuildProgramDesc() { var->SetType(proto::VarType::SELECTED_ROWS); } - SetOp(&prog, "matmul", {"a1", "a2"}, {"b"}); + SetOp(&prog, op_name, {"a1", "a2"}, {"b"}); SetOp(&prog, "transpose2", {"b"}, {"c", "cx"}); SetOp(&prog, "reshape2", {"c"}, {"d", "dx"}); SetOp(&prog, "fc", {"d"}, {"e"}); @@ -60,13 +66,13 @@ ProgramDesc BuildProgramDesc() { return prog; } -void MainTest(const ProgramDesc &prog) { +void MainTest(const ProgramDesc &prog, const std::string &op_name) { std::unique_ptr graph(new ir::Graph(prog)); int original_nodes_num = graph->Nodes().size(); auto pass = - PassRegistry::Instance().Get("matmul_transpose_reshape_fuse_pass"); + PassRegistry::Instance().Get(op_name + "_transpose_reshape_fuse_pass"); graph.reset(pass->Apply(graph.release())); int current_nodes_num = graph->Nodes().size(); @@ -75,7 +81,7 @@ void MainTest(const ProgramDesc &prog) { for (auto *node : graph->Nodes()) { if (node->IsOp()) { auto *op = node->Op(); - if (op->Type() == "matmul") { + if (op->Type() == op_name) { EXPECT_EQ(op->GetAttrIfExists>("fused_reshape_Out"), std::vector({4, 5, 6})); EXPECT_EQ(op->GetAttrIfExists>("fused_transpose_Out"), @@ -85,12 +91,18 @@ void MainTest(const ProgramDesc &prog) { } } -TEST(MatmulTransposeReshapeFusePass, matmul_inputs) { - auto prog = BuildProgramDesc(); - MainTest(prog); +TEST(MatmulTransposeReshapeFusePass, matmul_fuse_pass) { + auto prog = BuildProgramDesc("matmul"); + MainTest(prog, "matmul"); +} + +TEST(MatmulTransposeReshapeFusePass, matmul_v2_fuse_pass) { + auto prog = BuildProgramDesc("matmul_v2"); + MainTest(prog, "matmul_v2"); } } // namespace ir } // namespace framework } // namespace paddle USE_PASS(matmul_transpose_reshape_fuse_pass); +USE_PASS(matmul_v2_transpose_reshape_fuse_pass); diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_v2_transpose_reshape_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/matmul_v2_transpose_reshape_fuse_pass.cc new file mode 100644 index 00000000000000..dcf4664d963da7 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/matmul_v2_transpose_reshape_fuse_pass.cc @@ -0,0 +1,92 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/mkldnn/matmul_v2_transpose_reshape_fuse_pass.h" +#include +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace ir { + +MatmulV2TransposeReshapeMKLDNNPass::MatmulV2TransposeReshapeMKLDNNPass() { + op_name_ = "matmul_v2"; + + AddOpCompat(OpCompat(op_name_)) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("trans_x") + .IsType() + .End() + .AddAttr("trans_y") + .IsType() + .End(); + + AddOpCompat(OpCompat("transpose2")) + .AddInput("X") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddOutput("XShape") + .IsTensor() + .End() + .AddAttr("axis") + .IsType>() + .End(); + + AddOpCompat(OpCompat("reshape2")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Shape") + .IsTensor() + .IsOptional() + .End() + .AddInput("ShapeTensor") + .IsTensor() + .IsOptional() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddOutput("XShape") + .IsTensor() + .End() + .AddAttr("shape") + .IsType>() + .End(); +} +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(matmul_v2_transpose_reshape_fuse_pass, + paddle::framework::ir::MatmulV2TransposeReshapeMKLDNNPass); + +REGISTER_PASS_CAPABILITY(matmul_v2_transpose_reshape_fuse_pass) + .AddCombination( + paddle::framework::compatible::OpVersionComparatorCombination() + .EQ("matmul_v2", 0) + .EQ("transpose2", 0) + .EQ("reshape2", 0)); diff --git a/paddle/fluid/framework/ir/mkldnn/matmul_v2_transpose_reshape_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/matmul_v2_transpose_reshape_fuse_pass.h new file mode 100644 index 00000000000000..60b7e981456982 --- /dev/null +++ b/paddle/fluid/framework/ir/mkldnn/matmul_v2_transpose_reshape_fuse_pass.h @@ -0,0 +1,35 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/fluid/framework/ir/mkldnn/matmul_transpose_reshape_fuse_pass.h" + +namespace paddle { +namespace framework { +namespace ir { +class MatmulV2TransposeReshapeMKLDNNPass + : public MatmulTransposeReshapeMKLDNNPass { + public: + MatmulV2TransposeReshapeMKLDNNPass(); + virtual ~MatmulV2TransposeReshapeMKLDNNPass() {} + + protected: + const std::string name_scope_{"matmul_v2_transpose_reshape_fuse"}; +}; +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.cc index 43c9849d5bbe3b..76a0c883c89233 100644 --- a/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/multi_gru_fuse_pass.cc @@ -111,9 +111,9 @@ void MultiGRUFusePass::ApplyImpl(ir::Graph* graph) const { }; gpd(graph, handler); AddStatis(fused_count); - - PrettyLogDetail("--- fused %d pairs of concatenated multi_gru ops", - fused_count); + if (!Has("disable_logs") || !Get("disable_logs")) + PrettyLogDetail("--- fused %d pairs of concatenated multi_gru ops", + fused_count); } } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.cc index 17770d26d7de9d..7821501cc4b23c 100644 --- a/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/multi_gru_seq_fuse_pass.cc @@ -126,9 +126,9 @@ void MultiGruSeqFusePass::ApplyImpl(ir::Graph* graph) const { }; gpd(graph, handler); AddStatis(fused_count); - - PrettyLogDetail("--- fused %d sequences of two multi_gru ops", - fused_count); + if (!Has("disable_logs") || !Get("disable_logs")) + PrettyLogDetail("--- fused %d sequences of two multi_gru ops", + fused_count); } } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc index 26692849d977b5..e408440f26f1c2 100644 --- a/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/reshape_transpose_matmul_mkldnn_fuse_pass.cc @@ -148,13 +148,14 @@ void ReshapeTransposeMatmulMkldnnFusePass::Fuse( gpd(graph, handler); AddStatis(found_reshape_transpose_matmul_count); - - std::stringstream msg_ss; - msg_ss << "--- Fused " << found_reshape_transpose_matmul_count - << " ReshapeTransposeMatmulMkldnn patterns"; - if (with_reshape_xshape) msg_ss << " with reshape's xshape"; - if (with_transpose_xshape) msg_ss << " with transpose's xshape"; - string::PrettyLogDetail(msg_ss.str().c_str()); + if (!Has("disable_logs") || !Get("disable_logs")) { + std::stringstream msg_ss; + msg_ss << "--- Fused " << found_reshape_transpose_matmul_count + << " ReshapeTransposeMatmulMkldnn patterns"; + if (with_reshape_xshape) msg_ss << " with reshape's xshape"; + if (with_transpose_xshape) msg_ss << " with transpose's xshape"; + string::PrettyLogDetail(msg_ss.str().c_str()); + } } void ReshapeTransposeMatmulMkldnnFusePass::ApplyImpl(ir::Graph *graph) const { diff --git a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc index 13f1fa50d080a3..0fc458723ffe43 100644 --- a/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/scale_matmul_fuse_pass.cc @@ -129,8 +129,9 @@ void ScaleMatmulFusePass::ApplyImpl(ir::Graph* graph) const { }; gpd(graph, handler); AddStatis(found_scale_matmul_fuse_count); - PrettyLogDetail("--- fused %d scale with matmul", - found_scale_matmul_fuse_count); + if (!Has("disable_logs") || !Get("disable_logs")) + PrettyLogDetail("--- fused %d scale with matmul", + found_scale_matmul_fuse_count); } } // namespace ir diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt b/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt index 6764799d828661..fea12baf0651fa 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt @@ -1,4 +1,4 @@ -cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper) +cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle scale_loss_grad_op_handle op_graph_view multi_devices_helper) cc_library(multi_devices_graph_print_pass SRCS multi_devices_graph_print_pass.cc DEPS multi_devices_helper) cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc DEPS multi_devices_helper) diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/modify_op_lock_and_record_event_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/modify_op_lock_and_record_event_pass.cc index 70b95c9154fd30..afd80e45cf65e5 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/modify_op_lock_and_record_event_pass.cc +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/modify_op_lock_and_record_event_pass.cc @@ -14,6 +14,7 @@ #include "paddle/fluid/framework/details/computation_op_handle.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/memory_optimize_pass/op_graph_view.h" @@ -21,14 +22,23 @@ namespace paddle { namespace framework { namespace ir { +template +static bool IsMatchedPlaceSingleDeviceOp(details::OpHandleBase *op_base, + const platform::Place &place) { + auto *op = dynamic_cast(op_base); + return op && op->GetPlace() == place; +} + static bool IsLockAndRecordEventFreeComputationOpHandle( details::ComputationOpHandle *op, const OpGraphView &graph_view) { if (!platform::is_gpu_place(op->GetPlace()) && !platform::is_xpu_place(op->GetPlace())) return false; for (auto &pending_op : graph_view.PendingOps(op)) { - auto *tmp = dynamic_cast(pending_op); - if (tmp == nullptr || !(tmp->GetPlace() == op->GetPlace())) { + if (!IsMatchedPlaceSingleDeviceOp( + pending_op, op->GetPlace()) && + !IsMatchedPlaceSingleDeviceOp( + pending_op, op->GetPlace())) { return false; } } diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc index c826e1c5a584ac..8bbe6a12d8abc2 100644 --- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc +++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc @@ -425,15 +425,15 @@ PDNode* MultiHeadMatmulPattern::operator()() { PDNode* MultiHeadMatmulV3Pattern::operator()() { std::unordered_set matmul_ops{"matmul", "matmul_v2"}; auto* input0 = pattern->NewNode(input0_repr()); - input0->assert_is_op_input("matmul"); + input0->assert_is_ops_input(matmul_ops); // First path with scale - auto* mul0 = pattern->NewNode(mul0_repr())->assert_is_op("matmul"); + auto* mul0 = pattern->NewNode(mul0_repr())->assert_is_ops(matmul_ops); auto* mul0_w_var = pattern->NewNode(mul0_w_repr()) ->AsInput() - ->assert_is_op_input("matmul", "Y"); + ->assert_is_ops_input(matmul_ops, "Y"); auto* mul0_out_var = - pattern->NewNode(mul0_out_repr())->assert_is_op_output("matmul"); + pattern->NewNode(mul0_out_repr())->assert_is_ops_output(matmul_ops); decltype(mul0) eltadd0; decltype(mul0) eltadd0_b_var; @@ -461,11 +461,12 @@ PDNode* MultiHeadMatmulV3Pattern::operator()() { pattern->NewNode(transpose2_0_repr())->assert_is_op("transpose2"); auto* transpose2_0_out_var = pattern->NewNode(transpose2_0_out_repr()) ->assert_is_op_output("transpose2"); - transpose2_0_out_var->AsIntermediate()->assert_is_op_input("matmul", "X"); + transpose2_0_out_var->AsIntermediate()->assert_is_ops_input(matmul_ops, "X"); - auto* matmul_qk = pattern->NewNode(matmul_qk_repr())->assert_is_op("matmul"); + auto* matmul_qk = + pattern->NewNode(matmul_qk_repr())->assert_is_ops(matmul_ops); auto* matmul_qk_out_var = - pattern->NewNode(matmul_qk_out_repr())->assert_is_op_output("matmul"); + pattern->NewNode(matmul_qk_out_repr())->assert_is_ops_output(matmul_ops); matmul_qk_out_var->AsIntermediate()->assert_is_op_input("elementwise_add"); auto* eltadd_qk = @@ -499,15 +500,15 @@ PDNode* MultiHeadMatmulV3Pattern::operator()() { pattern->NewNode(reshape2_qkv_repr())->assert_is_op("reshape2"); auto* reshape2_qkv_out_var = pattern->NewNode(reshape2_qkv_out_repr()) ->assert_is_op_output("reshape2"); - reshape2_qkv_out_var->assert_is_op_input("matmul"); + reshape2_qkv_out_var->assert_is_ops_input(matmul_ops); // Second path to matmul - auto* mul1 = pattern->NewNode(mul1_repr())->assert_is_op("matmul"); + auto* mul1 = pattern->NewNode(mul1_repr())->assert_is_ops(matmul_ops); auto* mul1_w_var = pattern->NewNode(mul1_w_repr()) ->AsInput() - ->assert_is_op_input("matmul", "Y"); + ->assert_is_ops_input(matmul_ops, "Y"); auto* mul1_out_var = - pattern->NewNode(mul1_out_repr())->assert_is_op_output("matmul"); + pattern->NewNode(mul1_out_repr())->assert_is_ops_output(matmul_ops); decltype(mul1) eltadd1; decltype(mul1) eltadd1_b_var; @@ -534,16 +535,16 @@ PDNode* MultiHeadMatmulV3Pattern::operator()() { pattern->NewNode(transpose2_1_repr())->assert_is_op("transpose2"); auto* transpose2_1_out_var = pattern->NewNode(transpose2_1_out_repr()) ->assert_is_op_output("transpose2"); - transpose2_1_out_var->AsIntermediate()->assert_is_op_input( - "matmul", "Y"); // link to matmul qk + transpose2_1_out_var->AsIntermediate()->assert_is_ops_input( + matmul_ops, "Y"); // link to matmul qk // Third path to matmul - auto* mul2 = pattern->NewNode(mul2_repr())->assert_is_op("matmul"); + auto* mul2 = pattern->NewNode(mul2_repr())->assert_is_ops(matmul_ops); auto* mul2_w_var = pattern->NewNode(mul2_w_repr()) ->AsInput() - ->assert_is_op_input("matmul", "Y"); + ->assert_is_ops_input(matmul_ops, "Y"); auto* mul2_out_var = - pattern->NewNode(mul2_out_repr())->assert_is_op_output("matmul"); + pattern->NewNode(mul2_out_repr())->assert_is_ops_output(matmul_ops); decltype(mul2) eltadd2; decltype(mul2) eltadd2_b_var; @@ -1173,6 +1174,23 @@ MultiHeadMatmulV3FusePass::MultiHeadMatmulV3FusePass() { .IsType() .End(); + AddOpCompat(OpCompat("matmul_v2")) + .AddInput("X") + .IsTensor() + .End() + .AddInput("Y") + .IsTensor() + .End() + .AddOutput("Out") + .IsTensor() + .End() + .AddAttr("trans_x") + .IsBoolEQ(false) + .End() + .AddAttr("trans_y") // QK(true) QKV(false) + .IsType() + .End(); + AddOpCompat(OpCompat("softmax")) .AddInput("X") .IsTensor() diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc index 5958728946c2ed..22babcc719aeb4 100644 --- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc +++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc @@ -437,7 +437,11 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph, Scope* scope, BOOST_GET_CONST(int, quantized_op_node->Op()->GetAttr("bit_length")); int range = ((1 << (bit_length - 1)) - 1); std::vector weight_scale; - + int quant_axis = 0; + if (dequant_op_node->Op()->HasAttr("quant_axis")) { + quant_axis = + BOOST_GET_CONST(int, dequant_op_node->Op()->GetAttr("quant_axis")); + } // Get weight scale if (dequant_type == "fake_channel_wise_dequantize_max_abs") { Node* dequant_channel_scale_node = @@ -488,6 +492,16 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph, Scope* scope, } } if (dequant_type == "fake_channel_wise_dequantize_max_abs") { + if (quant_axis == 0) { + } else { + PADDLE_ENFORCE_EQ( + quant_axis == 1, true, + platform::errors::InvalidArgument( + "'quant_axis' of mul/matmul/fc op weight dequantized by " + "[fake_channel_wise_dequantize_max_abs]should be 1, but " + "the received is %d", + quant_axis)); + } PADDLE_ENFORCE_EQ( weight_scale.size(), static_cast(w_dims[1]), platform::errors::InvalidArgument( @@ -511,6 +525,16 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph, Scope* scope, "model, please set the 'weight_quantize_type' params as " "'channel_wise_abs_max' and generate the quantized model again.", dequant_type)); + if (quant_axis == 0) { + } else { + PADDLE_ENFORCE_EQ( + quant_axis == 0, true, + platform::errors::InvalidArgument( + "'quant_axis' of conv2d/depthwise_conv2d op weight dequantized " + "by [fake_channel_wise_dequantize_max_abs]should be 0, but " + "the received is %d", + quant_axis)); + } PADDLE_ENFORCE_EQ( weight_scale.size(), static_cast(w_dims[0]), platform::errors::InvalidArgument( @@ -528,6 +552,16 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph, Scope* scope, "conv2d_transpose must be dequantized by " "[fake_channel_wise_dequantize_max_abs], but got %s", dequant_type)); + if (quant_axis == 0) { + } else { + PADDLE_ENFORCE_EQ( + quant_axis == 1, true, + platform::errors::InvalidArgument( + "'quant_axis' of conv2d_transpose op weight dequantized by " + "[fake_channel_wise_dequantize_max_abs]should be 1, but " + "the received is %d", + quant_axis)); + } PADDLE_ENFORCE_EQ( weight_scale.size(), static_cast(w_dims[1]), platform::errors::InvalidArgument( @@ -548,7 +582,8 @@ void QuantDequantFusePass::FuseDequant(ir::Graph* graph, Scope* scope, std::string new_input = quantized_op_input_node->Name(); std::string new_output = dequant_op_out_node->Name(); - framework::OpDesc new_op_desc(base_op_desc, nullptr); + framework::OpDesc new_op_desc(base_op_desc, + quantized_op_node->Op()->Block()); new_op_desc.SetType(quantized_op_type); new_op_desc.SetAttr("enable_int8", true); if (quantized_op_type == "conv2d" || quantized_op_type == "conv2d_fusion" || diff --git a/paddle/fluid/framework/new_executor/event_count.h b/paddle/fluid/framework/new_executor/event_count.h index 0c6d49042d22db..7f1e3670056fcc 100644 --- a/paddle/fluid/framework/new_executor/event_count.h +++ b/paddle/fluid/framework/new_executor/event_count.h @@ -50,11 +50,13 @@ #include #include #include -#include "paddle/fluid/framework/new_executor/workqueue_utils.h" namespace paddle { namespace framework { +void* AlignedMalloc(size_t size, size_t alignment); +void AlignedFree(void* memory_ptr); + class EventCount { public: class Waiter; diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index 7d9d3d5fef14a8..d6ea840362e7ef 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -23,6 +23,8 @@ PADDLE_DEFINE_EXPORTED_bool(new_executor_use_inplace, true, "Use inplace in new executor"); +constexpr const char* kExceptionCaught = "ExceptionCaught"; + namespace paddle { namespace framework { // NOTE(Aurelius84): Need a better strategy to determine it. @@ -37,11 +39,14 @@ InterpreterCore::InterpreterCore(const platform::Place& place, main_program_(main_prog), global_scope_(global_scope), stream_analyzer_(place), - async_work_queue_(kHostNumThreads) { + async_work_queue_(kHostNumThreads, &main_thread_blocker_) { is_build_ = false; feed_names_ = feed_names; + exception_notifier_ = main_thread_blocker_.RegisterEvent( + kExceptionCaught, [this]() { return exception_holder_.IsCaught(); }); + // Step1: add feedop and fetchop to main_program AddFetch(fetch_names); @@ -118,6 +123,8 @@ void InterpreterCore::Convert() { temp_inst.input_index_ = vec_func_list_[i].input_index; temp_inst.output_index_ = vec_func_list_[i].output_index; temp_inst.type_ = vec_func_list_[i].type_; + temp_inst.no_data_transform_index_ = + vec_func_list_[i].no_data_transform_index; OpInOutInfo info; @@ -189,8 +196,6 @@ void InterpreterCore::Convert() { for (auto inst_id : filter_next) { dependecy_count_[inst_id]++; } - vec_instruction_[i].next_instruction_.all_next_ops_ = - std::move(filter_next); } for (size_t i = 0; i < vec_instruction_.size(); ++i) { @@ -356,65 +361,145 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) { void InterpreterCore::ExecuteInstructionList( const std::vector& vec_instr) { - auto atomic_deps = async_work_queue_.PrepareAtomicDeps(dependecy_count_); - auto atomic_var_ref = async_work_queue_.PrepareAtomicVarRef(vec_meta_info_); - std::atomic op_run_number{0}; + async_work_queue_.PrepareAtomicDeps(dependecy_count_); + async_work_queue_.PrepareAtomicVarRef(vec_meta_info_); + op_run_number_ = 0; + + exception_holder_.Clear(); for (size_t i = 0; i < dependecy_count_.size(); ++i) { if (dependecy_count_[i] == 0) { - async_work_queue_.AddTask(vec_instr[i].type_, [&, i]() { - RunInstructionAsync(i, &atomic_deps, &atomic_var_ref, &op_run_number); - }); + async_work_queue_.AddTask(vec_instr[i].type_, + [&, i] { RunInstructionAsync(i); }); } } - async_work_queue_.WaitEmpty(); + auto event_id = main_thread_blocker_.WaitEvent(); + VLOG(3) << "event_id " << event_id; + + if (UNLIKELY(exception_holder_.IsCaught())) { + VLOG(4) << "Exception caught " << exception_holder_.Type(); + exception_holder_.ReThrow(); + } PADDLE_ENFORCE_EQ( - op_run_number.load(), vec_instr.size(), + op_run_number_.load(), vec_instr.size(), platform::errors::Fatal( "Required op_run_number == %d, but received op_run_number = %d.", - vec_instr.size(), op_run_number.load())); + vec_instr.size(), op_run_number_.load())); } -void InterpreterCore::RunInstructionAsync(size_t instr_id, - AtomicVectorSizeT* atomic_deps, - AtomicVectorSizeT* atomic_var_ref, - std::atomic* op_run_number) { - auto& instr_node = vec_instruction_[instr_id]; - platform::RecordEvent instruction_event( - instr_node.kernel_func_.operator_base_->Type()); - event_manager_.WaitEvent(instr_node, place_); - - RunInstruction(instr_node); +void InterpreterCore::RunNextInstructions( + const Instruction& instr, std::queue* reserved_next_ops) { + auto& next_instr = instr.next_instruction_; + auto& atomic_deps = async_work_queue_.AtomicDeps(); + auto IsReady = [&](size_t next_id) { + return atomic_deps[next_id]->fetch_sub(1, std::memory_order_relaxed) == 1; + }; - event_manager_.RecordEvent(instr_node, place_); - op_run_number->fetch_add(1, std::memory_order_relaxed); + if (instr.type_ == OpFuncType::kQueueAsync) { + // move all sync_ops into other threads + for (auto next_id : next_instr.synchronize_run_) { + if (IsReady(next_id)) { + async_work_queue_.AddTask( + vec_instruction_[next_id].type_, + [&, next_id] { RunInstructionAsync(next_id); }); + } + } + // keep all async_ops running in current thread + for (auto next_id : next_instr.direct_run_) { + if (IsReady(next_id)) { + reserved_next_ops->push(next_id); + } + } + for (auto next_id : next_instr.event_wait_run_) { + if (IsReady(next_id)) { + reserved_next_ops->push(next_id); + } + } + } else { + // move async_ops into async_thread + for (auto next_id : next_instr.event_wait_run_) { + if (IsReady(next_id)) { + async_work_queue_.AddTask( + vec_instruction_[next_id].type_, + [&, next_id] { RunInstructionAsync(next_id); }); + } + } + auto direct_run_ops = interpretercore::merge_vector( + next_instr.synchronize_run_, next_instr.direct_run_); + size_t first_op = 0; + for (auto next_id : direct_run_ops) { + if (IsReady(next_id)) { + // only keep one op running in current thread + if (first_op == 0) { + first_op = next_id; + continue; + } + // move rest ops into other threads + async_work_queue_.AddTask( + vec_instruction_[next_id].type_, + [&, next_id] { RunInstructionAsync(next_id); }); + } + } + if (first_op != 0) reserved_next_ops->push(first_op); + } +} - auto& next_instr = instr_node.next_instruction_.all_next_ops_; +void InterpreterCore::RunInstructionAsync(size_t instr_id) { + std::queue ready_ops; + ready_ops.push(instr_id); + while (!ready_ops.empty()) { + instr_id = ready_ops.front(); + ready_ops.pop(); + auto& instr_node = vec_instruction_[instr_id]; + auto* op = instr_node.kernel_func_.operator_base_; + platform::RecordEvent instruction_event(op->Type()); + event_manager_.WaitEvent(instr_node, place_); + + try { + RunInstruction(instr_node); + } catch (platform::EnforceNotMet& ex) { + framework::InsertCallStackInfo(op->Type(), op->Attrs(), &ex); + exception_holder_.Catch(std::make_exception_ptr(std::move(ex))); + } catch (platform::EOFException&) { + exception_holder_.Catch(std::current_exception()); + } catch (std::exception& ex) { + LOG(WARNING) << op->Type() << " raises an exception " + << platform::demangle(typeid(ex).name()) << ", " + << ex.what(); + exception_holder_.Catch(std::current_exception()); + } catch (...) { + LOG(WARNING) << op->Type() << " raises an unknown exception"; + exception_holder_.Catch(std::current_exception()); + } - for (auto next_i : next_instr) { - // fetch_sub return value before applying sub - bool is_ready = - atomic_deps->at(next_i)->fetch_sub(1, std::memory_order_relaxed) == 1; - if (is_ready) { - async_work_queue_.AddTask(vec_instruction_[next_i].type_, [=]() { - RunInstructionAsync(next_i, atomic_deps, atomic_var_ref, op_run_number); - }); + if (UNLIKELY(exception_holder_.IsCaught())) { + VLOG(4) << "Exception caught"; + if (exception_notifier_ != nullptr) { + exception_notifier_->NotifyEvent(); + } + return; } + + event_manager_.RecordEvent(instr_node, place_); + op_run_number_.fetch_add(1, std::memory_order_relaxed); + + // GC infomation + CheckGC(instr_id, instr_node.gc_check_var_list); + + RunNextInstructions(instr_node, &ready_ops); } - // GC infomation - CheckGC(instr_id, instr_node.gc_check_var_list, atomic_var_ref); } void InterpreterCore::CheckGC(size_t instr_id, - const std::vector& gc_check_list, - AtomicVectorSizeT* atomic_var_ref) { + const std::vector& gc_check_list) { auto& var_scope = *global_scope_; + auto& atomic_var_ref = async_work_queue_.AtomicVarRef(); for (auto var_id : gc_check_list) { - bool is_ready = atomic_var_ref->at(var_id)->fetch_sub( - 1, std::memory_order_relaxed) == 1; + bool is_ready = + atomic_var_ref[var_id]->fetch_sub(1, std::memory_order_relaxed) == 1; if (is_ready && var_scope.vec_meta_info_[var_id].vardesc_ && !var_scope.vec_meta_info_[var_id].vardesc_->Persistable()) { gc_.Add(var_scope.var_list[var_id], gc_event_[instr_id], diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h index e594f9ca8b54b5..9fba5f2cdce8b9 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.h +++ b/paddle/fluid/framework/new_executor/interpretercore.h @@ -19,6 +19,7 @@ #include #include +#include "paddle/fluid/framework/details/exception_holder.h" #include "paddle/fluid/framework/new_executor/event_manager.h" #include "paddle/fluid/framework/new_executor/interpretercore_garbage_collector.h" #include "paddle/fluid/framework/new_executor/interpretercore_util.h" @@ -26,6 +27,7 @@ #include "paddle/fluid/framework/new_executor/profiler.h" #include "paddle/fluid/framework/new_executor/stream_analyzer.h" #include "paddle/fluid/framework/new_executor/workqueue.h" +#include "paddle/fluid/framework/new_executor/workqueue_utils.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/variable.h" @@ -65,13 +67,11 @@ class InterpreterCore { void DryRunPrepare(const std::vector& feed_tensors); - void CheckGC(size_t instr_id, const std::vector& gc_check_list, - AtomicVectorSizeT* working_var_ref); + void CheckGC(size_t instr_id, const std::vector& gc_check_list); - void RunInstructionAsync(size_t instr_id, - AtomicVectorSizeT* working_dependecy_count, - AtomicVectorSizeT* working_var_ref, - std::atomic* op_run_number); + void RunInstructionAsync(size_t instr_id); + void RunNextInstructions(const Instruction& instr_id, + std::queue* reserved_next_ops); void AddFetch(const std::vector& fetch_names); void BuildSkipShareLoDInfo(); @@ -97,10 +97,14 @@ class InterpreterCore { InterpreterProfiler dry_run_profiler_; StreamAnalyzer stream_analyzer_; EventManager event_manager_; + EventsWaiter main_thread_blocker_; interpretercore::AsyncWorkQueue async_work_queue_; + details::ExceptionHolder exception_holder_; + std::shared_ptr exception_notifier_{nullptr}; InterpreterCoreGarbageCollector gc_; std::vector gc_event_; + std::atomic op_run_number_{0}; }; } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc index 16df5d794f4d44..7bb0429c6228b2 100644 --- a/paddle/fluid/framework/new_executor/interpretercore_util.cc +++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc @@ -12,31 +12,40 @@ // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/fluid/framework/new_executor/interpretercore_util.h" +#include + #include "paddle/fluid/framework/executor_gc_helper.h" namespace paddle { namespace framework { namespace interpretercore { -AtomicVectorSizeT AsyncWorkQueue::PrepareAtomicDeps( +AtomicVectorSizeT& AsyncWorkQueue::PrepareAtomicDeps( const std::vector& dependecy_count) { - AtomicVectorSizeT working_dependecy_count(dependecy_count.size()); + if (atomic_deps_.size() != dependecy_count.size()) { + atomic_deps_.clear(); + std::generate_n(std::back_inserter(atomic_deps_), dependecy_count.size(), + [] { return std::make_unique>(0); }); + } + for (size_t i = 0; i < dependecy_count.size(); ++i) { - working_dependecy_count[i] = - std::make_unique>(dependecy_count[i]); + atomic_deps_[i]->store(dependecy_count[i]); } - return working_dependecy_count; + return atomic_deps_; } -AtomicVectorSizeT AsyncWorkQueue::PrepareAtomicVarRef( +AtomicVectorSizeT& AsyncWorkQueue::PrepareAtomicVarRef( const std::vector& vec_meta_info) { - AtomicVectorSizeT working_var_ref(vec_meta_info.size()); + if (atomic_var_ref_.size() != vec_meta_info.size()) { + atomic_var_ref_.clear(); + std::generate_n(std::back_inserter(atomic_var_ref_), vec_meta_info.size(), + [] { return std::make_unique>(0); }); + } for (size_t i = 0; i < vec_meta_info.size(); ++i) { - working_var_ref[i] = - std::make_unique>(vec_meta_info[i].var_ref_count_); + atomic_var_ref_[i]->store(vec_meta_info[i].var_ref_count_); } - return working_var_ref; + return atomic_var_ref_; } bool var_can_be_deleted(const std::string& name, const BlockDesc& block) { @@ -269,6 +278,7 @@ void build_op_func_list(const platform::Place& place, // step 3. Insert memcpy_op if needed VariableValueMap& ins_map_temp = runtime_context.inputs; + std::unordered_set no_data_transform_index; for (auto& var_name_item : ins_map_temp) { for (size_t i = 0; i < var_name_item.second.size(); ++i) { auto var = var_name_item.second[i]; @@ -280,8 +290,14 @@ void build_op_func_list(const platform::Place& place, static_cast(op_base) ->GetKernelTypeForVar(var_name_item.first, *tensor_in, expected_kernel_key); - if (!platform::is_same_place(kernel_type_for_var.place_, - expected_kernel_key.place_)) { + if (platform::is_same_place(kernel_type_for_var.place_, + expected_kernel_key.place_)) { + // record no need data transformer input var_id + auto& var_name = inputs_names[var_name_item.first][i]; + VLOG(3) << op->Type() << " found no data_transform var: " << var_name + << " with id: " << var_scope->name2id[var_name]; + no_data_transform_index.emplace(var_scope->name2id[var_name]); + } else { if (op_base->Type() == "fetch_v2") { op_base->SetAttr("deepcopy", false); } @@ -376,6 +392,7 @@ void build_op_func_list(const platform::Place& place, } } } + op_func_node.no_data_transform_index = std::move(no_data_transform_index); // step 4. Run op kernel op_list->push_back(op_base); VLOG(3) << op_base->Type() diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.h b/paddle/fluid/framework/new_executor/interpretercore_util.h index 259f1c615533d9..b1e1c02ab9513b 100644 --- a/paddle/fluid/framework/new_executor/interpretercore_util.h +++ b/paddle/fluid/framework/new_executor/interpretercore_util.h @@ -33,6 +33,7 @@ #include "paddle/fluid/framework/garbage_collector.h" #include "paddle/fluid/framework/new_executor/new_executor_defs.h" #include "paddle/fluid/framework/new_executor/workqueue.h" +#include "paddle/fluid/framework/new_executor/workqueue_utils.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" @@ -53,33 +54,43 @@ using AtomicVectorSizeT = std::vector>>; class AsyncWorkQueue { public: - explicit AsyncWorkQueue(size_t host_num_threads) + AsyncWorkQueue(size_t host_num_threads, EventsWaiter* waiter) : host_num_thread_(host_num_threads) { std::vector group_options; // for execute host Kernel group_options.emplace_back(/*num_threads*/ host_num_threads, /*allow_spinning*/ true, - /*track_task*/ true); + /*track_task*/ true, + /*queue_empty_waiter*/ waiter); // for launch device Kernel group_options.emplace_back(/*num_threads*/ 1, - /*allow_spinning*/ true, /*track_task*/ true); + /*allow_spinning*/ true, + /*track_task*/ true, + /*queue_empty_waiter*/ waiter); queue_group_ = CreateWorkQueueGroup(group_options); } - AtomicVectorSizeT PrepareAtomicDeps( + AtomicVectorSizeT& PrepareAtomicDeps( const std::vector& dependecy_count); - AtomicVectorSizeT PrepareAtomicVarRef( + AtomicVectorSizeT& PrepareAtomicVarRef( const std::vector& vec_meta_info); - void WaitEmpty() { queue_group_->WaitQueueGroupEmpty(); } + // void WaitEmpty() { queue_group_->WaitQueueGroupEmpty(); } void AddTask(const OpFuncType& op_func_type, std::function fn) { queue_group_->AddTask(static_cast(op_func_type), std::move(fn)); } + void Cancel() { queue_group_->Cancel(); } + + AtomicVectorSizeT& AtomicDeps() { return atomic_deps_; } + AtomicVectorSizeT& AtomicVarRef() { return atomic_var_ref_; } + private: size_t host_num_thread_; std::unique_ptr queue_group_; + AtomicVectorSizeT atomic_deps_; + AtomicVectorSizeT atomic_var_ref_; }; std::string get_memcpy_type(const platform::Place& src_place, diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h index 9c0444b3157cb1..e6cff353a659d7 100644 --- a/paddle/fluid/framework/new_executor/new_executor_defs.h +++ b/paddle/fluid/framework/new_executor/new_executor_defs.h @@ -477,15 +477,10 @@ struct VariableScope { std::vector vec_meta_info_; }; -struct EventRun { - explicit EventRun(size_t op_id) : op_id_(op_id) {} - size_t op_id_; -}; struct NextInstruction { std::vector direct_run_; - std::vector event_wait_run_; - std::vector synchronize_run_; - std::vector all_next_ops_; + std::vector event_wait_run_; + std::vector synchronize_run_; }; struct EventInter { @@ -516,6 +511,8 @@ struct Instruction { std::map> input_index_; std::map> output_index_; + std::unordered_set no_data_transform_index_; + std::vector gc_check_var_list; NextInstruction next_instruction_; @@ -532,6 +529,7 @@ struct OpFuncNode { // int unsed; std::map> input_index; std::map> output_index; + std::unordered_set no_data_transform_index; OpKernelComputeFunc kernel_func_; platform::DeviceContext* dev_ctx_; // not owned diff --git a/paddle/fluid/framework/new_executor/nonblocking_threadpool.h b/paddle/fluid/framework/new_executor/nonblocking_threadpool.h index 2997ce1fe2473a..6e56532456c6fd 100644 --- a/paddle/fluid/framework/new_executor/nonblocking_threadpool.h +++ b/paddle/fluid/framework/new_executor/nonblocking_threadpool.h @@ -19,9 +19,12 @@ namespace paddle { namespace framework { +template class TaskTracker { public: - TaskTracker() : wait_empty_cv_(1) {} + TaskTracker() = default; + + explicit TaskTracker(Notifier& notifier) : notifier_(¬ifier) {} TaskTracker(const TaskTracker&) = delete; @@ -33,32 +36,17 @@ class TaskTracker { void SubCounter() { if (1 == num_tasks_.fetch_sub(1, std::memory_order_relaxed)) { - wait_empty_cv_.Notify(true); + if (notifier_ != nullptr) { + notifier_->NotifyEvent(); + } } } - // only one user can wait at any time - void WaitTaskNumToZero() { - bool waiting = false; - if (!wait_empty_.compare_exchange_strong(waiting, true, - std::memory_order_seq_cst, - std::memory_order_relaxed)) { - abort(); - } - EventCount::Waiter* w = wait_empty_cv_.GetWaiter(0); - wait_empty_cv_.Prewait(); - if (num_tasks_.load(std::memory_order_relaxed) == 0) { - wait_empty_cv_.CancelWait(); - } else { - wait_empty_cv_.CommitWait(w); - } - wait_empty_.store(false); - } + uint64_t PendingTaskNum() { return num_tasks_.load(); } private: alignas(64) std::atomic num_tasks_{0}; - alignas(64) EventCount wait_empty_cv_; - alignas(64) std::atomic wait_empty_{false}; + Notifier* notifier_{nullptr}; }; template @@ -185,6 +173,12 @@ class ThreadPoolTempl { ec_.Notify(true); } + void WaitThreadsExit() { + for (size_t i = 0; i < thread_data_.size(); ++i) { + thread_data_[i].thread->WaitExit(); + } + } + size_t NumThreads() const { return num_threads_; } int CurrentThreadId() const { diff --git a/paddle/fluid/framework/new_executor/run_queue.h b/paddle/fluid/framework/new_executor/run_queue.h index 13035237ff8b48..e457b20a3c35d5 100644 --- a/paddle/fluid/framework/new_executor/run_queue.h +++ b/paddle/fluid/framework/new_executor/run_queue.h @@ -37,6 +37,8 @@ #include #include #include +#include "paddle/fluid/framework/new_executor/workqueue_utils.h" +#include "paddle/fluid/memory/allocation/spin_lock.h" namespace paddle { namespace framework { @@ -101,7 +103,7 @@ class RunQueue { // PushBack adds w at the end of the queue. // If queue is full returns w, otherwise returns default-constructed Work. Work PushBack(Work w) { - std::unique_lock lock(mutex_); + std::unique_lock lock(mutex_); unsigned back = back_.load(std::memory_order_relaxed); Elem* e = &array_[(back - 1) & kMask]; uint8_t s = e->state.load(std::memory_order_relaxed); @@ -123,7 +125,7 @@ class RunQueue { return Work(); } - std::unique_lock lock(mutex_); + std::unique_lock lock(mutex_); unsigned back = back_.load(std::memory_order_relaxed); Elem* e = &array_[back & kMask]; uint8_t s = e->state.load(std::memory_order_relaxed); @@ -145,7 +147,7 @@ class RunQueue { return 0; } - std::unique_lock lock(mutex_); + std::unique_lock lock(mutex_); unsigned back = back_.load(std::memory_order_relaxed); unsigned size = Size(); unsigned mid = back; @@ -213,7 +215,7 @@ class RunQueue { // modification counters. alignas(64) std::atomic front_; alignas(64) std::atomic back_; - std::mutex mutex_; + paddle::memory::SpinLock mutex_; Elem array_[kSize]; // SizeOrNotEmpty returns current queue size; if NeedSizeEstimate is false, diff --git a/paddle/fluid/framework/new_executor/stream_analyzer.cc b/paddle/fluid/framework/new_executor/stream_analyzer.cc index a9322d8fc88edb..ffc2da499e1f7b 100644 --- a/paddle/fluid/framework/new_executor/stream_analyzer.cc +++ b/paddle/fluid/framework/new_executor/stream_analyzer.cc @@ -38,7 +38,8 @@ std::vector StreamAnalyzer::ParseEventVarIds( std::vector new_event_var_ids; for (auto& item : next_instr.input_index_) { for (auto var_id : item.second) { - if (unique_var_ids.count(var_id) > 0) { + if (unique_var_ids.count(var_id) > 0 && + next_instr.no_data_transform_index_.count(var_id) == 0) { new_event_var_ids.push_back(var_id); } } diff --git a/paddle/fluid/framework/new_executor/thread_environment.h b/paddle/fluid/framework/new_executor/thread_environment.h index be936274186f4f..eb1ee4de90898d 100644 --- a/paddle/fluid/framework/new_executor/thread_environment.h +++ b/paddle/fluid/framework/new_executor/thread_environment.h @@ -25,7 +25,16 @@ struct StlThreadEnvironment { class EnvThread { public: explicit EnvThread(std::function f) : thr_(std::move(f)) {} - ~EnvThread() { thr_.join(); } + void WaitExit() { + if (thr_.joinable()) { + thr_.join(); + } + } + ~EnvThread() { + if (thr_.joinable()) { + thr_.join(); + } + } private: std::thread thr_; diff --git a/paddle/fluid/framework/new_executor/workqueue.cc b/paddle/fluid/framework/new_executor/workqueue.cc index bc5a4e27dc528a..7607b3a297f843 100644 --- a/paddle/fluid/framework/new_executor/workqueue.cc +++ b/paddle/fluid/framework/new_executor/workqueue.cc @@ -13,13 +13,18 @@ namespace paddle { namespace framework { namespace { +using TaskTracker = TaskTracker; + class WorkQueueImpl : public WorkQueue { public: - explicit WorkQueueImpl(const WorkQueueOptions& options) - : WorkQueue(options), queue_(nullptr), tracker_(nullptr) { - if (options_.track_task) { + explicit WorkQueueImpl(const WorkQueueOptions& options) : WorkQueue(options) { + if (options_.track_task && options.queue_empty_waiter != nullptr) { void* storage = AlignedMalloc(sizeof(TaskTracker), alignof(TaskTracker)); - tracker_ = new (storage) TaskTracker; + TaskTracker* tracker = reinterpret_cast(storage); + auto notifier = options.queue_empty_waiter->RegisterEvent( + kQueueEmptyEvent, + [tracker]() { return tracker->PendingTaskNum() == 0; }); + tracker_ = new (storage) TaskTracker(*notifier.get()); } queue_ = new NonblockingThreadPool(options_.num_threads, options_.allow_spinning); @@ -44,20 +49,16 @@ class WorkQueueImpl : public WorkQueue { queue_->AddTask(std::move(fn)); } - void WaitQueueEmpty() override { - if (tracker_ == nullptr) { - PADDLE_THROW( - platform::errors::Unavailable("set WorkQueueOptions.track_task = " - "true before call this interface.")); - } - tracker_->WaitTaskNumToZero(); + void Cancel() override { + queue_->Cancel(); + queue_->WaitThreadsExit(); } size_t NumThreads() const override { return queue_->NumThreads(); } private: - NonblockingThreadPool* queue_; - TaskTracker* tracker_; + NonblockingThreadPool* queue_{nullptr}; + TaskTracker* tracker_{nullptr}; }; class WorkQueueGroupImpl : public WorkQueueGroup { @@ -69,12 +70,12 @@ class WorkQueueGroupImpl : public WorkQueueGroup { void AddTask(size_t queue_idx, std::function fn) override; - void WaitQueueGroupEmpty() override; - size_t QueueNumThreads(size_t queue_idx) const override; size_t QueueGroupNumThreads() const override; + void Cancel() override; + private: std::vector queues_; NonblockingThreadPool* queues_storage_; @@ -92,9 +93,14 @@ WorkQueueGroupImpl::WorkQueueGroupImpl( queues_storage_ = reinterpret_cast(buffer); for (size_t idx = 0; idx < num_queues; ++idx) { const auto& options = queues_options_[idx]; - if (options.track_task && tracker_ == nullptr) { + if (options.track_task && tracker_ == nullptr && + options.queue_empty_waiter != nullptr) { void* storage = AlignedMalloc(sizeof(TaskTracker), alignof(TaskTracker)); - tracker_ = new (storage) TaskTracker; + TaskTracker* tracker = reinterpret_cast(storage); + auto notifier = options.queue_empty_waiter->RegisterEvent( + kQueueEmptyEvent, + [tracker]() { return tracker->PendingTaskNum() == 0; }); + tracker_ = new (storage) TaskTracker(*notifier.get()); } queues_[idx] = new (&queues_storage_[idx]) NonblockingThreadPool(options.num_threads, options.allow_spinning); @@ -124,15 +130,6 @@ void WorkQueueGroupImpl::AddTask(size_t queue_idx, std::function fn) { queues_[queue_idx]->AddTask(std::move(fn)); } -void WorkQueueGroupImpl::WaitQueueGroupEmpty() { - if (nullptr == tracker_) { - PADDLE_THROW(platform::errors::Unavailable( - "set WorkQueueOptions.track_task = true for at least one of queues " - "before call this interface.")); - } - tracker_->WaitTaskNumToZero(); -} - size_t WorkQueueGroupImpl::QueueNumThreads(size_t queue_idx) const { assert(queue_idx < queues_.size()); return queues_.at(queue_idx)->NumThreads(); @@ -146,6 +143,15 @@ size_t WorkQueueGroupImpl::QueueGroupNumThreads() const { return total_num; } +void WorkQueueGroupImpl::Cancel() { + for (auto queue : queues_) { + queue->Cancel(); + } + for (auto queue : queues_) { + queue->WaitThreadsExit(); + } +} + } // namespace std::unique_ptr CreateSingleThreadedWorkQueue( @@ -166,7 +172,7 @@ std::unique_ptr CreateMultiThreadedWorkQueue( "WorkQueueOptions.num_threads must be " "greater than 1.")); std::unique_ptr ptr(new WorkQueueImpl(options)); - return ptr; + return std::move(ptr); } std::unique_ptr CreateWorkQueueGroup( @@ -176,7 +182,7 @@ std::unique_ptr CreateWorkQueueGroup( "For a WorkQueueGroup, the number of WorkQueueOptions " "must be greater than 1.")); std::unique_ptr ptr(new WorkQueueGroupImpl(queues_options)); - return ptr; + return std::move(ptr); } } // namespace framework diff --git a/paddle/fluid/framework/new_executor/workqueue.h b/paddle/fluid/framework/new_executor/workqueue.h index ead9d9949b7001..a299d0aaed7d29 100644 --- a/paddle/fluid/framework/new_executor/workqueue.h +++ b/paddle/fluid/framework/new_executor/workqueue.h @@ -21,15 +21,31 @@ namespace paddle { namespace framework { +constexpr const char* kQueueEmptyEvent = "QueueEmpty"; + +class EventsWaiter; + struct WorkQueueOptions { WorkQueueOptions(size_t num_threads, bool allow_spinning, bool track_task) : num_threads(num_threads), allow_spinning(allow_spinning), track_task(track_task) {} + WorkQueueOptions(size_t num_threads, bool allow_spinning, bool track_task, + EventsWaiter* waiter) + : num_threads(num_threads), + allow_spinning(allow_spinning), + track_task(track_task), + queue_empty_waiter(waiter) {} + size_t num_threads; bool allow_spinning; + // If you need to blocking the calling thread to wait "queue empty", set + // track_task = true and set queue_empty_waiter. EventsWaiter::WaitEvent will + // block the calling thread until any of events (including "queue empty") + // occured. bool track_task; + EventsWaiter* queue_empty_waiter{nullptr}; // not owned }; class WorkQueue { @@ -44,12 +60,13 @@ class WorkQueue { virtual void AddTask(std::function fn) = 0; - // set WorkQueueOptions.track_task = true before call this - // interface, otherwise will abort() - virtual void WaitQueueEmpty() = 0; + // See WorkQueueOptions.track_task for details + // virtual void WaitQueueEmpty() = 0; virtual size_t NumThreads() const = 0; + virtual void Cancel() = 0; + protected: WorkQueueOptions options_; }; @@ -67,14 +84,15 @@ class WorkQueueGroup { virtual void AddTask(size_t queue_idx, std::function fn) = 0; - // set WorkQueueOptions.track_task = true for at least one of queues - // before call this interface, otherwise will abort() - virtual void WaitQueueGroupEmpty() = 0; + // See WorkQueueOptions.track_task for details + // virtual void WaitQueueGroupEmpty() = 0; virtual size_t QueueNumThreads(size_t queue_idx) const = 0; virtual size_t QueueGroupNumThreads() const = 0; + virtual void Cancel() = 0; + protected: std::vector queues_options_; }; diff --git a/paddle/fluid/framework/new_executor/workqueue_test.cc b/paddle/fluid/framework/new_executor/workqueue_test.cc index c229a84b145ab1..3ea0096b631e82 100644 --- a/paddle/fluid/framework/new_executor/workqueue_test.cc +++ b/paddle/fluid/framework/new_executor/workqueue_test.cc @@ -16,18 +16,21 @@ #include #include "glog/logging.h" #include "gtest/gtest.h" +#include "paddle/fluid/framework/new_executor/workqueue_utils.h" TEST(WorkQueue, TestSingleThreadedWorkQueue) { VLOG(1) << "In Test"; using paddle::framework::WorkQueueOptions; using paddle::framework::WorkQueue; using paddle::framework::CreateSingleThreadedWorkQueue; + using paddle::framework::EventsWaiter; std::atomic finished{false}; std::atomic counter{0}; constexpr unsigned kLoopNum = 1000000; // CreateSingleThreadedWorkQueue + EventsWaiter events_waiter; WorkQueueOptions options(/*num_threads*/ 1, /*allow_spinning*/ true, - /*track_task*/ true); + /*track_task*/ true, &events_waiter); auto work_queue = CreateSingleThreadedWorkQueue(options); // NumThreads EXPECT_EQ(work_queue->NumThreads(), 1u); @@ -42,7 +45,7 @@ TEST(WorkQueue, TestSingleThreadedWorkQueue) { }); // WaitQueueEmpty EXPECT_EQ(finished.load(), false); - work_queue->WaitQueueEmpty(); + events_waiter.WaitEvent(); EXPECT_EQ(finished.load(), true); EXPECT_EQ(counter.load(), kLoopNum); } @@ -52,13 +55,15 @@ TEST(WorkQueue, TestMultiThreadedWorkQueue) { using paddle::framework::WorkQueueOptions; using paddle::framework::WorkQueue; using paddle::framework::CreateMultiThreadedWorkQueue; + using paddle::framework::EventsWaiter; std::atomic finished{false}; std::atomic counter{0}; constexpr unsigned kExternalLoopNum = 100; constexpr unsigned kLoopNum = 1000000; // CreateMultiThreadedWorkQueue + EventsWaiter events_waiter; WorkQueueOptions options(/*num_threads*/ 10, /*allow_spinning*/ true, - /*track_task*/ true); + /*track_task*/ true, &events_waiter); auto work_queue = CreateMultiThreadedWorkQueue(options); // NumThreads EXPECT_EQ(work_queue->NumThreads(), 10u); @@ -75,24 +80,28 @@ TEST(WorkQueue, TestMultiThreadedWorkQueue) { } // WaitQueueEmpty EXPECT_EQ(finished.load(), false); - work_queue->WaitQueueEmpty(); + events_waiter.WaitEvent(); EXPECT_EQ(finished.load(), true); EXPECT_EQ(counter.load(), kLoopNum * kExternalLoopNum); + // Cancel + work_queue->Cancel(); } TEST(WorkQueue, TestWorkQueueGroup) { using paddle::framework::WorkQueueOptions; using paddle::framework::WorkQueueGroup; using paddle::framework::CreateWorkQueueGroup; + using paddle::framework::EventsWaiter; std::atomic finished{false}; std::atomic counter{0}; constexpr unsigned kExternalLoopNum = 100; constexpr unsigned kLoopNum = 1000000; - // CreateMultiThreadedWorkQueue + // ThreadedWorkQueueGroup + EventsWaiter events_waiter; WorkQueueOptions sq_options(/*num_threads*/ 1, /*allow_spinning*/ true, - /*track_task*/ true); + /*track_task*/ true, &events_waiter); WorkQueueOptions mq_options(/*num_threads*/ 10, /*allow_spinning*/ true, - /*track_task*/ true); + /*track_task*/ true, &events_waiter); auto queue_group = CreateWorkQueueGroup({sq_options, mq_options}); // NumThreads EXPECT_EQ(queue_group->QueueNumThreads(0), 1u); @@ -112,7 +121,9 @@ TEST(WorkQueue, TestWorkQueueGroup) { ++counter; } }); - // WaitQueueGroupEmpty() - queue_group->WaitQueueGroupEmpty(); + // WaitQueueGroupEmpty + events_waiter.WaitEvent(); EXPECT_EQ(counter.load(), kLoopNum * kExternalLoopNum + kLoopNum); + // Cancel + queue_group->Cancel(); } diff --git a/paddle/fluid/framework/new_executor/workqueue_utils.cc b/paddle/fluid/framework/new_executor/workqueue_utils.cc index 2ea49e676a807a..2c81cffb49d827 100644 --- a/paddle/fluid/framework/new_executor/workqueue_utils.cc +++ b/paddle/fluid/framework/new_executor/workqueue_utils.cc @@ -55,5 +55,62 @@ void AlignedFree(void* mem_ptr) { #endif } +constexpr EventsWaiter::EventId kEmptyEventId = -1; + +EventsWaiter::EventsWaiter() + : trigger_event_(kEmptyEventId), waiting_(false), cv_(1) {} + +std::shared_ptr EventsWaiter::RegisterEvent( + const std::string& name, EventChecker checker) { + names_.emplace_back(name); + checkers_.emplace_back(std::move(checker)); + EventId id = checkers_.size() - 1; + auto notifier = std::shared_ptr(new EventNotifier(id, this)); + notifiers_.emplace_back(notifier); + return notifier; +} + +std::string EventsWaiter::WaitEvent() { + // only one user can wait at any time + bool waiting = false; + if (!waiting_.compare_exchange_strong(waiting, true, + std::memory_order_seq_cst, + std::memory_order_relaxed)) { + PADDLE_THROW( + platform::errors::ResourceExhausted("Another thread is waiting.")); + } + EventId id = kEmptyEventId; + auto w = cv_.GetWaiter(0); + cv_.Prewait(); + int64_t event_num = checkers_.size(); + for (int64_t i = 0; id == kEmptyEventId && i < event_num; ++i) { + if (checkers_[i]()) { + id = i; + } + } + if (id != kEmptyEventId) { + cv_.CancelWait(); + } else { + cv_.CommitWait(w); + id = trigger_event_.load(std::memory_order_relaxed); + } + trigger_event_.store(kEmptyEventId, std::memory_order_relaxed); + waiting_.store(false); + return names_.at(id); +} + +void EventsWaiter::SetTriggerEvent(const EventId& id) { + trigger_event_.store(id, std::memory_order_relaxed); + cv_.Notify(true); +} + +std::string EventsWaiter::EventNotifier::GetEventName() { + return waiter_.names_.at(id_); +} + +void EventsWaiter::EventNotifier::NotifyEvent() { + waiter_.SetTriggerEvent(id_); +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/new_executor/workqueue_utils.h b/paddle/fluid/framework/new_executor/workqueue_utils.h index 6907f2f17da0db..a06d9f319dfeee 100644 --- a/paddle/fluid/framework/new_executor/workqueue_utils.h +++ b/paddle/fluid/framework/new_executor/workqueue_utils.h @@ -14,9 +14,15 @@ #pragma once +#include #include #include #include +#include +#include +#include +#include +#include "paddle/fluid/framework/new_executor/event_count.h" #include "paddle/fluid/platform/enforce.h" namespace paddle { @@ -63,5 +69,56 @@ void* AlignedMalloc(size_t size, size_t alignment); void AlignedFree(void* memory_ptr); +// A multiplexing waiter, be able to wait multi events simultaneously. +// Blocking the calling thread to wait any of the registered events. +// Non-thread-safe. +class EventsWaiter { + public: + using EventId = int64_t; + + using EventChecker = std::function; + + class EventNotifier { + public: + void NotifyEvent(); + + EventId GetEventId() { return id_; } + + std::string GetEventName(); + + private: + friend EventsWaiter; + EventNotifier(EventId id, EventsWaiter* waiter) + : id_(id), waiter_(*waiter) {} + + EventId id_; + EventsWaiter& waiter_; + }; + + EventsWaiter(); + + EventsWaiter(const EventsWaiter&) = delete; + + EventsWaiter& operator=(const EventsWaiter&) = delete; + + // All the RegisterEvent functions must be called before any WaitEvent + std::shared_ptr RegisterEvent(const std::string& name, + EventChecker checker); + + // Wait any of the registered events + std::string WaitEvent(); + + private: + friend EventNotifier; + void SetTriggerEvent(const EventId& id); + + std::vector names_; + std::vector checkers_; + std::vector> notifiers_; + std::atomic trigger_event_; + std::atomic waiting_; + EventCount cv_; +}; + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h index 0eafbb027f0421..9470fd9b699330 100644 --- a/paddle/fluid/framework/op_desc.h +++ b/paddle/fluid/framework/op_desc.h @@ -164,7 +164,7 @@ class OpDesc { // Note: the identity only used as a key for referring to its // distributed attribute now. - uint64_t Id() { return id_; } + uint64_t Id() const { return id_; } private: template diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 670cb36dcc3aba..0cd17cdb10d55c 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -76,6 +76,8 @@ static DDim GetDimsDebug(const Scope& scope, const std::string& name, } else { return var->Get().GetCompleteDims(); } + } else if (var->IsType()) { + return DDim({static_cast(var->Get().size())}); } else { return DDim({-1}); } @@ -106,6 +108,8 @@ static std::string GetDtype(const Scope& scope, const std::string& name) { } else { return DataTypeToString(tensor.type()); } + } else if (var->IsType()) { + return "strings"; } else { return ""; } @@ -1589,14 +1593,15 @@ void OperatorWithKernel::ParseInputDataType( "not initialized.", Type(), name, ctx.InputNames(name).at(i))); proto::VarType::Type tmp = t->type(); - PADDLE_ENFORCE( - tmp == *data_type || *data_type == default_data_type, - platform::errors::InvalidArgument( - "The DataType of %s Op's duplicable Variable %s must be " - "consistent. The current variable type is (%s), but the " - "previous variable type is (%s).", - Type(), name, DataTypeToString(tmp), - DataTypeToString(*data_type))); + PADDLE_ENFORCE(tmp == *data_type || *data_type == default_data_type, + platform::errors::InvalidArgument( + "The DataType of %s Op's duplicable or different " + "slot Variable %s must be " + "consistent or reigster GetExpectedKernelType. The " + "current variable type is (%s), but the " + "previous variable type is (%s).", + Type(), name, DataTypeToString(tmp), + DataTypeToString(*data_type))); *data_type = tmp; } } diff --git a/paddle/fluid/framework/operator_kernel_configs.h b/paddle/fluid/framework/operator_kernel_configs.h index 68edb7c89dd872..ab812a30981f0d 100644 --- a/paddle/fluid/framework/operator_kernel_configs.h +++ b/paddle/fluid/framework/operator_kernel_configs.h @@ -15,8 +15,10 @@ limitations under the License. */ #pragma once #include +#include #include #include +#include "glog/logging.h" namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt new file mode 100644 index 00000000000000..04931c7c4b35e1 --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt @@ -0,0 +1,11 @@ +cc_library(cinn_cache_key SRCS cinn_cache_key.cc DEPS boost graph graph_helper lod_tensor proto_desc) +cc_library(build_cinn_pass SRCS build_cinn_pass.cc DEPS pass subgraph_detector cinn_compiler) +cc_library(transform_desc SRCS transform_desc.cc DEPS proto_desc cinn) +cc_library(cinn_graph_symbolization SRCS cinn_graph_symbolization.cc DEPS lod_tensor graph graph_helper transform_desc cinn) +cc_library(cinn_compiler SRCS cinn_compiler.cc DEPS graph lod_tensor cinn_cache_key cinn_graph_symbolization cinn) + +cc_test(cinn_cache_key_test SRCS cinn_cache_key_test.cc DEPS cinn_cache_key) +cc_test(build_cinn_pass_test SRCS build_cinn_pass_test.cc DEPS build_cinn_pass cinn_compiler) +cc_test(transform_desc_test SRCS transform_desc_test.cc DEPS transform_desc) +cc_test(cinn_graph_symbolization_test SRCS cinn_graph_symbolization_test.cc DEPS cinn_graph_symbolization) +cc_test(cinn_compiler_test SRCS cinn_compiler_test.cc DEPS cinn_compiler place proto_desc graph_viz_pass build_cinn_pass cinn) diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc new file mode 100644 index 00000000000000..0664a63c2b72b3 --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.cc @@ -0,0 +1,392 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cinn/frontend/op_mapper_registry.h" +#include "cinn/frontend/op_mappers/use_op_mappers.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/node.h" +#include "paddle/fluid/framework/ir/subgraph_detector.h" +#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h" + +namespace paddle { +namespace framework { +namespace paddle2cinn { + +using framework::ir::Graph; +using framework::ir::Node; + +using GraphNodeVec = std::vector; +using GraphNodeSet = std::unordered_set; + +// Deal with subgraph's feed input var node: +// create a new input var node and it's feed op node +void AddFeedOpAndVar(const std::unordered_set& feed_vars, + const GraphNodeSet& cluster, + const std::unordered_map& old_op2new_op, + Graph* graph) { + for (auto* old_var : feed_vars) { + // create feed op + OpDesc desc; + desc.SetType("feed"); + desc.SetOutput("Out", {old_var->Name()}); + auto op = graph->CreateOpNode(&desc); + + // create new feed var node (SSAGraph) + auto var = graph->CreateVarNode(old_var->Var()); + + // link feed op and feed var + op->outputs = {var}; + var->inputs = {op}; + + // link feed var to cluster op + for (auto* old_op : old_var->outputs) { + if (cluster.count(old_op)) { + var->outputs.emplace_back(old_op2new_op.at(old_op)); + old_op2new_op.at(old_op)->inputs.emplace_back(var); + } + // Do not need relink old op or old var here, they will be + // fixed in RemoveLinkFromCluster, here we just deal with + // new subgraph's node. + } + } +} + +// Deal with subgraph's parameter var node: +// create a new input var node, it's data will get by scope, +// so it don't need feed op +void AddParamVar(const std::unordered_set& param_vars, + const GraphNodeSet& cluster, + const std::unordered_map& old_op2new_op, + Graph* graph) { + for (auto* old_var : param_vars) { + auto var = graph->CreateVarNode(old_var->Var()); + + for (auto* old_op : old_var->outputs) { + if (cluster.count(old_op)) { + var->outputs.emplace_back(old_op2new_op.at(old_op)); + old_op2new_op.at(old_op)->inputs.emplace_back(var); + } + } + } +} + +// Deal with subgraph's outputs var node: +// create a new output var node and it's fetch op +void AddOutputVar(const std::unordered_set& output_vars, + const GraphNodeSet& cluster, + const std::unordered_map& old_op2new_op, + Graph* graph) { + for (auto* old_var : output_vars) { + auto var = graph->CreateVarNode(old_var->Var()); + + for (auto* old_op : old_var->inputs) { + if (cluster.count(old_op)) { + var->inputs.emplace_back(old_op2new_op.at(old_op)); + old_op2new_op.at(old_op)->outputs.emplace_back(var); + } + } + } +} + +// Create new subgraph with and op nodes are cluster nodes, and all +// var node are from internal nodes +std::unique_ptr CreateNewSubGraph(const GraphNodeSet& cluster, + const GraphNodeSet& cluster_internals, + const GraphNodeSet& cluster_inputs, + const GraphNodeSet& cluster_outputs) { + // Graph's constructor must has one parameter, and in our code, + // the ProgramDesc is useless, so here we pass a temporary object. + auto subgraph = std::make_unique(framework::ProgramDesc()); + + std::unordered_map old_op2new_op; + for (auto* op : cluster) { + auto sub_node = subgraph->CreateOpNode(op->Op()); + old_op2new_op[op] = sub_node; + } + + std::unordered_map old_var2new_var; + for (auto* var : cluster_internals) { + Node* sub_node; + if (var->Var() == nullptr) { + sub_node = subgraph->CreateEmptyNode(var->Name(), var->NodeType()); + } else { + sub_node = subgraph->CreateVarNode(var->Var()); + } + old_var2new_var[var] = sub_node; + } + + std::unordered_set need_feed_vars; + std::unordered_set param_vars, output_vars; + // the subgraph is independently, so here we only need link + // to the node in new subgraph, and discard the link to + // out-graph. + for (auto* op : cluster) { + for (auto* var : op->inputs) { + if (cluster_internals.count(var)) { + old_op2new_op[op]->inputs.emplace_back(old_var2new_var[var]); + } else if (cluster_inputs.count(var) && var->Var() != nullptr) { + if (var->Var()->IsParameter()) { + // Parameters have been preserved in scope, compared to feed var, + // param just need add new var and don't need add feed op. + // The var is used for check whether we need preserve the tensor + // when transform paddle scope to CINN scope. + param_vars.insert(var); + } else { + // When the var is subgraph input and the var is not parameter, + // we need add a new feed op to feed the var. + need_feed_vars.insert(var); + } + } + } + for (auto* var : op->outputs) { + if (cluster_internals.count(var)) { + old_op2new_op[op]->outputs.emplace_back(old_var2new_var[var]); + } else if (cluster_outputs.count(var) && var->Var() != nullptr) { + // Create new output var node to guarantee the independency of + // subgraph. In other words, the subgraph has no connection with + // other graph, even the input graph. + output_vars.insert(var); + } + } + } + + AddFeedOpAndVar(need_feed_vars, cluster, old_op2new_op, subgraph.get()); + AddParamVar(param_vars, cluster, old_op2new_op, subgraph.get()); + AddOutputVar(output_vars, cluster, old_op2new_op, subgraph.get()); + + for (auto* var : cluster_internals) { + for (auto* op : var->inputs) { + if (cluster.count(op)) { + old_var2new_var[var]->inputs.emplace_back(old_op2new_op[op]); + } + } + for (auto* op : var->outputs) { + if (cluster.count(op)) { + old_var2new_var[var]->outputs.emplace_back(old_op2new_op[op]); + } + } + } + + return subgraph; +} + +// This interface is used to classify all variables involved in a cluster into +// three types: inputs, outputs, and internals. +// The input node is some subgraph op's input but not any subgraph op's output. +// The output node is some subgraph op's output and some out-graph op's input. +// Specially, the internal node is a node that only used by subgraph, and +// out-graph should not using this node at all. +// cluster_inputs & cluster_outputs & cluster_internals == NULL +// cluster_outputs | cluster_internals == all graph op's outputs node +void AnalyseClusterVariables(const GraphNodeSet& cluster, + GraphNodeSet* cluster_inputs, + GraphNodeSet* cluster_outputs, + GraphNodeSet* cluster_internals) { + // collecting all input and output of op + for (auto* op_node : cluster) { + for (auto* input_var_node : op_node->inputs) { + cluster_inputs->insert(input_var_node); + } + for (auto* output_var_node : op_node->outputs) { + cluster_outputs->insert(output_var_node); + } + } + // remove output node from cluster_inputs, + // and add cluster_internals node + for (auto* var_node : *cluster_outputs) { + if (cluster_inputs->count(var_node) > 0) { + // if a input node also exists in output list, remove + cluster_inputs->erase(var_node); + + // the internal node is must an output node of sub-graph, + // but not any input node of out-graph. + bool is_only_used_internal = true; + for (auto* next_op_node : var_node->outputs) { + is_only_used_internal &= (cluster.count(next_op_node) > 0); + } + if (is_only_used_internal) { + cluster_internals->insert(var_node); + } + } + } + + // if a output node also exists in internal list, remove. + for (auto* var_node : *cluster_internals) { + cluster_outputs->erase(var_node); + } +} + +Node* AddSpecialOpToGraph(const GraphNodeSet& cluster_inputs, + const GraphNodeSet& cluster_outputs, + const std::string& compilation_key, Graph* graph) { + // add special cinn op + framework::OpDesc special_op_desc; + special_op_desc.SetType(kCinnLaunchOp); + std::vector input_names; + std::for_each(cluster_inputs.begin(), cluster_inputs.end(), + [&input_names](Node* n) { + if (n->Var() != nullptr) { + input_names.emplace_back(n->Name()); + } + }); + special_op_desc.SetInput("X", input_names); + std::vector output_names; + std::for_each(cluster_outputs.begin(), cluster_outputs.end(), + [&output_names](Node* n) { + if (n->Var() != nullptr) { + output_names.emplace_back(n->Name()); + } + }); + special_op_desc.SetOutput("Out", output_names); + special_op_desc.SetAttr(kCompilationKey, compilation_key); + special_op_desc.Flush(); + auto* special_op_node = graph->CreateOpNode(&special_op_desc); + special_op_node->inputs.assign(cluster_inputs.begin(), cluster_inputs.end()); + special_op_node->outputs.assign(cluster_outputs.begin(), + cluster_outputs.end()); + return special_op_node; +} + +void AddLinkToSpecialOp(const GraphNodeSet& cluster_inputs, + const GraphNodeSet& cluster_outputs, + Node* special_op_node) { + // add new link from cluster_inputs to special_op_node + for (auto* var_node : cluster_inputs) { + var_node->outputs.push_back(special_op_node); + } + + // add new link from special_op_node to cluster_outputs + for (auto* var_node : cluster_outputs) { + var_node->inputs.push_back(special_op_node); + } +} + +void RemoveLinkFromCluster(const GraphNodeSet& cluster, + const GraphNodeSet& cluster_inputs, + const GraphNodeSet& cluster_outputs) { + // remove all nodes in cluster + auto get_preserved_ops = [&cluster](const GraphNodeVec& ops) { + GraphNodeVec nodes; + for (auto* op_node : ops) { + if (cluster.find(op_node) == cluster.end()) { + nodes.emplace_back(op_node); + } + } + return nodes; + }; + + // removing useless link from cluster_inputs to cluster + for (auto* var_node : cluster_inputs) { + auto preserved_ops = get_preserved_ops(var_node->outputs); + var_node->outputs.assign(preserved_ops.begin(), preserved_ops.end()); + // According to SSA form, a var node must not be any two op's output, + // and the cluster_inputs var nodes is defined as an out-graph op's + // output, so the cluster_inputs var nodes are not any subgraph op's + // output. Do not reassign input list here. + } + + // removing useless link from cluster to cluster_outputs + for (auto* var_node : cluster_outputs) { + auto preserved_ops = get_preserved_ops(var_node->inputs); + var_node->inputs.assign(preserved_ops.begin(), preserved_ops.end()); + + // Note that cluster_outputs var node maybe some subgraph op's input, + // here we need remove them. + preserved_ops = get_preserved_ops(var_node->outputs); + var_node->outputs.assign(preserved_ops.begin(), preserved_ops.end()); + } +} + +// Removing cluster node and internals node from Graph +void RemoveSubGraphFromGraph(const GraphNodeSet& cluster, + const GraphNodeSet& cluster_internals, + Graph* graph) { + for (auto* op_node : cluster) { + graph->RemoveNode(op_node); + } + for (auto* var_node : cluster_internals) { + graph->RemoveNode(var_node); + } +} + +// Replacing Cinn subgraph to a special op node, whose op_type is +// kCinnLaunchOp, and inputs ares cluster_inputs and outputs are +// cluster_outputs. +// Meanwhile, move all links of cluster to the special op. +void ReplaceSubGraphWithSpecialOpNode(const GraphNodeSet& cluster, + const GraphNodeSet& cluster_inputs, + const GraphNodeSet& cluster_outputs, + const GraphNodeSet& cluster_internals, + const std::string& compilation_key, + Graph* graph) { + // First, add the special op node whose name is "kCinnLaunchOp" into graph + auto special_op_node = AddSpecialOpToGraph(cluster_inputs, cluster_outputs, + compilation_key, graph); + // Second, remove all graph's links which are from or to cluster nodes + RemoveLinkFromCluster(cluster, cluster_inputs, cluster_outputs); + // Third, add new links from or to the the special op node + AddLinkToSpecialOp(cluster_inputs, cluster_outputs, special_op_node); + // Finally, remove the cinn sub graph from graph + RemoveSubGraphFromGraph(cluster, cluster_internals, graph); +} + +// Search all subgraphs which all op node supported by CINN, +// Here we using SubgraphDetector to detecte the subgraph that +// all of op node supported by CINN. We using OpMapperRegistry +// to check whether the op node supported by CINN. +void SearchAllSubgraphs(Graph* graph) { + auto teller = [](const Node* node) { + return ::cinn::frontend::OpMapperRegistry::Global()->Find(node->Name()) != + nullptr; + }; + std::vector clusters = + framework::ir::SubgraphDetector(graph, teller)(); + + auto* cinn_compiler = CinnCompiler::GetInstance(); + for (const auto& node_vec : clusters) { + // Classify var node to inputs, outputs, and internals. + GraphNodeSet cluster_set(node_vec.begin(), node_vec.end()); + + GraphNodeSet cluster_inputs, cluster_outputs, cluster_internals; + AnalyseClusterVariables(cluster_set, &cluster_inputs, &cluster_outputs, + &cluster_internals); + // Create a new subgraph according to the found cluster and + // save it in CinnCompiler + std::string compilation_key = cinn_compiler->AddGraph(CreateNewSubGraph( + cluster_set, cluster_internals, cluster_inputs, cluster_outputs)); + // Replace the found cluster to a new special op node + ReplaceSubGraphWithSpecialOpNode(cluster_set, cluster_inputs, + cluster_outputs, cluster_internals, + compilation_key, graph); + } +} + +void BuildCinnPass::ApplyImpl(Graph* graph) const { SearchAllSubgraphs(graph); } + +} // namespace paddle2cinn +} // namespace framework +} // namespace paddle + +REGISTER_PASS(build_cinn_pass, paddle::framework::paddle2cinn::BuildCinnPass); diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass.h b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.h new file mode 100644 index 00000000000000..556ff228915e4d --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass.h @@ -0,0 +1,63 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace paddle2cinn { + +constexpr char kCinnLaunchOp[] = "CinnLaunchOp"; +constexpr char kCompilationKey[] = "compilation_key"; + +// A pass named BuildCinnPass, the function of this pass is: +// +// a) Detect the subgraphs that can be compiled by the CINN compiler. We call a +// detected subgraph a cluster, which is consisted of several op nodes. +// +// b) Call the CINN compiler to compile each original cluster and get the +// compiled cluster, which is consisted of several kCinnLaunchOp. +// +// c) Replace the original cluster with corresponding compiled cluster on the +// original graph. +// +// In this pass, some questions are handled with cautions: +// +// a) How to determine whether two op nodes can be divided into a cluster? +// Firstly, both op nodes should be compile supported. +// Secondly, there should be a direct path between the two op nodes through a +// var node. +// Thirdly, there should be no extra path between the two op nodes through +// unsupported op nodes. +// Lastly, if op nodes a and b can be divied into a cluster, op nodes b and c +// can be divided into a cluster, a and c can also be divided into a cluster. +// The implementation of cluster detection is encapsulated in the +// SubGraphDetector +// class. +// +// b) How to deal with the links between the var nodes in global graph and the +// op nodes in a cluster? +// We first add links between the var nodes in global graph and the op nodes in +// the compiled cluster, and then remove useless links between the var nodes in +// global graph and the op nodes in the original cluster. +class BuildCinnPass : public framework::ir::Pass { + protected: + void ApplyImpl(framework::ir::Graph* graph) const override; +}; + +} // namespace paddle2cinn +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc new file mode 100644 index 00000000000000..79a27dccb4b00c --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/build_cinn_pass_test.cc @@ -0,0 +1,526 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h" + +#include +#include +#include + +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/details/build_strategy.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/node.h" +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/var_desc.h" + +namespace paddle { +namespace framework { +namespace paddle2cinn { + +using framework::ir::Graph; +using framework::ir::Node; + +inline bool CheckNodeExisted(const std::unordered_set& nodes, + const std::string& op_name) { + return std::find_if(nodes.begin(), nodes.end(), [&op_name](const Node* node) { + return node->Name() == op_name; + }) != nodes.end(); +} + +inline int CountNode(const std::unordered_set& nodes, + const std::string& op_name) { + return std::count_if( + nodes.begin(), nodes.end(), + [&op_name](const Node* node) { return node->Name() == op_name; }); +} + +inline Node* GetNode(const std::unordered_set& nodes, + const std::string& op_name) { + return *std::find_if(nodes.begin(), nodes.end(), + [&op_name](const Node* node) { + return node->Name().find(op_name) != std::string::npos; + }); +} + +inline bool CheckGraphIndependence(const std::unordered_set& nodes) { + auto check_node_ok = [&nodes](Node* n1, Node* n2) -> bool { + if (n1->IsOp() && !n2->IsVar()) { + return false; + } + if (n1->IsVar() && !n2->IsOp()) { + return false; + } + if (nodes.count(n2) == 0) { + return false; + } + return true; + }; + + for (auto node : nodes) { + for (auto in : node->inputs) { + if (!check_node_ok(node, in)) { + return false; + } + } + for (auto out : node->outputs) { + if (!check_node_ok(node, out)) { + return false; + } + } + } + return true; +} + +// Get compilation_key values +std::vector GetCompilationKeys(const Graph& graph) { + std::vector compilation_keys; + for (auto& node : graph.Nodes()) { + if (node->IsOp() && node->Name() == kCinnLaunchOp) { + compilation_keys.emplace_back( + BOOST_GET_CONST(std::string, node->Op()->GetAttr(kCompilationKey))); + } + } + return compilation_keys; +} + +std::unique_ptr BuildNoCinnSubgraph() { + ProgramDesc prog; + auto g = std::make_unique(prog); + // var1 -- + // | --> fake1 --> var3 --> fake2 --> var4 + // var2 -- + OpDesc fake1_op; + fake1_op.SetType("fake1"); + OpDesc fake2_op; + fake2_op.SetType("fake2"); + + VarDesc var1("var1"); + VarDesc var2("var2"); + var2.SetPersistable(true); + var2.SetIsParameter(true); + VarDesc var3("var3"); + VarDesc var4("var4"); + + ir::Node* fake1 = g->CreateOpNode(&fake1_op); + ir::Node* fake2 = g->CreateOpNode(&fake2_op); + + ir::Node* v1 = g->CreateVarNode(&var1); + ir::Node* v2 = g->CreateVarNode(&var2); + ir::Node* v3 = g->CreateVarNode(&var3); + ir::Node* v4 = g->CreateVarNode(&var4); + + // fill op node + fake1->inputs = {v1, v2}; + fake1->outputs = {v3}; + fake2->inputs = {v3}; + fake2->outputs = {v4}; + + // fill variable node + v1->outputs = {fake1}; + v2->outputs = {fake1}; + + v3->inputs = {fake1}; + v3->outputs = {fake2}; + + v4->inputs = {fake2}; + + return g; +} + +TEST(BuildCinnPassTest, NoCinnSubgraph) { + auto g = BuildNoCinnSubgraph(); + auto previous_nodes = g->Nodes(); + + auto pass = + paddle::framework::ir::PassRegistry::Instance().Get("build_cinn_pass"); + pass->Apply(g.get()); + + // After search, origin graph should no change + ASSERT_EQ(previous_nodes, g->Nodes()); + ASSERT_TRUE(CheckGraphIndependence(g->Nodes())); + + // After search, there should be no cinn subgraph + ASSERT_TRUE(GetCompilationKeys(*g).empty()); +} + +std::unique_ptr BuildAllOpSupportCinnGraph() { + ProgramDesc prog; + auto g = std::make_unique(prog); + + // v1 -- + // | --> mul --> v3 -- + // v2 -- | --> add --> v5 --> relu --> v6 + // v4 -- + + OpDesc add_op; + add_op.SetType("add"); + OpDesc mul_op; + mul_op.SetType("mul"); + OpDesc relu_op; + relu_op.SetType("relu"); + + VarDesc var1("var1"); + VarDesc var2("var2"); + var2.SetPersistable(true); + var2.SetIsParameter(true); + VarDesc var3("var3"); + VarDesc var4("var4"); + VarDesc var5("var5"); + VarDesc var6("var6"); + + ir::Node* add = g->CreateOpNode(&add_op); + ir::Node* mul = g->CreateOpNode(&mul_op); + ir::Node* relu = g->CreateOpNode(&relu_op); + + ir::Node* v0 = g->CreateEmptyNode("var0", Node::Type::kVariable); + ir::Node* v1 = g->CreateVarNode(&var1); + ir::Node* v2 = g->CreateVarNode(&var2); + ir::Node* v3 = g->CreateVarNode(&var3); + ir::Node* v4 = g->CreateVarNode(&var4); + ir::Node* v5 = g->CreateVarNode(&var5); + ir::Node* v6 = g->CreateVarNode(&var6); + ir::Node* v7 = g->CreateControlDepVar(); + + // fill op node + mul->inputs = {v0, v1, v2}; + mul->outputs = {v3}; + add->inputs = {v3, v4}; + add->outputs = {v5}; + relu->inputs = {v5}; + relu->outputs = {v6, v7}; + + // fill variable node + v0->outputs = {mul}; + v1->outputs = {mul}; + v2->outputs = {mul}; + + v3->inputs = {mul}; + v3->outputs = {add}; + + v4->outputs = {add}; + + v5->inputs = {add}; + v5->outputs = {relu}; + + v6->inputs = {relu}; + v7->inputs = {relu}; + + return g; +} + +TEST(BuildCinnPassTest, AllOpSupportCinn) { + auto g = BuildAllOpSupportCinnGraph(); + + auto pass = + paddle::framework::ir::PassRegistry::Instance().Get("build_cinn_pass"); + pass->Apply(g.get()); + + // After search, the graph should as following + // v0 --| + // v1 --| |--> v6 + // v2 --| --> kCinnLaunchOp |--> v7 + // v4 --| + const auto& nodes = g->Nodes(); + ASSERT_EQ(nodes.size(), static_cast(7)); + ASSERT_TRUE(CheckGraphIndependence(nodes)); + + // A new op named kCinnLaunchOp should be added + ASSERT_TRUE(CheckNodeExisted(nodes, kCinnLaunchOp)); + auto* cinn_op = GetNode(nodes, kCinnLaunchOp); + auto* v0 = GetNode(nodes, "var0"); + auto* v1 = GetNode(nodes, "var1"); + auto* v2 = GetNode(nodes, "var2"); + auto* v4 = GetNode(nodes, "var4"); + auto* v6 = GetNode(nodes, "var6"); + auto* v7 = GetNode(nodes, Node::kControlDepVarName); + + ASSERT_EQ( + std::unordered_set(cinn_op->inputs.begin(), cinn_op->inputs.end()), + std::unordered_set({v0, v1, v2, v4})); + ASSERT_EQ(cinn_op->outputs, std::vector({v6, v7})); + ASSERT_EQ(v1->outputs, std::vector({cinn_op})); + ASSERT_EQ(v6->inputs, std::vector({cinn_op})); + + // previous op (mul, add, relu) should all removed + ASSERT_FALSE(CheckNodeExisted(nodes, "mul")); + ASSERT_FALSE(CheckNodeExisted(nodes, "add")); + ASSERT_FALSE(CheckNodeExisted(nodes, "relu")); + + // After search, there should has just one cinn subgraph + // feed --> v1 -- + // | --> mul --> v3 -- + // v2 -- | --> add --> v5 --> relu --> v6 + // feed --> v4 -- + auto compilation_keys = GetCompilationKeys(*g); + ASSERT_EQ(compilation_keys.size(), static_cast(1)); + auto* cinn_compiler = CinnCompiler::GetInstance(); + const auto& subgraph = cinn_compiler->FindGraph(compilation_keys[0]); + + const auto& subnodes = subgraph.Nodes(); + ASSERT_EQ(subnodes.size(), static_cast(11)); + ASSERT_TRUE(CheckGraphIndependence(subnodes)); + + ASSERT_TRUE(CheckNodeExisted(subnodes, "mul")); + ASSERT_TRUE(CheckNodeExisted(subnodes, "add")); + ASSERT_TRUE(CheckNodeExisted(subnodes, "relu")); + ASSERT_EQ(CountNode(subnodes, "feed"), 2); + + // No-parameter input should has feed op + auto new_v1 = GetNode(subnodes, "var1"); + ASSERT_EQ(new_v1->inputs.size(), static_cast(1)); + ASSERT_EQ(new_v1->outputs.size(), static_cast(1)); + ASSERT_EQ(new_v1->inputs[0]->Name(), "feed"); + ASSERT_EQ(new_v1->outputs[0]->Name(), "mul"); + + // Parameter input should not has feed op + auto new_v2 = GetNode(subnodes, "var2"); + ASSERT_TRUE(new_v2->inputs.empty()); + ASSERT_EQ(new_v2->outputs.size(), static_cast(1)); + ASSERT_EQ(new_v2->outputs[0]->Name(), "mul"); +} + +std::unique_ptr BuildGraphWithOneCinnSubgraph() { + ProgramDesc prog; + auto g = std::make_unique(prog); + + // fake1 --> v1 -- + // | --> mul --> v3 --> relu --> v4 --> fake2 + // v2 -- + + OpDesc fake1_op; + fake1_op.SetType("fake1"); + OpDesc mul_op; + mul_op.SetType("mul"); + OpDesc relu_op; + relu_op.SetType("relu"); + OpDesc fake2_op; + fake2_op.SetType("fake2"); + + VarDesc var1("var1"); + VarDesc var2("var2"); + var2.SetPersistable(true); + var2.SetIsParameter(true); + VarDesc var3("var3"); + VarDesc var4("var4"); + + ir::Node* fake1 = g->CreateOpNode(&fake1_op); + ir::Node* mul = g->CreateOpNode(&mul_op); + ir::Node* relu = g->CreateOpNode(&relu_op); + ir::Node* fake2 = g->CreateOpNode(&fake2_op); + + ir::Node* v1 = g->CreateVarNode(&var1); + ir::Node* v2 = g->CreateVarNode(&var2); + ir::Node* v3 = g->CreateVarNode(&var3); + ir::Node* v4 = g->CreateVarNode(&var4); + + // fill op node + fake1->outputs = {v1}; + mul->inputs = {v2, v1}; + mul->outputs = {v3}; + relu->inputs = {v3}; + relu->outputs = {v4}; + fake2->inputs = {v4}; + + // fill variable node + v2->outputs = {mul}; + + v1->inputs = {fake1}; + v1->outputs = {mul}; + + v3->inputs = {mul}; + v3->outputs = {relu}; + + v4->inputs = {relu}; + v4->outputs = {fake2}; + + return g; +} + +TEST(BuildCinnPassTest, OneCinnSubgraph) { + auto g = BuildGraphWithOneCinnSubgraph(); + + auto pass = + paddle::framework::ir::PassRegistry::Instance().Get("build_cinn_pass"); + pass->Apply(g.get()); + + // After search, the graph should as following + // fake1 --> v1 -- + // | --> kCinnLaunchOp --> v4 --> fake2 + // v2 -- + const auto& nodes = g->Nodes(); + ASSERT_EQ(nodes.size(), static_cast(6)); + ASSERT_TRUE(CheckGraphIndependence(nodes)); + + // A new op named kCinnLaunchOp should be added + ASSERT_TRUE(CheckNodeExisted(nodes, kCinnLaunchOp)); + + // previous op (mul, add, relu) should be removed + ASSERT_FALSE(CheckNodeExisted(nodes, "mul")); + ASSERT_FALSE(CheckNodeExisted(nodes, "relu")); + + // previous op (fake1, fake2) should be preserved + ASSERT_TRUE(CheckNodeExisted(nodes, "fake1")); + ASSERT_TRUE(CheckNodeExisted(nodes, "fake2")); + + // After search, there should has just one cinn subgraph + // feed --> v1 -- + // | --> mul --> v3 --> relu --> v4 + // v2 -- + auto compilation_keys = GetCompilationKeys(*g); + ASSERT_EQ(compilation_keys.size(), static_cast(1)); + auto* cinn_compiler = CinnCompiler::GetInstance(); + const auto& subgraph = cinn_compiler->FindGraph(compilation_keys[0]); + + const auto& subnodes = subgraph.Nodes(); + ASSERT_EQ(subnodes.size(), static_cast(7)); + ASSERT_TRUE(CheckGraphIndependence(subnodes)); + + ASSERT_TRUE(CheckNodeExisted(subnodes, "mul")); + ASSERT_TRUE(CheckNodeExisted(subnodes, "relu")); + ASSERT_EQ(CountNode(subnodes, "feed"), 1); +} + +std::unique_ptr BuildGraphWithMultiCinnSubgraph() { + ProgramDesc prog; + auto g = std::make_unique(prog); + + // fake1 --> v1 -- + // | --> mul --> v3 --> fake2 --> v4 --> relu --> v5 --> fake3 + // v2 -- + + OpDesc fake1_op; + fake1_op.SetType("fake1"); + OpDesc mul_op; + mul_op.SetType("mul"); + OpDesc relu_op; + relu_op.SetType("relu"); + OpDesc fake2_op; + fake2_op.SetType("fake2"); + OpDesc fake3_op; + fake3_op.SetType("fake3"); + + VarDesc var1("var1"); + VarDesc var2("var2"); + var2.SetPersistable(true); + var2.SetIsParameter(true); + VarDesc var3("var3"); + VarDesc var4("var4"); + VarDesc var5("var5"); + + ir::Node* fake1 = g->CreateOpNode(&fake1_op); + ir::Node* mul = g->CreateOpNode(&mul_op); + ir::Node* relu = g->CreateOpNode(&relu_op); + ir::Node* fake2 = g->CreateOpNode(&fake2_op); + ir::Node* fake3 = g->CreateOpNode(&fake3_op); + + ir::Node* v1 = g->CreateVarNode(&var1); + ir::Node* v2 = g->CreateVarNode(&var2); + ir::Node* v3 = g->CreateVarNode(&var3); + ir::Node* v4 = g->CreateVarNode(&var4); + ir::Node* v5 = g->CreateVarNode(&var5); + + // fill op node + fake1->outputs = {v1}; + mul->inputs = {v2, v1}; + mul->outputs = {v3}; + fake2->inputs = {v3}; + fake2->outputs = {v4}; + relu->inputs = {v4}; + relu->outputs = {v5}; + fake3->inputs = {v5}; + + // fill variable node + v2->outputs = {mul}; + + v1->inputs = {fake1}; + v1->outputs = {mul}; + + v3->inputs = {mul}; + v3->outputs = {fake2}; + + v4->inputs = {fake2}; + v4->outputs = {relu}; + + v5->inputs = {relu}; + v5->outputs = {fake3}; + + return g; +} + +TEST(BuildCinnPassTest, MultiCinnSubgraph) { + auto g = BuildGraphWithMultiCinnSubgraph(); + + auto pass = + paddle::framework::ir::PassRegistry::Instance().Get("build_cinn_pass"); + pass->Apply(g.get()); + + // After search, the graph should as following + // fake1 -> v1 - + // | -> CinnOp -> v3 -> fake2 -> v4 -> CinnOp ->v5 -> fake3 + // v2 - + const auto& nodes = g->Nodes(); + ASSERT_EQ(nodes.size(), static_cast(10)); + ASSERT_TRUE(CheckGraphIndependence(nodes)); + + // A new op named kCinnLaunchOp should be added + ASSERT_TRUE(CheckNodeExisted(nodes, kCinnLaunchOp)); + ASSERT_EQ(CountNode(nodes, kCinnLaunchOp), 2); + + // previous op (mul, add, relu) should be removed + ASSERT_FALSE(CheckNodeExisted(nodes, "mul")); + ASSERT_FALSE(CheckNodeExisted(nodes, "relu")); + + // previous op (fake1, fake2) should be preserved + ASSERT_TRUE(CheckNodeExisted(nodes, "fake1")); + ASSERT_TRUE(CheckNodeExisted(nodes, "fake2")); + ASSERT_TRUE(CheckNodeExisted(nodes, "fake3")); + + // After search, there should has two cinn subgraphs, + // and each of subgraphs just has one node. + auto compilation_keys = GetCompilationKeys(*g); + ASSERT_EQ(compilation_keys.size(), static_cast(2)); + + // subgraph1: + // feed --> v4 --> relu --> v5 + // subgraph2: + // feed --> v1 -- + // | --> mul --> v3 + // v2 -- + auto* cinn_compiler = CinnCompiler::GetInstance(); + const auto& subgraph1 = cinn_compiler->FindGraph(compilation_keys[0]); + const auto& subnodes1 = subgraph1.Nodes(); + ASSERT_TRUE(CheckGraphIndependence(subnodes1)); + + const auto& subgraph2 = cinn_compiler->FindGraph(compilation_keys[1]); + const auto& subnodes2 = subgraph2.Nodes(); + ASSERT_TRUE(CheckGraphIndependence(subnodes2)); + + if (CheckNodeExisted(subnodes1, "relu")) { + ASSERT_EQ(subnodes1.size(), static_cast(4)); + ASSERT_EQ(subnodes2.size(), static_cast(5)); + } else { + ASSERT_EQ(subnodes2.size(), static_cast(4)); + ASSERT_EQ(subnodes1.size(), static_cast(5)); + } +} + +} // namespace paddle2cinn +} // namespace framework +} // namespace paddle + +USE_PASS(build_cinn_pass); diff --git a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc new file mode 100644 index 00000000000000..923282c59e2d4a --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.cc @@ -0,0 +1,94 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/paddle2cinn/cinn_cache_key.h" + +#include +#include + +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/lod_tensor.h" + +namespace paddle { +namespace framework { +namespace paddle2cinn { + +CinnCacheKey::CinnCacheKey( + const ir::Graph& graph, + const std::map& input_tensors, + const std::string& arch_str) { + this->SetKey(graph, input_tensors, arch_str); +} + +CinnCacheKey::CinnCacheKey(const ir::Graph& graph, + const std::map& input_shapes, + const std::string& arch_str) { + this->SetKey(graph, input_shapes, arch_str); +} + +void CinnCacheKey::SetKey( + const ir::Graph& graph, + const std::map& input_tensors, + const std::string& arch_str) { + ProgramDesc program; + GraphToProgram(graph, &program); + program.Proto()->SerializeToString(&graph_serialize_str_); + for (const auto& name_tensor : input_tensors) { + input_shapes_[name_tensor.first] = name_tensor.second->dims(); + } + arch_str_ = arch_str; +} + +void CinnCacheKey::SetKey(const ir::Graph& graph, + const std::map& input_shapes, + const std::string& arch_str) { + ProgramDesc program; + GraphToProgram(graph, &program); + program.Proto()->SerializeToString(&graph_serialize_str_); + input_shapes_ = input_shapes; + arch_str_ = arch_str; +} + +bool CinnCacheKey::operator!=(const CinnCacheKey& other) const { + return !this->operator==(other); +} + +bool CinnCacheKey::operator==(const CinnCacheKey& other) const { + return graph_serialize_str_ == other.graph_serialize_str_ && + input_shapes_ == other.input_shapes_ && arch_str_ == other.arch_str_; +} + +size_t CinnCacheKey::Hash::hash_combine(size_t seed, size_t value) { + return seed ^ (value + 0x9e3779b9 + (seed << 6) + (seed >> 2)); +} + +size_t CinnCacheKey::Hash::operator()(const CinnCacheKey& key) const { + std::size_t ret = 0; + + std::hash string_hasher; + for (const auto& name_shape : key.input_shapes_) { + ret = hash_combine(ret, string_hasher(name_shape.first)); + ret = hash_combine(ret, string_hasher(name_shape.second.to_str())); + } + + ret = hash_combine(ret, string_hasher(key.graph_serialize_str_)); + ret = hash_combine(ret, string_hasher(key.arch_str_)); + return ret; +} + +} // namespace paddle2cinn +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h new file mode 100644 index 00000000000000..02b152a681c446 --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key.h @@ -0,0 +1,68 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/lod_tensor.h" + +namespace paddle { +namespace framework { +namespace paddle2cinn { + +// Class to store the keys for compiling CINN. +// +// CINN cannot handle changable shape now, so CinnCompiler keeps a cache mapping +// from CinnCacheKey to CinnCompiledObject. +// +// The CinnCacheKey contains a graph serialized string and the input tensor +// shapes. +class CinnCacheKey { + public: + CinnCacheKey(const ir::Graph& graph, + const std::map& input_tensors, + const std::string& arch_str); + CinnCacheKey(const ir::Graph& graph, + const std::map& input_shapes, + const std::string& arch_str); + + ~CinnCacheKey() {} + + void SetKey(const ir::Graph& graph, + const std::map& input_tensors, + const std::string& arch_str); + void SetKey(const ir::Graph& graph, + const std::map& input_shapes, + const std::string& arch_str); + + bool operator==(const CinnCacheKey& other) const; + bool operator!=(const CinnCacheKey& other) const; + + struct Hash { + static size_t hash_combine(size_t seed, size_t value); + size_t operator()(const CinnCacheKey& key) const; + }; + + private: + std::string graph_serialize_str_; + std::map input_shapes_; + std::string arch_str_; +}; + +} // namespace paddle2cinn +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc new file mode 100644 index 00000000000000..f13f44998211f4 --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/cinn_cache_key_test.cc @@ -0,0 +1,103 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/paddle2cinn/cinn_cache_key.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { +namespace paddle2cinn { + +TEST(CinnCacheKeyTest, TestAsUnorderedKey) { + std::unordered_set test_set; + + ProgramDesc empty_program; + ir::Graph empty_graph(empty_program); + + ProgramDesc program; + auto *global_block = program.MutableBlock(0); + auto *x = global_block->Var("X"); + x->SetType(proto::VarType::LOD_TENSOR); + ir::Graph graph(program); + + LoDTensor tensor; + tensor.Resize({1, 2, 3}); + const LoDTensor *tensor_pointer = &tensor; + std::map feed_tensors = { + {"X", tensor_pointer}}; + + DDim ddim = paddle::framework::make_ddim({1, 2, 3}); + std::map feed_shapes = {{"X", ddim}}; + + CinnCacheKey cache_key0(empty_graph, feed_tensors, "x86"); + CinnCacheKey cache_key1(empty_graph, feed_shapes, "x86"); + EXPECT_EQ(cache_key0, cache_key1); + + CinnCacheKey cache_key2(graph, feed_shapes, "x86"); + CinnCacheKey cache_key3(graph, feed_shapes, "nvgpu"); + CinnCacheKey cache_key4(graph, feed_tensors, "nvgpu"); + EXPECT_NE(cache_key2, cache_key3); + EXPECT_EQ(cache_key3, cache_key4); + + CinnCacheKey cache_key5(empty_graph, + std::map(), "unk"); + CinnCacheKey cache_key6(empty_graph, std::map(), "unk"); + EXPECT_EQ(cache_key5, cache_key6); + + EXPECT_NE(cache_key1, cache_key3); + EXPECT_NE(cache_key4, cache_key2); + + EXPECT_NE(cache_key3, cache_key5); + EXPECT_NE(cache_key6, cache_key4); + + EXPECT_NE(cache_key5, cache_key1); + EXPECT_NE(cache_key2, cache_key6); + + test_set.insert(cache_key0); + test_set.insert(cache_key1); + test_set.insert(cache_key3); + test_set.insert(cache_key4); + test_set.insert(cache_key5); + test_set.insert(cache_key6); + EXPECT_EQ(test_set.size(), 3U); + + auto iter = test_set.find(cache_key0); + EXPECT_NE(iter, test_set.end()); + test_set.erase(iter); + EXPECT_EQ(test_set.size(), 2U); + EXPECT_EQ(test_set.find(cache_key1), test_set.end()); + + iter = test_set.find(cache_key3); + EXPECT_NE(iter, test_set.end()); + test_set.erase(iter); + EXPECT_EQ(test_set.size(), 1U); + EXPECT_EQ(test_set.find(cache_key4), test_set.end()); + + iter = test_set.find(cache_key5); + EXPECT_NE(iter, test_set.end()); + test_set.erase(iter); + EXPECT_EQ(test_set.size(), 0U); + EXPECT_EQ(test_set.find(cache_key6), test_set.end()); +} + +} // namespace paddle2cinn +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc new file mode 100644 index 00000000000000..44cea60bdcb8e4 --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.cc @@ -0,0 +1,127 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h" + +#include +#include +#include + +#include "cinn/common/target.h" +#include "cinn/common/type.h" +#include "cinn/frontend/decomposer/use_decomposer.h" +#include "cinn/frontend/net_builder.h" // need to remove after +#include "cinn/frontend/pass/use_program_pass.h" +#include "cinn/frontend/program_pass.h" +#include "cinn/frontend/syntax.h" +#include "cinn/hlir/framework/graph.h" +#include "cinn/hlir/framework/graph_compiler.h" +#include "cinn/hlir/framework/pass.h" +#include "cinn/hlir/pass/use_pass.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace paddle2cinn { + +using ir::Graph; +using ::cinn::common::Target; +using ::cinn::common::Float; +using ::cinn::hlir::framework::GraphCompiler; +using ::cinn::hlir::framework::BuildScope; +using ::cinn::frontend::ProgramPass; +using ::cinn::hlir::framework::ApplyPass; + +CinnCompiler* CinnCompiler::GetInstance() { + static CinnCompiler instance; + return &instance; +} + +std::string CinnCompiler::AddGraph(std::unique_ptr graph) { + std::string graph_key; + ProgramDesc program; + GraphToProgram(*graph, &program); + program.Proto()->SerializeToString(&graph_key); + if (!graphs_.count(graph_key)) { + graphs_[graph_key] = std::move(graph); + } else { + LOG(WARNING) + << "The graph being added is already in CinnCompiler. Its key is:\n" + << graph_key; + } + return graph_key; +} + +const Graph& CinnCompiler::FindGraph(const std::string& graph_key) const { + PADDLE_ENFORCE_NE( + graphs_.count(graph_key), 0, + platform::errors::InvalidArgument("Can not find the target graph: %s", + graph_key.c_str())); + return *graphs_.at(graph_key); +} + +const CinnCompiledObject& CinnCompiler::Compile( + const Graph& graph, + const std::map& input_tensors, + const Target& target) { + CinnCacheKey cur_key(graph, input_tensors, target.arch_str()); + if (!cache_.count(cur_key)) { + real_compiled_num_++; + cache_[cur_key] = CompileGraph(graph, input_tensors, target); + } + return *cache_[cur_key]; +} + +const CinnCompiledObject& CinnCompiler::Compile( + const std::string& compilation_key, + const std::map& input_tensors, + const Target& target) { + const auto& graph = FindGraph(compilation_key); + return Compile(graph, input_tensors, target); +} + +std::unique_ptr CinnCompiler::CompileGraph( + const ir::Graph& graph, + const std::map& input_tensors, + const Target& target) const { + CinnGraphSymbolization symbol{real_compiled_num_, graph, target, + input_tensors}; + auto frontend_program = symbol(); + ProgramPass::Apply(&frontend_program, target, {"Decomposer"}); + auto cinn_graph = std::make_shared<::cinn::hlir::framework::Graph>( + frontend_program, target); + VLOG(4) << "The " << real_compiled_num_ << "-th compilation (" + << target.arch_str() << "), and its related graph:\n" + << cinn_graph->Visualize(); + ApplyPass(cinn_graph.get(), "OpFusion"); + auto scope = BuildScope(target, cinn_graph); + GraphCompiler graph_compiler(target, scope, cinn_graph); + GraphCompiler::CompileOptions options; + options.with_instantiate_variables = false; + auto compiled_res = graph_compiler.Build(options); + auto compiled_obj = std::make_unique(); + *compiled_obj = {std::move(compiled_res.runtime_program), scope, + symbol.var_model_to_program_map()}; + return compiled_obj; +} + +} // namespace paddle2cinn +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler.h b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h new file mode 100644 index 00000000000000..3b0fb5cf6965f4 --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler.h @@ -0,0 +1,88 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include + +#include "cinn/common/target.h" +#include "cinn/hlir/framework/graph_compiler.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/paddle2cinn/cinn_cache_key.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/macros.h" + +namespace paddle { +namespace framework { +namespace paddle2cinn { + +struct CinnCompiledObject { + std::unique_ptr<::cinn::hlir::framework::Program> runtime_program; + std::shared_ptr<::cinn::hlir::framework::Scope> scope; + std::unordered_map paddle2cinn_varmap; +}; + +// Entrance to use CINN. +// +// CINN cannot handle changable shape now, so CinnCompiler keeps a cache mapping +// from CinnCacheKey to CinnCompiledObject. If cache hits, we will re-use cache +// stored CinnCompiledObject, otherwise we will compile again and put into +// cache. +class CinnCompiler { + public: + // Singleton + static CinnCompiler* GetInstance(); + + const CinnCompiledObject& Compile( + const ir::Graph& graph, + const std::map& input_tensors, + const ::cinn::common::Target& target); + + const CinnCompiledObject& Compile( + const std::string& compilation_key, + const std::map& input_tensors, + const ::cinn::common::Target& target); + + std::string AddGraph(std::unique_ptr graph); + + const ir::Graph& FindGraph(const std::string& key) const; + + std::int64_t real_compiled_num() const { return real_compiled_num_; } + + ~CinnCompiler() = default; + + private: + CinnCompiler() = default; + std::unique_ptr CompileGraph( + const ir::Graph& graph, + const std::map& input_tensors, + const ::cinn::common::Target& target) const; + + std::unordered_map> graphs_; + std::unordered_map, + CinnCacheKey::Hash> + cache_; + std::atomic_int64_t real_compiled_num_{0}; + + DISABLE_COPY_AND_ASSIGN(CinnCompiler); +}; + +} // namespace paddle2cinn +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc new file mode 100644 index 00000000000000..22792e0f8c359a --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/cinn_compiler_test.cc @@ -0,0 +1,168 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/paddle2cinn/cinn_compiler.h" + +#include +#include +#include + +#include "cinn/common/target.h" +#include "gtest/gtest.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/paddle2cinn/build_cinn_pass.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/place.h" + +namespace paddle { +namespace framework { +namespace paddle2cinn { + +using ir::Graph; +using ::cinn::common::Target; + +// X - +// | -> mul -> MUL_OUT - +// Y - | -> elementwise_add -> ADD_OUT -> relu -> RELU_OUT +// Z - +std::unique_ptr CreateGraph() { + ProgramDesc program; + auto* global_block = program.MutableBlock(0); + // mul + auto* x = global_block->Var("X"); + x->SetType(proto::VarType::LOD_TENSOR); + x->SetLoDLevel(0); + x->SetDataType(proto::VarType::FP32); + x->SetShape({1000, 784}); + + auto* y = global_block->Var("Y"); + y->SetType(proto::VarType::LOD_TENSOR); + y->SetLoDLevel(0); + y->SetDataType(proto::VarType::FP32); + y->SetShape({784, 100}); + y->SetPersistable(true); + y->SetIsParameter(true); + + auto* mul_op = global_block->AppendOp(); + mul_op->SetType("mul"); + mul_op->SetInput("X", {x->Name()}); + mul_op->SetInput("Y", {y->Name()}); + + auto* mul_out = global_block->Var("MUL_OUT"); + mul_out->SetType(proto::VarType::LOD_TENSOR); + mul_op->SetOutput("Out", {mul_out->Name()}); + + // add + auto* z = global_block->Var("Z"); + z->SetType(proto::VarType::LOD_TENSOR); + z->SetLoDLevel(0); + z->SetDataType(proto::VarType::FP32); + z->SetShape({100}); + z->SetPersistable(true); + z->SetIsParameter(true); + + auto* add_op = global_block->AppendOp(); + add_op->SetType("elementwise_add"); + add_op->SetInput("X", {mul_out->Name()}); + add_op->SetInput("Y", {z->Name()}); + + auto* add_out = global_block->Var("ADD_OUT"); + add_out->SetType(proto::VarType::LOD_TENSOR); + add_op->SetOutput("Out", {add_out->Name()}); + + // relu + auto* relu_op = global_block->AppendOp(); + relu_op->SetType("relu"); + relu_op->SetInput("X", {add_out->Name()}); + + auto* relu_out = global_block->Var("RELU_OUT"); + relu_out->SetType(proto::VarType::LOD_TENSOR); + relu_op->SetOutput("Out", {relu_out->Name()}); + program.Flush(); + return std::make_unique(program); +} + +TEST(CinnCompilerTest, Compile) { + auto viz_pass = ir::PassRegistry::Instance().Get("graph_viz_pass"); + auto cinn_pass = ir::PassRegistry::Instance().Get("build_cinn_pass"); + auto viz_graph = [&viz_pass](const std::string& viz_path, Graph* graph) { + viz_pass->Erase("graph_viz_path"); + viz_pass->Set("graph_viz_path", new std::string(viz_path)); + viz_pass->Apply(graph); + }; + + // create a graph + auto graph = CreateGraph(); + viz_graph("origin_graph.dot", graph.get()); + // apply build_cinn_pass + cinn_pass->Apply(graph.get()); + viz_graph("processed_graph.dot", graph.get()); + // get the compilation_key + std::vector compilation_keys; + for (auto& node : graph->Nodes()) { + if (node->IsOp() && node->Name() == kCinnLaunchOp) { + compilation_keys.emplace_back( + BOOST_GET_CONST(std::string, node->Op()->GetAttr(kCompilationKey))); + } + } + ASSERT_EQ(compilation_keys.size(), 1); + + const auto& compilation_key = compilation_keys[0]; + auto* cinn_compiler = CinnCompiler::GetInstance(); + const auto& compiling_graph = cinn_compiler->FindGraph(compilation_key); + // viz_graph("compiling_graph.dot", const_cast(&compiling_graph)); + + EXPECT_THROW(cinn_compiler->FindGraph("no_existed"), + paddle::platform::EnforceNotMet); + + LoDTensor tensor1, tensor2, tensor3; + tensor1.Resize({1000, 784}); + tensor2.Resize({784, 100}); + tensor3.Resize({100}); + tensor1.mutable_data(platform::CPUPlace()); + tensor2.mutable_data(platform::CPUPlace()); + tensor3.mutable_data(platform::CPUPlace()); + std::map input_tensors = { + {"X", &tensor1}, {"Y", &tensor2}, {"Z", &tensor3}}; + + auto compile_fn = [&](const Target& target) { + const auto& compiled_obj = + cinn_compiler->Compile(compiling_graph, input_tensors, target); + ASSERT_NE(compiled_obj.runtime_program, nullptr); + ASSERT_NE(compiled_obj.scope, nullptr); + ASSERT_FALSE(compiled_obj.paddle2cinn_varmap.empty()); + const auto& cached_obj = + cinn_compiler->Compile(compilation_key, input_tensors, target); + ASSERT_EQ(reinterpret_cast(&compiled_obj), + reinterpret_cast(&cached_obj)); + }; + + // GPU Compilation + compile_fn(::cinn::common::DefaultNVGPUTarget()); + ASSERT_EQ(cinn_compiler->real_compiled_num(), 1); + // CPU Compilation + compile_fn(::cinn::common::DefaultHostTarget()); + ASSERT_EQ(cinn_compiler->real_compiled_num(), 2); +} + +} // namespace paddle2cinn +} // namespace framework +} // namespace paddle + +USE_PASS(build_cinn_pass); +USE_PASS(graph_viz_pass); diff --git a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc new file mode 100644 index 00000000000000..e4e16498b8440c --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.cc @@ -0,0 +1,172 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h" + +#include +#include +#include +#include + +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/paddle2cinn/transform_desc.h" +#include "paddle/fluid/framework/variable.h" + +#include "cinn/frontend/op_mappers/use_op_mappers.h" +#include "cinn/frontend/var_type_utils.h" + +namespace paddle { +namespace framework { +namespace paddle2cinn { + +using ir::Graph; +using ir::Node; +using CinnTensor = ::cinn::hlir::framework::Tensor; +using OpMapperContext = CinnGraphSymbolization::OpMapperContext; +using CinnOpDesc = CinnGraphSymbolization::CinnOpDesc; +using FeedInfoMap = CinnGraphSymbolization::FeedInfoMap; + +namespace utils { + +OpMapperContext::FeedInfo GetCinnFeedInfoFromTensor(const Tensor& tensor) { + OpMapperContext::FeedInfo info; + const auto& dim = tensor.dims(); + for (int i = 0; i < dim.size(); i++) { + info.shape.emplace_back(static_cast(dim[i])); + } + + auto cinn_var_type = TransformVarDataTypeToCinn(tensor.type()); + info.type = ::cinn::frontend::utils::CppVarType2CommonType(cinn_var_type); + return info; +} +} // namespace utils + +FeedInfoMap CinnGraphSymbolization::GetFeedInfoMapFromInput() const { + FeedInfoMap feed_map; + for (auto& feed_pair : input_tensors_) { + const auto& feed_name = feed_pair.first; + const auto* tensor = feed_pair.second; + + feed_map[feed_name] = utils::GetCinnFeedInfoFromTensor(*tensor); + } + return feed_map; +} + +// get the graph's op input Parameter var name set +std::unordered_set +CinnGraphSymbolization::GetGraphInputParameterNames() const { + std::unordered_set names; + + for (auto* node : graph_.Nodes()) { + if (node->IsOp()) { + for (auto* var : node->inputs) { + if (var->Var()->IsParameter()) { + // Only need preserve the input parameter var of graph, + // others do not. + names.insert(var->Name()); + } + } + } + } + + return names; +} + +// Transform paddle scope to cinn, note that we only preserve the graph’s +// input parameter variable and ignore others. +std::shared_ptr<::cinn::hlir::framework::Scope> +CinnGraphSymbolization::CreateCinnScope(const FeedInfoMap& feed_map) const { + auto cinn_scope = ::cinn::hlir::framework::Scope::Create(); + + // get the graph's input parameter variable name list + auto parameter_names = GetGraphInputParameterNames(); + + for (const auto& param_name : parameter_names) { + VLOG(4) << "add param var [" << param_name << "] info scope"; + // if cannot find var in graph input, skip. + // scope accepte the CINN format name, so here we need transform + // paddle format name to CINN format. + auto* cinn_var = cinn_scope->Var( + ::cinn::utils::TransValidVarName(param_name)); + + auto& cinn_tensor = absl::get(*cinn_var); + // here we only need preserve dtype and shape, do not need preserve data + auto feed_info = feed_map.at(param_name); + cinn_tensor->set_type(feed_info.type); + cinn_tensor->Resize(::cinn::hlir::framework::Shape(feed_info.shape)); + } + + return cinn_scope; +} + +std::vector> +CinnGraphSymbolization::TransformAllGraphOpToCinn() const { + std::vector> cinn_op_descs; + + const auto& sorted_ops = ir::TopologySortOperations(graph_); + for (auto* node : sorted_ops) { + cinn_op_descs.emplace_back(std::make_unique()); + auto& cinn_desc = cinn_op_descs.back(); + + TransformOpDescToCinn(node->Op(), cinn_desc.get()); + } + return cinn_op_descs; +} + +void CinnGraphSymbolization::RunOp(const CinnOpDesc& op_desc, + const OpMapperContext& ctx) const { + const auto& op_type = op_desc.Type(); + auto* kernel = ::cinn::frontend::OpMapperRegistry::Global()->Find(op_type); + PADDLE_ENFORCE_NE(kernel, nullptr, + platform::errors::NotFound( + "Op %s is Not Supported by CINN, please register" + " this op in the CINN repo.", + op_type.c_str())); + VLOG(4) << "Running Op " << op_type; + kernel->Run(op_desc, ctx); +} + +void CinnGraphSymbolization::RunGraph(const OpMapperContext& ctx) const { + auto cinn_op_descs = TransformAllGraphOpToCinn(); + // run the CINN op one by one, note that all ops + // have been sorted at constructor. + for (auto& op_desc : cinn_op_descs) { + RunOp(*op_desc, ctx); + } +} + +::cinn::frontend::Program CinnGraphSymbolization::operator()() { + std::string builder_name = "NetBuilder_of_graph_" + std::to_string(graph_id_); + VLOG(4) << "NetBuilder Name " << builder_name; + + ::cinn::frontend::NetBuilder builder(builder_name); + + auto feed_map = GetFeedInfoMapFromInput(); + auto cinn_scope = CreateCinnScope(feed_map); + + OpMapperContext ctx(*cinn_scope, target_, &builder, &var_map_, + &var_model_to_program_map_); + // add all tensor's feed info into context + for (auto& feed_pair : feed_map) { + ctx.AddFeedInfo(feed_pair.first, feed_pair.second); + VLOG(4) << "add feed var [" << feed_pair.first << "] info context"; + } + RunGraph(ctx); + + return builder.Build(); +} + +} // namespace paddle2cinn +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h new file mode 100644 index 00000000000000..b6b4b24c6ee3db --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h @@ -0,0 +1,128 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" + +#include "cinn/frontend/net_builder.h" +#include "cinn/frontend/op_mapper_registry.h" + +namespace paddle { +namespace framework { +namespace paddle2cinn { + +// An executor accept subgraph which is generated by BuildCinnPass, +// run each op's CINN Op Mapper, finally return a frontend::Program object +// corresponding to the subgraph. +// +// Parameter: +// 1. graph_id: +// the unique graph id, used for generating unique NetBuilder name. +// 2. graph: +// the CINN subgraph whose op are all supported by CINN, and the +// graph is independently of other graph. +// 3. input_tensors: +// all input var nodes of CINN subgraph, they are necessary for +// we need pass the shape and data type into CINN, otherwise the +// NetBuilder may error for the shape not meet the precondition. +// +// Describe: +// The main function is operator(), it will run all op function by CINN +// OpMapper and finally return a program object. +// The executor operator() consisted by the following step: +// 1. create a NetBuilder, it's name is unique for each graph; +// 2. create OpMapperContext, contain scope, target, local var_map and +// local var_model_to_program_map; +// 3. add all feed var into OpMapperContext to pass the shape and type +// into CINN; +// 4. topological sorting graph op nodes; +// 5. transform all op from paddle opdesc format to cinn opdesc format; +// 5. run the CINN op in graph one by one. Note that the graph have been +// topo sorted; +// 6. return the NetBuilder.Build() after all op run. +class CinnGraphSymbolization { + public: + CinnGraphSymbolization( + int64_t graph_id, const ir::Graph& graph, + const ::cinn::common::Target& target, + const std::map& input_tensors) + : graph_id_(graph_id), + graph_(graph), + target_(target), + input_tensors_(input_tensors) {} + + // run all CINN op in graph by topo sorting then return its NetBuilder + ::cinn::frontend::Program operator()(); + + // return the internal variable map + const std::unordered_map& var_map() + const { + return var_map_; + } + + // return the map from the variable name in paddle model to cinn program. + const std::unordered_map& var_model_to_program_map() + const { + return var_model_to_program_map_; + } + + using OpMapperContext = ::cinn::frontend::OpMapperContext; + using FeedInfoMap = + std::unordered_map; + using CinnOpDesc = ::cinn::frontend::paddle::cpp::OpDesc; + + private: + const int64_t graph_id_; + const ir::Graph& graph_; + const ::cinn::common::Target& target_; + const std::map& input_tensors_; + + // preserve local variable map + std::unordered_map var_map_; + std::unordered_map var_model_to_program_map_; + + // transform all paddle var desc in feed list into cinn_var_descs_ + FeedInfoMap GetFeedInfoMapFromInput() const; + + // transform all paddle op desc in graph into cinn op desc + std::vector> TransformAllGraphOpToCinn() const; + + // RunOp accept OpDesc and global run context then run + // it's kernel registered in OpMapper. + // called in RunGraph. + void RunOp(const CinnOpDesc& op_desc, const OpMapperContext& ctx) const; + + // preserve var desc, run the op one by one. + void RunGraph(const OpMapperContext& ctx) const; + + // create cinn scope and add parameter's feed info into scope + std::shared_ptr<::cinn::hlir::framework::Scope> CreateCinnScope( + const FeedInfoMap& feed_map) const; + + // get the graph op's input persistable var name set + std::unordered_set GetGraphInputParameterNames() const; + + friend class CinnGraphSymbolizationForTest; +}; + +} // namespace paddle2cinn +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc new file mode 100644 index 00000000000000..940228314a1d45 --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization_test.cc @@ -0,0 +1,299 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/paddle2cinn/cinn_graph_symbolization.h" + +namespace paddle { +namespace framework { +namespace paddle2cinn { + +using ir::Graph; +using ir::Node; +using ::cinn::frontend::NetBuilder; +using CinnTensor = ::cinn::hlir::framework::Tensor; +using OpMapperContext = CinnGraphSymbolization::OpMapperContext; +using CinnOpDesc = CinnGraphSymbolization::CinnOpDesc; +using FeedInfoMap = CinnGraphSymbolization::FeedInfoMap; + +// only used for test CinnGraphSymbolization class +class CinnGraphSymbolizationForTest { + public: + explicit CinnGraphSymbolizationForTest(CinnGraphSymbolization* cinn_symbol) + : cinn_symbol_(cinn_symbol) {} + + std::unordered_set GetGraphInputParameterNames() { + return cinn_symbol_->GetGraphInputParameterNames(); + } + + std::shared_ptr<::cinn::hlir::framework::Scope> CreateCinnScope( + const FeedInfoMap& feed_map) { + return cinn_symbol_->CreateCinnScope(feed_map); + } + + OpMapperContext CreateNewContext(NetBuilder* builder, + const FeedInfoMap& feed_map) { + return OpMapperContext(*cinn_symbol_->CreateCinnScope(feed_map), + cinn_symbol_->target_, builder, + &cinn_symbol_->var_map_, + &cinn_symbol_->var_model_to_program_map_); + } + + FeedInfoMap GetFeedInfoMapFromInput() { + return cinn_symbol_->GetFeedInfoMapFromInput(); + } + + std::vector> TransformAllGraphOpToCinn() { + return cinn_symbol_->TransformAllGraphOpToCinn(); + } + + void RunOp(const CinnOpDesc& op_desc, const OpMapperContext& ctx) { + cinn_symbol_->RunOp(op_desc, ctx); + } + + private: + CinnGraphSymbolization* cinn_symbol_; +}; + +class CinnGraphSymbolizationTest : public ::testing::Test { + public: + CinnGraphSymbolizationTest() { + int64_t graph_id = 100; + graph_ = BuildAllOpSupportCinnGraph(); + target_ = CreateDefaultTarget(); + feed_tensors_ = CreateFeedTensors(); + feed_targets_ = ConvertFeedType(feed_tensors_); + symbol_ = std::make_unique(graph_id, *graph_, + target_, feed_targets_); + builder_ = std::make_unique("NetBuilder_of_graph_" + + std::to_string(graph_id)); + test_ = std::make_unique(symbol_.get()); + feed_map_ = test_->GetFeedInfoMapFromInput(); + } + + std::unique_ptr symbol_; + std::unique_ptr test_; + std::map feed_targets_; + + OpMapperContext CreateNewContext() { + return test_->CreateNewContext(builder_.get(), feed_map_); + } + + std::shared_ptr<::cinn::hlir::framework::Scope> CreateCinnScope() { + return test_->CreateCinnScope(feed_map_); + } + + private: + std::unique_ptr graph_; + ::cinn::common::Target target_; + std::map feed_tensors_; + std::unique_ptr builder_; + FeedInfoMap feed_map_; + + std::unique_ptr BuildAllOpSupportCinnGraph() { + ProgramDesc prog; + auto g = std::make_unique(prog); + + // v1 -- + // | --> mul --> v3 -- + // v2 -- | --> add --> v5 --> relu --> v6 + // v4 -- + + OpDesc add_op; + add_op.SetType("add"); + add_op.SetInput("X", {"var3"}); + add_op.SetInput("Y", {"var4"}); + add_op.SetOutput("Out", {"var5"}); + + OpDesc mul_op; + mul_op.SetType("mul"); + mul_op.SetInput("X", {"var1"}); + mul_op.SetInput("Y", {"var2"}); + mul_op.SetOutput("Out", {"var3"}); + + OpDesc relu_op; + relu_op.SetType("relu"); + relu_op.SetInput("X", {"var5"}); + relu_op.SetOutput("Out", {"var6"}); + + OpDesc feed_var1; + feed_var1.SetType("feed"); + feed_var1.SetOutput("Out", {"var1"}); + + OpDesc feed_var4; + feed_var4.SetType("feed"); + feed_var4.SetOutput("Out", {"var4"}); + + VarDesc var1("var1"); + VarDesc var2("var2"); + var2.SetPersistable(true); + var2.SetIsParameter(true); + VarDesc var3("var3"); + VarDesc var4("var4"); + VarDesc var5("var5"); + VarDesc var6("var6"); + + ir::Node* add = g->CreateOpNode(&add_op); + ir::Node* mul = g->CreateOpNode(&mul_op); + ir::Node* relu = g->CreateOpNode(&relu_op); + + ir::Node* feed1 = g->CreateOpNode(&feed_var1); + ir::Node* feed4 = g->CreateOpNode(&feed_var4); + + ir::Node* v1 = g->CreateVarNode(&var1); + ir::Node* v2 = g->CreateVarNode(&var2); + ir::Node* v3 = g->CreateVarNode(&var3); + ir::Node* v4 = g->CreateVarNode(&var4); + ir::Node* v5 = g->CreateVarNode(&var5); + ir::Node* v6 = g->CreateVarNode(&var6); + + // fill op node + feed1->outputs = {v1}; + feed4->outputs = {v4}; + mul->inputs = {v1, v2}; + mul->outputs = {v3}; + add->inputs = {v3, v4}; + add->outputs = {v5}; + relu->inputs = {v5}; + relu->outputs = {v6}; + + // fill variable node + v1->inputs = {feed1}; + v1->outputs = {mul}; + + v2->outputs = {mul}; + + v3->inputs = {mul}; + v3->outputs = {add}; + + v4->inputs = {feed4}; + v4->outputs = {add}; + + v5->inputs = {add}; + v5->outputs = {relu}; + + v6->inputs = {relu}; + + return g; + } + + ::cinn::common::Target CreateDefaultTarget(bool use_gpu = false) { +#ifdef PADDLE_WITH_CUDA + if (use_gpu) { + return ::cinn::common::DefaultNVGPUTarget(); + } +#endif + return ::cinn::common::DefaultHostTarget(); + } + + std::map CreateFeedTensors() { + std::map feed_targets; + + auto create_tensor = []() { + LoDTensor tensor; + DDim dims = {256, 1024}; + tensor.Resize(dims); + tensor.mutable_data(platform::CPUPlace(), proto::VarType::FP32); + return tensor; + }; +#define FillFeedList(Name) feed_targets[#Name] = create_tensor(); + FillFeedList(var1); + FillFeedList(var2); + FillFeedList(var3); + FillFeedList(var4); + FillFeedList(var5); + FillFeedList(var6); +#undef FillFeedList + DDim y_dim = {1024, 1024}; + feed_targets["var2"].Resize(y_dim); + + return feed_targets; + } + + std::map ConvertFeedType( + const std::map& feed_targets) { + std::map res; + for (auto& feed_pair : feed_targets) { + res[feed_pair.first] = &feed_pair.second; + } + return res; + } +}; + +TEST_F(CinnGraphSymbolizationTest, feed_map) { + auto feed_map = test_->GetFeedInfoMapFromInput(); + auto ctx = CreateNewContext(); + + ASSERT_TRUE(feed_map.count("var1")); + ASSERT_TRUE(feed_map.count("var2")); + + auto feed_info = feed_map.at("var1"); + ASSERT_EQ(feed_info.shape, std::vector({256, 1024})); + ASSERT_EQ(feed_info.type, ::cinn::common::F32()); +} + +TEST_F(CinnGraphSymbolizationTest, scope) { + auto prame_names = test_->GetGraphInputParameterNames(); + ASSERT_EQ(prame_names, std::unordered_set({"var2"})); + + auto cinn_scope = CreateCinnScope(); + + auto* var1 = cinn_scope->FindVar("var1"); + ASSERT_EQ(var1, nullptr); + auto* var2 = cinn_scope->FindVar("var2"); + ASSERT_NE(var2, nullptr); + + auto& cinn_tensor = absl::get(*var2); + ASSERT_EQ(cinn_tensor->shape().data(), std::vector({1024, 1024})); + ASSERT_EQ(cinn_tensor->type(), ::cinn::common::F32()); +} + +TEST_F(CinnGraphSymbolizationTest, sortgraph) { + auto cinn_op_descs = test_->TransformAllGraphOpToCinn(); + ASSERT_FALSE(cinn_op_descs.empty()); + std::vector sort_names; + for (auto& desc : cinn_op_descs) { + sort_names.emplace_back(desc->Type()); + } + ASSERT_EQ(sort_names, + std::vector({"feed", "mul", "feed", "add", "relu"})); +} + +TEST_F(CinnGraphSymbolizationTest, runop) { + auto cinn_op_descs = test_->TransformAllGraphOpToCinn(); + auto feed_map = test_->GetFeedInfoMapFromInput(); + + auto ctx = CreateNewContext(); + // add all tensor's feed info into context + for (auto& feed_pair : feed_map) { + ctx.AddFeedInfo(feed_pair.first, feed_pair.second); + } + + ASSERT_NO_THROW(test_->RunOp(*cinn_op_descs[0], ctx)); + + CinnOpDesc desc; + desc.SetType("fake"); + ASSERT_ANY_THROW(test_->RunOp(desc, ctx)); +} + +TEST_F(CinnGraphSymbolizationTest, basic) { + ASSERT_NO_THROW((*symbol_)()); + ASSERT_FALSE(symbol_->var_map().empty()); + ASSERT_FALSE(symbol_->var_model_to_program_map().empty()); +} + +} // namespace paddle2cinn +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/paddle2cinn/transform_desc.cc b/paddle/fluid/framework/paddle2cinn/transform_desc.cc new file mode 100644 index 00000000000000..52b1395c732ace --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/transform_desc.cc @@ -0,0 +1,348 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/paddle2cinn/transform_desc.h" + +#include +#include +#include + +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { +namespace paddle2cinn { + +using PbVarType = framework::proto::VarType; +namespace cpp = ::cinn::frontend::paddle::cpp; + +::cinn::frontend::paddle::cpp::VarDescAPI::Type TransformVarTypeToCinn( + const ::paddle::framework::proto::VarType::Type &type) { +#define SET_TYPE_CASE_ITEM(type__) \ + case ::paddle::framework::proto::VarType::type__: \ + return ::cinn::frontend::paddle::cpp::VarDescAPI::Type::type__; \ + break; + + switch (type) { + SET_TYPE_CASE_ITEM(LOD_TENSOR); + SET_TYPE_CASE_ITEM(LOD_TENSOR_ARRAY); + SET_TYPE_CASE_ITEM(LOD_RANK_TABLE); + SET_TYPE_CASE_ITEM(SELECTED_ROWS); + SET_TYPE_CASE_ITEM(FEED_MINIBATCH); + SET_TYPE_CASE_ITEM(FETCH_LIST); + SET_TYPE_CASE_ITEM(STEP_SCOPES); + SET_TYPE_CASE_ITEM(PLACE_LIST); + SET_TYPE_CASE_ITEM(READER); + default: + PADDLE_THROW(platform::errors::NotFound("Cannot found var type")); + } +#undef SET_TYPE_CASE_ITEM +} + +::paddle::framework::proto::VarType::Type TransformVarTypeFromCinn( + const ::cinn::frontend::paddle::cpp::VarDescAPI::Type &type) { +#define SET_TYPE_CASE_ITEM(type__) \ + case ::cinn::frontend::paddle::cpp::VarDescAPI::Type::type__: \ + return ::paddle::framework::proto::VarType::type__; \ + break; + + switch (type) { + SET_TYPE_CASE_ITEM(LOD_TENSOR); + SET_TYPE_CASE_ITEM(LOD_TENSOR_ARRAY); + SET_TYPE_CASE_ITEM(LOD_RANK_TABLE); + SET_TYPE_CASE_ITEM(SELECTED_ROWS); + SET_TYPE_CASE_ITEM(FEED_MINIBATCH); + SET_TYPE_CASE_ITEM(FETCH_LIST); + SET_TYPE_CASE_ITEM(STEP_SCOPES); + SET_TYPE_CASE_ITEM(PLACE_LIST); + SET_TYPE_CASE_ITEM(READER); + default: + PADDLE_THROW(platform::errors::NotFound("Cannot found var type")); + } +#undef SET_TYPE_CASE_ITEM +} + +::cinn::frontend::paddle::cpp::VarDescAPI::Type TransformVarDataTypeToCinn( + const ::paddle::framework::proto::VarType::Type &type) { +#define SET_DATA_TYPE_CASE_ITEM(type__) \ + case ::paddle::framework::proto::VarType::type__: \ + return ::cinn::frontend::paddle::cpp::VarDescAPI::Type::type__; \ + break; + + switch (type) { + SET_DATA_TYPE_CASE_ITEM(BOOL); + SET_DATA_TYPE_CASE_ITEM(SIZE_T); + SET_DATA_TYPE_CASE_ITEM(UINT8); + SET_DATA_TYPE_CASE_ITEM(INT8); + SET_DATA_TYPE_CASE_ITEM(INT16); + SET_DATA_TYPE_CASE_ITEM(INT32); + SET_DATA_TYPE_CASE_ITEM(INT64); + SET_DATA_TYPE_CASE_ITEM(FP16); + SET_DATA_TYPE_CASE_ITEM(FP32); + SET_DATA_TYPE_CASE_ITEM(FP64); + default: + PADDLE_THROW(platform::errors::NotFound("Cannot found var data type")); + } +#undef SET_DATA_TYPE_CASE_ITEM +} + +::paddle::framework::proto::VarType::Type TransformVarDataTypeFromCpp( + const ::cinn::frontend::paddle::cpp::VarDescAPI::Type &type) { +#define SET_DATA_TYPE_CASE_ITEM(type__) \ + case ::cinn::frontend::paddle::cpp::VarDescAPI::Type::type__: \ + return ::paddle::framework::proto::VarType::type__; \ + break; + + switch (type) { + SET_DATA_TYPE_CASE_ITEM(BOOL); + SET_DATA_TYPE_CASE_ITEM(SIZE_T); + SET_DATA_TYPE_CASE_ITEM(UINT8); + SET_DATA_TYPE_CASE_ITEM(INT8); + SET_DATA_TYPE_CASE_ITEM(INT16); + SET_DATA_TYPE_CASE_ITEM(INT32); + SET_DATA_TYPE_CASE_ITEM(INT64); + SET_DATA_TYPE_CASE_ITEM(FP16); + SET_DATA_TYPE_CASE_ITEM(FP32); + SET_DATA_TYPE_CASE_ITEM(FP64); + default: + PADDLE_THROW(platform::errors::NotFound("Cannot found var data type")); + } +#undef SET_DATA_TYPE_CASE_ITEM +} + +void TransformVarDescToCinn(framework::VarDesc *pb_desc, + cpp::VarDesc *cpp_desc) { + cpp_desc->SetName(pb_desc->Name()); + cpp_desc->SetType(TransformVarTypeToCinn(pb_desc->GetType())); + cpp_desc->SetPersistable(pb_desc->Persistable()); + if (pb_desc->Name() != "feed" && pb_desc->Name() != "fetch") { + cpp_desc->SetDataType(TransformVarDataTypeToCinn(pb_desc->GetDataType())); + cpp_desc->SetShape(pb_desc->GetShape()); + } +} + +void TransformVarDescFromCinn(const cpp::VarDesc &cpp_desc, + framework::VarDesc *pb_desc) { + pb_desc->Proto()->Clear(); + pb_desc->SetName(cpp_desc.Name()); + pb_desc->SetType(TransformVarTypeFromCinn(cpp_desc.GetType())); + pb_desc->SetPersistable(cpp_desc.Persistable()); + if (cpp_desc.Name() != "feed" && cpp_desc.Name() != "fetch") { + pb_desc->SetShape(cpp_desc.GetShape()); + pb_desc->SetDataType(TransformVarDataTypeFromCpp(cpp_desc.GetDataType())); + } +} + +/// For OpDesc transform +void OpInputsToCinn(framework::OpDesc *pb_desc, cpp::OpDesc *cpp_desc) { + for (const std::string ¶m : pb_desc->InputNames()) { + cpp_desc->SetInput(param, pb_desc->Input(param)); + } +} + +void OpInputsFromCinn(const cpp::OpDesc &cpp_desc, framework::OpDesc *pb_desc) { + pb_desc->MutableInputs()->clear(); + for (const std::string ¶m : cpp_desc.InputArgumentNames()) { + pb_desc->SetInput(param, cpp_desc.Input(param)); + } +} + +void OpOutputsToCinn(framework::OpDesc *pb_desc, cpp::OpDesc *cpp_desc) { + for (const std::string ¶m : pb_desc->OutputNames()) { + cpp_desc->SetOutput(param, pb_desc->Output(param)); + } +} + +void OpOutputsFromCinn(const cpp::OpDesc &cpp_desc, + framework::OpDesc *pb_desc) { + pb_desc->MutableOutputs()->clear(); + for (const std::string ¶m : cpp_desc.OutputArgumentNames()) { + pb_desc->SetOutput(param, cpp_desc.Output(param)); + } +} + +void OpAttrsToCinn(framework::OpDesc *pb_desc, cpp::OpDesc *cpp_desc) { + using AttrType = framework::proto::AttrType; + auto set_attr = [&](const std::string &name, AttrType type) { + switch (type) { +#define IMPL_ONE(type__, T) \ + case AttrType::type__: \ + cpp_desc->SetAttr(name, pb_desc->GetAttrIfExists(name)); \ + break; + IMPL_ONE(INT, int32_t); + IMPL_ONE(FLOAT, float); + IMPL_ONE(STRING, std::string); + IMPL_ONE(STRINGS, std::vector); + IMPL_ONE(FLOATS, std::vector); + IMPL_ONE(INTS, std::vector); + IMPL_ONE(BOOLEAN, bool); + IMPL_ONE(LONG, int64_t); + IMPL_ONE(LONGS, std::vector); + case AttrType::BLOCK: { + auto i = pb_desc->GetAttrIfExists(name); + cpp_desc->SetAttr(name, i); + break; + } + default: + PADDLE_THROW(platform::errors::NotFound( + "Unsupported attr type %d found ", static_cast(type))); + } + }; +#undef IMPL_ONE + + for (const auto &attr_name : pb_desc->AttrNames()) { + auto type = pb_desc->GetAttrType(attr_name); + set_attr(attr_name, type); + } +} + +void OpAttrsFromCinn(const cpp::OpDesc &cpp_desc, framework::OpDesc *pb_desc) { + pb_desc->MutableAttrMap()->clear(); + using AttrType = cpp::OpDescAPI::AttrType; + auto set_attr = [&](const std::string &name, AttrType type) { + switch (type) { +#define IMPL_ONE(type__, T) \ + case AttrType::type__: \ + pb_desc->SetAttr(name, cpp_desc.GetAttr(name)); \ + break; + IMPL_ONE(INT, int32_t); + IMPL_ONE(FLOAT, float); + IMPL_ONE(STRING, std::string); + IMPL_ONE(STRINGS, std::vector); + IMPL_ONE(FLOATS, std::vector); + IMPL_ONE(INTS, std::vector); + IMPL_ONE(BOOLEAN, bool); + IMPL_ONE(LONG, int64_t); + IMPL_ONE(LONGS, std::vector); + default: + PADDLE_THROW(platform::errors::NotFound( + "Unsupported attr type %d found ", static_cast(type))); + } + }; +#undef IMPL_ONE + + for (const auto &attr_name : cpp_desc.AttrNames()) { + auto type = cpp_desc.GetAttrType(attr_name); + set_attr(attr_name, type); + } +} + +void TransformOpDescToCinn(framework::OpDesc *pb_desc, cpp::OpDesc *cpp_desc) { + cpp_desc->SetType(pb_desc->Type()); + OpInputsToCinn(pb_desc, cpp_desc); + OpOutputsToCinn(pb_desc, cpp_desc); + OpAttrsToCinn(pb_desc, cpp_desc); +} + +void TransformOpDescFromCinn(const cpp::OpDesc &cpp_desc, + framework::OpDesc *pb_desc) { + pb_desc->Proto()->Clear(); + pb_desc->SetType(cpp_desc.Type()); + OpInputsFromCinn(cpp_desc, pb_desc); + OpOutputsFromCinn(cpp_desc, pb_desc); + OpAttrsFromCinn(cpp_desc, pb_desc); +} + +/// For BlockDesc transform +void TransformBlockDescToCinn(framework::BlockDesc *pb_desc, + cpp::BlockDesc *cpp_desc) { + cpp_desc->SetIdx(pb_desc->ID()); + cpp_desc->SetParentIdx(pb_desc->Parent()); + cpp_desc->SetForwardBlockIdx(pb_desc->ForwardBlockID()); + + cpp_desc->ClearOps(); + const auto &all_ops = pb_desc->AllOps(); + for (const auto &op : all_ops) { + auto *cpp_op_desc = cpp_desc->AddOp(); + TransformOpDescToCinn(op, cpp_op_desc); + } + + cpp_desc->ClearVars(); + const auto &all_vars = pb_desc->AllVars(); + for (const auto &var : all_vars) { + auto *cpp_var_desc = cpp_desc->AddVar(); + TransformVarDescToCinn(var, cpp_var_desc); + } +} + +void TransformBlockDescFromCinn(const cpp::BlockDesc &cpp_desc, + framework::BlockDesc *pb_desc) { + pb_desc->Proto()->Clear(); + + pb_desc->Proto()->set_idx(cpp_desc.Idx()); + pb_desc->Proto()->set_parent_idx(cpp_desc.ParentIdx()); + pb_desc->Proto()->set_forward_block_idx(cpp_desc.ForwardBlockIdx()); + + for (size_t i = 0; i < cpp_desc.OpsSize(); ++i) { + const auto &cpp_op_desc = + cpp_desc.template GetConstOp(static_cast(i)); + auto *pb_op_desc = pb_desc->AppendOp(); + TransformOpDescFromCinn(cpp_op_desc, pb_op_desc); + } + + for (size_t i = 0; i < cpp_desc.VarsSize(); ++i) { + const auto &cpp_var_desc = + cpp_desc.template GetConstVar(static_cast(i)); + auto *pb_var_desc = pb_desc->Var(cpp_var_desc.Name()); + TransformVarDescFromCinn(cpp_var_desc, pb_var_desc); + } +} + +/// For ProgramDesc transform +void TransformProgramDescToCinn(framework::ProgramDesc *pb_desc, + cpp::ProgramDesc *cpp_desc) { + if (pb_desc->Proto()->version().has_version()) { + cpp_desc->SetVersion(pb_desc->Version()); + } + + cpp_desc->ClearBlocks(); + for (size_t i = 0; i < pb_desc->Size(); ++i) { + auto *pb_block_desc = pb_desc->MutableBlock(i); + auto *cpp_block_desc = cpp_desc->AddBlock(); + TransformBlockDescToCinn(pb_block_desc, cpp_block_desc); + } +} + +void TransformProgramDescFromCinn(const cpp::ProgramDesc &cpp_desc, + framework::ProgramDesc *pb_desc) { + pb_desc->Proto()->Clear(); + + if (cpp_desc.HasVersion()) { + pb_desc->SetVersion(cpp_desc.Version()); + } + + // For paddle proto program, the only way to add block is invoke + // AppendBlock(), + // the AppendBlock need one necessary parameter: const BlockDesc &parent, + // but the only function of parent is set the block's parent_idx value. + // Meanwhile a program has at least one block, so we set block0 to all + // sub-block's parent in initial and cannot remove. + // Don't worry, it will be change in "TransformBlockDescFromCinn". + auto *block0 = pb_desc->MutableBlock(0); + + for (size_t i = 0; i < cpp_desc.BlocksSize(); ++i) { + const auto &cpp_block_desc = cpp_desc.GetConstBlock(i); + framework::BlockDesc *pb_block_desc = nullptr; + if (i < pb_desc->Size()) { + pb_block_desc = pb_desc->MutableBlock(i); + } else { + pb_block_desc = pb_desc->AppendBlock(*block0); + } + TransformBlockDescFromCinn(cpp_block_desc, pb_block_desc); + } +} + +} // namespace paddle2cinn +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/paddle2cinn/transform_desc.h b/paddle/fluid/framework/paddle2cinn/transform_desc.h new file mode 100644 index 00000000000000..76a4f812730dfa --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/transform_desc.h @@ -0,0 +1,79 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/var_desc.h" + +#include "cinn/frontend/paddle/cpp/block_desc.h" +#include "cinn/frontend/paddle/cpp/desc_api.h" +#include "cinn/frontend/paddle/cpp/op_desc.h" +#include "cinn/frontend/paddle/cpp/program_desc.h" +#include "cinn/frontend/paddle/cpp/var_desc.h" + +namespace paddle { +namespace framework { +namespace paddle2cinn { + +::cinn::frontend::paddle::cpp::VarDescAPI::Type TransformVarTypeToCinn( + const ::paddle::framework::proto::VarType::Type& type); + +::paddle::framework::proto::VarType::Type TransformVarTypeFromCinn( + const ::cinn::frontend::paddle::cpp::VarDescAPI::Type& type); + +::cinn::frontend::paddle::cpp::VarDescAPI::Type TransformVarDataTypeToCinn( + const ::paddle::framework::proto::VarType::Type& type); + +::paddle::framework::proto::VarType::Type TransformVarDataTypeFromCpp( + const ::cinn::frontend::paddle::cpp::VarDescAPI::Type& type); + +// Why use framework::VarDesc* rather than const framework::VarDesc& here? +// framework::VarDesc lack of many API like clear(), etc. On the other hand, +// the paddle node return framework::Desc* even if the node is const +void TransformVarDescToCinn(framework::VarDesc* pb_desc, + ::cinn::frontend::paddle::cpp::VarDesc* cpp_desc); + +void TransformVarDescFromCinn( + const ::cinn::frontend::paddle::cpp::VarDesc& cpp_desc, + framework::VarDesc* pb_desc); + +void TransformOpDescToCinn(framework::OpDesc* pb_desc, + ::cinn::frontend::paddle::cpp::OpDesc* cpp_desc); + +void TransformOpDescFromCinn( + const ::cinn::frontend::paddle::cpp::OpDesc& cpp_desc, + framework::OpDesc* pb_desc); + +void TransformBlockDescToCinn( + framework::BlockDesc* pb_desc, + ::cinn::frontend::paddle::cpp::BlockDesc* cpp_desc); + +void TransformBlockDescFromCinn( + const ::cinn::frontend::paddle::cpp::BlockDesc& cpp_desc, + framework::BlockDesc* pb_desc); + +void TransformProgramDescToCinn( + framework::ProgramDesc* pb_desc, + ::cinn::frontend::paddle::cpp::ProgramDesc* cpp_desc); + +void TransformProgramDescFromCinn( + const ::cinn::frontend::paddle::cpp::ProgramDesc& cpp_desc, + framework::ProgramDesc* pb_desc); + +} // namespace paddle2cinn +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/paddle2cinn/transform_desc_test.cc b/paddle/fluid/framework/paddle2cinn/transform_desc_test.cc new file mode 100644 index 00000000000000..ba324295cad723 --- /dev/null +++ b/paddle/fluid/framework/paddle2cinn/transform_desc_test.cc @@ -0,0 +1,236 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/paddle2cinn/transform_desc.h" + +namespace paddle { +namespace framework { +namespace paddle2cinn { + +using PbVarType = framework::proto::VarType; +namespace cpp = ::cinn::frontend::paddle::cpp; + +// check VarDesc +cpp::VarDesc CreateCppVarDesc() { + cpp::VarDesc var("test"); + var.SetType(cpp::VarDescAPI::Type::LOD_TENSOR); + var.SetPersistable(true); + var.SetDataType(cpp::VarDescAPI::Type::FP32); + var.SetShape({100, 200, 300}); + return var; +} + +framework::VarDesc CreatePbVarDesc() { + framework::VarDesc var("test"); + var.SetType(PbVarType::LOD_TENSOR); + var.SetPersistable(true); + var.SetDataType(PbVarType::FP32); + var.SetShape({100, 200, 300}); + return var; +} + +TEST(TransformVarDesc, cpp2pb) { + auto cpp_var = CreateCppVarDesc(); + framework::VarDesc pb_var("init"); + TransformVarDescFromCinn(cpp_var, &pb_var); + + auto correct_var = CreatePbVarDesc(); + ASSERT_EQ(pb_var.Name(), correct_var.Name()); + ASSERT_EQ(pb_var.GetType(), correct_var.GetType()); + ASSERT_EQ(pb_var.Persistable(), correct_var.Persistable()); + ASSERT_EQ(pb_var.GetDataType(), correct_var.GetDataType()); + ASSERT_EQ(pb_var.GetShape(), correct_var.GetShape()); +} + +TEST(TransformVarDesc, pb2cpp) { + auto pb_var = CreatePbVarDesc(); + cpp::VarDesc cpp_var; + TransformVarDescToCinn(&pb_var, &cpp_var); + + auto correct_var = CreateCppVarDesc(); + ASSERT_EQ(cpp_var.Name(), correct_var.Name()); + ASSERT_EQ(cpp_var.GetType(), correct_var.GetType()); + ASSERT_EQ(cpp_var.Persistable(), correct_var.Persistable()); + ASSERT_EQ(cpp_var.GetDataType(), correct_var.GetDataType()); + ASSERT_EQ(cpp_var.GetShape(), correct_var.GetShape()); +} + +// check OpDesc +cpp::OpDesc CreateCppOpDesc() { + cpp::OpDesc op; + op.SetType("test"); + op.SetInput("X", {"x1"}); + op.SetInput("Y", {"y1", "y2"}); + op.SetOutput("Out", {"out1"}); + op.SetAttr("attr_f", 0.1f); + op.SetAttr("attr_str", "test_attr"); + return op; +} + +framework::OpDesc CreatePbOpDesc() { + framework::OpDesc op; + op.SetType("test"); + op.SetInput("X", {"x1"}); + op.SetInput("Y", {"y1", "y2"}); + op.SetOutput("Out", {"out1"}); + op.SetAttr("attr_f", 0.1f); + op.SetAttr("attr_str", std::string("test_attr")); + return op; +} + +TEST(TransformOpDesc, cpp2pb) { + auto cpp_op = CreateCppOpDesc(); + framework::OpDesc pb_op; + TransformOpDescFromCinn(cpp_op, &pb_op); + + auto correct_op = CreatePbOpDesc(); + ASSERT_EQ(pb_op.Type(), correct_op.Type()); + ASSERT_EQ(pb_op.Inputs(), correct_op.Inputs()); + ASSERT_EQ(pb_op.Outputs(), correct_op.Outputs()); + ASSERT_EQ(pb_op.AttrNames(), correct_op.AttrNames()); + + for (const auto &attr_name : pb_op.AttrNames()) { + ASSERT_EQ(pb_op.GetAttrType(attr_name), correct_op.GetAttrType(attr_name)); + } + ASSERT_EQ(pb_op.GetAttrIfExists("attr_f"), + correct_op.GetAttrIfExists("attr_f")); + ASSERT_EQ(pb_op.GetAttrIfExists("attr_str"), + correct_op.GetAttrIfExists("attr_str")); +} + +TEST(TransformOpDesc, pb2cpp) { + auto pb_op = CreatePbOpDesc(); + cpp::OpDesc cpp_op; + TransformOpDescToCinn(&pb_op, &cpp_op); + + auto correct_op = CreateCppOpDesc(); + ASSERT_EQ(cpp_op.Type(), correct_op.Type()); + ASSERT_EQ(cpp_op.inputs(), correct_op.inputs()); + ASSERT_EQ(cpp_op.outputs(), correct_op.outputs()); + ASSERT_EQ(cpp_op.AttrNames(), correct_op.AttrNames()); + ASSERT_EQ(cpp_op.attr_types(), correct_op.attr_types()); + + ASSERT_EQ(cpp_op.GetAttr("attr_f"), + correct_op.GetAttr("attr_f")); + ASSERT_EQ(cpp_op.GetAttr("attr_str"), + correct_op.GetAttr("attr_str")); +} + +// check BlockDesc +// framework::BlockDesc is DISABLE_COPY_AND_ASSIGN, so can not return +void CreateCppBlockDesc(cpp::BlockDesc *block) { + block->SetIdx(42); + block->SetParentIdx(4); + block->SetForwardBlockIdx(32); + + auto *op = block->AddOp(); + *op = CreateCppOpDesc(); + + auto *var = block->AddVar(); + *var = CreateCppVarDesc(); +} + +void CreatePbBlockDesc(framework::BlockDesc *block) { + block->Proto()->set_idx(42); + block->Proto()->set_parent_idx(4); + block->Proto()->set_forward_block_idx(32); + + auto *op = block->AppendOp(); + *op = CreatePbOpDesc(); + + auto *var = block->Var("init"); + *var = CreatePbVarDesc(); +} + +TEST(TransformBlockDesc, cpp2pb) { + cpp::BlockDesc cpp_block; + CreateCppBlockDesc(&cpp_block); + + framework::ProgramDesc pb_prog; + auto *pb_block = pb_prog.MutableBlock(0); + TransformBlockDescFromCinn(cpp_block, pb_block); + + framework::ProgramDesc correct_prog; + auto *correct_block = correct_prog.MutableBlock(0); + CreatePbBlockDesc(correct_block); + ASSERT_EQ(pb_block->ID(), correct_block->ID()); + ASSERT_EQ(pb_block->Parent(), correct_block->Parent()); + ASSERT_EQ(pb_block->ForwardBlockID(), correct_block->ForwardBlockID()); + ASSERT_EQ(pb_block->OpSize(), correct_block->OpSize()); + ASSERT_EQ(pb_block->AllVars().size(), correct_block->AllVars().size()); +} + +TEST(TransformBlockDesc, pb2cpp) { + framework::ProgramDesc pb_prog; + auto *pb_block = pb_prog.MutableBlock(0); + CreatePbBlockDesc(pb_block); + + cpp::BlockDesc cpp_block; + TransformBlockDescToCinn(pb_block, &cpp_block); + + cpp::BlockDesc correct_block; + CreateCppBlockDesc(&correct_block); + ASSERT_EQ(cpp_block.Idx(), correct_block.Idx()); + ASSERT_EQ(cpp_block.ParentIdx(), correct_block.ParentIdx()); + ASSERT_EQ(cpp_block.ForwardBlockIdx(), correct_block.ForwardBlockIdx()); + ASSERT_EQ(cpp_block.OpsSize(), correct_block.OpsSize()); + ASSERT_EQ(cpp_block.VarsSize(), correct_block.VarsSize()); +} + +// check ProgramDesc +cpp::ProgramDesc CreateCppProgramDesc() { + cpp::ProgramDesc prog; + prog.SetVersion(22); + + auto *block = prog.AddBlock(); + CreateCppBlockDesc(block); + + return prog; +} + +framework::ProgramDesc CreatePbProgramDesc() { + framework::ProgramDesc prog; + prog.SetVersion(22); + + auto *block = prog.MutableBlock(0); + CreatePbBlockDesc(block); + return prog; +} + +TEST(TransformProgramDesc, cpp2pb) { + auto cpp_prog = CreateCppProgramDesc(); + framework::ProgramDesc pb_prog; + TransformProgramDescFromCinn(cpp_prog, &pb_prog); + + auto correct_prog = CreatePbProgramDesc(); + ASSERT_EQ(pb_prog.Version(), correct_prog.Version()); + ASSERT_EQ(pb_prog.Size(), correct_prog.Size()); +} + +TEST(TransformProgramDesc, pb2cpp) { + auto pb_prog = CreatePbProgramDesc(); + cpp::ProgramDesc cpp_prog; + TransformProgramDescToCinn(&pb_prog, &cpp_prog); + + auto correct_prog = CreateCppProgramDesc(); + ASSERT_EQ(cpp_prog.Version(), correct_prog.Version()); + ASSERT_EQ(cpp_prog.BlocksSize(), correct_prog.BlocksSize()); +} + +} // namespace paddle2cinn +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index adbbfb380bc45f..d19ac0b65f4d1e 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -27,6 +27,7 @@ limitations under the License. */ #include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h" +#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph_helper.h" @@ -34,6 +35,7 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h" #include "paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.h" #include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" #include "paddle/fluid/platform/event.h" #include "paddle/fluid/platform/profiler.h" @@ -43,6 +45,10 @@ limitations under the License. */ DECLARE_double(eager_delete_tensor_gb); +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) +DECLARE_bool(sync_nccl_allreduce); +#endif + #ifdef WITH_GPERFTOOLS #include "gperftools/profiler.h" #endif @@ -669,6 +675,7 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, // ncclOp std::vector async_graphs = CompileGraphWithBuildStrategy(graph, &graphs, loss_var_name); + PrepareForCUDAGraphCapture(graph); graph = member_->ApplyMemoryOptimizePass(graph); async_graphs[0] = graph; @@ -882,6 +889,23 @@ void ParallelExecutor::BCastParamsToDevices( FetchResultType ParallelExecutor::Run( const std::vector &fetch_tensors, bool return_merged) { VLOG(3) << "enter ParallelExecutor Run"; +#ifdef PADDLE_WITH_CUDA + if (platform::IsCUDAGraphCapturing()) { + PADDLE_ENFORCE_EQ(fetch_tensors.empty(), true, + platform::errors::InvalidArgument( + "Cannot fetch data when using CUDA Graph.")); + PADDLE_ENFORCE_EQ( + member_->build_strategy_.allow_cuda_graph_capture_, true, + platform::errors::InvalidArgument( + "You must turn on build_strategy.allow_cuda_graph_capture = True " + "to enable CUDA Graph capturing.")); + PADDLE_ENFORCE_EQ( + member_->places_[0], platform::CUDAGraphCapturingPlace(), + platform::errors::InvalidArgument("The place to capture CUDAGraph is " + "not the same as the place to run.")); + } +#endif + #ifdef WITH_GPERFTOOLS if (gProfileStarted) { ProfilerFlush(); @@ -932,6 +956,16 @@ void ParallelExecutor::SkipMemoryReuse( void ParallelExecutor::FeedTensorsIntoLocalScopes( const std::vector> &tensors) { + if (platform::IsCUDAGraphCapturing()) { + for (auto &tensor : tensors) { + PADDLE_ENFORCE_EQ( + tensor.empty(), true, + platform::errors::PermissionDenied( + "Feeding data is not permitted when capturing CUDA Graph.")); + } + return; + } + if (!member_->AllowPartialFeed()) { PADDLE_ENFORCE_EQ(tensors.size(), member_->local_scopes_.size(), platform::errors::Unimplemented( @@ -987,6 +1021,14 @@ void ParallelExecutor::FeedTensorsIntoLocalScopes( void ParallelExecutor::FeedAndSplitTensorIntoLocalScopes( const std::unordered_map &tensors) { + if (platform::IsCUDAGraphCapturing()) { + PADDLE_ENFORCE_EQ( + tensors.empty(), true, + platform::errors::PermissionDenied( + "Feeding data is not permitted when capturing CUDA Graph.")); + return; + } + size_t num_places = member_->places_.size(); bool allow_partial_feed = member_->AllowPartialFeed(); @@ -1568,6 +1610,107 @@ const ir::Graph &ParallelExecutor::Graph() const { return member_->executor_->Graph(); } +void ParallelExecutor::PrepareForCUDAGraphCapture(ir::Graph *graph) { + const auto &build_strategy = member_->build_strategy_; + if (!build_strategy.allow_cuda_graph_capture_) return; +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE_EQ( + build_strategy.async_mode_, false, + platform::errors::InvalidArgument( + "Async Executor does not support CUDA Graph capturing.")); + PADDLE_ENFORCE_EQ( + platform::IsCUDAGraphCapturing(), false, + platform::errors::PermissionDenied("CUDA Graph is not allowed to capture " + "when running the first batch.")); + PADDLE_ENFORCE_EQ( + member_->places_.size(), 1, + platform::errors::InvalidArgument( + "CUDA Graph is only supported when one GPU device is running.")); + PADDLE_ENFORCE_EQ(platform::is_gpu_place(member_->places_[0]), true, + platform::errors::InvalidArgument( + "CUDA Graph is only supported on NVIDIA GPU device.")); + PADDLE_ENFORCE_EQ(FLAGS_sync_nccl_allreduce, false, + platform::errors::InvalidArgument( + "FLAGS_sync_nccl_allreduce must be False to support " + "CUDA Graph capturing.")); + + std::unordered_map> all_vars; + for (auto &node : graph->Nodes()) { + if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { + auto *var_desc = node->Var(); + all_vars[var_desc->Name()].emplace_back(var_desc); + } + } + + auto mark_var_as_persistable = [&all_vars](const std::string &name) { + auto iter = all_vars.find(name); + if (iter != all_vars.end()) { + for (auto *var_desc : iter->second) { + var_desc->SetPersistable(true); + } + } + }; + + // Step 1: All fused vars must be persistable. + if (graph->Has(details::kFusedVars)) { + auto &fused_vars = graph->Get(details::kFusedVars); + for (auto &fused_var : fused_vars) { + fused_var.second.persistable_ = true; + mark_var_as_persistable(fused_var.first); + } + } + + // Step 2: All pinned vars must be persistable. + if (graph->Has(details::kPinnedVars)) { + auto &pinned_vars = graph->Get(details::kPinnedVars); + for (auto &pinned_var : pinned_vars) { + mark_var_as_persistable(pinned_var); + } + } + + // Step 3: Move all main programs to startup programs to make sure that + // the main programs would only be run once. + if (graph->Has(details::kProgramDescs)) { + auto &startup_programs = + graph->GetOrInit(details::kStartupProgramDescs); + auto &main_programs = + graph->Get(details::kProgramDescs); + for (auto &main_program : main_programs) { + startup_programs.emplace_back(main_program); + } + graph->Erase(details::kProgramDescs); + } + + // Step 4: Mark all vars in startup programs to be persistable. + if (graph->Has(details::kStartupProgramDescs)) { + auto &startup_programs = + graph->GetOrInit(details::kStartupProgramDescs); + for (auto &startup_program : startup_programs) { + for (auto &op_desc : startup_program.Block(0).AllOps()) { + for (auto &output : op_desc->OutputArgumentNames()) { + mark_var_as_persistable(output); + } + } + } + } + + // Step 5: ScaleLossGrad must be run beforehand to avoid H2D copy. + auto ops = ir::FilterByNodeWrapper(*graph); + auto *scope = member_->local_scopes_[0]; + for (auto *op : ops) { + auto *loss_grad_op = dynamic_cast(op); + if (loss_grad_op == nullptr) continue; + auto loss_grad_name = loss_grad_op->LossGradName(); + mark_var_as_persistable(loss_grad_name); + loss_grad_op->RunOnVar(scope->Var(loss_grad_name)); + loss_grad_op->SetSkipRunning(true); + } +#else + PADDLE_THROW(platform::errors::Unimplemented( + "CUDA Graph is only supported on NVIDIA GPU device.")); +#endif +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 6c871a8d858156..78774f04896389 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -144,6 +144,8 @@ class ParallelExecutor { void SetReaderOpDeviceInfoOfGraphs( const std::vector &final_graphs); + void PrepareForCUDAGraphCapture(ir::Graph *graph); + ParallelExecutorPrivate *member_; std::vector> async_graphs_; std::vector var_infos_; diff --git a/paddle/fluid/framework/ps_gpu_trainer.cc b/paddle/fluid/framework/ps_gpu_trainer.cc index 8b16b6a5d007ff..dc7b86d344d771 100644 --- a/paddle/fluid/framework/ps_gpu_trainer.cc +++ b/paddle/fluid/framework/ps_gpu_trainer.cc @@ -29,9 +29,12 @@ namespace framework { void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc, Dataset* dataset) { - dataset_ = dataset; + SetDataset(dataset); thread_num_ = trainer_desc.thread_num(); param_ = trainer_desc.downpour_param(); + ParseDumpConfig(trainer_desc); + mpi_rank_ = trainer_desc.mpi_rank(); + mpi_size_ = trainer_desc.mpi_size(); for (int i = 0; i < param_.dense_table_size(); ++i) { uint64_t table_id = static_cast(param_.dense_table(i).table_id()); auto table = param_.dense_table(i); @@ -44,6 +47,8 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc, int place_num = trainer_desc.worker_places_size(); const std::vector readers = dataset->GetReaders(); + dump_file_num_ = trainer_desc.dump_file_num(); + user_define_dump_filename_ = trainer_desc.user_define_dump_filename(); std::vector dev_ids; for (int i = 0; i < place_num; ++i) { int num = trainer_desc.worker_places(i); @@ -64,6 +69,11 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc, workers_[i] = DeviceWorkerFactory::CreateDeviceWorker( trainer_desc.device_worker_name()); workers_[i]->SetDeviceIndex(i); + workers_[i]->SetNeedDumpField(need_dump_field_); + workers_[i]->SetNeedDumpParam(need_dump_param_); + workers_[i]->SetDumpFieldVector(dump_fields_); + workers_[i]->SetDumpParamVector(dump_param_); + workers_[i]->InitRandomDumpConfig(trainer_desc); workers_[i]->SetDataFeed(readers[i]); workers_[i]->Initialize(trainer_desc); workers_[i]->SetWorkerNum(place_num); @@ -71,7 +81,14 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc, return; } -void PSGPUTrainer::DumpWork(int tid) {} +std::string PSGPUTrainer::GetDumpPath(int tid) { + if (user_define_dump_filename_ != "") { + return string::format_string("%s/part-%s-%05d", dump_fields_path_.c_str(), + user_define_dump_filename_.c_str(), tid); + } + return string::format_string("%s/part-%03d-%05d", dump_fields_path_.c_str(), + mpi_rank_, tid); +} void PSGPUTrainer::RegisterHeterCallback() { /* @@ -124,7 +141,28 @@ void PSGPUTrainer::InitTrainerEnv(const ProgramDesc& main_program, return; } +void PSGPUTrainer::InitDumpEnv() { + queue_ = paddle::framework::MakeChannel(); + for (size_t i = 0; i < places_.size(); ++i) { + workers_[i]->SetChannelWriter(queue_.get()); + } + dump_thread_num_ = 1; + if (dump_file_num_ > mpi_size_) { + dump_thread_num_ = dump_file_num_ / mpi_size_; + if (dump_file_num_ % mpi_size_ > mpi_rank_) { + dump_thread_num_ += 1; + } + } + for (int i = 0; i < dump_thread_num_; i++) { + dump_thread_.push_back( + std::thread(std::bind(&TrainerBase::DumpWork, this, i))); + } +} + void PSGPUTrainer::InitOtherEnv(const ProgramDesc& main_program) { + if (need_dump_field_ || need_dump_param_) { + InitDumpEnv(); + } VLOG(3) << "init other env done."; } @@ -204,6 +242,9 @@ void PSGPUTrainer::Finalize() { } } MergeDenseParam(); + if (need_dump_field_ || need_dump_param_) { + FinalizeDumpEnv(); + } root_scope_->DropKids(); } } // namespace framework diff --git a/paddle/fluid/framework/ps_gpu_worker.cc b/paddle/fluid/framework/ps_gpu_worker.cc index 66d8a40dda1607..e41768810c6d2c 100644 --- a/paddle/fluid/framework/ps_gpu_worker.cc +++ b/paddle/fluid/framework/ps_gpu_worker.cc @@ -34,11 +34,6 @@ void PSGPUWorker::Initialize(const TrainerDesc& desc) { dev_ctx_ = platform::DeviceContextPool::Instance().Get(place_); mpi_rank_ = desc.mpi_rank(); trainer_desc_ = desc; - /* - for (int i = 0; i < trainer_desc_.xpu_recv_list_size(); ++i) { - send_var_list_.push_back(trainer_desc_.xpu_recv_list(i)); - } - */ for (int i = 0; i < param_.sparse_table_size(); ++i) { uint64_t table_id = static_cast(param_.sparse_table(i).table_id()); @@ -89,19 +84,7 @@ void PSGPUWorker::Initialize(const TrainerDesc& desc) { no_cvm_ = desc.no_cvm(); scale_datanorm_ = desc.scale_datanorm(); dump_slot_ = desc.dump_slot(); - dump_fields_.resize(desc.dump_fields_size()); - for (int i = 0; i < desc.dump_fields_size(); ++i) { - dump_fields_[i] = desc.dump_fields(i); - } adjust_ins_weight_config_ = desc.adjust_ins_weight_config(); - need_dump_param_ = false; - dump_param_.resize(desc.dump_param_size()); - for (int i = 0; i < desc.dump_param_size(); ++i) { - dump_param_[i] = desc.dump_param(i); - } - if (desc.dump_param_size() != 0) { - need_dump_param_ = true; - } for (int i = 0; i < desc.check_nan_var_names_size(); ++i) { check_nan_var_names_.push_back(desc.check_nan_var_names(i)); } @@ -134,12 +117,6 @@ void PSGPUWorker::SetChannelWriter(ChannelObject* queue) { writer_.Reset(queue); } -void PSGPUWorker::SetNeedDump(bool need_dump_field) { - need_dump_field_ = need_dump_field; -} - -void PSGPUWorker::DumpParam() {} - void PSGPUWorker::TrainFiles() { platform::SetNumThreads(1); platform::Timer timeline; @@ -150,6 +127,7 @@ void PSGPUWorker::TrainFiles() { // how to accumulate fetched values here device_reader_->Start(); int cur_batch; + int batch_cnt = 0; while ((cur_batch = device_reader_->Next()) > 0) { total_ins_num += cur_batch; for (auto& op : ops_) { @@ -164,9 +142,19 @@ void PSGPUWorker::TrainFiles() { op->Run(*thread_scope_, place_); } } + if (need_dump_field_) { + DumpField(*thread_scope_, dump_mode_, dump_interval_); + } + if (need_dump_param_ && thread_id_ == 0) { + DumpParam(*thread_scope_, batch_cnt); + } PrintFetchVars(); thread_scope_->DropKids(); + ++batch_cnt; + } + if (need_dump_field_ || need_dump_param_) { + writer_.Flush(); } timeline.Pause(); VLOG(1) << "GpuPs worker " << thread_id_ << " train cost " diff --git a/paddle/fluid/framework/string_array.cc b/paddle/fluid/framework/string_array.cc new file mode 100755 index 00000000000000..3071e6bf4cff33 --- /dev/null +++ b/paddle/fluid/framework/string_array.cc @@ -0,0 +1,104 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include + +#include "glog/logging.h" +#include "paddle/fluid/framework/string_array.h" + +namespace paddle { +namespace framework { + +std::wstring_convert> kConverter; + +// Convert the std::string type to the std::wstring type. +bool ConvertStrToWstr(const std::string& src, std::wstring* res) { + try { + *res = kConverter.from_bytes(src); + } catch (std::range_error& e) { + VLOG(3) << "The string " << src << " was converted to unicode failedly! "; + return false; + } + return true; +} + +// Convert the std::wstring type to the std::string type. +void ConvertWstrToStr(const std::wstring& src, std::string* res) { + *res = kConverter.to_bytes(src); +} + +// Normalization Form Canonical Decomposition. +void NFD(const std::string& s, std::string* ret) { + *ret = ""; + char* result = reinterpret_cast( + utf8proc_NFD(reinterpret_cast(s.c_str()))); + if (result) { + *ret = std::move(std::string(result)); + free(result); + } +} + +// Write the data which is type of +// std::unordered_map to ostream. +void StringMapToStream(std::ostream& os, + const std::unordered_map& data) { + { + // firstly write the data size. + size_t t = data.size(); + os.write(reinterpret_cast(&t), sizeof(t)); + } + { + // then write the data + for (auto it = data.begin(); it != data.end(); ++it) { + std::string token = it->first; + int32_t token_id = it->second; + // write the token + size_t length = token.size(); + os.write(reinterpret_cast(&length), sizeof(length)); + os.write(token.c_str(), length); + // write the token_id + os.write(reinterpret_cast(&token_id), sizeof(token_id)); + } + } +} + +// Read the data which is type of +// std::unordered_map from istream. +void StringMapFromStream(std::istream& is, + std::unordered_map* data) { + // first read the map size + size_t map_size; + is.read(reinterpret_cast(&map_size), sizeof(map_size)); + data->reserve(map_size); + // then read the data + for (size_t i = 0; i < map_size; ++i) { + // read the token + size_t token_length; + is.read(reinterpret_cast(&token_length), sizeof(token_length)); + char* tmp = new char[token_length]; + is.read(tmp, token_length); + std::string token(tmp, tmp + token_length); + delete[] tmp; + // read the token_id + int32_t token_id; + is.read(reinterpret_cast(&token_id), sizeof(token_id)); + + data->emplace(token, token_id); + } +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/string_array.h b/paddle/fluid/framework/string_array.h new file mode 100755 index 00000000000000..b874fbac4c9e7c --- /dev/null +++ b/paddle/fluid/framework/string_array.h @@ -0,0 +1,48 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace paddle { +namespace framework { + +using String = std::string; +using Strings = std::vector; +using Vocab = std::unordered_map; + +// Convert the std::string type to the std::string type. +bool ConvertStrToWstr(const std::string& src, std::wstring* res); +// Convert the std::wstring type to the std::string type. +void ConvertWstrToStr(const std::wstring& src, std::string* res); +// Normalization Form Canonical Decomposition. +void NFD(const std::string& s, std::string* ret); + +// Write the data which is type of +// std::unordered_map to ostream. +void StringMapToStream(std::ostream& os, + const std::unordered_map& data); + +// Read the data which is type of +// std::unordered_map from istream. +void StringMapFromStream(std::istream& is, + std::unordered_map* data); +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc index 4f6eb803d1c26e..fbd7aa588d49a8 100644 --- a/paddle/fluid/framework/tensor.cc +++ b/paddle/fluid/framework/tensor.cc @@ -29,14 +29,16 @@ void Tensor::check_memory_size() const { PADDLE_ENFORCE_NOT_NULL(holder_, platform::errors::PreconditionNotMet( "Tensor holds no memory. " "Call Tensor::mutable_data firstly.")); + size_t size = numel() * SizeOfType(type()); + PADDLE_ENFORCE_LE( - numel() * SizeOfType(type()), memory_size(), + size, memory_size(), platform::errors::PreconditionNotMet( "Tensor's dimension is out of bound." "Tensor's dimension must be equal or less than the size of its " "memory." "But received Tensor's dimension is d%, memory's size is %d.", - numel() * SizeOfType(type()), memory_size())); + size, memory_size())); } Tensor::Tensor(const proto::VarType::Type& dtype) diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 15021b6267b656..1c43219330bfe7 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/framework/tensor_util.h" - #include #include #include @@ -22,6 +20,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/profiler.h" #ifdef PADDLE_WITH_MKLDNN @@ -1065,6 +1064,9 @@ void* GetDstPtrByDLDataType(DLDataType type, framework::Tensor* dst, if (type.code == kDLFloat) return static_cast( dst->mutable_data(dst_place)); + if (type.code == kDLBfloat) + return static_cast( + dst->mutable_data(dst_place)); PADDLE_THROW(platform::errors::Unimplemented( "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.", type.code, type.bits)); @@ -1081,6 +1083,16 @@ void* GetDstPtrByDLDataType(DLDataType type, framework::Tensor* dst, return static_cast(dst->mutable_data(dst_place)); if (type.code == kDLFloat) return static_cast(dst->mutable_data(dst_place)); + if (type.code == kDLComplex) + return static_cast( + dst->mutable_data>(dst_place)); + PADDLE_THROW(platform::errors::Unimplemented( + "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.", + type.code, type.bits)); + case 128: + if (type.code == kDLComplex) + return static_cast( + dst->mutable_data>(dst_place)); PADDLE_THROW(platform::errors::Unimplemented( "DLDataType code <%d> is illegal when DLDataType.bits is <%d>.", type.code, type.bits)); @@ -1107,15 +1119,15 @@ void TensorFromDLPack(const ::DLTensor& dl_tensor, framework::Tensor* dst) { auto src_ptr = static_cast(dl_tensor.data); auto size = paddle::framework::product(vddim) * type.bits / 8; - if (dl_tensor.ctx.device_type == kDLCPU) { + if (dl_tensor.device.device_type == kDLCPU) { memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size); } #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - if (dl_tensor.ctx.device_type == kDLGPU) { + if (dl_tensor.device.device_type == kDLGPU) { platform::CUDAPlace dst_place = - platform::CUDAPlace(dl_tensor.ctx.device_id); + platform::CUDAPlace(dl_tensor.device.device_id); platform::CUDAPlace src_place = - platform::CUDAPlace(dl_tensor.ctx.device_id); + platform::CUDAPlace(dl_tensor.device.device_id); dst_ptr = GetDstPtrByDLDataType(type, dst, dst_place); auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(dst_place); memory::Copy( diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h index f4bbbaa2e70cf5..73829898be961d 100644 --- a/paddle/fluid/framework/tensor_util.h +++ b/paddle/fluid/framework/tensor_util.h @@ -13,11 +13,17 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include +#include +#include +#include +#include #include #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/dlpack_tensor.h" #include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/string_array.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/memory/allocation/allocator_facade.h" #ifdef PADDLE_WITH_ASCEND_CL @@ -48,6 +54,14 @@ class PrintOptions { PrintOptions() {} }; +void TensorToStream(std::ostream& os, const Tensor& tensor, + const platform::DeviceContext& dev_ctx); +void TensorFromStream(std::istream& is, Tensor* tensor, + const platform::DeviceContext& dev_ctx); +void TensorFromStream(std::istream& is, Tensor* tensor, + const platform::DeviceContext& dev_ctx, + const size_t& seek, const std::vector& shape); + // NOTE(zcd): Because TensorCopy is an async operation, when the src_place // and dst_place are two different GPU, to ensure that the operation can // be carried out correctly, there is a src_ctx wait operation in TensorCopy. diff --git a/paddle/fluid/framework/trainer.h b/paddle/fluid/framework/trainer.h index 0f34c84549f2b9..f6e274e6257e4c 100644 --- a/paddle/fluid/framework/trainer.h +++ b/paddle/fluid/framework/trainer.h @@ -258,13 +258,12 @@ class PSGPUTrainer : public TrainerBase { virtual void Run(); virtual void Finalize(); virtual void RegisterHeterCallback(); - virtual void DumpWork(int tid); virtual Scope* GetWorkerScope(int thread_id); virtual void CacheProgram(const ProgramDesc& main_program) { new (&program_) ProgramDesc(main_program); } - virtual std::string GetDumpPath(int tid) { return ""; } - virtual void InitDumpEnv() {} + virtual std::string GetDumpPath(int tid); + virtual void InitDumpEnv() override; virtual void MergeDenseParam(); template @@ -286,6 +285,9 @@ class PSGPUTrainer : public TrainerBase { std::vector threads_; int use_ps_gpu_; int thread_num_; + int mpi_rank_; + int mpi_size_; + int dump_file_num_; }; #endif diff --git a/paddle/fluid/framework/var_desc.cc b/paddle/fluid/framework/var_desc.cc index c3bdd6ae7f135c..41fe9fbbc0396e 100644 --- a/paddle/fluid/framework/var_desc.cc +++ b/paddle/fluid/framework/var_desc.cc @@ -209,6 +209,10 @@ const proto::VarType::TensorDesc &VarDesc::tensor_desc() const { return desc_.type().lod_tensor().tensor(); case proto::VarType::LOD_TENSOR_ARRAY: return desc_.type().tensor_array().tensor(); + case proto::VarType::STRINGS: + return desc_.type().strings(); + case proto::VarType::VOCAB: + return desc_.type().vocab(); default: PADDLE_THROW(platform::errors::Unavailable( "Getting 'tensor_desc' is not supported by the %s type variable.", @@ -249,6 +253,10 @@ proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() { return desc_.mutable_type()->mutable_lod_tensor()->mutable_tensor(); case proto::VarType::LOD_TENSOR_ARRAY: return desc_.mutable_type()->mutable_tensor_array()->mutable_tensor(); + case proto::VarType::STRINGS: + return desc_.mutable_type()->mutable_strings(); + case proto::VarType::VOCAB: + return desc_.mutable_type()->mutable_vocab(); default: PADDLE_THROW( platform::errors::Unavailable("Getting 'mutable_tensor_desc' is not " diff --git a/paddle/fluid/framework/var_desc.h b/paddle/fluid/framework/var_desc.h index d1a1757d5309b6..a6f56ad4458348 100644 --- a/paddle/fluid/framework/var_desc.h +++ b/paddle/fluid/framework/var_desc.h @@ -160,7 +160,7 @@ class VarDesc { // Note: the identity only used as a key for referring to its // distributed attribute now. - uint64_t Id() { return id_; } + uint64_t Id() const { return id_; } private: const proto::VarType::TensorDesc &tensor_desc() const; diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h index 473df85aa0421e..c8c3cf364e0fc0 100644 --- a/paddle/fluid/framework/var_type_traits.h +++ b/paddle/fluid/framework/var_type_traits.h @@ -18,10 +18,12 @@ #include #include #include +#include #include #include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/string_array.h" #include "paddle/fluid/platform/place.h" #ifdef PADDLE_WITH_CUDA #include @@ -162,8 +164,8 @@ struct VarTypeRegistryImpl { // Paddle would generate unique Ids for each registered variable types. using VarTypeRegistry = detail::VarTypeRegistryImpl< Tensor, LoDTensor, SelectedRows, std::vector, LoDRankTable, - LoDTensorArray, platform::PlaceList, ReaderHolder, std::string, Scope *, - operators::reader::LoDTensorBlockingQueueHolder, FetchList, + Strings, LoDTensorArray, platform::PlaceList, ReaderHolder, String, Scope *, + operators::reader::LoDTensorBlockingQueueHolder, FetchList, FeedList, operators::reader::OrderedMultiDeviceLoDTensorBlockingQueueHolder, #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) @@ -177,8 +179,7 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl< #if defined(PADDLE_WITH_XPU_BKCL) BKCLUniqueId, platform::BKCLCommunicator, #endif - int, float>; - + int, float, Vocab>; template struct VarTypeTrait { static_assert(VarTypeRegistry::IsRegistered(), "Must be registered type"); @@ -208,9 +209,13 @@ REG_PROTO_VAR_TYPE_TRAIT(LoDRankTable, proto::VarType::LOD_RANK_TABLE); REG_PROTO_VAR_TYPE_TRAIT(LoDTensorArray, proto::VarType::LOD_TENSOR_ARRAY); REG_PROTO_VAR_TYPE_TRAIT(platform::PlaceList, proto::VarType::PLACE_LIST); REG_PROTO_VAR_TYPE_TRAIT(ReaderHolder, proto::VarType::READER); +REG_PROTO_VAR_TYPE_TRAIT(FeedList, proto::VarType::FEED_LIST); REG_PROTO_VAR_TYPE_TRAIT(FetchList, proto::VarType::FETCH_LIST); REG_PROTO_VAR_TYPE_TRAIT(int, proto::VarType::INT32); REG_PROTO_VAR_TYPE_TRAIT(float, proto::VarType::FP32); +REG_PROTO_VAR_TYPE_TRAIT(Vocab, proto::VarType::VOCAB); +REG_PROTO_VAR_TYPE_TRAIT(String, proto::VarType::STRING); +REG_PROTO_VAR_TYPE_TRAIT(Strings, proto::VarType::STRINGS); /** End of variable type registration */ diff --git a/paddle/fluid/framework/variable_helper.cc b/paddle/fluid/framework/variable_helper.cc index bdcdd4e64e3314..37ec5d7bc83bda 100644 --- a/paddle/fluid/framework/variable_helper.cc +++ b/paddle/fluid/framework/variable_helper.cc @@ -21,6 +21,7 @@ limitations under the License. */ #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/string_array.h" #include "paddle/fluid/platform/place.h" namespace paddle { @@ -41,6 +42,10 @@ void InitializeVariable(Variable *var, proto::VarType::Type var_type) { var->GetMutable(); } else if (var_type == proto::VarType::LOD_TENSOR_ARRAY) { var->GetMutable(); + } else if (var_type == proto::VarType::STRINGS) { + var->GetMutable(); + } else if (var_type == proto::VarType::VOCAB) { + var->GetMutable(); } else if (var_type == proto::VarType::PLACE_LIST) { var->GetMutable(); } else if (var_type == proto::VarType::READER) { diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc index 48e5e430b136a5..f2ea692ad08808 100644 --- a/paddle/fluid/imperative/amp_auto_cast.cc +++ b/paddle/fluid/imperative/amp_auto_cast.cc @@ -24,6 +24,17 @@ namespace imperative { class VarBase; +AutoCastGuard::AutoCastGuard(std::shared_ptr tracer, AmpLevel level) + : tracer_(tracer) { + pre_amp_level_ = tracer_->GetAmpLevel(); + + if (pre_amp_level_ != level) { + tracer_->SetAmpLevel(level); + } +} + +AutoCastGuard::~AutoCastGuard() { tracer_->SetAmpLevel(pre_amp_level_); } + AmpOperators::AmpOperators() : allow_ops_(new std::unordered_set()), block_ops_(new std::unordered_set()), @@ -117,7 +128,7 @@ static inline std::shared_ptr CastToType( imperative::NameVarBaseMap outs = {{"Out", {out}}}; { - AutoCastGuard guard(tracer, 0); + AutoCastGuard guard(tracer, AmpLevel::O0); tracer->TraceOp("cast", ins, outs, std::move(attrs)); } @@ -180,6 +191,14 @@ NameVarBaseMap AutoCastInputs(const std::string& op_type, continue; } + if ((op_type == "fused_attention" || op_type == "fused_feedforward")) { + if (pair.first == "LnScale" || pair.first == "LnBias" || + pair.first == "Ln2Scale" || pair.first == "Ln2Bias" || + pair.first == "Ln1Scale" || pair.first == "Ln1Bias") { + continue; + } + } + VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from " << GetDtypeStr(*pair.second.cbegin()) << " to float16"; for (auto& var : pair.second) { @@ -212,6 +231,14 @@ NameVarBaseMap AutoCastInputs(const std::string& op_type, pair.first == "X" && dst_type == framework::proto::VarType::FP32) { continue; } + if ((op_type == "fused_attention" || op_type == "fused_feedforwad") && + dst_type == framework::proto::VarType::FP32) { + if (pair.first != "LnScale" && pair.first != "LnBias" && + pair.first != "Ln2Scale" && pair.first != "Ln2Bias" && + pair.first != "Ln1Scale" && pair.first != "Ln1Bias") { + continue; + } + } VLOG(5) << "Op(" << op_type << "): Cast " << pair.first << " from " << GetDtypeStr(*pair.second.cbegin()) << " to " << framework::DataTypeToString(dst_type); diff --git a/paddle/fluid/imperative/amp_auto_cast.h b/paddle/fluid/imperative/amp_auto_cast.h index 79bc83a777aa90..903e2652888d85 100644 --- a/paddle/fluid/imperative/amp_auto_cast.h +++ b/paddle/fluid/imperative/amp_auto_cast.h @@ -19,15 +19,22 @@ #include #include -#include "paddle/fluid/imperative/tracer.h" #include "paddle/fluid/imperative/type_defs.h" namespace paddle { namespace imperative { -// Singleton implementation with C++ 11 +// NOTE(zhiqiu): only O1 and O2 are valid now +enum class AmpLevel { + O0 = 0, // fp32 + O1, // amp, mixed fp32-fp16 + O2, // almost fp16 + O3, // fp16 +}; + class Tracer; +// Singleton implementation with C++ 11 class AmpOperators { public: ~AmpOperators(); @@ -63,16 +70,9 @@ std::ostream& operator<<(std::ostream& os, AmpOperators& ops); // NOTE(zhiqiu): AutoCastGuard is used for RAII. class AutoCastGuard { public: - AutoCastGuard(std::shared_ptr tracer, int guard_level) - : tracer_(tracer) { - pre_amp_level_ = tracer_->AMPLevel(); - - if (pre_amp_level_ != guard_level) { - tracer_->SetAMPLevel(guard_level); - } - } + AutoCastGuard(std::shared_ptr tracer, AmpLevel guard_level); - ~AutoCastGuard() { tracer_->SetAMPLevel(pre_amp_level_); } + ~AutoCastGuard(); // forbid copy and operator= AutoCastGuard(const AutoCastGuard& guard) = delete; @@ -80,7 +80,7 @@ class AutoCastGuard { private: std::shared_ptr tracer_; - int pre_amp_level_; + AmpLevel pre_amp_level_; }; NameVarBaseMap AutoCastInputs(const std::string& op_type, diff --git a/paddle/fluid/imperative/gloo_context.cc b/paddle/fluid/imperative/gloo_context.cc index d7df6ec3c11641..ef1bf0d158787e 100644 --- a/paddle/fluid/imperative/gloo_context.cc +++ b/paddle/fluid/imperative/gloo_context.cc @@ -18,6 +18,7 @@ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/string/split.h" +#include "paddle/fluid/string/string_helper.h" namespace paddle { namespace framework { @@ -52,23 +53,49 @@ void GLOOParallelContext::InitWithRingID(int ring_id) { platform::errors::OutOfRange("Still not implement InitWithRingID")); } -#define GLOO_CASE(type, T, gw) \ - case type: { \ - VLOG(4) << "Use the gloo all reduce to sync. SRC:" << src_tensor; \ - std::vector send_vector##T; \ - framework::TensorToVector(src_tensor, &send_vector##T); \ - auto recv_vector##T = gw->AllReduce(send_vector##T); \ - framework::TensorFromVector(recv_vector##T, dst_tensor); \ - VLOG(4) << "DST:" << *dst_tensor; \ - break; \ +#define GLOO_CASE(type, T, gw) \ + case type: { \ + std::vector send_vector##T; \ + framework::TensorToVector(src_tensor, &send_vector##T); \ + auto recv_vector##T = gw->AllReduce(send_vector##T); \ + framework::TensorFromVector(recv_vector##T, dst_tensor); \ + break; \ } void GLOOParallelContext::AllReduceByStream(const framework::Variable &src, framework::Variable *dst, int ring_id, bool use_calc_stream) { // AllReduce(src, dst, strategy_, ring_id, use_calc_stream); - auto src_tensor = src.Get(); - auto *dst_tensor = dst->GetMutable(); + if (src.IsType()) { + if (!dst->IsType()) { + dst->Clear(); + } + AllReduce(src.Get(), + dst->GetMutable()); + } else if (src.IsType()) { + if (&src != dst) { + if (!dst->IsType()) { + dst->Clear(); + } + AllReduce(src.Get(), + dst->GetMutable()); + } else { + // SelectedRows cannot be allreduce in-place + framework::Variable tmp_dst; + AllReduce(src.Get(), + tmp_dst.GetMutable()); + *dst = std::move(tmp_dst); + } + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Unsupported variable type %s for imperative allreduce, only " + "LoDTensor and SelectedRows are supported.", + platform::demangle(framework::ToTypeName(src.Type())))); + } +} + +void GLOOParallelContext::AllReduce(const framework::Tensor &src_tensor, + framework::Tensor *dst_tensor) { auto gloo_wrapper = framework::GlooWrapper::GetInstance(); dst_tensor->Resize(src_tensor.dims()); switch (src_tensor.type()) { @@ -84,6 +111,71 @@ void GLOOParallelContext::AllReduceByStream(const framework::Variable &src, gloo_wrapper->Barrier(); } +#define GLOO_ALL_GATHER_CASE(type, T, gw) \ + case type: { \ + const auto *src_tensor_ptr = src_tensor.data(); \ + gw->AllGatherVector(const_cast(src_tensor_ptr), \ + reinterpret_cast(dst_tensor_ptr), \ + element_nums); \ + break; \ + } + +void GLOOParallelContext::AllReduce(const framework::SelectedRows &src, + framework::SelectedRows *dst) { + // auto ; + // int local_rank = strategy_.local_rank_; + int nranks = strategy_.nranks_; + VLOG(3) << "SelectedRows AllReduce start"; + const auto &src_tensor = src.value(); + const auto &place = src_tensor.place(); + auto dtype = src_tensor.type(); + // 1. Gather rows number from all workers. Here use ncclAllGather to do this, + // but we can use other ways to implement is in the future + const auto &src_rows = src.rows(); + auto gloo_wrapper = framework::GlooWrapper::GetInstance(); + size_t local_row_num = src_rows.size(); + std::vector rows_num_vector = + gloo_wrapper->AllGather(local_row_num); + const auto *cpu_rows_num_ptr = rows_num_vector.data(); + auto rows_num = std::accumulate(cpu_rows_num_ptr, cpu_rows_num_ptr + nranks, + static_cast(0)); + dst->set_height(src.height()); + VLOG(3) << "Gather rows: " << string::join_strings(rows_num_vector, ',') + << ", total rows number: " << rows_num + << ", height: " << src.height(); + auto *dst_rows = dst->mutable_rows(); + dst_rows->resize(rows_num); + auto *dst_rows_ptr = dst_rows->MutableData(place); + const int64_t *src_rows_ptr = src_rows.Data(place); + + auto *dst_tensor = dst->mutable_value(); + auto dims = src_tensor.dims(); + dims[0] = rows_num; + auto feature_size = framework::product(dims) / dims[0]; + dst_tensor->Resize(dims); + + std::vector element_nums = rows_num_vector; + std::for_each(element_nums.begin(), element_nums.end(), + [feature_size](size_t &x) { x = x * feature_size; }); + + auto *dst_tensor_ptr = dst_tensor->mutable_data(place, dtype); + gloo_wrapper->AllGatherVector(const_cast(src_rows_ptr), + static_cast(dst_rows_ptr), + rows_num_vector); + + switch (dtype) { + GLOO_ALL_GATHER_CASE(framework::proto::VarType::FP32, float, gloo_wrapper); + GLOO_ALL_GATHER_CASE(framework::proto::VarType::FP64, double, gloo_wrapper); + GLOO_ALL_GATHER_CASE(framework::proto::VarType::INT32, int, gloo_wrapper); + GLOO_ALL_GATHER_CASE(framework::proto::VarType::INT64, int64_t, + gloo_wrapper); + default: { + PADDLE_THROW( + platform::errors::InvalidArgument("Invalid datatype for allreduce")); + } + } +} + paddle::platform::DeviceContext *GLOOParallelContext::GetDeviceContext( int ring_id) { // return the CPUDeviceContext diff --git a/paddle/fluid/imperative/gloo_context.h b/paddle/fluid/imperative/gloo_context.h index f54dc1a406a92f..305a75a881153f 100644 --- a/paddle/fluid/imperative/gloo_context.h +++ b/paddle/fluid/imperative/gloo_context.h @@ -16,6 +16,9 @@ #include #include #include +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/imperative/parallel_context.h" #include "paddle/fluid/platform/device_context.h" @@ -52,6 +55,11 @@ class GLOOParallelContext : public ParallelContext { void SynchronizeCompute() override; + private: + void AllReduce(const framework::Tensor& src, framework::Tensor* dst); + void AllReduce(const framework::SelectedRows& src, + framework::SelectedRows* dst); + private: std::unique_ptr device_; }; diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index fbc5453f82146a..fd6a070c3fc529 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -87,9 +87,17 @@ class TensorAddFunctor : public boost::static_visitor<> { #ifdef PADDLE_WITH_XPU void operator()(const platform::XPUPlace& place) { + using XPUType = typename XPUTypeTrait::Type; platform::XPUDeviceContext* ctx = dynamic_cast( platform::DeviceContextPool::Instance().Get(place)); - xpu::add(ctx->x_context(), x_, y_, y_, static_cast(numel_)); + int r = xpu::add( + ctx->x_context(), reinterpret_cast(x_), + reinterpret_cast(y_), reinterpret_cast(y_), + static_cast(numel_)); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External("XPU add kernel return wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); } #else void operator()(const platform::XPUPlace& place) { @@ -154,6 +162,24 @@ class TensorAddFunctor : public boost::static_visitor<> { T* y_; }; +#ifdef PADDLE_WITH_XPU +template +void XPUTensorAddFunctor(const platform::Place& place, + const framework::Tensor& src, framework::Tensor* dst) { + using XPUType = typename XPUTypeTrait::Type; + platform::XPUDeviceContext* ctx = dynamic_cast( + platform::DeviceContextPool::Instance().Get(place)); + const XPUType* x = reinterpret_cast(src.data()); + XPUType* y = reinterpret_cast(dst->mutable_data(place)); + int r = xpu::add(ctx->x_context(), x, y, y, + static_cast(src.numel())); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External("XPU add kernel return wrong value[%d %s]", r, + XPUAPIErrorMsg[r])); +} +#endif + template void TensorAddImpl(const framework::Tensor& src, framework::Tensor* dst, const platform::Place& place) { @@ -226,7 +252,26 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) { return; } #endif + +#ifdef PADDLE_WITH_XPU + if (platform::is_xpu_place(place)) { + if (data_type == framework::DataTypeTrait::DataType()) { + XPUTensorAddFunctor(place, src_tensor, dst_tensor); + } else if (data_type == + framework::DataTypeTrait::DataType()) { + XPUTensorAddFunctor(place, src_tensor, dst_tensor); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Gradient accumulation of data type (%s) on place (%s) is not " + "supported in imperative mode", + framework::DataTypeToString(data_type), place)); + } + return; + } +#endif + PADDLE_TENSOR_ADD(float); + #ifndef PADDLE_WITH_XPU // NOTE(phlrain): xpu only support float PADDLE_TENSOR_ADD(double); diff --git a/paddle/fluid/imperative/partial_grad_engine.cc b/paddle/fluid/imperative/partial_grad_engine.cc index c1ec675a557070..45756083c9047f 100644 --- a/paddle/fluid/imperative/partial_grad_engine.cc +++ b/paddle/fluid/imperative/partial_grad_engine.cc @@ -307,7 +307,15 @@ static void FillConstantLike(const VariableWrapper &ref_var, auto *dst_tensor = dst_var->MutableVar()->GetMutable(); auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place); dst_tensor->Resize(ref_tensor.dims()); - dst_tensor->mutable_data(place, ref_var.DataType()); + // TOOD(jiabin): Ugly fix here we have fwd_data_type_ and data_type, since in + // grad mission + // we can't get data_type_ directly. We need to check if we can only use + // default data_type for now. + if (ref_var.ForwardDataType() != -1) { + dst_tensor->mutable_data(place, ref_var.ForwardDataType()); + } else { + dst_tensor->mutable_data(place, ref_var.DataType()); + } operators::math::set_constant(*dev_ctx, dst_tensor, value); } diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index 8f45cd0fa6ea14..c31464bf20acc9 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -21,6 +21,7 @@ #include "paddle/fluid/platform/xpu/xpu_op_list.h" #endif DECLARE_bool(check_nan_inf); +DECLARE_bool(benchmark); namespace paddle { namespace imperative { @@ -208,6 +209,19 @@ static void PreparedOpRunImpl( op.Type(), outs, dev_ctx->GetPlace()); } + /*For profiling/benchmark only*/ + if (FLAGS_benchmark) { + dev_ctx->Wait(); +#if defined(PADDLE_WITH_CUDA) + PADDLE_ENFORCE_CUDA_SUCCESS(cudaGetLastError()); + VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error"; +#endif +#if defined(PADDLE_WITH_HIP) + PADDLE_ENFORCE_CUDA_SUCCESS(hipGetLastError()); + VLOG(4) << "Operator(" << op.Type() << "): context wait and get last error"; +#endif + } + /** * [ Why need handle complex gradient to real gradient? ] * diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 49e079c58caf3c..0f363d0ea1bff8 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -176,10 +176,10 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins, : attr_checker->GetDefaultAttrMap(); NameVarBaseMap new_ins = ins; - if (amp_level_ == 1) { + if (amp_level_ == AmpLevel::O1) { VLOG(5) << "Auto mixed precision run operator: " << type; new_ins = AutoCastInputs(type, ins); - } else if (amp_level_ == 2) { + } else if (amp_level_ == AmpLevel::O2) { VLOG(5) << "Pure fp16 run operator: " << type; new_ins = CastPureFp16Inputs(type, ins); } diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index e77623d7a46092..93f68f2054b9a8 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -23,6 +23,7 @@ #include #include "ThreadPool.h" #include "paddle/fluid/framework/garbage_collector.h" +#include "paddle/fluid/imperative/amp_auto_cast.h" #include "paddle/fluid/imperative/basic_engine.h" #include "paddle/fluid/imperative/jit/program_desc_tracer.h" #include "paddle/fluid/imperative/layer.h" @@ -31,6 +32,8 @@ namespace paddle { namespace imperative { +enum class AmpLevel; + using GarbageCollectorMap = std::map>; @@ -105,9 +108,12 @@ class Tracer { void SetHasGrad(bool has_grad) { has_grad_ = has_grad; } - void SetAMPLevel(int level) { amp_level_ = level; } + void SetAmpLevel(AmpLevel level) { + VLOG(4) << "set amp_level to " << static_cast(level); + amp_level_ = level; + } - int AMPLevel() const { return amp_level_; } + AmpLevel GetAmpLevel() const { return amp_level_; } paddle::framework::GarbageCollector* MutableGarbageCollectorIfNotExists( const platform::Place& place); @@ -120,7 +126,7 @@ class Tracer { platform::Place expected_place_; GarbageCollectorMap gcs_; static thread_local bool has_grad_; - int amp_level_{0}; + AmpLevel amp_level_{AmpLevel::O0}; }; // To access static variable current_tracer diff --git a/paddle/fluid/imperative/variable_wrapper.h b/paddle/fluid/imperative/variable_wrapper.h index 5fa8b89a396d9b..9fbbe7d06f8ad8 100644 --- a/paddle/fluid/imperative/variable_wrapper.h +++ b/paddle/fluid/imperative/variable_wrapper.h @@ -20,6 +20,7 @@ #include #include "paddle/fluid/framework/op_kernel_type.h" +#include "paddle/fluid/framework/string_array.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/imperative/hooks.h" #include "paddle/fluid/imperative/op_base.h" @@ -153,6 +154,15 @@ class VariableWrapper { tensor = &(var_.Get()); } else if (type_ == framework::proto::VarType::SELECTED_ROWS) { tensor = &(var_.Get().value()); + } else if (type_ == framework::proto::VarType::VOCAB) { + const framework::Vocab* data = nullptr; + data = &(var_.Get()); + if (data && data->size() != 0) { + VLOG(6) << "The tensor of variable " << name_ + << " is not initialized"; + return data_type_; + } + return framework::proto::VarType::VOCAB; } else { VLOG(6) << "Variable " << name_ << " is not initialized"; return data_type_; @@ -162,6 +172,7 @@ class VariableWrapper { return tensor->type(); } else { VLOG(6) << "The tensor of variable " << name_ << " is not initialized"; + return data_type_; } } diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index cda6dc31126d9c..ad96a4e3437beb 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -238,6 +238,7 @@ struct Argument { DECL_ARGUMENT_FIELD(xpu_autotune_file, XpuAutotuneFile, std::string); DECL_ARGUMENT_FIELD(xpu_precision, XpuPrecision, std::string); DECL_ARGUMENT_FIELD(xpu_adaptive_seqlen, XpuAdaptiveSeqlen, bool); + DECL_ARGUMENT_FIELD(xpu_device_id, XpuDeviceId, int); DECL_ARGUMENT_FIELD(use_nnadapter, UseNNAdapter, bool); DECL_ARGUMENT_FIELD(nnadapter_model_cache_dir, NNAdapterModelCacheDir, diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index 4fdd963b6abff9..dcbbee97a772cc 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -56,10 +56,18 @@ void IRPassManager::CreatePasses(Argument *argument, auto pass = framework::ir::PassRegistry::Instance().Get(pass_name); if (pass_name == "graph_viz_pass") { - std::string dot_file_path = std::to_string(pass_num) + "_ir_" + - (pre_pass.empty() ? "origin" : pre_pass) + - ".dot"; + std::string optim_cache_dir = argument->optim_cache_dir(); + std::string dot_file_path; + if (optim_cache_dir.empty()) { + dot_file_path = std::to_string(pass_num) + "_ir_" + + (pre_pass.empty() ? "origin" : pre_pass) + ".dot"; + } else { + dot_file_path = optim_cache_dir + "/" + std::to_string(pass_num) + + "_ir_" + (pre_pass.empty() ? "origin" : pre_pass) + + ".dot"; + } pass->Set("graph_viz_path", new std::string(std::move(dot_file_path))); + pass->Set("optim_cache_dir", new std::string(std::move(optim_cache_dir))); pass_num++; } else if (pass_name == "mkldnn_placement_pass") { pass->Set("mkldnn_enabled_op_types", @@ -202,6 +210,7 @@ void IRPassManager::CreatePasses(Argument *argument, new std::string(argument->xpu_autotune_file())); pass->Set("precision", new std::string(argument->xpu_precision())); pass->Set("adaptive_seqlen", new bool(argument->xpu_adaptive_seqlen())); + pass->Set("xpu_device_id", new int(argument->xpu_device_id())); // NNAdapter Related pass->Set("use_nnadapter", new bool(argument->use_nnadapter())); pass->Set("nnadapter_model_cache_dir", @@ -237,6 +246,8 @@ void IRPassManager::CreatePasses(Argument *argument, pass->Set("use_fc_padding", new bool(use_fc_padding)); } + pass->Set("disable_logs", new bool(disable_logs_)); + pre_pass = pass_name; passes_.emplace_back(std::move(pass)); diff --git a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc index c04342f837e3f9..6c38809b432153 100644 --- a/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/lite_subgraph_pass.cc @@ -243,6 +243,7 @@ void LiteSubgraphPass::SetUpEngine( bool use_gpu = Get("use_gpu"); bool enable_int8 = Get("enable_int8"); bool use_xpu = Get("use_xpu"); + int xpu_device_id = Get("xpu_device_id"); int xpu_l3_workspace_size = Get("xpu_l3_workspace_size"); int cpu_math_library_num_threads = Get("cpu_math_library_num_threads"); bool locked = Get("locked"); @@ -305,6 +306,7 @@ void LiteSubgraphPass::SetUpEngine( }; config.cpu_math_library_num_threads = cpu_math_library_num_threads; config.xpu_l3_workspace_size = xpu_l3_workspace_size; + config.device_id = xpu_device_id; config.locked = locked; config.autotune = autotune; config.autotune_file = autotune_file; diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt index bbec3eab1cadff..53b92c13363020 100755 --- a/paddle/fluid/inference/api/CMakeLists.txt +++ b/paddle/fluid/inference/api/CMakeLists.txt @@ -26,7 +26,7 @@ if(WITH_MKLDNN) set(mkldnn_quantizer_cfg ${mkldnn_quantizer_cfg} PARENT_SCOPE) endif() -cc_library(analysis_config SRCS analysis_config.cc DEPS ${mkldnn_quantizer_cfg} lod_tensor paddle_pass_builder table_printer) +cc_library(analysis_config SRCS analysis_config.cc DEPS ${mkldnn_quantizer_cfg} lod_tensor paddle_pass_builder table_printer utf8proc) cc_library(paddle_infer_contrib SRCS paddle_infer_contrib.cc DEPS zero_copy_tensor) cc_library(paddle_pass_builder SRCS paddle_pass_builder.cc) diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 5d056e054f51c5..0440801cfc538b 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -12,7 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #include +#include #include "paddle/fluid/inference/api/paddle_analysis_config.h" #include "paddle/fluid/inference/api/paddle_pass_builder.h" #include "paddle/fluid/inference/utils/table_printer.h" @@ -20,6 +22,10 @@ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/gpu_info.h" +#ifdef PADDLE_WITH_TENSORRT +#include "paddle/fluid/inference/tensorrt/helper.h" +#endif + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) DECLARE_uint64(initial_gpu_memory_in_mb); #endif @@ -758,17 +764,6 @@ std::string AnalysisConfig::Summary() { {"mkldnn_cache_capacity", std::to_string(mkldnn_cache_capacity_)}); os.InsetDivider(); - auto Precision2String = - [](paddle::AnalysisConfig::Precision prec) -> std::string { - if (prec == Precision::kFloat32) - return "fp32"; - else if (prec == Precision::kHalf) - return "fp16"; - else if (prec == Precision::kInt8) - return "int8"; - else - return "None"; - }; // gpu info os.InsertRow({"use_gpu", use_gpu_ ? "true" : "false"}); if (use_gpu_) { @@ -780,6 +775,33 @@ std::string AnalysisConfig::Summary() { os.InsertRow({"use_tensorrt", use_tensorrt_ ? "true" : "false"}); if (use_tensorrt_) { +#ifdef PADDLE_WITH_TENSORRT + auto Precision2String = + [](paddle::AnalysisConfig::Precision prec) -> std::string { + if (prec == Precision::kFloat32) + return "fp32"; + else if (prec == Precision::kHalf) + return "fp16"; + else if (prec == Precision::kInt8) + return "int8"; + else + return "None"; + }; + auto version2string = + [](const std::tuple &ver) -> std::string { + std::ostringstream os; + int major = std::get<0>(ver); + int minor = std::get<1>(ver); + int patch = std::get<2>(ver); + os << major << "." << minor << "." << patch; + return os.str(); + }; + os.InsertRow( + {"trt_compile_version", + version2string(inference::tensorrt::GetTrtCompileVersion())}); + os.InsertRow( + {"trt_runtime_version", + version2string(inference::tensorrt::GetTrtRuntimeVersion())}); os.InsertRow({"tensorrt_precision_mode", Precision2String(tensorrt_precision_mode_)}); os.InsertRow({"tensorrt_workspace_size", @@ -805,6 +827,7 @@ std::string AnalysisConfig::Summary() { if (trt_use_dla_) { os.InsertRow({"tensorrt_dla_core", std::to_string(trt_dla_core_)}); } +#endif } } os.InsetDivider(); diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 804f035a2e2cac..dda4be8f81c63f 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -36,6 +36,7 @@ #include "paddle/fluid/inference/analysis/helper.h" #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h" #include "paddle/fluid/inference/api/helper.h" +#include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h" #include "paddle/fluid/inference/utils/io_utils.h" #include "paddle/fluid/inference/utils/singleton.h" @@ -56,6 +57,7 @@ #if PADDLE_WITH_TENSORRT #include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/helper.h" #include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" #endif @@ -617,6 +619,7 @@ void AnalysisPredictor::PrepareArgument() { argument_.SetXpuAutotuneFile(config_.xpu_autotune_file_); argument_.SetXpuPrecision(config_.xpu_precision_); argument_.SetXpuAdaptiveSeqlen(config_.xpu_adaptive_seqlen_); + argument_.SetXpuDeviceId(config_.xpu_device_id_); // NNAdapter related argument_.SetUseNNAdapter(config_.NNAdapter().use_nnadapter); argument_.SetNNAdapterDeviceNames( @@ -1403,6 +1406,7 @@ USE_TRT_CONVERTER(roi_align); USE_TRT_CONVERTER(affine_channel); USE_TRT_CONVERTER(multiclass_nms); USE_TRT_CONVERTER(nearest_interp); +USE_TRT_CONVERTER(nearest_interp_v2); USE_TRT_CONVERTER(reshape); USE_TRT_CONVERTER(reduce_sum); USE_TRT_CONVERTER(gather_nd); @@ -1410,6 +1414,8 @@ USE_TRT_CONVERTER(reduce_mean); USE_TRT_CONVERTER(tile); USE_TRT_CONVERTER(conv3d); USE_TRT_CONVERTER(conv3d_transpose); +USE_TRT_CONVERTER(mish); +USE_TRT_CONVERTER(pool3d) #endif namespace paddle_infer { @@ -1469,6 +1475,22 @@ int GetNumBytesOfDataType(DataType dtype) { std::string GetVersion() { return paddle::get_version(); } +std::tuple GetTrtCompileVersion() { +#ifdef PADDLE_WITH_TENSORRT + return paddle::inference::tensorrt::GetTrtCompileVersion(); +#else + return std::tuple{0, 0, 0}; +#endif +} + +std::tuple GetTrtRuntimeVersion() { +#ifdef PADDLE_WITH_TENSORRT + return paddle::inference::tensorrt::GetTrtRuntimeVersion(); +#else + return std::tuple{0, 0, 0}; +#endif +} + std::string UpdateDllFlag(const char *name, const char *value) { return paddle::UpdateDllFlag(name, value); } diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index 86fbde00075f09..a15a1cd84b1409 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -359,6 +359,15 @@ TEST(AnalysisPredictor, set_xpu_device_id) { namespace paddle_infer { TEST(Predictor, Run) { + auto trt_compile_ver = GetTrtCompileVersion(); + auto trt_runtime_ver = GetTrtRuntimeVersion(); + LOG(INFO) << "trt compile version: " << std::get<0>(trt_compile_ver) << "." + << std::get<1>(trt_compile_ver) << "." + << std::get<2>(trt_compile_ver); + LOG(INFO) << "trt runtime version: " << std::get<0>(trt_runtime_ver) << "." + << std::get<1>(trt_runtime_ver) << "." + << std::get<2>(trt_runtime_ver); + Config config; config.SetModel(FLAGS_dirname); diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt index 47abe3298aa7c4..1fdc5cd730e53a 100644 --- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt +++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt @@ -34,12 +34,14 @@ include_directories("${PADDLE_LIB}/") set(PADDLE_LIB_THIRD_PARTY_PATH "${PADDLE_LIB}/third_party/install/") include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/include") include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/include") +include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}utf8proc/include") include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/include") include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/include") include_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/include") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}protobuf/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}glog/lib") +link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}utf8proc/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}gflags/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}xxhash/lib") link_directories("${PADDLE_LIB_THIRD_PARTY_PATH}cryptopp/lib") @@ -151,12 +153,13 @@ if (NOT WIN32) set(EXTERNAL_LIB "-lrt -ldl -lpthread") set(DEPS ${DEPS} ${MATH_LIB} ${MKLDNN_LIB} - glog gflags protobuf xxhash cryptopp + glog gflags protobuf xxhash cryptopp utf8proc ${EXTERNAL_LIB}) else() set(DEPS ${DEPS} ${MATH_LIB} ${MKLDNN_LIB} - glog gflags_static libprotobuf xxhash cryptopp-static ${EXTERNAL_LIB}) + glog gflags_static libprotobuf xxhash cryptopp-static utf8proc_static + ${EXTERNAL_LIB}) set(DEPS ${DEPS} shlwapi.lib) endif(NOT WIN32) diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index a9c6ef13177c20..bb537f0c652857 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -43,15 +43,33 @@ void Tensor::Reshape(const std::vector &shape) { tensor->Resize(paddle::framework::make_ddim(shape)); } -#define EAGER_GET_TENSOR \ - if (!tensor_) { \ - tensor_ = FindTensor(); \ - } \ - auto *tensor = static_cast(tensor_); +void Tensor::ReshapeStrings(const size_t &shape) { + PADDLE_ENFORCE_EQ( + name_.empty(), false, + paddle::platform::errors::PreconditionNotMet( + "Need to SetName first, so that the corresponding tensor can " + "be retrieved.")); + PADDLE_ENFORCE_EQ(input_or_output_, true, + paddle::platform::errors::PermissionDenied( + "Can't reshape the output tensor, it is readonly")); + auto *scope = static_cast(scope_); + auto *var = scope->FindVar(name_); + PADDLE_ENFORCE_NOT_NULL( + var, paddle::platform::errors::PreconditionNotMet( + "No tensor called [%s] in the runtime scope", name_)); + paddle_infer::Strings *tensor = var->GetMutable(); + tensor->resize(shape); +} + +#define EAGER_GET_TENSOR(tensor_type) \ + if (!tensor_) { \ + tensor_ = FindTensor(); \ + } \ + auto *tensor = static_cast(tensor_); template T *Tensor::mutable_data(PlaceType place) { - EAGER_GET_TENSOR; + EAGER_GET_TENSOR(paddle::framework::LoDTensor); PADDLE_ENFORCE_GT( tensor->numel(), 0, paddle::platform::errors::PreconditionNotMet( @@ -83,7 +101,7 @@ T *Tensor::mutable_data(PlaceType place) { template T *Tensor::data(PlaceType *place, int *size) const { - EAGER_GET_TENSOR; + EAGER_GET_TENSOR(paddle::framework::LoDTensor); auto *res = tensor->data(); if (paddle::platform::is_cpu_place(tensor->place())) { @@ -103,7 +121,7 @@ T *Tensor::data(PlaceType *place, int *size) const { } DataType Tensor::type() const { - EAGER_GET_TENSOR; + EAGER_GET_TENSOR(paddle::framework::LoDTensor); auto type = tensor->type(); if (type == paddle::framework::proto::VarType::FP32) { return DataType::FLOAT32; @@ -125,7 +143,7 @@ PlaceType Tensor::place() const { return place_; } template void Tensor::CopyFromCpu(const T *data) { - EAGER_GET_TENSOR; + EAGER_GET_TENSOR(paddle::framework::LoDTensor); PADDLE_ENFORCE_GE(tensor->numel(), 0, paddle::platform::errors::PreconditionNotMet( "You should call Tensor::Reshape(const " @@ -186,10 +204,20 @@ void Tensor::CopyFromCpu(const T *data) { } } +void Tensor::CopyStringsFromCpu(const paddle_infer::Strings *data) { + EAGER_GET_TENSOR(paddle_infer::Strings); + PADDLE_ENFORCE_GE(tensor->size(), 0, + paddle::platform::errors::PreconditionNotMet( + "You should call Tensor::Reshape(const " + "std::size_t &shape)function before copying" + "the string data from cpu.")); + *tensor = *data; +} + template void Tensor::CopyToCpuImpl(T *data, void *exec_stream, CallbackFunc cb, void *cb_params) const { - EAGER_GET_TENSOR; + EAGER_GET_TENSOR(paddle::framework::LoDTensor); auto ele_num = tensor->numel(); auto *t_data = tensor->data(); auto t_place = tensor->place(); @@ -371,6 +399,7 @@ Tensor::Tensor(void *scope) : scope_{scope} { "set to the pointer of scope.")); } +template void *Tensor::FindTensor() const { PADDLE_ENFORCE_EQ( name_.empty(), false, @@ -382,12 +411,12 @@ void *Tensor::FindTensor() const { PADDLE_ENFORCE_NOT_NULL( var, paddle::platform::errors::PreconditionNotMet( "No tensor called [%s] in the runtime scope", name_)); - auto *tensor = var->GetMutable(); + auto *tensor = var->GetMutable(); return tensor; } std::vector Tensor::shape() const { - EAGER_GET_TENSOR; + EAGER_GET_TENSOR(paddle::framework::LoDTensor); PADDLE_ENFORCE_NOT_NULL( tensor_, paddle::platform::errors::PreconditionNotMet( "Not found tensor called %s in the scope", name_)); @@ -395,7 +424,7 @@ std::vector Tensor::shape() const { } void Tensor::SetLoD(const std::vector> &x) { - EAGER_GET_TENSOR; + EAGER_GET_TENSOR(paddle::framework::LoDTensor); paddle::framework::LoD lod; for (auto &level : x) { lod.emplace_back(level); @@ -404,7 +433,7 @@ void Tensor::SetLoD(const std::vector> &x) { } std::vector> Tensor::lod() const { - EAGER_GET_TENSOR; + EAGER_GET_TENSOR(paddle::framework::LoDTensor); std::vector> res; for (auto &level : tensor->lod()) { res.emplace_back(level); diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc index 1f1be136103791..eb134874c3aa8a 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor_dummy.cc @@ -36,7 +36,10 @@ template PD_INFER_DECL int64_t *Tensor::data(PlaceType *place, template float *Tensor::mutable_data(PlaceType place); template int64_t *Tensor::mutable_data(PlaceType place); -void *Tensor::FindTensor() const { return nullptr; } +template +void *Tensor::FindTensor() const { + return nullptr; +} std::vector Tensor::shape() const { return {}; } diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc b/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc index 0c092a8684d1ad..4b6f90f3f0652e 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc @@ -88,7 +88,8 @@ bool SetPlaceAndCheck(PlaceType place, size_t length) { const std::vector> lod{{0, length}}; scope.Var(name); auto tensor = CreateTensor(place, &scope, name); - tensor->Reshape({static_cast(length)}); + std::vector shape{static_cast(length)}; + tensor->Reshape(shape); tensor->mutable_data(place); tensor->SetLoD(lod); diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index de6b28de27557c..b137b7ba6f97e2 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -174,6 +174,14 @@ class PD_INFER_DECL ZeroCopyTensor : public paddle_infer::Tensor { void copy_from_cpu(const T* data) { return CopyFromCpu(data); } + + /// \brief Experimental interface. + /// It's usually used to set the input tensor data with Strings data type. + /// \param data The pointer of the data, from which the tensor will copy. + void copy_strings_from_cpu(const paddle_infer::Strings* data) { + return CopyStringsFromCpu(data); + } + /// \brief Copy the tensor data to the host memory. /// It's usually used to get the output tensor data. /// \param[out] data The tensor will copy the data to the address. diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h index a516abb1432ca8..35b90bfa54f73c 100644 --- a/paddle/fluid/inference/api/paddle_inference_api.h +++ b/paddle/fluid/inference/api/paddle_inference_api.h @@ -169,6 +169,8 @@ PD_INFER_DECL std::shared_ptr CreatePredictor( PD_INFER_DECL int GetNumBytesOfDataType(DataType dtype); PD_INFER_DECL std::string GetVersion(); +PD_INFER_DECL std::tuple GetTrtCompileVersion(); +PD_INFER_DECL std::tuple GetTrtRuntimeVersion(); PD_INFER_DECL std::string UpdateDllFlag(const char* name, const char* value); namespace services { diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 704fbb2b95c892..5b49a0d591edd9 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -93,11 +93,14 @@ const std::vector kTRTSubgraphPasses({ "squeeze2_matmul_fuse_pass", // "reshape2_matmul_fuse_pass", // "flatten2_matmul_fuse_pass", // + "map_matmul_v2_to_mul_pass", // + "map_matmul_v2_to_matmul_pass", // "map_matmul_to_mul_pass", // "fc_fuse_pass", // "conv_elementwise_add_fuse_pass", // - "tensorrt_subgraph_pass", // - "conv_bn_fuse_pass", // + "add_support_int8_pass", + "tensorrt_subgraph_pass", // + "conv_bn_fuse_pass", // #if CUDNN_VERSION >= 7100 // To run conv_fusion, the version of cudnn must be // guaranteed at least v7 // cudnn8.0 has memory leak problem in conv + eltwise + act, so we @@ -140,6 +143,8 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) { "squeeze2_matmul_fuse_pass", // "reshape2_matmul_fuse_pass", // "flatten2_matmul_fuse_pass", // + "map_matmul_v2_to_mul_pass", // + "map_matmul_v2_to_matmul_pass", // "map_matmul_to_mul_pass", // "fc_fuse_pass", // "fc_elementwise_layernorm_fuse_pass", // @@ -200,6 +205,8 @@ CpuPassStrategy::CpuPassStrategy() : PassStrategy({}) { "squeeze2_matmul_fuse_pass", // "reshape2_matmul_fuse_pass", // "flatten2_matmul_fuse_pass", // + "map_matmul_v2_to_mul_pass", // + "map_matmul_v2_to_matmul_pass", // "map_matmul_to_mul_pass", // "fc_fuse_pass", // "repeated_fc_relu_fuse_pass", // @@ -245,6 +252,7 @@ void CpuPassStrategy::EnableMKLDNN() { "scale_matmul_fuse_pass", // "reshape_transpose_matmul_mkldnn_fuse_pass", // "matmul_transpose_reshape_fuse_pass", // + "matmul_v2_transpose_reshape_fuse_pass", // // Disabled due to topology-dependent speed-up // "fc_mkldnn_pass", // "fc_act_mkldnn_fuse_pass", diff --git a/paddle/fluid/inference/api/paddle_tensor.h b/paddle/fluid/inference/api/paddle_tensor.h index f6dce74c30ded1..24a72a0b9dadbd 100644 --- a/paddle/fluid/inference/api/paddle_tensor.h +++ b/paddle/fluid/inference/api/paddle_tensor.h @@ -14,10 +14,16 @@ #pragma once +#include + #include "paddle_infer_declare.h" // NOLINT namespace paddle_infer { +/// \brief Experimental. +/// Strings for text data. +using Strings = std::vector; + typedef void (*CallbackFunc)(void*); #if defined(PADDLE_WITH_TESTING) && defined(PADDLE_WITH_INFERENCE_API_TEST) @@ -57,6 +63,14 @@ class PD_INFER_DECL Tensor { /// \param shape The shape to set. void Reshape(const std::vector& shape); + /// \brief Experimental interface. + /// Reset the shape of the Strings tensor. + /// Generally it's only used for the input tensor. + /// Reshape must be called before calling + /// ZeroCopyStringTensorCreate() or PaddleInferTensorCreate() + /// \param shape The shape to set. + void ReshapeStrings(const std::size_t& shape); + /// \brief Get the memory pointer in CPU or GPU with specific data type. /// Please Reshape the tensor first before call this. /// It's usually used to get input data pointer. @@ -78,6 +92,11 @@ class PD_INFER_DECL Tensor { template void CopyFromCpu(const T* data); + /// \brief Experimental interface. + /// It's usually used to set the input tensor data with Strings data type. + /// \param data The pointer of the data, from which the tensor will copy. + void CopyStringsFromCpu(const paddle_infer::Strings* data); + /// \brief Copy the tensor data to the host memory. /// It's usually used to get the output tensor data. /// \param[out] data The tensor will copy the data to the address. @@ -122,7 +141,10 @@ class PD_INFER_DECL Tensor { protected: explicit Tensor(void* scope); + + template void* FindTensor() const; + void SetPlace(PlaceType place, int device = -1); void SetName(const std::string& name); diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc index d2bc95e7c3eb3d..f976e217bab1a0 100644 --- a/paddle/fluid/inference/io.cc +++ b/paddle/fluid/inference/io.cc @@ -17,11 +17,13 @@ limitations under the License. */ #include #include #include + #include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/feed_fetch_type.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/version.h" #include "paddle/fluid/platform/cpu_helper.h" +#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/pybind/pybind.h" DEFINE_string(devices, "", "The devices to be used which is joined by comma."); @@ -85,10 +87,12 @@ void LoadPersistables(framework::Executor* executor, framework::Scope* scope, framework::VarDesc* new_var = load_block->Var(var->Name()); new_var->SetShape(var->GetShape()); new_var->SetDataType(var->GetDataType()); - new_var->SetType(var->GetType()); + auto var_type = var->GetType(); + new_var->SetType(var_type); - if (var->GetType() != - framework::proto::VarType::Type::VarType_Type_SELECTED_ROWS) { + if ((var_type != + framework::proto::VarType::Type::VarType_Type_SELECTED_ROWS) && + (var_type != framework::proto::VarType::VOCAB)) { new_var->SetLoDLevel(var->GetLoDLevel()); } diff --git a/paddle/fluid/inference/lite/engine.cc b/paddle/fluid/inference/lite/engine.cc index 47b9d681b4754f..cd78cfecd86357 100644 --- a/paddle/fluid/inference/lite/engine.cc +++ b/paddle/fluid/inference/lite/engine.cc @@ -67,6 +67,7 @@ paddle::lite_api::PaddlePredictor* EngineManager::Create( lite_cxx_config.set_xpu_conv_autotune(cfg.autotune, cfg.autotune_file); lite_cxx_config.set_xpu_multi_encoder_method(cfg.precision, cfg.adaptive_seqlen); + lite_cxx_config.set_xpu_dev_per_thread(cfg.device_id); #endif #ifdef LITE_SUBGRAPH_WITH_NPU diff --git a/paddle/fluid/inference/lite/engine.h b/paddle/fluid/inference/lite/engine.h index 48072656cb9966..adeaca7c1c3b7c 100644 --- a/paddle/fluid/inference/lite/engine.h +++ b/paddle/fluid/inference/lite/engine.h @@ -39,6 +39,9 @@ struct EngineConfig { std::vector neglected_passes; lite_api::LiteModelType model_type{lite_api::LiteModelType::kProtobuf}; bool model_from_memory{true}; + // TODO(wilber): now only works for xpu, lite gpu can support device_id or + // not? + int device_id = 0; // for xpu size_t xpu_l3_workspace_size; diff --git a/paddle/fluid/inference/lite/test_engine_lite.cc b/paddle/fluid/inference/lite/test_engine_lite.cc index 080622899eb2e7..b2750fd070d3eb 100644 --- a/paddle/fluid/inference/lite/test_engine_lite.cc +++ b/paddle/fluid/inference/lite/test_engine_lite.cc @@ -110,23 +110,24 @@ TEST(EngineManager, engine) { }; LOG(INFO) << "Create EngineManager"; - inference::Singleton::Global().Create( - unique_key, config); - LOG(INFO) << "Create EngineManager done"; - ASSERT_EQ( - inference::Singleton::Global().Empty(), - false); - ASSERT_EQ(inference::Singleton::Global().Has( - unique_key), - true); - paddle::lite_api::PaddlePredictor* engine_0 = - inference::Singleton::Global().Get( - unique_key); - CHECK_NOTNULL(engine_0); - inference::Singleton::Global().DeleteAll(); - CHECK(inference::Singleton::Global().Get( - unique_key) == nullptr) - << "the engine_0 should be nullptr"; + // TODO(wilber): The ut is out of date, we need to a new lite subgraph test. + // inference::Singleton::Global().Create( + // unique_key, config); + // LOG(INFO) << "Create EngineManager done"; + // ASSERT_EQ( + // inference::Singleton::Global().Empty(), + // false); + // ASSERT_EQ(inference::Singleton::Global().Has( + // unique_key), + // true); + // paddle::lite_api::PaddlePredictor* engine_0 = + // inference::Singleton::Global().Get( + // unique_key); + // CHECK_NOTNULL(engine_0); + // inference::Singleton::Global().DeleteAll(); + // CHECK(inference::Singleton::Global().Get( + // unique_key) == nullptr) + // << "the engine_0 should be nullptr"; } } // namespace lite diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index c79915629b70d1..b6aa0a230cc2d5 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -17,6 +17,9 @@ nv_library(tensorrt_converter gather_nd_op.cc tile_op.cc conv3d_op.cc + mish_op.cc + nearest_interp_v2_op.cc + pool3d_op.cc DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto op_registry) nv_test(test_op_converter SRCS test_op_converter.cc DEPS diff --git a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc index 7ea41839cb939f..71a2fa68f1749f 100644 --- a/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/batch_norm_op.cc @@ -147,9 +147,10 @@ class BatchNormOpConverter : public OpConverter { X = expand_layer->getOutput(0); } - layer = TRT_ENGINE_ADD_LAYER( - engine_, Scale, *X, nvinfer1::ScaleMode::kCHANNEL, shift_weights.get(), - scale_weights.get(), power_weights.get()); + layer = TRT_ENGINE_ADD_LAYER(engine_, ScaleNd, *X, + nvinfer1::ScaleMode::kCHANNEL, + shift_weights.get(), scale_weights.get(), + power_weights.get(), dynamic_shape_offset); auto output_name = op_desc.Output("Y").front(); engine_->SetWeights(op_desc.Input("Bias").front(), diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc index 2f802ea8d181ea..8569dd63478529 100644 --- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc @@ -83,8 +83,8 @@ class ElementwiseWeightOpConverter : public OpConverter { } if (op_type_ == "add") { nvinfer1::IScaleLayer* scale_layer = TRT_ENGINE_ADD_LAYER( - engine_, Scale, *X, scale_mode, shift_weights.get(), - scale_weights.get(), power_weights.get()); + engine_, ScaleNd, *X, scale_mode, shift_weights.get(), + scale_weights.get(), power_weights.get(), dynamic_shape_offset); layer = scale_layer; } else if (op_type_ == "mul") { nvinfer1::IScaleLayer* scale_layer = TRT_ENGINE_ADD_LAYER( diff --git a/paddle/fluid/inference/tensorrt/convert/mish_op.cc b/paddle/fluid/inference/tensorrt/convert/mish_op.cc new file mode 100644 index 00000000000000..6b646d9935b528 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/mish_op.cc @@ -0,0 +1,74 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h" + +namespace paddle { +namespace framework { +class Scope; + +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { + +/* + * Mish converter from fluid to tensorRT. + */ +class MishOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(4) << "convert fluid Mish op to tensorrt Mish plugin"; + + framework::OpDesc op_desc(op, nullptr); + // Declare inputs + int input_num = op_desc.Input("X").size(); + auto* input = engine_->GetITensor(op_desc.Input("X")[0]); + + const float threshold = + op_desc.HasAttr("threshold") + ? BOOST_GET_CONST(float, op_desc.GetAttr("threshold")) + : 20.0f; + + nvinfer1::ILayer* layer = nullptr; + if (engine_->with_dynamic_shape()) { + bool with_fp16 = + engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); + plugin::MishPluginDynamic* plugin = + new plugin::MishPluginDynamic(threshold, with_fp16); + layer = engine_->AddDynamicPlugin(&input, input_num, plugin); + } else { + bool with_fp16 = + engine_->WithFp16() && !engine_->disable_trt_plugin_fp16(); + plugin::MishPlugin* plugin = new plugin::MishPlugin(threshold, with_fp16); + layer = engine_->AddPlugin(&input, input_num, plugin); + } + + auto output_name = op_desc.Output("Out")[0]; + RreplenishLayerAndOutput(layer, "mish", {output_name}, test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(mish, MishOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/nearest_interp_v2_op.cc b/paddle/fluid/inference/tensorrt/convert/nearest_interp_v2_op.cc new file mode 100644 index 00000000000000..f2e0e0c09c5efb --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/nearest_interp_v2_op.cc @@ -0,0 +1,108 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" + +namespace paddle { +namespace framework { +class Scope; +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { + +class NearestInterpolateV2OpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc& op, + const framework::Scope& scope, bool test_mode) override { + VLOG(3) << "convert a fluid nearest_interp_v2 op"; + + framework::OpDesc op_desc(op, nullptr); + + std::string input_name = op_desc.Input("X").front(); + std::string output_name = op_desc.Output("Out").front(); + + auto input = engine_->GetITensor(input_name); + + auto data_layout = framework::StringToDataLayout( + BOOST_GET_CONST(std::string, op_desc.GetAttr("data_layout"))); + auto interp_method = + BOOST_GET_CONST(std::string, op_desc.GetAttr("interp_method")); + bool align_corners = + BOOST_GET_CONST(bool, op_desc.GetAttr("align_corners")); + + auto input_names = op_desc.Input("X"); + auto scale = BOOST_GET_CONST(std::vector, op_desc.GetAttr("scale")); + auto out_h = BOOST_GET_CONST(int, op_desc.GetAttr("out_h")); + auto out_w = BOOST_GET_CONST(int, op_desc.GetAttr("out_w")); + + auto layer = TRT_ENGINE_ADD_LAYER(engine_, Resize, *input); + layer->setAlignCorners(align_corners); + + auto in_dim = input->getDimensions(); + + float scale_h = 1.f; + float scale_w = 1.f; + + std::vector scales; + + if (out_h > 0 && out_w > 0) { + // axis are different in static/dynamic mode + bool with_dynamic = engine_->with_dynamic_shape(); + + int h_axis = (data_layout == framework::DataLayout::kNCHW) + with_dynamic; + int w_axis = + (data_layout == framework::DataLayout::kNCHW) + 1 + with_dynamic; + + scale_h = + static_cast(out_h) / static_cast(in_dim.d[h_axis]); + scale_w = + static_cast(out_w) / static_cast(in_dim.d[w_axis]); + } else { + scale_h = scale[0]; + scale_w = scale[1]; + } + + if (engine_->with_dynamic_shape()) { + scales.push_back(1.f); + } + + if (data_layout == framework::DataLayout::kNCHW) { + scales.push_back(1.f); + scales.push_back(scale_h); + scales.push_back(scale_w); + } else if (data_layout == framework::DataLayout::kNHWC) { + // NHWC + scales.push_back(scale_h); + scales.push_back(scale_w); + scales.push_back(1.f); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Data layout must be NCHW or NHWC.")); + } + layer->setScales(scales.data(), scales.size()); + + RreplenishLayerAndOutput(layer, "nearest_interp_v2", {output_name}, + test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +REGISTER_TRT_OP_CONVERTER(nearest_interp_v2, NearestInterpolateV2OpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc index 1898f28c73ad0b..35c9658108ab54 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -107,11 +107,26 @@ class Pool2dOpConverter : public OpConverter { plugin_pool_type = plugin::PoolPlugin::PoolType::avg; } + if (padding_algorithm == "VALID") { + std::fill(paddings.begin(), paddings.end(), 0); + } nvinfer1::DimsHW nv_ksize(ksize[0], ksize[1]); nvinfer1::DimsHW nv_strides(strides[0], strides[1]); nvinfer1::DimsHW nv_paddings(paddings[0], paddings[1]); nvinfer1::ILayer *layer = nullptr; + nvinfer1::DimsHW g_pre_pad(0, 0); + nvinfer1::DimsHW g_post_pad(0, 0); + // paddle Non ceil_mode : Output size = (input size - filter size + 2 * + // padding) / stride (stride size) + 1 + // tensorrt EXPLICIT_ROUND_DOWN: O = floor((M - DK) / S) + 1 + // so if M - DK < 0 we need extra padding + if (input_shape.d[input_dims - 2] - ksize[0] + 2 * paddings[0] < 0) { + g_post_pad.h() = strides[0] - 1; + } + if (input_shape.d[input_dims - 1] - ksize[1] + 2 * paddings[1] < 0) { + g_post_pad.w() = strides[1] - 1; + } if (op_desc.HasAttr("enable_int8")) { #if IS_TRT_VERSION_GE(5000) @@ -123,6 +138,44 @@ class Pool2dOpConverter : public OpConverter { if (engine_->with_dynamic_shape()) { if (!adaptive && !global_pooling && !ceil_mode) { + // input_shape.d < 0 means we can't get shape info here. + // we may suffer from issue if shape is not met finally. + if ((padding_algorithm != "SAME") && + ((g_post_pad.w() > 0 && input_shape.d[input_dims - 2] > 0) || + (g_post_pad.h() > 0 && input_shape.d[input_dims - 1] > 0))) { + auto *pad_layer = TRT_ENGINE_ADD_LAYER(engine_, Padding, *input1, + g_pre_pad, g_post_pad); + PADDLE_ENFORCE_NOT_NULL( + pad_layer, platform::errors::Fatal( + "Pad layer in poolOp converter could not be " + "created. The pointer to pad layer is `NULL`.")); + input1 = pad_layer->getOutput(0); + } + + auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *input1, + nv_pool_type, nv_ksize); + pool_layer->setStride(nv_strides); + pool_layer->setPadding(nv_paddings); + pool_layer->setAverageCountExcludesPadding(exclusive); + if (padding_algorithm == "SAME") { + pool_layer->setPaddingMode(nvinfer1::PaddingMode::kSAME_UPPER); + } + layer = pool_layer; + } else if (!adaptive && !global_pooling && ceil_mode) { + nvinfer1::DimsHW pre_pad(0, 0); + nvinfer1::DimsHW post_pad(0, 0); + // If ceil mode is true, we will pad the appropriate size to the input. + DealCeilMode(input_shape, ksize, strides, paddings, &pre_pad, &post_pad, + input_dims); + auto *pad_layer = TRT_ENGINE_ADD_LAYER( + engine_, Padding, *const_cast(input1), pre_pad, + post_pad); + PADDLE_ENFORCE_NOT_NULL( + pad_layer, platform::errors::Fatal( + "Pad layer in poolOp converter could not be " + "created. The pointer to pad layer is `NULL`.")); + input1 = pad_layer->getOutput(0); + auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *input1, nv_pool_type, nv_ksize); pool_layer->setStride(nv_strides); @@ -157,9 +210,8 @@ class Pool2dOpConverter : public OpConverter { if (global_pooling == true) { nv_ksize.d[0] = input_shape.d[input_dims - 2]; nv_ksize.d[1] = input_shape.d[input_dims - 1]; - auto *pool_layer = TRT_ENGINE_ADD_LAYER( - engine_, Pooling, *const_cast(input1), - nv_pool_type, nv_ksize); + auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *input1, + nv_pool_type, nv_ksize); PADDLE_ENFORCE_NOT_NULL( pool_layer, platform::errors::Fatal( "trt pool layer in converter could not be created.")); @@ -181,28 +233,38 @@ class Pool2dOpConverter : public OpConverter { } if (!adaptive) { - // Under ceil mode, the pre_pad and post_pad are used to - // record the the padding size. In some ceil mode cases, - // we do not need padding, so we initialize the two vars to 0. - - nvinfer1::DimsHW pre_pad(0, 0); - nvinfer1::DimsHW post_pad(0, 0); if (ceil_mode) { + nvinfer1::DimsHW pre_pad(0, 0); + nvinfer1::DimsHW post_pad(0, 0); // If ceil mode is true, we will pad the appropriate size to the input. DealCeilMode(input_shape, ksize, strides, paddings, &pre_pad, &post_pad, input_dims); - auto *pad_layer = TRT_ENGINE_ADD_LAYER( - engine_, Padding, *const_cast(input1), pre_pad, - post_pad); + auto *pad_layer = + TRT_ENGINE_ADD_LAYER(engine_, Padding, *input1, pre_pad, post_pad); + PADDLE_ENFORCE_NOT_NULL( pad_layer, platform::errors::Fatal( "Pad layer in poolOp converter could not be " "created. The pointer to pad layer is `NULL`.")); input1 = pad_layer->getOutput(0); } - auto *pool_layer = TRT_ENGINE_ADD_LAYER( - engine_, Pooling, *const_cast(input1), - nv_pool_type, nv_ksize); +#if IS_TRT_VERSION_GE(8000) + // Exclude padding pixels from the average mean is not supported well by + // TRT + // so enable padding for trt8.0 above. + if ((g_post_pad.w() > 0 || g_post_pad.h() > 0) && + (padding_algorithm != "SAME") && !ceil_mode) { + auto *pad_layer = TRT_ENGINE_ADD_LAYER(engine_, Padding, *input1, + g_pre_pad, g_post_pad); + PADDLE_ENFORCE_NOT_NULL( + pad_layer, platform::errors::Fatal( + "Pad layer in poolOp converter could not be " + "created. The pointer to pad layer is `NULL`.")); + input1 = pad_layer->getOutput(0); + } +#endif + auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, Pooling, *input1, + nv_pool_type, nv_ksize); PADDLE_ENFORCE_NOT_NULL( pool_layer, platform::errors::Fatal( "trt pool layer in converter could not be created.")); diff --git a/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc new file mode 100644 index 00000000000000..9baed499f14a78 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/pool3d_op.cc @@ -0,0 +1,228 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.h" + +namespace paddle { +namespace framework { +class Scope; + +namespace proto { +class OpDesc; +} // namespace proto +} // namespace framework +} // namespace paddle + +namespace paddle { +namespace inference { +namespace tensorrt { + +inline void DealCeilMode(const nvinfer1::Dims &input_shape, + std::vector ksize, std::vector strides, + std::vector paddings, nvinfer1::DimsCHW *pre_pad, + nvinfer1::DimsCHW *post_pad, int input_dims) { + int input_depth = input_shape.d[input_dims - 3]; + int input_height = input_shape.d[input_dims - 2]; + int input_width = input_shape.d[input_dims - 1]; + + int floor_d_output_size = + (input_depth - ksize[0] + 2 * paddings[0]) / strides[0] + 1; + int ceil_d_output_size = + (input_depth - ksize[0] + 2 * paddings[0] + strides[0] - 1) / strides[0] + + 1; + + int floor_h_output_size = + (input_height - ksize[1] + 2 * paddings[1]) / strides[1] + 1; + int ceil_h_output_size = + (input_height - ksize[1] + 2 * paddings[1] + strides[1] - 1) / + strides[1] + + 1; + + int floor_w_output_size = + (input_width - ksize[2] + 2 * paddings[2]) / strides[2] + 1; + int ceil_w_output_size = + (input_width - ksize[2] + 2 * paddings[2] + strides[2] - 1) / strides[2] + + 1; + + if (floor_d_output_size != ceil_d_output_size) { + post_pad->c() = strides[0] - 1; + } + + if (floor_h_output_size != ceil_h_output_size) { + post_pad->h() = strides[1] - 1; + } + + if (floor_w_output_size != ceil_w_output_size) { + post_pad->w() = strides[2] - 1; + } +} + +class Pool3dOpConverter : public OpConverter { + public: + void operator()(const framework::proto::OpDesc &op, + const framework::Scope &scope, bool test_mode) override { + VLOG(4) + << "convert a fluid pool3d op to tensorrt pool3d layer without bias"; + framework::OpDesc op_desc(op, nullptr); + auto *input1 = engine_->GetITensor(op_desc.Input("X")[0]); + nvinfer1::Dims input_shape = input1->getDimensions(); + int input_dims = input_shape.nbDims; + + bool global_pooling = + BOOST_GET_CONST(bool, op_desc.GetAttr("global_pooling")); + std::string pool_type = + BOOST_GET_CONST(std::string, op_desc.GetAttr("pooling_type")); + std::vector ksize = + BOOST_GET_CONST(std::vector, op_desc.GetAttr("ksize")); + std::vector strides = + BOOST_GET_CONST(std::vector, op_desc.GetAttr("strides")); + std::vector paddings = + BOOST_GET_CONST(std::vector, op_desc.GetAttr("paddings")); + bool exclusive = op_desc.HasAttr("exclusive") + ? BOOST_GET_CONST(bool, op_desc.GetAttr("exclusive")) + : true; + bool ceil_mode = BOOST_GET_CONST(bool, op_desc.GetAttr("ceil_mode")); + bool adaptive = false; + if (op_desc.HasAttr("adaptive")) + adaptive = BOOST_GET_CONST(bool, op_desc.GetAttr("adaptive")); + std::string padding_algorithm = "EXPLICIT"; + if (op_desc.HasAttr("padding_algorithm")) + padding_algorithm = + BOOST_GET_CONST(std::string, op_desc.GetAttr("padding_algorithm")); + if (padding_algorithm == "VALID" || padding_algorithm == "SAME") { + std::fill(paddings.begin(), paddings.end(), 0); + } + + nvinfer1::PoolingType nv_pool_type = nvinfer1::PoolingType::kMAX; + nvinfer1::ReduceOperation reduce_operation = + nvinfer1::ReduceOperation::kMAX; + plugin::Pool3DPlugin::Pool3DType plugin_pool_type = + plugin::Pool3DPlugin::Pool3DType::max; + if (pool_type == "max") { + nv_pool_type = nvinfer1::PoolingType::kMAX; + reduce_operation = nvinfer1::ReduceOperation::kMAX; + plugin_pool_type = plugin::Pool3DPlugin::Pool3DType::max; + } else if (pool_type == "avg") { + nv_pool_type = nvinfer1::PoolingType::kAVERAGE; + reduce_operation = nvinfer1::ReduceOperation::kAVG; + plugin_pool_type = plugin::Pool3DPlugin::Pool3DType::avg; + } + nvinfer1::DimsCHW nv_ksize(ksize[0], ksize[1], ksize[2]); + nvinfer1::DimsCHW nv_strides(strides[0], strides[1], strides[2]); + nvinfer1::DimsCHW nv_paddings(paddings[0], paddings[1], paddings[2]); + nvinfer1::ILayer *layer = nullptr; + if (op_desc.HasAttr("enable_int8")) { + CHECK(op_desc.HasAttr("X_scale")); + float input_scale = BOOST_GET_CONST(float, op_desc.GetAttr("X_scale")); + engine_->SetTensorDynamicRange(input1, input_scale); + } + + if (engine_->with_dynamic_shape()) { + if (!adaptive && !global_pooling && !ceil_mode) { + auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, PoolingNd, *input1, + nv_pool_type, nv_ksize); + pool_layer->setStrideNd(nv_strides); + pool_layer->setPaddingNd(nv_paddings); + pool_layer->setAverageCountExcludesPadding(exclusive); + layer = pool_layer; + } else if (global_pooling) { + auto *reduce_layer = TRT_ENGINE_ADD_LAYER(engine_, Reduce, *input1, + reduce_operation, 28, true); + layer = reduce_layer; + } else { + plugin::Pool3DPluginDynamic *plugin = new plugin::Pool3DPluginDynamic( + ceil_mode, pool_type, adaptive, ksize, strides, paddings, + global_pooling); + layer = engine_->AddDynamicPlugin(&input1, 1, plugin); + } + auto output_name = op_desc.Output("Out")[0]; + layer->setName(("pool3d (Output: " + output_name + ")").c_str()); + layer->getOutput(0)->setName(output_name.c_str()); + engine_->SetITensor(output_name, layer->getOutput(0)); + if (test_mode) { + engine_->DeclareOutput(output_name); + } + return; + } + + if (global_pooling == true) { + auto *reduce_layer = TRT_ENGINE_ADD_LAYER(engine_, Reduce, *input1, + reduce_operation, 14, true); + layer = reduce_layer; + auto output_name = op_desc.Output("Out")[0]; + layer->setName(("pool3d (Output: " + output_name + ")").c_str()); + layer->getOutput(0)->setName(output_name.c_str()); + engine_->SetITensor(output_name, layer->getOutput(0)); + if (test_mode) { + engine_->DeclareOutput(output_name); + } + return; + } + + if (!adaptive) { + if (!ceil_mode) { + auto *pool_layer = TRT_ENGINE_ADD_LAYER(engine_, PoolingNd, *input1, + nv_pool_type, nv_ksize); + PADDLE_ENFORCE_NOT_NULL( + pool_layer, + platform::errors::Fatal( + "trt pool layer in converter could not be created.")); + pool_layer->setStrideNd(nv_strides); + pool_layer->setPaddingNd(nv_paddings); + pool_layer->setAverageCountExcludesPadding(exclusive); + layer = pool_layer; + } else { + std::vector input_shape_v; + for (int i = 0; i < input_dims; i++) { + input_shape_v.push_back(input_shape.d[i]); + } + plugin::Pool3DPlugin *plugin = + new plugin::Pool3DPlugin(ceil_mode, plugin_pool_type, adaptive, + ksize, strides, paddings, input_shape_v); + auto *pool_layer = engine_->AddPluginV2Ext(&input1, 1, plugin); + PADDLE_ENFORCE_NOT_NULL( + pool_layer, + platform::errors::Fatal( + "trt pool3d plugin layer in converter could not be created.")); + layer = pool_layer; + } + } else { + // Average pooling needs to exclude the padding pixels from the average + // mean. + // It is not supported well by TRT, we use a plugin here. + std::vector input_shape_v; + for (int i = 0; i < input_dims; i++) { + input_shape_v.push_back(input_shape.d[i]); + } + plugin::Pool3DPlugin *plugin = + new plugin::Pool3DPlugin(ceil_mode, plugin_pool_type, adaptive, ksize, + strides, paddings, input_shape_v); + auto *pool_layer = engine_->AddPluginV2Ext(&input1, 1, plugin); + PADDLE_ENFORCE_NOT_NULL( + pool_layer, + platform::errors::Fatal( + "trt pool3d plugin layer in converter could not be created.")); + layer = pool_layer; + } + auto output_name = op_desc.Output("Out")[0]; + RreplenishLayerAndOutput(layer, "pool3d", {output_name}, test_mode); + } +}; + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +USE_OP(pool3d); +REGISTER_TRT_OP_CONVERTER(pool3d, Pool3dOpConverter); diff --git a/paddle/fluid/inference/tensorrt/convert/test_mish_op.cc b/paddle/fluid/inference/tensorrt/convert/test_mish_op.cc new file mode 100644 index 00000000000000..c84c30255fa962 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_mish_op.cc @@ -0,0 +1,47 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +TEST(mish_op, test_mish) { + std::unordered_set parameters; + framework::Scope scope; + TRTConvertValidation validator(10, parameters, scope, 1000); + validator.DeclInputVar("mish-X", nvinfer1::Dims3(3, 2, 2)); + validator.DeclOutputVar("mish-Out", nvinfer1::Dims3(3, 2, 2)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("mish"); + desc.SetInput("X", {"mish-X"}); + desc.SetOutput("Out", {"mish-Out"}); + + desc.SetAttr("threshold", 20.0f); + + validator.SetOp(*desc.Proto()); + + validator.Execute(1); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +USE_OP(mish); diff --git a/paddle/fluid/inference/tensorrt/convert/test_nearest_interp_v2_op.cc b/paddle/fluid/inference/tensorrt/convert/test_nearest_interp_v2_op.cc new file mode 100644 index 00000000000000..f5ab6a99249314 --- /dev/null +++ b/paddle/fluid/inference/tensorrt/convert/test_nearest_interp_v2_op.cc @@ -0,0 +1,54 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "paddle/fluid/inference/tensorrt/convert/op_converter.h" +#include "paddle/fluid/inference/tensorrt/convert/ut_helper.h" + +namespace paddle { +namespace inference { +namespace tensorrt { + +TEST(nearest_interp_v2_op, test_swish) { + std::unordered_set parameters; + framework::Scope scope; + TRTConvertValidation validator(10, parameters, scope, 1000); + validator.DeclInputVar("interp-X", nvinfer1::Dims3(3, 32, 32)); + validator.DeclOutputVar("interp-Out", nvinfer1::Dims3(3, 64, 64)); + + // Prepare Op description + framework::OpDesc desc; + desc.SetType("nearest_interp_v2"); + desc.SetInput("X", {"interp-X"}); + desc.SetOutput("Out", {"interp-Out"}); + + std::vector scale({2.f, 2.f}); + + desc.SetAttr("data_layout", "NCHW"); + desc.SetAttr("interp_method", "nearest"); + desc.SetAttr("align_corners", false); + desc.SetAttr("scale", scale); + desc.SetAttr("out_h", 0); + desc.SetAttr("out_w", 0); + + validator.SetOp(*desc.Proto()); + + validator.Execute(1); +} + +} // namespace tensorrt +} // namespace inference +} // namespace paddle + +USE_OP(nearest_interp_v2); diff --git a/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc b/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc index 2d12eaf736b754..17d217dff43fdb 100644 --- a/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/yolo_box_op.cc @@ -48,13 +48,20 @@ class YoloBoxOpConverter : public OpConverter { float conf_thresh = BOOST_GET_CONST(float, op_desc.GetAttr("conf_thresh")); bool clip_bbox = BOOST_GET_CONST(bool, op_desc.GetAttr("clip_bbox")); float scale_x_y = BOOST_GET_CONST(float, op_desc.GetAttr("scale_x_y")); + bool iou_aware = op_desc.HasAttr("iou_aware") + ? BOOST_GET_CONST(bool, op_desc.GetAttr("iou_aware")) + : false; + float iou_aware_factor = + op_desc.HasAttr("iou_aware_factor") + ? BOOST_GET_CONST(float, op_desc.GetAttr("iou_aware_factor")) + : 0.5; int type_id = static_cast(engine_->WithFp16()); auto input_dim = X_tensor->getDimensions(); auto* yolo_box_plugin = new plugin::YoloBoxPlugin( type_id ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT, anchors, class_num, conf_thresh, downsample_ratio, clip_bbox, scale_x_y, - input_dim.d[1], input_dim.d[2]); + iou_aware, iou_aware_factor, input_dim.d[1], input_dim.d[2]); std::vector yolo_box_inputs; yolo_box_inputs.push_back(X_tensor); diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index 517af24f4d8a96..26182a79321993 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -135,12 +135,6 @@ void TensorRTEngine::FreezeNetwork() { } for (int j = 0; j < layer->getNbOutputs(); j++) { auto *temp_out = layer->getOutput(j); - if (temp_out->isNetworkOutput()) { - VLOG(1) << "Layer(Name: " << layer->getName() - << ") is set to float32 because its output(" - << temp_out->getName() << ") is the output of the network."; - return false; - } if (!temp_out->dynamicRangeIsSet()) { VLOG(1) << "Layer(Name: " << layer->getName() << ") is set to float32 because its output(" @@ -196,6 +190,19 @@ void TensorRTEngine::FreezeNetwork() { #if IS_TRT_VERSION_GE(6000) LOG(INFO) << "Run Paddle-TRT Dynamic Shape mode."; for (auto &input : min_input_shape_) { +#if IS_TRT_VERSION_LT(7000) + // trt6 will check all_of input > 0 + if (!(std::all_of(input.second.begin(), input.second.end(), + [](int x) { return x > 0; }) && + std::all_of(max_input_shape_[input.first].begin(), + max_input_shape_[input.first].end(), + [](int x) { return x > 0; }) && + std::all_of(optim_input_shape_[input.first].begin(), + optim_input_shape_[input.first].end(), + [](int x) { return x > 0; }))) { + continue; + } +#endif VLOG(4) << "TRT dynamic_shape set " << input.first << " min: " << Vec2Str(input.second) << ", max: " << Vec2Str(max_input_shape_[input.first]) @@ -225,6 +232,7 @@ void TensorRTEngine::FreezeNetwork() { infer_engine_.reset(infer_builder_->buildEngineWithConfig( *network(), *infer_builder_config_)); #else + infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kSPARSE_WEIGHTS); infer_ptr plan(infer_builder_->buildSerializedNetwork( *network(), *infer_builder_config_)); infer_ptr runtime(createInferRuntime(&logger_)); @@ -356,6 +364,13 @@ nvinfer1::IPluginV2Layer *TensorRTEngine::AddPluginV2Ext( return network()->addPluginV2(inputs, num_inputs, *plugin); } +nvinfer1::IPluginV2Layer *TensorRTEngine::AddPluginV2IOExt( + nvinfer1::ITensor *const *inputs, int num_inputs, + nvinfer1::IPluginV2IOExt *plugin) { + owned_plugin_v2ioext_.emplace_back(plugin); + return network()->addPluginV2(inputs, num_inputs, *plugin); +} + void TensorRTEngine::freshDeviceId() { int count; cudaGetDeviceCount(&count); diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index e22c2488d3b8b6..0e1b9fe3366cac 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -116,6 +116,17 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector& shape, std::string input, input, ShapeStr(shape))); } return nvinfer1::Dims2(shape[1], shape[2]); + } else if (shape.size() == 2UL) { + if (shape[1] == -1) { + PADDLE_THROW(platform::errors::InvalidArgument( + "The input [%s] shape of trt subgraph is %s, please enable " + "trt dynamic_shape mode by SetTRTDynamicShapeInfo.", + input, ShapeStr(shape))); + } + nvinfer1::Dims dims; + dims.nbDims = 1; + dims.d[0] = shape[1]; + return dims; } return nvinfer1::Dims3(shape[1], 1, 1); } else { @@ -323,6 +334,10 @@ class TensorRTEngine { int num_inputs, plugin::PluginTensorRTV2Ext* plugin); + nvinfer1::IPluginV2Layer* AddPluginV2IOExt(nvinfer1::ITensor* const* inputs, + int num_inputs, + nvinfer1::IPluginV2IOExt* plugin); + void SetTensorDynamicRange(nvinfer1::ITensor* tensor, float range) { quant_dynamic_range_[tensor] = range; } @@ -429,6 +444,7 @@ class TensorRTEngine { bool with_ernie() { return with_ernie_; } bool disable_trt_plugin_fp16() { return disable_trt_plugin_fp16_; } bool with_dynamic_shape() { return with_dynamic_shape_; } + AnalysisConfig::Precision precision() { return precision_; } #if IS_TRT_VERSION_GE(6000) nvinfer1::IPluginV2Layer* AddDynamicPlugin( @@ -550,6 +566,7 @@ class TensorRTEngine { std::vector> owned_plugin_; std::vector> owned_plugin_v2ext_; + std::vector> owned_plugin_v2ioext_; // TensorRT related internal members template diff --git a/paddle/fluid/inference/tensorrt/helper.h b/paddle/fluid/inference/tensorrt/helper.h index 16595b8a032988..b8051d8610442f 100644 --- a/paddle/fluid/inference/tensorrt/helper.h +++ b/paddle/fluid/inference/tensorrt/helper.h @@ -73,8 +73,24 @@ static nvinfer1::IPluginRegistry* GetPluginRegistry() { static int GetInferLibVersion() { return static_cast(dy::getInferLibVersion()); } +#else +static int GetInferLibVersion() { return 0; } #endif +static std::tuple GetTrtRuntimeVersion() { + int ver = GetInferLibVersion(); + int major = ver / 1000; + ver -= major * 1000; + int minor = ver / 100; + int patch = ver - minor * 100; + return std::tuple{major, minor, patch}; +} + +static std::tuple GetTrtCompileVersion() { + return std::tuple{NV_TENSORRT_MAJOR, NV_TENSORRT_MINOR, + NV_TENSORRT_PATCH}; +} + // A logger for create TensorRT infer builder. class NaiveLogger : public nvinfer1::ILogger { public: diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 5bfd2f12777952..13504f444109b7 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -48,15 +48,19 @@ struct SimpleOpTypeSetTeller : public Teller { int8_teller_set.insert("skip_layernorm"); int8_teller_set.insert("slice"); #endif -#if IS_TRT_VERSION_GE(7130) - teller_set.insert("group_norm"); -#endif +// TODO(baoachun) The group_norm trt plugin will check input's dim +// not -1 failed when dynamic shape mode. +// #if IS_TRT_VERSION_GE(7130) +// teller_set.insert("group_norm"); +// #endif #if IS_TRT_VERSION_GE(7000) teller_set.insert("tile"); #endif #if CUDA_VERSION >= 10020 teller_set.insert("reshape"); teller_set.insert("reshape2"); + int8_teller_set.insert("reshape"); + int8_teller_set.insert("reshape2"); #endif } @@ -89,7 +93,9 @@ struct SimpleOpTypeSetTeller : public Teller { "scale", "elementwise_mul", "conv2d_transpose", - "hard_swish"}; + "hard_swish", + "transpose", + "transpose2"}; std::unordered_set teller_set{"mul", "matmul", "conv2d", @@ -134,7 +140,10 @@ struct SimpleOpTypeSetTeller : public Teller { "reduce_sum", "reduce_mean", "conv3d", - "conv3d_transpose"}; + "conv3d_transpose", + "mish", + "nearest_interp_v2", + "pool3d"}; }; bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, @@ -166,27 +175,19 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, << " op does not support input's dim is 1 in tensorrt."; return false; } + // TODO(inference): fix + if (x_shape.size() == 2 && !with_dynamic_shape) { + VLOG(3) << "activation op does not support input's dim is 2 in " + "tensorrt static shape, the output shape has diff."; + return false; + } } if (op_type == "pool2d") { std::vector paddings = BOOST_GET_CONST(std::vector, desc.GetAttr("paddings")); - if (paddings.size() > 2) return false; - if (desc.HasAttr("exclusive")) { - if (BOOST_GET_CONST(bool, desc.GetAttr("exclusive"))) { - std::vector ksize = - BOOST_GET_CONST(std::vector, desc.GetAttr("ksize")); - for (size_t i = 0; i < ksize.size(); i++) { - if (ksize[i] <= paddings[i]) { - VLOG(3) << "the padding size should be less than the filter size " - "for exclusive-counting pooling."; - return false; - } - } - } - } - if (desc.HasAttr("ceil_mode")) { - if (BOOST_GET_CONST(bool, desc.GetAttr("ceil_mode"))) return false; + if (paddings.size() > 2) { + return false; } if (desc.Input("X").size() != 1) { VLOG(3) << "TRT Pool2d expect 1 input, but got " @@ -208,15 +209,32 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, << pool_type << " pool type."; return false; } + if (pool_type == "avg") { + if (desc.HasAttr("global_pooling")) { + if (!BOOST_GET_CONST(bool, desc.GetAttr("global_pooling"))) { + if (desc.HasAttr("exclusive")) { + if (BOOST_GET_CONST(bool, desc.GetAttr("exclusive"))) { + std::vector ksize = + BOOST_GET_CONST(std::vector, desc.GetAttr("ksize")); + for (size_t i = 0; i < ksize.size(); i++) { + if (ksize[i] <= paddings[i]) { + VLOG(3) << "the padding size should be less than the " + "filter size " + "for exclusive-counting pooling."; + return false; + } + } + } + } + } + } + } } } if (op_type == "conv2d" || op_type == "conv2d_transpose" || op_type == "conv2d_fusion" || op_type == "depthwise_conv2d" || op_type == "depthwise_conv2d_transpose") { - std::vector paddings = - BOOST_GET_CONST(std::vector, desc.GetAttr("paddings")); - if (desc.Input("Input").size() != 1) { VLOG(3) << "TRT Conv2d expect 1 input, but got " << desc.Input("Input").size() << " input."; @@ -232,9 +250,31 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, if (desc.HasAttr("padding_algorithm")) { auto padding_algorithm = BOOST_GET_CONST(std::string, desc.GetAttr("padding_algorithm")); - if (padding_algorithm == "SAME" || padding_algorithm == "VALID") { + if (padding_algorithm == "VALID") { return false; } + if (padding_algorithm == "SAME") { + if (desc.HasAttr("dilations")) { + const std::vector dilations = + BOOST_GET_CONST(std::vector, desc.GetAttr("dilations")); + if (dilations[0] != 1 || dilations[1] != 1) { + VLOG(3) << "In Same mode, Dilations must be (1, 1) for " + "tensorRT, but given (" + << dilations[0] << ", " << dilations[1] << ")"; + return false; + } + } + } + } + + if (use_no_calib_int8) { + if (desc.HasAttr("padding_algorithm")) { + auto padding_algorithm = + BOOST_GET_CONST(std::string, desc.GetAttr("padding_algorithm")); + if (padding_algorithm == "SAME") { + return false; + } + } } if (desc.HasAttr("enable_int8")) { @@ -300,6 +340,26 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, "the pass."; return false; } + + // not support broadcast + auto* x_var_desc = block->FindVar(desc.Input("X")[0]); + auto* y_var_desc = block->FindVar(desc.Input("Y")[0]); + const auto x_shape = x_var_desc->GetShape(); + const auto y_shape = y_var_desc->GetShape(); + if (x_shape.size() != y_shape.size()) { + VLOG(3) + << "matmul op not support broadcast, please check inputs'shape. "; + return false; + } + uint64_t dims = 2; + for (size_t i = 0; i < x_shape.size() - dims; ++i) { + if (x_shape[i] != y_shape[i] && (x_shape[i] == 1 || y_shape[i] == 1)) { + VLOG(3) << "matmul op not support broadcast, please check " + "inputs'shape[i]. "; + return false; + } + } + for (auto& param_name : desc.Inputs()) { for (auto& var_name : param_name.second) { auto* var_desc = block->FindVar(var_name); @@ -313,6 +373,24 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, } } } + if (op_type == "softmax") { + auto* block = desc.Block(); + if (block == nullptr) { + VLOG(3) << "The block desc is nullptr, we can't continue to analyze. " + "Developers need to check whether block_desc is passed in " + "the pass."; + return false; + } + auto x_var_name = desc.Input("X")[0]; + auto* x_var_desc = block->FindVar(x_var_name); + const auto x_shape = x_var_desc->GetShape(); + // TODO(inference): fix + if (x_shape.size() == 2 && !with_dynamic_shape) { + VLOG(3) << "softmax op does not support input's dim is 2 in tensorrt " + "static shape, the output shape has diff."; + return false; + } + } if (op_type == "group_norm") { if (!with_dynamic_shape) return false; bool has_attrs = (desc.HasAttr("epsilon") && desc.HasAttr("groups")); @@ -324,20 +402,35 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, if (op_type == "concat") { if (!desc.HasAttr("axis")) { return false; + } + int axis = BOOST_GET_CONST(int, desc.GetAttr("axis")); + if (with_dynamic_shape) { + if (axis < 0) return false; } else { - int axis = BOOST_GET_CONST(int, desc.GetAttr("axis")); - if (with_dynamic_shape) { - if (axis < 0) return false; - } else { - if (axis <= 0) return false; - } - auto concat_inputs = desc.Inputs(); - if (concat_inputs.find("AxisTensor") != concat_inputs.end()) { - if (desc.Input("AxisTensor").size() >= 1) { - return false; - } + if (axis <= 0) return false; + } + auto concat_inputs = desc.Inputs(); + if (concat_inputs.find("AxisTensor") != concat_inputs.end()) { + if (desc.Input("AxisTensor").size() >= 1) { + return false; } } + auto* block = desc.Block(); + if (block == nullptr) { + VLOG(3) << "The block desc is nullptr, we can't continue to analyze. " + "Developers need to check whether block_desc is passed in " + "the pass."; + return false; + } + auto x_var_name = desc.Input("X")[0]; + auto* x_var_desc = block->FindVar(x_var_name); + const auto x_shape = x_var_desc->GetShape(); + // TODO(inference): fix + if (x_shape.size() == 2 && !with_dynamic_shape) { + VLOG(3) << "concat op does not support input's dim is 2 in tensorrt " + "static shape, the output shape has diff."; + return false; + } } if (op_type == "transpose2" || op_type == "transpose") { if (!desc.HasAttr("axis")) { @@ -567,6 +660,33 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, } } + if (op_type == "nearest_interp_v2") { + std::vector attrs{"data_layout", "interp_method", + "align_corners", "scale", + "out_h", "out_w"}; + for (auto const attr : attrs) { + if (!desc.HasAttr(attr)) return false; + } + auto data_layout = framework::StringToDataLayout( + BOOST_GET_CONST(std::string, desc.GetAttr("data_layout"))); + if (data_layout != framework::DataLayout::kNCHW && + data_layout != framework::DataLayout::kNHWC) + return false; + auto interp_method = + BOOST_GET_CONST(std::string, desc.GetAttr("interp_method")); + if (interp_method != "nearest") return false; + auto scale = BOOST_GET_CONST(std::vector, desc.GetAttr("scale")); + auto out_h = BOOST_GET_CONST(int, desc.GetAttr("out_h")); + auto out_w = BOOST_GET_CONST(int, desc.GetAttr("out_w")); + if (!(out_h > 0 && out_w > 0)) { + if (scale[0] <= 0.f || scale[1] <= 0.f) { + VLOG(3) << "scale factor must be greater than 0 if out_h or out_w is " + "not set."; + return false; + } + } + } + if (op_type == "roi_align") { if (!with_dynamic_shape) return false; @@ -627,6 +747,22 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, << desc.Output("Y").size() << "."; return false; } + auto* block = desc.Block(); + if (block == nullptr) { + VLOG(3) << "The block desc is nullptr, we can't continue to analyze. " + "Developers need to check whether block_desc is passed in " + "the pass."; + return false; + } + auto x_var_name = desc.Input("X")[0]; + auto* x_var_desc = block->FindVar(x_var_name); + const auto x_shape = x_var_desc->GetShape(); + // TODO(inference): fix + if (x_shape.size() == 2 && !with_dynamic_shape) { + VLOG(3) << "batch_norm op does not support input's dim is 2 in " + "tensorrt static shape, the output shape has diff."; + return false; + } } if (op_type == "split") { @@ -714,6 +850,12 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, VLOG(3) << "The output_length should be equal to the output size."; return false; } + // TODO(inference): fix + if (x_shape.size() == 2 && !with_dynamic_shape) { + VLOG(3) << "split op does not support input's dim is 2 in tensorrt " + "static shape. The output shape has diff."; + return false; + } } if (op_type == "scale") { auto scale_inputs = desc.Inputs(); @@ -866,6 +1008,12 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, VLOG(3) << "gelu op does not support input's dim is 1 in tensorrt."; return false; } + // TODO(inference): fix + if (x_shape.size() == 2 && !with_dynamic_shape) { + VLOG(3) << "gelu op does not support input's dim is 2 in tensorrt " + "static shape, the output shape has diff."; + return false; + } } if (op_type == "layer_norm") { @@ -916,6 +1064,22 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, << desc.Output("Y").size(); return false; } + + auto* block = desc.Block(); + if (block == nullptr) { + VLOG(3) << "The block desc is nullptr, we can't continue to analyze. " + "Developers need to check whether block_desc is passed in " + "the pass."; + return false; + } + auto x_var_name = desc.Input("X")[0]; + auto* x_var_desc = block->FindVar(x_var_name); + const auto x_shape = x_var_desc->GetShape(); + if (x_shape.size() != 4) { + VLOG(3) << "The instance_norm op only support 4-dimensional input in " + "tensorrt."; + return false; + } } if (op_type == "leaky_relu") { @@ -981,7 +1145,13 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, auto* x_var_desc = block->FindVar(x_var_name); const auto x_shape = x_var_desc->GetShape(); if (x_shape.size() == 1) { - VLOG(3) << "dropout op does not support input's dim is 1 in tensorrt."; + VLOG(3) << "scale op does not support input's dim is 1 in tensorrt."; + return false; + } + // TODO(inference): fix + if (x_shape.size() == 2 && !with_dynamic_shape) { + VLOG(3) << "scale op does not support input's dim is 2 in tensorrt " + "static shape, the output shape has diff."; return false; } } @@ -1001,6 +1171,12 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, VLOG(3) << "swish op does not support input's dim is 1 in tensorrt."; return false; } + // TODO(inference): fix + if (x_shape.size() == 2 && !with_dynamic_shape) { + VLOG(3) << "swish op does not support input's dim is 2 in tensorrt " + "static shape, the output shape has diff."; + return false; + } } if (op_type == "prelu") { @@ -1044,6 +1220,52 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, return false; } } + +#if IS_TRT_VERSION_LT(7000) + if (!with_dynamic_shape) { + // TODO(inference): fix trt6 static plugin error. + VLOG(3) << "prelu static plugin in trt6 has bug."; + return false; + } +#endif + } + + if (op_type == "mish") { + if (desc.Input("X").size() != 1) { + VLOG(3) << "Invalid input X's size of mish TRT converter. " + "Expected 1, received " + << desc.Input("X").size() << "."; + return false; + } + if (desc.Output("Out").size() != 1) { + VLOG(3) << "Invalid output Out's size of mish TRT converter. " + "Expected 1, received " + << desc.Output("Out").size() << "."; + return false; + } + + auto* block = desc.Block(); + if (block == nullptr) { + VLOG(3) << "The block desc is nullptr, we can't continue to analyze. " + "Developers need to check whether block_desc is passed in " + "the pass."; + return false; + } + + auto x_var_name = desc.Input("X")[0]; + auto* x_var_desc = block->FindVar(x_var_name); + const auto x_shape = x_var_desc->GetShape(); + if (x_shape.size() == 1) { + VLOG(3) << "mish op does not support input's dim is 1 in tensorrt."; + return false; + } + + if (!with_dynamic_shape) { + if (x_shape.size() == 2) { + VLOG(3) << "mish op does not support input's dim is 2 in tensorrt."; + return false; + } + } } if (op_type == "roi_align") { @@ -1144,6 +1366,47 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, } if (op_type == "fc") { + auto* block = desc.Block(); + if (block == nullptr) { + VLOG(3) << "The block desc is nullptr, we can't continue to analyze. " + "Developers need to check whether block_desc is passed in " + "the pass."; + return false; + } + + // y'shapes == 2 + auto fc_inputs = desc.Inputs(); + std::string fc_y = ""; + if (fc_inputs.find("Y") != fc_inputs.end()) { + fc_y = "Y"; + } else if (fc_inputs.find("W") != fc_inputs.end()) { + fc_y = "W"; + } else { + VLOG(3) << " input_y(fc_op) must be Y or W "; + return false; + } + + // There is currently no input: Y(weight) more than two dimensions + /* + auto* y_var_desc = block->FindVar(desc.Input(fc_y)[0]); + const auto y_shape = y_var_desc->GetShape(); + if (y_shape.size() != 2) { + VLOG(3) + << " input_y(fc_op)'shapes must be 2, but input_y(fc_op)'shapes = " + << y_shape.size(); + return false; + } + // y_num_col_dims ==1 + if (desc.HasAttr("y_num_col_dims")) { + int y_num_col_dims = + BOOST_GET_CONST(int, desc.GetAttr("y_num_col_dims")); + if (y_num_col_dims != 1) { + VLOG(3) << " fc_op'y_num_col_dims must be 1, but y_num_col_dims = " + << y_num_col_dims; + return false; + } + } + */ int x_num_col_dims = desc.HasAttr("x_num_col_dims") ? BOOST_GET_CONST(int, desc.GetAttr("x_num_col_dims")) @@ -1151,8 +1414,9 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, ? BOOST_GET_CONST(int, desc.GetAttr("in_num_col_dims")) : 1); if (x_num_col_dims < 1) { - VLOG(3) << "converter expects x_num_col_dims >= 1, " - "but x_num_col_dims = %d."; + VLOG(3) << "fc_op expects x_num_col_dims >= 1, " + "but x_num_col_dims = " + << x_num_col_dims; return false; } } @@ -1208,6 +1472,12 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, VLOG(3) << "clip op does not support input's dim is 1 in tensorrt."; return false; } + // TODO(inference): fix + if (x_shape.size() == 2 && !with_dynamic_shape) { + VLOG(3) << "clip op does not support input's dim is 2 in tensorrt " + "static shape, the output shape has diff."; + return false; + } } if (op_type == "reduce_sum" || op_type == "reduce_mean") { diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt index 311c2312a9f45b..9e93894e623c00 100644 --- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt @@ -9,6 +9,8 @@ nv_library(tensorrt_plugin yolo_box_op_plugin.cu roi_align_op_plugin.cu gather_nd_op_plugin.cu + mish_op_plugin.cu + pool3d_op_plugin.cu DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor) nv_test(test_split_plugin SRCS test_split_plugin.cc DEPS diff --git a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu index 69e0075729b0dc..d6a1cdb9e68a65 100644 --- a/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/elementwise_op_plugin.cu @@ -65,12 +65,6 @@ nvinfer1::Dims ElementWisePlugin::getOutputDimensions( } int ElementWisePlugin::initialize() TRT_NOEXCEPT { - PADDLE_ENFORCE_GT(dims_y_.nbDims, 0, - platform::errors::InvalidArgument( - "The dimension of input Y of TRT elementwise op plugin " - "should be greater than 0, but got %d.", - dims_y_.nbDims)); - axis_ = (axis_ == -1) ? dims_x_.nbDims - dims_y_.nbDims : axis_; int trimed_nb_dims = dims_y_.nbDims; for (; trimed_nb_dims > 0; --trimed_nb_dims) { diff --git a/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h index c0ee608c39dabc..475c908c13bbf2 100644 --- a/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/hard_swish_op_plugin.h @@ -161,7 +161,7 @@ class HardSwishPluginDynamicCreator : public nvinfer1::IPluginCreator { public: HardSwishPluginDynamicCreator() {} const char* getPluginName() const TRT_NOEXCEPT override { - return "hardswish_plugin_dynamic"; + return "hard_swish_plugin_dynamic"; } const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; } diff --git a/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu index b7c4fb7c99acfd..a9a50543e7bb70 100644 --- a/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/instance_norm_op_plugin.cu @@ -65,11 +65,6 @@ int InstanceNormPlugin::enqueue(int batch_size, const void *const *inputs, #endif cudaStream_t stream) TRT_NOEXCEPT { const auto &input_dims = this->getInputDims(0); - - PADDLE_ENFORCE_EQ(input_dims.nbDims, 3, - platform::errors::InvalidArgument( - "Input Dims should be 3 (except the batch), got %d", - input_dims.nbDims)); int n = batch_size; int c = input_dims.d[0]; int h = input_dims.d[1]; diff --git a/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.cu new file mode 100644 index 00000000000000..6e268e7b0b330d --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.cu @@ -0,0 +1,235 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "glog/logging.h" +#include "paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +int MishPlugin::initialize() TRT_NOEXCEPT { return 0; } + +bool MishPlugin::supportsFormat( + nvinfer1::DataType type, nvinfer1::PluginFormat format) const TRT_NOEXCEPT { + if (with_fp16_) { + return ((type == nvinfer1::DataType::kFLOAT || + type == nvinfer1::DataType::kHALF) && + (format == nvinfer1::PluginFormat::kLINEAR)); + } else { + return ((type == nvinfer1::DataType::kFLOAT) && + (format == nvinfer1::PluginFormat::kLINEAR)); + } +} + +nvinfer1::Dims MishPlugin::getOutputDimensions(int index, + const nvinfer1::Dims* in_dims, + int nb_inputs) TRT_NOEXCEPT { + PADDLE_ENFORCE_EQ(nb_inputs, 1, platform::errors::InvalidArgument( + "We expect [number of inputs] == 1" + "in TRT Mish op plugin, but got " + "[number of inputs] = %d.", + nb_inputs)); + PADDLE_ENFORCE_LT(index, this->getNbOutputs(), + platform::errors::InvalidArgument( + "We expect [index] < [number of outputs]" + "in TRT Mish op plugin, but got " + "[index] = %d, [number of outputs] = %d.", + index, this->getNbOutputs())); + nvinfer1::Dims const& input_dims = in_dims[0]; + nvinfer1::Dims output_dims = input_dims; + return output_dims; +} + +template +__device__ T kTanh(T x) { + return tanh(x); +} + +template <> +__device__ half kTanh(half x) { +#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) + const float tmp = tanhf(__half2float(x)); + return __float2half(tmp); +#endif +} + +template +__device__ T kSoftplus(T x, T threshold) { + return x > threshold ? x : log(exp(x) + static_cast(1.0f)); +} + +template <> +__device__ half kSoftplus(half x, half threshold) { +#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) + return x > threshold ? x : hlog(hexp(x) + static_cast(1.0f)); +#endif +} + +template +__global__ void mish_kernel(float threshold, int n, const T* input, T* output) { + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + const T in = input[idx]; + output[idx] = in * kTanh(kSoftplus(in, static_cast(threshold))); + } +} + +template <> +__global__ void mish_kernel(float threshold, int n, const half* input, + half* output) { +#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__) + const int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < n) { + const half in = input[idx]; + output[idx] = + in * kTanh(kSoftplus(in, static_cast(threshold))); + } +#endif +} + +#if IS_TRT_VERSION_LT(8000) +int MishPlugin::enqueue(int batchSize, const void* const* inputs, + void** outputs, +#else +int MishPlugin::enqueue(int batchSize, const void* const* inputs, + void* const* outputs, +#endif + void* workspace, cudaStream_t stream) TRT_NOEXCEPT { + const auto& input_dims = this->getInputDims(0); + int num = batchSize; + for (int i = 0; i < input_dims.nbDims; i++) { + num *= input_dims.d[i]; + } + + const int block_size = 256; + const int grid_size = (num + block_size - 1) / block_size; + + auto type = getDataType(); + if (type == nvinfer1::DataType::kFLOAT) { + VLOG(1) << "TRT Plugin DataType selected. Mish-->fp32"; + const float* input = static_cast(inputs[0]); + float* output = static_cast(outputs[0]); + mish_kernel<<>>(threshold_, num, + input, output); + } else if (type == nvinfer1::DataType::kHALF) { + VLOG(1) << "TRT Plugin DataType selected. Mish-->fp16"; + const half* input = static_cast(inputs[0]); + half* output = static_cast(outputs[0]); + mish_kernel<<>>(threshold_, num, + input, output); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "The Mish TRT Plugin's input type should be float or half.")); + } + + return cudaGetLastError() != cudaSuccess; +} + +// Dynamic Plugin below. +int MishPluginDynamic::initialize() TRT_NOEXCEPT { + getPluginNamespace(); + return 0; +} + +size_t MishPluginDynamic::getSerializationSize() const TRT_NOEXCEPT { + return SerializedSize(threshold_) + SerializedSize(with_fp16_); +} + +void MishPluginDynamic::serialize(void* buffer) const TRT_NOEXCEPT { + SerializeValue(&buffer, threshold_); + SerializeValue(&buffer, with_fp16_); +} + +nvinfer1::DimsExprs MishPluginDynamic::getOutputDimensions( + int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs, + nvinfer1::IExprBuilder& expr_builder) TRT_NOEXCEPT { + return inputs[0]; +} + +bool MishPluginDynamic::supportsFormatCombination( + int pos, const nvinfer1::PluginTensorDesc* in_out, int nb_inputs, + int nb_outputs) TRT_NOEXCEPT { + PADDLE_ENFORCE_NOT_NULL( + in_out, platform::errors::InvalidArgument( + "The input of mish plugin shoule not be nullptr.")); + + PADDLE_ENFORCE_LT( + pos, nb_inputs + nb_outputs, + platform::errors::InvalidArgument("The pos(%d) should be less than the " + "num(%d) of the input and the output.", + pos, nb_inputs + nb_outputs)); + + const nvinfer1::PluginTensorDesc& in = in_out[pos]; + if (pos == 0) { + if (with_fp16_) { + return (in.type == nvinfer1::DataType::kFLOAT || + in.type == nvinfer1::DataType::kHALF) && + (in.format == nvinfer1::TensorFormat::kLINEAR); + } else { + return (in.type == nvinfer1::DataType::kFLOAT) && + (in.format == nvinfer1::TensorFormat::kLINEAR); + } + } + const nvinfer1::PluginTensorDesc& prev = in_out[pos - 1]; + // output + return in.type == prev.type && in.format == prev.format; +} + +nvinfer1::DataType MishPluginDynamic::getOutputDataType( + int index, const nvinfer1::DataType* input_types, + int nb_inputs) const TRT_NOEXCEPT { + PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument( + "The Mish Plugin only has one input, so the " + "index value should be 0, but get %d.", + index)); + return input_types[0]; +} + +int MishPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc* input_desc, + const nvinfer1::PluginTensorDesc* output_desc, + const void* const* inputs, void* const* outputs, + void* workspace, + cudaStream_t stream) TRT_NOEXCEPT { + auto input_dims = input_desc[0].dims; + size_t num = ProductDim(input_dims); + const int block_size = 256; + const int grid_size = (num + block_size - 1) / block_size; + + auto input_type = input_desc[0].type; + if (input_type == nvinfer1::DataType::kFLOAT) { + VLOG(1) << "TRT Plugin DataType selected. Mish-->fp32"; + const float* input = static_cast(inputs[0]); + float* output = static_cast(outputs[0]); + mish_kernel<<>>(threshold_, num, + input, output); + } else if (input_type == nvinfer1::DataType::kHALF) { + VLOG(1) << "TRT Plugin DataType selected. Mish-->fp16"; + const half* input = static_cast(inputs[0]); + half* output = static_cast(outputs[0]); + mish_kernel<<>>(threshold_, num, + input, output); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "The Mish TRT Plugin's input type should be float or half.")); + } + return cudaGetLastError() != cudaSuccess; +} + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h new file mode 100644 index 00000000000000..75390666ea097f --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h @@ -0,0 +1,175 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include "paddle/fluid/inference/tensorrt/engine.h" +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +class MishPlugin : public PluginTensorRT { + private: + float threshold_; + + protected: + size_t getSerializationSize() const TRT_NOEXCEPT override { + return getBaseSerializationSize() + SerializedSize(threshold_); + } + + // TRT will call this func to serialize the configuration of TRT + // It should not be called by users. + void serialize(void* buffer) const TRT_NOEXCEPT override { + serializeBase(buffer); + SerializeValue(&buffer, threshold_); + } + + public: + explicit MishPlugin(const float threshold, const bool with_fp16) + : threshold_(threshold) { + with_fp16_ = with_fp16; + } + + // It was used for tensorrt deserialization. + // It should not be called by users. + MishPlugin(void const* serialData, size_t serialLength) { + deserializeBase(serialData, serialLength); + DeserializeValue(&serialData, &serialLength, &threshold_); + } + + ~MishPlugin() {} + MishPlugin* clone() const TRT_NOEXCEPT override { + return new MishPlugin(threshold_, with_fp16_); + } + + const char* getPluginType() const TRT_NOEXCEPT override { + return "mish_plugin"; + } + int getNbOutputs() const TRT_NOEXCEPT override { return 1; } + int initialize() TRT_NOEXCEPT override; + bool supportsFormat(nvinfer1::DataType type, nvinfer1::PluginFormat format) + const TRT_NOEXCEPT override; + nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, + int nbInputDims) TRT_NOEXCEPT override; +#if IS_TRT_VERSION_LT(8000) + int enqueue(int batchSize, const void* const* inputs, void** outputs, +#else + int enqueue(int batchSize, const void* const* inputs, void* const* outputs, +#endif + void* workspace, cudaStream_t stream) TRT_NOEXCEPT override; +}; + +class MishPluginCreator : public TensorRTPluginCreator { + public: + const char* getPluginName() const TRT_NOEXCEPT override { + return "mish_plugin"; + } + + const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; } + + nvinfer1::IPluginV2* deserializePlugin( + const char* name, const void* serial_data, + size_t serial_length) TRT_NOEXCEPT override { + return new MishPlugin(serial_data, serial_length); + } +}; + +REGISTER_TRT_PLUGIN_V2(MishPluginCreator); + +class MishPluginDynamic : public DynamicPluginTensorRT { + public: + explicit MishPluginDynamic(const float threshold, const bool with_fp16) + : threshold_(threshold) { + with_fp16_ = with_fp16; + } + MishPluginDynamic(void const* serialData, size_t serialLength) { + DeserializeValue(&serialData, &serialLength, &threshold_); + DeserializeValue(&serialData, &serialLength, &with_fp16_); + } + nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override { + return new MishPluginDynamic(threshold_, with_fp16_); + } + + const char* getPluginType() const TRT_NOEXCEPT override { + return "mish_plugin_dynamic"; + } + int getNbOutputs() const TRT_NOEXCEPT override { return 1; } + int initialize() TRT_NOEXCEPT override; + + size_t getSerializationSize() const TRT_NOEXCEPT override; + void serialize(void* buffer) const TRT_NOEXCEPT override; + + nvinfer1::DimsExprs getOutputDimensions( + int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs, + nvinfer1::IExprBuilder& expr_builder) TRT_NOEXCEPT override; + + bool supportsFormatCombination(int pos, + const nvinfer1::PluginTensorDesc* inOut, + int nbInputs, + int nbOutputs) TRT_NOEXCEPT override; + + void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, + int nbInputs, + const nvinfer1::DynamicPluginTensorDesc* out, + int nbOutputs) TRT_NOEXCEPT override {} + + size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, + int nbInputs, + const nvinfer1::PluginTensorDesc* outputs, + int nbOutputs) const TRT_NOEXCEPT override { + return 0; + } + + int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, + const void* const* inputs, void* const* outputs, void* workspace, + cudaStream_t stream) TRT_NOEXCEPT override; + nvinfer1::DataType getOutputDataType( + int index, const nvinfer1::DataType* inputTypes, + int nbInputs) const TRT_NOEXCEPT override; + + void destroy() TRT_NOEXCEPT override { delete this; } + + private: + float threshold_; +}; + +class MishPluginDynamicCreator : public TensorRTPluginCreator { + public: + const char* getPluginName() const TRT_NOEXCEPT override { + return "mish_plugin_dynamic"; + } + + const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; } + + nvinfer1::IPluginV2* deserializePlugin( + const char* name, const void* serial_data, + size_t serial_length) TRT_NOEXCEPT override { + auto plugin = new MishPluginDynamic(serial_data, serial_length); + return plugin; + } +}; + +REGISTER_TRT_PLUGIN_V2(MishPluginDynamicCreator); + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu new file mode 100644 index 00000000000000..861a9aa9d000bf --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.cu @@ -0,0 +1,375 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, softwarepool +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.h" +#include "paddle/fluid/operators/math/pooling.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +size_t Pool3DPlugin::getSerializationSize() const TRT_NOEXCEPT { + return getBaseSerializationSize() + SerializedSize(ceil_mode_) + + SerializedSize(pool3d_type_) + SerializedSize(adaptive_) + + SerializedSize(ksize_) + SerializedSize(strides_) + + SerializedSize(paddings_) + SerializedSize(input_shape_) + + SerializedSize(output_shape_); +} + +// TRT will call this func when we need to serialize the configuration of +// tensorrt. +void Pool3DPlugin::serialize(void *buffer) const TRT_NOEXCEPT { + serializeBase(buffer); + SerializeValue(&buffer, ceil_mode_); + SerializeValue(&buffer, pool3d_type_); + SerializeValue(&buffer, adaptive_); + SerializeValue(&buffer, ksize_); + SerializeValue(&buffer, strides_); + SerializeValue(&buffer, paddings_); + SerializeValue(&buffer, input_shape_); + SerializeValue(&buffer, output_shape_); +} + +Pool3DPlugin *Pool3DPlugin::clone() const TRT_NOEXCEPT { + return new Pool3DPlugin(ceil_mode_, pool3d_type_, adaptive_, ksize_, strides_, + paddings_, input_shape_); +} + +const char *Pool3DPlugin::getPluginType() const TRT_NOEXCEPT { + return "pool3d_plugin"; +} + +int Pool3DPlugin::getNbOutputs() const TRT_NOEXCEPT { return 1; } + +int Pool3DPlugin::initialize() TRT_NOEXCEPT { return 0; } + +nvinfer1::DataType Pool3DPlugin::getOutputDataType( + int index, const nvinfer1::DataType *input_types, + int nb_inputs) const TRT_NOEXCEPT { + return input_types[0]; +} + +void Pool3DPlugin::destroy() TRT_NOEXCEPT { delete this; } + +nvinfer1::Dims Pool3DPlugin::getOutputDimensions( + int index, const nvinfer1::Dims *inputDims, int nbInputs) TRT_NOEXCEPT { + PADDLE_ENFORCE_EQ(nbInputs, 1, + platform::errors::InvalidArgument( + "The Pool3D Plugin only has one input, so the nbInputs " + "value should be 1, but get %d.", + nbInputs)); + PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument( + "The Pool3D Plugin only has one input, so " + "the index value should be 0, but get %d.", + index)); + PADDLE_ENFORCE_EQ(inputDims[0].nbDims, 4, + platform::errors::InvalidArgument( + "The Pool3D Plugin only has four Dimensions, so the " + "nbDims value should be 4, but get %d.", + inputDims[0].nbDims)); + + nvinfer1::Dims const &input_dims = inputDims[0]; + + nvinfer1::Dims output_dims = input_dims; + + output_dims.d[1] = output_shape_[1]; + output_dims.d[2] = output_shape_[2]; + output_dims.d[3] = output_shape_[3]; + return output_dims; +} + +int Pool3DPlugin::enqueue(int batchSize, const void *const *inputs, +#if IS_TRT_VERSION_LT(8000) + void **outputs, void *workspace, + cudaStream_t stream) TRT_NOEXCEPT { +#else + void *const *outputs, void *workspace, + cudaStream_t stream) TRT_NOEXCEPT { +#endif + int input_size = 0; + float const *idata = reinterpret_cast(inputs[0]); + float *const *odatas = reinterpret_cast(outputs); + + std::vector input_shape = input_shape_; + std::vector output_shape = output_shape_; + input_shape.insert(input_shape.begin(), batchSize); + output_shape.insert(output_shape.begin(), batchSize); + + if (pool3d_type_ == Pool3DType::max) { + paddle::operators::math::MaxPool pool_process; + paddle::operators::math::Pool3dDirectCUDAFunctor< + paddle::operators::math::MaxPool, float> + pool3d_forward; + pool3d_forward(idata, input_shape, output_shape, ksize_, strides_, + paddings_, true, adaptive_, odatas[0], stream, pool_process); + } else if (pool3d_type_ == Pool3DType::avg) { + paddle::operators::math::AvgPool pool_process; + paddle::operators::math::Pool3dDirectCUDAFunctor< + paddle::operators::math::AvgPool, float> + pool3d_forward; + pool3d_forward(idata, input_shape, output_shape, ksize_, strides_, + paddings_, true, adaptive_, odatas[0], stream, pool_process); + } + + return cudaGetLastError() != cudaSuccess; +} + +// Dynamic Plugin below. + +Pool3DPluginDynamic::Pool3DPluginDynamic(void const *serialData, + size_t serialLength) { + DeserializeValue(&serialData, &serialLength, &ceil_mode_); + const char *pool3d_type; + DeserializeValue(&serialData, &serialLength, &pool3d_type); + pool3d_type_ = std::string(pool3d_type); + DeserializeValue(&serialData, &serialLength, &adaptive_); + DeserializeValue(&serialData, &serialLength, &ksize_); + DeserializeValue(&serialData, &serialLength, &strides_); + DeserializeValue(&serialData, &serialLength, &paddings_); + DeserializeValue(&serialData, &serialLength, &is_global_); +} + +nvinfer1::IPluginV2DynamicExt *Pool3DPluginDynamic::clone() const TRT_NOEXCEPT { + return new Pool3DPluginDynamic(ceil_mode_, pool3d_type_, adaptive_, ksize_, + strides_, paddings_, is_global_); +} + +const char *Pool3DPluginDynamic::getPluginType() const TRT_NOEXCEPT { + return "pool3d_plugin_dynamic"; +} +int Pool3DPluginDynamic::getNbOutputs() const TRT_NOEXCEPT { return 1; } + +int Pool3DPluginDynamic::initialize() TRT_NOEXCEPT { return 0; } + +void Pool3DPluginDynamic::configurePlugin( + const nvinfer1::DynamicPluginTensorDesc *in, int nbInputs, + const nvinfer1::DynamicPluginTensorDesc *out, int nbOutputs) TRT_NOEXCEPT {} + +size_t Pool3DPluginDynamic::getWorkspaceSize( + const nvinfer1::PluginTensorDesc *inputs, int nbInputs, + const nvinfer1::PluginTensorDesc *outputs, + int nbOutputs) const TRT_NOEXCEPT { + return 0; +} + +size_t Pool3DPluginDynamic::getSerializationSize() const TRT_NOEXCEPT { + return SerializedSize(ceil_mode_) + SerializedSize(pool3d_type_.c_str()) + + SerializedSize(adaptive_) + SerializedSize(ksize_) + + SerializedSize(strides_) + SerializedSize(paddings_) + + SerializedSize(is_global_); +} + +void Pool3DPluginDynamic::serialize(void *buffer) const TRT_NOEXCEPT { + SerializeValue(&buffer, ceil_mode_); + SerializeValue(&buffer, pool3d_type_.c_str()); + SerializeValue(&buffer, adaptive_); + SerializeValue(&buffer, ksize_); + SerializeValue(&buffer, strides_); + SerializeValue(&buffer, paddings_); + SerializeValue(&buffer, is_global_); +} + +nvinfer1::DimsExprs Pool3DPluginDynamic::getOutputDimensions( + int output_index, const nvinfer1::DimsExprs *inputs, int nb_inputs, + nvinfer1::IExprBuilder &expr_builder) TRT_NOEXCEPT { + PADDLE_ENFORCE_EQ(nb_inputs, 1, + platform::errors::InvalidArgument( + "The Split plugin should be only one input.")); + + PADDLE_ENFORCE_EQ( + inputs[0].d[1]->isConstant(), true, + platform::errors::InvalidArgument("The channel dimension should be " + "static, but we found it's dynamic.")); + nvinfer1::DimsExprs output(inputs[0]); + if (is_global_) { + output.d[2] = expr_builder.constant(1); + output.d[3] = expr_builder.constant(1); + output.d[4] = expr_builder.constant(1); + return output; + } + if (adaptive_) { + output.d[2] = expr_builder.constant(ksize_[0]); + output.d[3] = expr_builder.constant(ksize_[1]); + output.d[4] = expr_builder.constant(ksize_[2]); + return output; + } + + auto stri_0 = expr_builder.constant(strides_[0]); + auto stri_1 = expr_builder.constant(strides_[1]); + auto stri_2 = expr_builder.constant(strides_[2]); + auto one_value = expr_builder.constant(1); + + auto v0_tmp = expr_builder.constant(-ksize_[0] + 2 * paddings_[0]); + auto v1_tmp = expr_builder.constant(-ksize_[1] + 2 * paddings_[1]); + auto v2_tmp = expr_builder.constant(-ksize_[2] + 2 * paddings_[2]); + + auto ceil_tmp = + expr_builder.constant(-ksize_[0] + 2 * paddings_[0] + strides_[0] - 1); + auto ceil1_tmp = + expr_builder.constant(-ksize_[1] + 2 * paddings_[1] + strides_[1] - 1); + auto ceil2_tmp = + expr_builder.constant(-ksize_[2] + 2 * paddings_[2] + strides_[2] - 1); + + if (!ceil_mode_) { + output.d[2] = expr_builder.operation( + nvinfer1::DimensionOperation::kSUM, + *expr_builder.operation( + nvinfer1::DimensionOperation::kFLOOR_DIV, + *expr_builder.operation(nvinfer1::DimensionOperation::kSUM, + *inputs[0].d[2], *v0_tmp), + *stri_0), + *one_value); + output.d[3] = expr_builder.operation( + nvinfer1::DimensionOperation::kSUM, + *expr_builder.operation( + nvinfer1::DimensionOperation::kFLOOR_DIV, + *expr_builder.operation(nvinfer1::DimensionOperation::kSUM, + *inputs[0].d[3], *v1_tmp), + *stri_1), + *one_value); + output.d[4] = expr_builder.operation( + nvinfer1::DimensionOperation::kSUM, + *expr_builder.operation( + nvinfer1::DimensionOperation::kFLOOR_DIV, + *expr_builder.operation(nvinfer1::DimensionOperation::kSUM, + *inputs[0].d[4], *v2_tmp), + *stri_2), + *one_value); + + } else { + output.d[2] = expr_builder.operation( + nvinfer1::DimensionOperation::kSUM, + *expr_builder.operation( + nvinfer1::DimensionOperation::kFLOOR_DIV, + *expr_builder.operation(nvinfer1::DimensionOperation::kSUM, + *inputs[0].d[2], *ceil_tmp), + *stri_0), + *one_value); + output.d[3] = expr_builder.operation( + nvinfer1::DimensionOperation::kSUM, + *expr_builder.operation( + nvinfer1::DimensionOperation::kFLOOR_DIV, + *expr_builder.operation(nvinfer1::DimensionOperation::kSUM, + *inputs[0].d[3], *ceil1_tmp), + *stri_1), + *one_value); + output.d[4] = expr_builder.operation( + nvinfer1::DimensionOperation::kSUM, + *expr_builder.operation( + nvinfer1::DimensionOperation::kFLOOR_DIV, + *expr_builder.operation(nvinfer1::DimensionOperation::kSUM, + *inputs[0].d[4], *ceil2_tmp), + *stri_2), + *one_value); + } + + return output; +} + +bool Pool3DPluginDynamic::supportsFormatCombination( + int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs, + int nb_outputs) TRT_NOEXCEPT { + PADDLE_ENFORCE_NOT_NULL( + in_out, platform::errors::InvalidArgument( + "The input of swish plugin shoule not be nullptr.")); + + PADDLE_ENFORCE_LT( + pos, nb_inputs + nb_outputs, + platform::errors::InvalidArgument("The pos(%d) should be less than the " + "num(%d) of the input and the output.", + pos, nb_inputs + nb_outputs)); + (in_out && pos < (nb_inputs + nb_outputs)); + + return ((in_out[pos].type == nvinfer1::DataType::kFLOAT) && + in_out[pos].format == nvinfer1::PluginFormat::kLINEAR); +} + +nvinfer1::DataType Pool3DPluginDynamic::getOutputDataType( + int index, const nvinfer1::DataType *input_types, + int nb_inputs) const TRT_NOEXCEPT { + PADDLE_ENFORCE_EQ(index, 0, + platform::errors::InvalidArgument( + "The Pool3D Plugin only has one input, so the " + "index value should be 0, but get %d.", + index)); + PADDLE_ENFORCE_EQ((input_types[0] == nvinfer1::DataType::kFLOAT), true, + platform::errors::InvalidArgument( + "The input type should be half or float")); + return input_types[0]; +} + +int Pool3DPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc, + const nvinfer1::PluginTensorDesc *output_desc, + const void *const *inputs, + void *const *outputs, void *workspace, + cudaStream_t stream) TRT_NOEXCEPT { + auto input_dims = input_desc[0].dims; + int n = input_dims.d[0]; + int c = input_dims.d[1]; + int d = input_dims.d[2]; + int h = input_dims.d[3]; + int w = input_dims.d[4]; + + const float *input = static_cast(inputs[0]); + float *output = static_cast(outputs[0]); + + std::vector input_shape, output_shape; + for (int i = 0; i < input_dims.nbDims; i++) + input_shape.push_back(input_dims.d[i]); + output_shape = input_shape; + + std::vector ksize = ksize_; + std::vector paddings = paddings_; + if (is_global_) { + ksize[0] = d; + ksize[1] = h; + ksize[2] = w; + paddings[0] = 0; + paddings[1] = 0; + paddings[2] = 0; + output_shape[2] = 1; + output_shape[3] = 1; + output_shape[4] = 1; + } else { + auto data_dim = CalcOutputSize({d, h, w}, ceil_mode_, adaptive_, ksize_, + strides_, paddings_); + output_shape[2] = data_dim[0]; + output_shape[3] = data_dim[1]; + output_shape[4] = data_dim[2]; + } + + if (pool3d_type_ == "max") { + paddle::operators::math::MaxPool pool_process; + paddle::operators::math::Pool3dDirectCUDAFunctor< + paddle::operators::math::MaxPool, float> + pool3d_forward; + pool3d_forward(input, input_shape, output_shape, ksize, strides_, paddings, + true, adaptive_, output, stream, pool_process); + } else if (pool3d_type_ == "avg") { + paddle::operators::math::AvgPool pool_process; + paddle::operators::math::Pool3dDirectCUDAFunctor< + paddle::operators::math::AvgPool, float> + pool3d_forward; + pool3d_forward(input, input_shape, output_shape, ksize, strides_, paddings, + true, adaptive_, output, stream, pool_process); + } + + return cudaGetLastError() != cudaSuccess; +} + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.h new file mode 100644 index 00000000000000..7c9a8625d70f3b --- /dev/null +++ b/paddle/fluid/inference/tensorrt/plugin/pool3d_op_plugin.h @@ -0,0 +1,244 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" + +namespace paddle { +namespace inference { +namespace tensorrt { +namespace plugin { + +static std::vector CalcOutputSize(const std::vector& input_shape, + const bool& ceil_mode, + const bool& adaptive, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings) { + std::vector output_shape = input_shape; + if (adaptive) { + output_shape[0] = ksize[0]; + output_shape[1] = ksize[1]; + output_shape[2] = ksize[2]; + } else { + int output_d = + (input_shape[0] - ksize[0] + 2 * paddings[0]) / strides[0] + 1; + int output_h = + (input_shape[1] - ksize[1] + 2 * paddings[1]) / strides[1] + 1; + int output_w = + (input_shape[2] - ksize[2] + 2 * paddings[2]) / strides[2] + 1; + if (ceil_mode) { + output_d = + (input_shape[0] - ksize[0] + 2 * paddings[0] + strides[0] - 1) / + strides[0] + + 1; + output_h = + (input_shape[1] - ksize[1] + 2 * paddings[1] + strides[1] - 1) / + strides[1] + + 1; + output_w = + (input_shape[2] - ksize[2] + 2 * paddings[2] + strides[2] - 1) / + strides[2] + + 1; + } + output_shape[0] = output_d; + output_shape[1] = output_h; + output_shape[2] = output_w; + } + return output_shape; +} + +class Pool3DPlugin : public PluginTensorRTV2Ext { + public: + size_t getSerializationSize() const TRT_NOEXCEPT override; + // TRT will call this func when we need to serialize the configuration of + // tensorrt. + void serialize(void* buffer) const TRT_NOEXCEPT override; + + enum class Pool3DType { + max = 0, + avg, + }; + Pool3DPlugin() {} + Pool3DPlugin(bool ceil_mode, Pool3DType pool3d_type, bool adaptive, + std::vector ksize, std::vector strides, + std::vector paddings, std::vector input_shape) + : ceil_mode_(ceil_mode), + pool3d_type_(pool3d_type), + adaptive_(adaptive), + ksize_(ksize), + strides_(strides), + paddings_(paddings), + input_shape_(input_shape) { + output_shape_ = input_shape_; + std::vector output_shape = + CalcOutputSize({input_shape_[1], input_shape_[2], input_shape_[3]}, + ceil_mode_, adaptive_, ksize_, strides_, paddings_); + output_shape_[1] = output_shape[0]; + output_shape_[2] = output_shape[1]; + output_shape_[3] = output_shape[2]; + } + + // It was used for tensorrt deserialization. + // It should not be called by users. + Pool3DPlugin(void const* serialData, size_t serialLength) { + deserializeBase(serialData, serialLength); + DeserializeValue(&serialData, &serialLength, &ceil_mode_); + DeserializeValue(&serialData, &serialLength, &pool3d_type_); + DeserializeValue(&serialData, &serialLength, &adaptive_); + DeserializeValue(&serialData, &serialLength, &ksize_); + DeserializeValue(&serialData, &serialLength, &strides_); + DeserializeValue(&serialData, &serialLength, &paddings_); + DeserializeValue(&serialData, &serialLength, &input_shape_); + DeserializeValue(&serialData, &serialLength, &output_shape_); + } + + Pool3DPlugin* clone() const TRT_NOEXCEPT override; + + const char* getPluginType() const TRT_NOEXCEPT override; + + nvinfer1::DataType getOutputDataType( + int index, const nvinfer1::DataType* input_types, + int nb_inputs) const TRT_NOEXCEPT override; + + int getNbOutputs() const TRT_NOEXCEPT override; + + nvinfer1::Dims getOutputDimensions(int index, const nvinfer1::Dims* inputs, + int nbInputDims) TRT_NOEXCEPT override; + + int initialize() TRT_NOEXCEPT override; + + void destroy() TRT_NOEXCEPT override; + +#if IS_TRT_VERSION_LT(8000) + int enqueue(int batchSize, const void* const* inputs, void** outputs, +#else + int enqueue(int batchSize, const void* const* inputs, void* const* outputs, +#endif + void* workspace, cudaStream_t stream) TRT_NOEXCEPT override; + + private: + bool ceil_mode_; + Pool3DType pool3d_type_; + bool adaptive_; + std::vector ksize_; + std::vector strides_; + std::vector paddings_; + std::vector input_shape_; + std::vector output_shape_; +}; + +class Pool3DPluginCreator : public TensorRTPluginCreator { + public: + const char* getPluginName() const TRT_NOEXCEPT override { + return "pool3d_plugin"; + } + + const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; } + + nvinfer1::IPluginV2* deserializePlugin( + const char* name, const void* serial_data, + size_t serial_length) TRT_NOEXCEPT override { + return new Pool3DPlugin(serial_data, serial_length); + } +}; +REGISTER_TRT_PLUGIN_V2(Pool3DPluginCreator); + +class Pool3DPluginDynamic : public DynamicPluginTensorRT { + public: + Pool3DPluginDynamic() {} + Pool3DPluginDynamic(const bool& ceil_mode, const std::string& pool3d_type, + const bool& adaptive, const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, const bool& is_global) + : ceil_mode_(ceil_mode), + pool3d_type_(pool3d_type), + adaptive_(adaptive), + ksize_(ksize), + strides_(strides), + paddings_(paddings), + is_global_(is_global) {} + + Pool3DPluginDynamic(void const* serialData, size_t serialLength); + ~Pool3DPluginDynamic() {} + nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT override; + const char* getPluginType() const TRT_NOEXCEPT override; + int getNbOutputs() const TRT_NOEXCEPT override; + int initialize() TRT_NOEXCEPT override; + size_t getSerializationSize() const TRT_NOEXCEPT override; + void serialize(void* buffer) const TRT_NOEXCEPT override; + + nvinfer1::DimsExprs getOutputDimensions( + int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs, + nvinfer1::IExprBuilder& expr_builder) TRT_NOEXCEPT override; + + bool supportsFormatCombination(int pos, + const nvinfer1::PluginTensorDesc* inOut, + int nbInputs, + int nbOutputs) TRT_NOEXCEPT override; + + void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in, + int nbInputs, + const nvinfer1::DynamicPluginTensorDesc* out, + int nbOutputs) TRT_NOEXCEPT override; + + size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs, + int nbInputs, + const nvinfer1::PluginTensorDesc* outputs, + int nbOutputs) const TRT_NOEXCEPT override; + + int enqueue(const nvinfer1::PluginTensorDesc* inputDesc, + const nvinfer1::PluginTensorDesc* outputDesc, + const void* const* inputs, void* const* outputs, void* workspace, + cudaStream_t stream) TRT_NOEXCEPT override; + nvinfer1::DataType getOutputDataType( + int index, const nvinfer1::DataType* inputTypes, + int nbInputs) const TRT_NOEXCEPT override; + + void destroy() TRT_NOEXCEPT override { delete this; } + + private: + bool ceil_mode_; + std::string pool3d_type_; + bool adaptive_; + std::vector ksize_; + std::vector strides_; + std::vector paddings_; + bool is_global_; +}; + +class Pool3DPluginDynamicCreator : public TensorRTPluginCreator { + public: + const char* getPluginName() const TRT_NOEXCEPT override { + return "pool3d_plugin_dynamic"; + } + + const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; } + + nvinfer1::IPluginV2* deserializePlugin( + const char* name, const void* serial_data, + size_t serial_length) TRT_NOEXCEPT override { + return new Pool3DPluginDynamic(serial_data, serial_length); + } +}; +REGISTER_TRT_PLUGIN_V2(Pool3DPluginDynamicCreator); + +} // namespace plugin +} // namespace tensorrt +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu index cbd6e3a2e4ffe5..2b6541c5515cec 100644 --- a/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.cu @@ -65,6 +65,7 @@ SlicePlugin::SlicePlugin(void const *serial_data, size_t serial_length) { DeserializeValue(&serial_data, &serial_length, &starts_); DeserializeValue(&serial_data, &serial_length, &ends_); DeserializeValue(&serial_data, &serial_length, &axes_); + DeserializeValue(&serial_data, &serial_length, &with_fp16_); cudaEventCreate(©_event_); cudaStreamCreate(©_stream_); } @@ -187,17 +188,17 @@ int SlicePlugin::enqueue(int batch_size, const void *const *inputs, } size_t SlicePlugin::getSerializationSize() const TRT_NOEXCEPT { - return getBaseSerializationSize() + SerializedSize(getPluginType()) + - SerializedSize(starts_) + SerializedSize(ends_) + - SerializedSize(axes_); + return getBaseSerializationSize() + SerializedSize(starts_) + + SerializedSize(ends_) + SerializedSize(axes_) + + SerializedSize(with_fp16_); } void SlicePlugin::serialize(void *buffer) const TRT_NOEXCEPT { - SerializeValue(&buffer, getPluginType()); serializeBase(buffer); SerializeValue(&buffer, starts_); SerializeValue(&buffer, ends_); SerializeValue(&buffer, axes_); + SerializeValue(&buffer, with_fp16_); } // Dynamic Plugin below. diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu index ee1709f57e2598..57177cfa8b421e 100644 --- a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.cu @@ -12,8 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include -#include #include #include @@ -29,7 +27,8 @@ YoloBoxPlugin::YoloBoxPlugin(const nvinfer1::DataType data_type, const std::vector& anchors, const int class_num, const float conf_thresh, const int downsample_ratio, const bool clip_bbox, - const float scale_x_y, const int input_h, + const float scale_x_y, const bool iou_aware, + const float iou_aware_factor, const int input_h, const int input_w) : data_type_(data_type), class_num_(class_num), @@ -37,6 +36,8 @@ YoloBoxPlugin::YoloBoxPlugin(const nvinfer1::DataType data_type, downsample_ratio_(downsample_ratio), clip_bbox_(clip_bbox), scale_x_y_(scale_x_y), + iou_aware_(iou_aware), + iou_aware_factor_(iou_aware_factor), input_h_(input_h), input_w_(input_w) { anchors_.insert(anchors_.end(), anchors.cbegin(), anchors.cend()); @@ -45,6 +46,7 @@ YoloBoxPlugin::YoloBoxPlugin(const nvinfer1::DataType data_type, assert(class_num_ > 0); assert(input_h_ > 0); assert(input_w_ > 0); + assert((iou_aware_factor_ > 0 && iou_aware_factor_ < 1)); cudaMalloc(&anchors_device_, anchors.size() * sizeof(int)); cudaMemcpy(anchors_device_, anchors.data(), anchors.size() * sizeof(int), @@ -59,6 +61,8 @@ YoloBoxPlugin::YoloBoxPlugin(const void* data, size_t length) { DeserializeValue(&data, &length, &downsample_ratio_); DeserializeValue(&data, &length, &clip_bbox_); DeserializeValue(&data, &length, &scale_x_y_); + DeserializeValue(&data, &length, &iou_aware_); + DeserializeValue(&data, &length, &iou_aware_factor_); DeserializeValue(&data, &length, &input_h_); DeserializeValue(&data, &length, &input_w_); } @@ -119,10 +123,10 @@ __device__ inline void GetYoloBox(float* box, const T* x, const int* anchors, int img_height, int img_width, float scale, float bias) { box[0] = static_cast( - (i + sigmoid(static_cast(x[index]) * scale + bias)) * img_width / + (i + sigmoid(static_cast(x[index])) * scale + bias) * img_width / grid_size_w); box[1] = static_cast( - (j + sigmoid(static_cast(x[index + stride]) * scale + bias)) * + (j + sigmoid(static_cast(x[index + stride])) * scale + bias) * img_height / grid_size_h); box[2] = static_cast(expf(static_cast(x[index + 2 * stride])) * anchors[2 * an_idx] * img_width / input_size_w); @@ -133,8 +137,19 @@ __device__ inline void GetYoloBox(float* box, const T* x, const int* anchors, __device__ inline int GetEntryIndex(int batch, int an_idx, int hw_idx, int an_num, int an_stride, int stride, - int entry) { - return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx; + int entry, bool iou_aware) { + if (iou_aware) { + return (batch * an_num + an_idx) * an_stride + + (batch * an_num + an_num + entry) * stride + hw_idx; + } else { + return (batch * an_num + an_idx) * an_stride + entry * stride + hw_idx; + } +} + +__device__ inline int GetIoUIndex(int batch, int an_idx, int hw_idx, int an_num, + int an_stride, int stride) { + return batch * an_num * an_stride + (batch * an_num + an_idx) * stride + + hw_idx; } template @@ -178,7 +193,8 @@ __global__ void KeYoloBoxFw(const T* const input, const int* const imgsize, const int w, const int an_num, const int class_num, const int box_num, int input_size_h, int input_size_w, bool clip_bbox, const float scale, - const float bias) { + const float bias, bool iou_aware, + const float iou_aware_factor) { int tid = blockIdx.x * blockDim.x + threadIdx.x; int stride = blockDim.x * gridDim.x; float box[4]; @@ -193,11 +209,16 @@ __global__ void KeYoloBoxFw(const T* const input, const int* const imgsize, int img_height = imgsize[2 * i]; int img_width = imgsize[2 * i + 1]; - int obj_idx = - GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4); + int obj_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 4, + iou_aware); float conf = sigmoid(static_cast(input[obj_idx])); - int box_idx = - GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0); + if (iou_aware) { + int iou_idx = GetIoUIndex(i, j, k * w + l, an_num, an_stride, grid_num); + float iou = sigmoid(input[iou_idx]); + conf = powf(conf, 1. - iou_aware_factor) * powf(iou, iou_aware_factor); + } + int box_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 0, + iou_aware); if (conf < conf_thresh) { for (int i = 0; i < 4; ++i) { @@ -212,8 +233,8 @@ __global__ void KeYoloBoxFw(const T* const input, const int* const imgsize, box_idx = (i * box_num + j * grid_num + k * w + l) * 4; CalcDetectionBox(boxes, box, box_idx, img_height, img_width, clip_bbox); - int label_idx = - GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, 5); + int label_idx = GetEntryIndex(i, j, k * w + l, an_num, an_stride, grid_num, + 5, iou_aware); int score_idx = (i * box_num + j * grid_num + k * w + l) * class_num; CalcLabelScore(scores, input, label_idx, score_idx, class_num, conf, grid_num); @@ -240,7 +261,8 @@ int YoloBoxPlugin::enqueue_impl(int batch_size, const void* const* inputs, reinterpret_cast(inputs[1]), reinterpret_cast(outputs[0]), reinterpret_cast(outputs[1]), conf_thresh_, anchors_device_, n, h, w, an_num, class_num_, box_num, - input_size_h, input_size_w, clip_bbox_, scale_x_y_, bias); + input_size_h, input_size_w, clip_bbox_, scale_x_y_, bias, iou_aware_, + iou_aware_factor_); return cudaGetLastError() != cudaSuccess; } @@ -274,6 +296,8 @@ size_t YoloBoxPlugin::getSerializationSize() const TRT_NOEXCEPT { serialize_size += SerializedSize(scale_x_y_); serialize_size += SerializedSize(input_h_); serialize_size += SerializedSize(input_w_); + serialize_size += SerializedSize(iou_aware_); + serialize_size += SerializedSize(iou_aware_factor_); return serialize_size; } @@ -285,6 +309,8 @@ void YoloBoxPlugin::serialize(void* buffer) const TRT_NOEXCEPT { SerializeValue(&buffer, downsample_ratio_); SerializeValue(&buffer, clip_bbox_); SerializeValue(&buffer, scale_x_y_); + SerializeValue(&buffer, iou_aware_); + SerializeValue(&buffer, iou_aware_factor_); SerializeValue(&buffer, input_h_); SerializeValue(&buffer, input_w_); } @@ -326,8 +352,8 @@ void YoloBoxPlugin::configurePlugin( nvinfer1::IPluginV2Ext* YoloBoxPlugin::clone() const TRT_NOEXCEPT { return new YoloBoxPlugin(data_type_, anchors_, class_num_, conf_thresh_, - downsample_ratio_, clip_bbox_, scale_x_y_, input_h_, - input_w_); + downsample_ratio_, clip_bbox_, scale_x_y_, + iou_aware_, iou_aware_factor_, input_h_, input_w_); } YoloBoxPluginCreator::YoloBoxPluginCreator() {} @@ -367,6 +393,8 @@ nvinfer1::IPluginV2Ext* YoloBoxPluginCreator::createPlugin( float scale_x_y = 1.; int h = -1; int w = -1; + bool iou_aware = false; + float iou_aware_factor = 0.5; for (int i = 0; i < fc->nbFields; ++i) { const std::string field_name(fc->fields[i].name); @@ -386,6 +414,10 @@ nvinfer1::IPluginV2Ext* YoloBoxPluginCreator::createPlugin( clip_bbox = *static_cast(fc->fields[i].data); } else if (field_name.compare("scale_x_y")) { scale_x_y = *static_cast(fc->fields[i].data); + } else if (field_name.compare("iou_aware")) { + iou_aware = *static_cast(fc->fields[i].data); + } else if (field_name.compare("iou_aware_factor")) { + iou_aware_factor = *static_cast(fc->fields[i].data); } else if (field_name.compare("h")) { h = *static_cast(fc->fields[i].data); } else if (field_name.compare("w")) { @@ -397,7 +429,8 @@ nvinfer1::IPluginV2Ext* YoloBoxPluginCreator::createPlugin( return new YoloBoxPlugin( type_id ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT, anchors, - class_num, conf_thresh, downsample_ratio, clip_bbox, scale_x_y, h, w); + class_num, conf_thresh, downsample_ratio, clip_bbox, scale_x_y, iou_aware, + iou_aware_factor, h, w); } nvinfer1::IPluginV2Ext* YoloBoxPluginCreator::deserializePlugin( diff --git a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h index c9e9f9a0567aee..ae9a6739cedd34 100644 --- a/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/yolo_box_op_plugin.h @@ -31,6 +31,7 @@ class YoloBoxPlugin : public nvinfer1::IPluginV2Ext { const std::vector& anchors, const int class_num, const float conf_thresh, const int downsample_ratio, const bool clip_bbox, const float scale_x_y, + const bool iou_aware, const float iou_aware_factor, const int input_h, const int input_w); YoloBoxPlugin(const void* data, size_t length); ~YoloBoxPlugin() override; @@ -89,6 +90,8 @@ class YoloBoxPlugin : public nvinfer1::IPluginV2Ext { float scale_x_y_; int input_h_; int input_w_; + bool iou_aware_; + float iou_aware_factor_; std::string namespace_; }; diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 11187a1c79fca3..6fd3944a6c5280 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -555,10 +555,6 @@ if(WITH_GPU AND TENSORRT_FOUND) if (NOT EXISTS ${TEST_SPLIT_CONVERTER_MODEL}/split_converter.tgz) inference_download_and_uncompress_without_verify(${TEST_SPLIT_CONVERTER_MODEL} ${INFERENCE_URL}/tensorrt_test "split_converter.tgz") endif() - set(TEST_INSTANCE_NORM_MODEL "${TRT_MODEL_INSTALL_DIR}/trt_instance_norm_test") - if (NOT EXISTS ${TEST_INSTANCE_NORM_MODEL}/instance_norm.tgz) - inference_download_and_uncompress_without_verify(${TEST_INSTANCE_NORM_MODEL} ${INFERENCE_URL}/tensorrt_test "instance_norm.tgz") - endif() inference_analysis_test(trt_mobilenet_test SRCS trt_mobilenet_test.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) @@ -577,9 +573,6 @@ if(WITH_GPU AND TENSORRT_FOUND) inference_analysis_test(trt_split_converter_test SRCS trt_split_converter_test.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} ARGS --infer_model=${TEST_SPLIT_CONVERTER_MODEL}/) - inference_analysis_test(trt_instance_norm_test SRCS trt_instance_norm_converter_test.cc - EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} - ARGS --infer_model=${TEST_INSTANCE_NORM_MODEL}/) inference_analysis_test(test_analyzer_capi_exp_gpu SRCS analyzer_capi_exp_gpu_tester.cc EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} paddle_inference_c ARGS --infer_model=${TRT_MODEL_INSTALL_DIR}/trt_inference_test_models) diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc index b8ccb8cee507b9..d33b11c389a095 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_pool1_fuse_statis_tester.cc @@ -36,10 +36,10 @@ TEST(Analyzer_seq_pool1_fuse_statis, fuse_statis) { ASSERT_TRUE(fuse_statis.count("repeated_fc_relu_fuse")); ASSERT_EQ(fuse_statis.at("fc_fuse"), 10); EXPECT_EQ(fuse_statis.at("seqpool_concat_fuse"), 2); - EXPECT_EQ(fuse_statis.at("squared_mat_sub_fuse"), 2); + EXPECT_EQ(fuse_statis.at("squared_mat_sub_fuse"), 0); EXPECT_EQ(fuse_statis.at("repeated_fc_relu_fuse"), 2); LOG(INFO) << "num_ops: " << num_ops; - EXPECT_EQ(num_ops, 171); + EXPECT_EQ(num_ops, 185); } } // namespace seq_pool1_tester diff --git a/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc b/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc index a7ff5af1bdc242..b74d1189b804be 100644 --- a/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc +++ b/paddle/fluid/inference/tests/infer_ut/test_LeViT.cc @@ -77,7 +77,7 @@ TEST(tensorrt_tester_LeViT, trt_fp32_bz2) { FLAGS_modeldir + "/inference.pdiparams"); config.EnableUseGpu(100, 0); config.EnableTensorRtEngine( - 1 << 20, 2, 6, paddle_infer::PrecisionType::kFloat32, false, false); + 1 << 20, 2, 50, paddle_infer::PrecisionType::kFloat32, false, false); // get groudtruth by disbale ir paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1); SingleThreadPrediction(pred_pool_no_ir.Retrive(0), &my_input_data_map, @@ -103,7 +103,7 @@ TEST(tensorrt_tester_LeViT, serial_diff_batch_trt_fp32) { config.SetModel(FLAGS_modeldir + "/inference.pdmodel", FLAGS_modeldir + "/inference.pdiparams"); config.EnableUseGpu(100, 0); - config.EnableTensorRtEngine(1 << 20, max_batch_size, 6, + config.EnableTensorRtEngine(1 << 20, max_batch_size, 50, paddle_infer::PrecisionType::kFloat32, false, false); paddle_infer::services::PredictorPool pred_pool(config, 1); @@ -145,7 +145,7 @@ TEST(tensorrt_tester_LeViT, multi_thread4_trt_fp32_bz2) { FLAGS_modeldir + "/inference.pdiparams"); config.EnableUseGpu(100, 0); config.EnableTensorRtEngine( - 1 << 20, 2, 6, paddle_infer::PrecisionType::kFloat32, false, false); + 1 << 20, 2, 50, paddle_infer::PrecisionType::kFloat32, false, false); // get groudtruth by disbale ir paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1); SingleThreadPrediction(pred_pool_no_ir.Retrive(0), &my_input_data_map, @@ -174,6 +174,6 @@ TEST(tensorrt_tester_LeViT, multi_thread4_trt_fp32_bz2) { int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); - ::google::ParseCommandLineFlags(&argc, &argv, true); + ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true); return RUN_ALL_TESTS(); } diff --git a/paddle/fluid/inference/tests/infer_ut/test_det_mv3_db.cc b/paddle/fluid/inference/tests/infer_ut/test_det_mv3_db.cc index 67c2eeb0be5f94..eb31acbdf7ca1d 100644 --- a/paddle/fluid/inference/tests/infer_ut/test_det_mv3_db.cc +++ b/paddle/fluid/inference/tests/infer_ut/test_det_mv3_db.cc @@ -35,44 +35,11 @@ paddle::test::Record PrepareInput(int batch_size, int image_shape = 640) { void PrepareDynamicShape(paddle_infer::Config* config, int max_batch_size = 4) { // set dynamic shape range std::map> min_input_shape = { - {"x", {1, 3, 50, 50}}, - {"conv2d_92.tmp_0", {1, 120, 20, 20}}, - {"conv2d_91.tmp_0", {1, 24, 10, 10}}, - {"conv2d_59.tmp_0", {1, 96, 20, 20}}, - {"nearest_interp_v2_1.tmp_0", {1, 256, 10, 10}}, - {"nearest_interp_v2_2.tmp_0", {1, 256, 20, 20}}, - {"conv2d_124.tmp_0", {1, 256, 20, 20}}, - {"nearest_interp_v2_3.tmp_0", {1, 64, 20, 20}}, - {"nearest_interp_v2_4.tmp_0", {1, 64, 20, 20}}, - {"nearest_interp_v2_5.tmp_0", {1, 64, 20, 20}}, - {"elementwise_add_7", {1, 56, 2, 2}}, - {"nearest_interp_v2_0.tmp_0", {1, 256, 2, 2}}}; + {"x", {1, 3, 50, 50}}}; std::map> max_input_shape = { - {"x", {max_batch_size, 3, 2000, 2000}}, - {"conv2d_92.tmp_0", {max_batch_size, 120, 400, 400}}, - {"conv2d_91.tmp_0", {max_batch_size, 24, 200, 200}}, - {"conv2d_59.tmp_0", {max_batch_size, 96, 400, 400}}, - {"nearest_interp_v2_1.tmp_0", {max_batch_size, 256, 200, 200}}, - {"nearest_interp_v2_2.tmp_0", {max_batch_size, 256, 400, 400}}, - {"conv2d_124.tmp_0", {max_batch_size, 256, 400, 400}}, - {"nearest_interp_v2_3.tmp_0", {max_batch_size, 64, 400, 400}}, - {"nearest_interp_v2_4.tmp_0", {max_batch_size, 64, 400, 400}}, - {"nearest_interp_v2_5.tmp_0", {max_batch_size, 64, 400, 400}}, - {"elementwise_add_7", {max_batch_size, 56, 400, 400}}, - {"nearest_interp_v2_0.tmp_0", {max_batch_size, 256, 400, 400}}}; + {"x", {max_batch_size, 3, 1600, 1600}}}; std::map> opt_input_shape = { - {"x", {1, 3, 640, 640}}, - {"conv2d_92.tmp_0", {1, 120, 160, 160}}, - {"conv2d_91.tmp_0", {1, 24, 80, 80}}, - {"conv2d_59.tmp_0", {1, 96, 160, 160}}, - {"nearest_interp_v2_1.tmp_0", {1, 256, 80, 80}}, - {"nearest_interp_v2_2.tmp_0", {1, 256, 160, 160}}, - {"conv2d_124.tmp_0", {1, 256, 160, 160}}, - {"nearest_interp_v2_3.tmp_0", {1, 64, 160, 160}}, - {"nearest_interp_v2_4.tmp_0", {1, 64, 160, 160}}, - {"nearest_interp_v2_5.tmp_0", {1, 64, 160, 160}}, - {"elementwise_add_7", {1, 56, 40, 40}}, - {"nearest_interp_v2_0.tmp_0", {1, 256, 40, 40}}}; + {"x", {1, 3, 640, 640}}}; config->SetTRTDynamicShapeInfo(min_input_shape, max_input_shape, opt_input_shape); } @@ -123,7 +90,7 @@ TEST(tensorrt_tester_det_mv3_db, multi_thread2_trt_fp32_dynamic_shape_bz2) { FLAGS_modeldir + "/inference.pdiparams"); config.EnableUseGpu(100, 0); config.EnableTensorRtEngine( - 1 << 20, 2, 3, paddle_infer::PrecisionType::kFloat32, true, false); + 1 << 20, 2, 3, paddle_infer::PrecisionType::kFloat32, false, false); PrepareDynamicShape(&config, 4); // get groudtruth by disbale ir paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1); @@ -197,6 +164,6 @@ TEST(mkldnn_tester_det_mv3_db, multi_thread2_mkl_fp32_bz2) { int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); - ::google::ParseCommandLineFlags(&argc, &argv, true); + ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true); return RUN_ALL_TESTS(); } diff --git a/paddle/fluid/inference/tests/infer_ut/test_ernie_text_cls.cc b/paddle/fluid/inference/tests/infer_ut/test_ernie_text_cls.cc index 6ef894cc3d1d64..3fa41b201c680f 100644 --- a/paddle/fluid/inference/tests/infer_ut/test_ernie_text_cls.cc +++ b/paddle/fluid/inference/tests/infer_ut/test_ernie_text_cls.cc @@ -132,6 +132,6 @@ TEST(mkldnn_tester_ernie_text_cls, multi_thread4_mkl_fp32_bz2) { int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); - ::google::ParseCommandLineFlags(&argc, &argv, true); + ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true); return RUN_ALL_TESTS(); } diff --git a/paddle/fluid/inference/tests/infer_ut/test_ernie_xnli_int8.cc b/paddle/fluid/inference/tests/infer_ut/test_ernie_xnli_int8.cc index 9e835511265528..4e924e31979659 100644 --- a/paddle/fluid/inference/tests/infer_ut/test_ernie_xnli_int8.cc +++ b/paddle/fluid/inference/tests/infer_ut/test_ernie_xnli_int8.cc @@ -186,7 +186,8 @@ TEST(tensorrt_tester_ernie_xnli, oss_varlen_truth_data_int8) { int main(int argc, char **argv) { ::testing::InitGoogleTest(&argc, argv); - ::google::ParseCommandLineFlags(&argc, &argv, true); + ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true); + #if IS_TRT_VERSION_GE(7200) return RUN_ALL_TESTS(); #endif diff --git a/paddle/fluid/inference/tests/infer_ut/test_mobilnetv1.cc b/paddle/fluid/inference/tests/infer_ut/test_mobilnetv1.cc index 21991d0da06a17..eaa7bac89efcd0 100644 --- a/paddle/fluid/inference/tests/infer_ut/test_mobilnetv1.cc +++ b/paddle/fluid/inference/tests/infer_ut/test_mobilnetv1.cc @@ -81,6 +81,6 @@ TEST(tensorrt_tester_mobilenetv1, tuned_dynamic_trt_fp32_bz2) { int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); - ::google::ParseCommandLineFlags(&argc, &argv, true); + ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true); return RUN_ALL_TESTS(); } diff --git a/paddle/fluid/inference/tests/infer_ut/test_ppyolo_mbv3.cc b/paddle/fluid/inference/tests/infer_ut/test_ppyolo_mbv3.cc index 2d69c933c2f81e..ff1647432a12d5 100644 --- a/paddle/fluid/inference/tests/infer_ut/test_ppyolo_mbv3.cc +++ b/paddle/fluid/inference/tests/infer_ut/test_ppyolo_mbv3.cc @@ -151,6 +151,6 @@ TEST(DISABLED_mkldnn_tester_ppyolo_mbv3, multi_thread4_mkl_bz2) { int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); - ::google::ParseCommandLineFlags(&argc, &argv, true); + ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true); return RUN_ALL_TESTS(); } diff --git a/paddle/fluid/inference/tests/infer_ut/test_ppyolov2_r50vd.cc b/paddle/fluid/inference/tests/infer_ut/test_ppyolov2_r50vd.cc index d74a333232473d..9689ec20956a17 100644 --- a/paddle/fluid/inference/tests/infer_ut/test_ppyolov2_r50vd.cc +++ b/paddle/fluid/inference/tests/infer_ut/test_ppyolov2_r50vd.cc @@ -150,6 +150,6 @@ TEST(mkldnn_tester_ppyolov2_r50vd, multi_thread2_mkl_bz2) { int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); - ::google::ParseCommandLineFlags(&argc, &argv, true); + ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true); return RUN_ALL_TESTS(); } diff --git a/paddle/fluid/inference/tests/infer_ut/test_resnet50.cc b/paddle/fluid/inference/tests/infer_ut/test_resnet50.cc index 6157fdbdb108a3..01bec2916e94ab 100644 --- a/paddle/fluid/inference/tests/infer_ut/test_resnet50.cc +++ b/paddle/fluid/inference/tests/infer_ut/test_resnet50.cc @@ -236,6 +236,6 @@ TEST(DISABLED_tensorrt_tester_resnet50, profile_multi_thread_trt_fp32) { int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); - ::google::ParseCommandLineFlags(&argc, &argv, true); + ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true); return RUN_ALL_TESTS(); } diff --git a/paddle/fluid/inference/tests/infer_ut/test_resnet50_quant.cc b/paddle/fluid/inference/tests/infer_ut/test_resnet50_quant.cc index ed7ab7b5eee7bd..380954f9e527d9 100644 --- a/paddle/fluid/inference/tests/infer_ut/test_resnet50_quant.cc +++ b/paddle/fluid/inference/tests/infer_ut/test_resnet50_quant.cc @@ -165,6 +165,6 @@ TEST(DISABLED_tensorrt_tester_resnet50_quant, multi_thread_multi_instance) { int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); - ::google::ParseCommandLineFlags(&argc, &argv, true); + ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true); return RUN_ALL_TESTS(); } diff --git a/paddle/fluid/inference/tests/infer_ut/test_yolov3.cc b/paddle/fluid/inference/tests/infer_ut/test_yolov3.cc index 845bcbc5c5b5f8..69a9e8d6a900a3 100644 --- a/paddle/fluid/inference/tests/infer_ut/test_yolov3.cc +++ b/paddle/fluid/inference/tests/infer_ut/test_yolov3.cc @@ -150,6 +150,6 @@ TEST(test_yolov3, multi_thread4_mkl_bz2) { int main(int argc, char** argv) { ::testing::InitGoogleTest(&argc, argv); - ::google::ParseCommandLineFlags(&argc, &argv, true); + ::GFLAGS_NAMESPACE::ParseCommandLineFlags(&argc, &argv, true); return RUN_ALL_TESTS(); } diff --git a/paddle/fluid/inference/utils/io_utils.cc b/paddle/fluid/inference/utils/io_utils.cc index 3691285ba3a51c..87331e1978f95e 100644 --- a/paddle/fluid/inference/utils/io_utils.cc +++ b/paddle/fluid/inference/utils/io_utils.cc @@ -197,6 +197,9 @@ void SerializeShapeRangeInfo( void DeserializeShapeRangeInfo( const std::string &path, paddle::inference::proto::ShapeRangeInfos *info) { int fd = open(path.c_str(), O_RDONLY); + if (fd == -1) { + PADDLE_THROW(platform::errors::NotFound("File [%s] is not found.", path)); + } google::protobuf::io::FileInputStream *is = new google::protobuf::io::FileInputStream(fd); google::protobuf::TextFormat::Parse(is, info); diff --git a/paddle/fluid/inference/utils/io_utils_tester.cc b/paddle/fluid/inference/utils/io_utils_tester.cc index 766afed4e50144..ffd97232652fd9 100644 --- a/paddle/fluid/inference/utils/io_utils_tester.cc +++ b/paddle/fluid/inference/utils/io_utils_tester.cc @@ -118,4 +118,8 @@ TEST(shape_info_io, read_and_write) { std::vector names{"test1"}; paddle::inference::UpdateShapeRangeInfo(path, min_shape, max_shape, opt_shape, names); + + ASSERT_THROW(paddle::inference::DeserializeShapeRangeInfo( + "no_exists_file", &min_shape, &max_shape, &opt_shape); + , paddle::platform::EnforceNotMet); } diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt index 6b4afae9f8c752..4aa1900f53f5e3 100644 --- a/paddle/fluid/memory/allocation/CMakeLists.txt +++ b/paddle/fluid/memory/allocation/CMakeLists.txt @@ -82,7 +82,11 @@ endif() cc_library(aligned_allocator SRCS aligned_allocator.cc DEPS allocator) cc_test(test_aligned_allocator SRCS test_aligned_allocator.cc DEPS aligned_allocator) cc_library(allocator_strategy SRCS allocator_strategy.cc DEPS gflags ${AllocatorFacadeDeps}) -cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy ) +cc_library(allocator_facade SRCS allocator_facade.cc DEPS allocator_strategy) + +if (WITH_GPU) + target_link_libraries(allocator_facade cuda_graph) +endif() cc_test(retry_allocator_test SRCS retry_allocator_test.cc DEPS retry_allocator locked_allocator cpu_allocator) if (WITH_TESTING) diff --git a/paddle/fluid/memory/allocation/aligned_allocator.cc b/paddle/fluid/memory/allocation/aligned_allocator.cc index 1d89918bfebf6a..f0b7f1a4b0d9e7 100644 --- a/paddle/fluid/memory/allocation/aligned_allocator.cc +++ b/paddle/fluid/memory/allocation/aligned_allocator.cc @@ -20,6 +20,7 @@ namespace paddle { namespace memory { namespace allocation { +// For memory address alignment class AlignedAllocation : public Allocation { public: AlignedAllocation(AllocationPtr underlying_allocation, size_t offset) diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc index 78bce53b6f4ffb..281902f3a2b12a 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.cc +++ b/paddle/fluid/memory/allocation/allocator_facade.cc @@ -23,6 +23,7 @@ #ifdef PADDLE_WITH_ASCEND_CL #include "paddle/fluid/memory/allocation/npu_pinned_allocator.h" #endif +#include "paddle/fluid/memory/allocation/aligned_allocator.h" #include "paddle/fluid/memory/allocation/retry_allocator.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" @@ -32,6 +33,9 @@ #include "paddle/fluid/memory/allocation/thread_local_allocator.h" #include "paddle/fluid/platform/gpu_info.h" #endif +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/cuda_graph.h" +#endif #ifdef PADDLE_WITH_XPU #include "paddle/fluid/platform/xpu/xpu_info.h" #endif @@ -47,17 +51,64 @@ PADDLE_DEFINE_EXPORTED_bool( "Whether to use system allocator to allocate CPU and GPU memory. " "Only used for unittests."); +DECLARE_string(allocator_strategy); + namespace paddle { namespace memory { namespace allocation { +#ifdef PADDLE_WITH_CUDA +class CUDAGraphAllocator + : public Allocator, + public std::enable_shared_from_this { + private: + class PrivateAllocation : public Allocation { + public: + PrivateAllocation(CUDAGraphAllocator* allocator, + AllocationPtr underlying_allocation) + : Allocation(underlying_allocation->ptr(), + underlying_allocation->size(), + underlying_allocation->place()), + allocator_(allocator->shared_from_this()), + underlying_allocation_(std::move(underlying_allocation)) {} + + private: + std::shared_ptr allocator_; + AllocationPtr underlying_allocation_; + }; + + explicit CUDAGraphAllocator(const std::shared_ptr& allocator) + : underlying_allocator_(allocator) {} + + public: + static std::shared_ptr Create( + const std::shared_ptr& allocator) { + return std::shared_ptr(new CUDAGraphAllocator(allocator)); + } + + protected: + Allocation* AllocateImpl(size_t size) { + VLOG(10) << "Allocate " << size << " for CUDA Graph"; + return new PrivateAllocation(this, underlying_allocator_->Allocate(size)); + } + + void FreeImpl(Allocation* allocation) { + VLOG(10) << "delete for CUDA Graph"; + delete allocation; + } + + private: + std::shared_ptr underlying_allocator_; +}; +#endif + class AllocatorFacadePrivate { public: using AllocatorMap = std::map>; - AllocatorFacadePrivate() { - auto strategy = GetAllocatorStrategy(); - switch (strategy) { + explicit AllocatorFacadePrivate(bool allow_free_idle_chunk = true) { + strategy_ = GetAllocatorStrategy(); + switch (strategy_) { case AllocatorStrategy::kNaiveBestFit: { InitNaiveBestFitCPUAllocator(); #ifdef PADDLE_WITH_XPU @@ -91,7 +142,8 @@ class AllocatorFacadePrivate { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); ++dev_id) { - InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id)); + InitAutoGrowthCUDAAllocator(platform::CUDAPlace(dev_id), + allow_free_idle_chunk); } InitNaiveBestFitCUDAPinnedAllocator(); #endif @@ -117,7 +169,7 @@ class AllocatorFacadePrivate { default: { PADDLE_THROW(platform::errors::InvalidArgument( - "Unsupported allocator strategy: %d", static_cast(strategy))); + "Unsupported allocator strategy: %d", static_cast(strategy_))); } } InitZeroSizeAllocators(); @@ -130,11 +182,31 @@ class AllocatorFacadePrivate { CheckAllocThreadSafe(); } + inline const AllocatorMap& GetAllocatorMap() { +#ifdef PADDLE_WITH_CUDA + if (UNLIKELY(platform::CUDAGraph::IsCapturing())) { + auto id = platform::CUDAGraph::CapturingID(); + auto iter = cuda_graph_allocator_map_.find(id); + PADDLE_ENFORCE_NE( + iter, cuda_graph_allocator_map_.end(), + platform::errors::PermissionDenied( + "No memory pool is prepared for CUDA Graph capturing.")); + return iter->second->allocators_; + } else { + return allocators_; + } +#else + return allocators_; +#endif + } + inline const std::shared_ptr& GetAllocator( const platform::Place& place, size_t size) { + VLOG(4) << "GetAllocator" + << " " << place << " " << size; const auto& allocators = (size > 0 ? (UNLIKELY(FLAGS_use_system_allocator) ? system_allocators_ - : allocators_) + : GetAllocatorMap()) : zero_size_allocators_); auto iter = allocators.find(place); PADDLE_ENFORCE_NE(iter, allocators.end(), @@ -145,6 +217,7 @@ class AllocatorFacadePrivate { private: void InitSystemAllocators() { + if (!system_allocators_.empty()) return; system_allocators_[platform::CPUPlace()] = std::make_shared(); #ifdef PADDLE_WITH_XPU int device_count = platform::GetXPUDeviceCount(); @@ -183,10 +256,42 @@ class AllocatorFacadePrivate { allocators_[p] = std::make_shared(p); } - void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p) { + void InitAutoGrowthCUDAAllocator(platform::CUDAPlace p, + bool allow_free_idle_chunk) { auto cuda_allocator = std::make_shared(p); + auto alignment = platform::GpuMinChunkSize(); + bool need_addr_align = true; + // NOTE: sometimes, since cuda runtime can not be forked, calling any cuda + // API in that case may got cuda error(3), i.e., + // cudaErrorInitializationError. And, the CUDAAllocator is only initialized + // but not really used. + // Here, the try-catch block is added to handle the case that + // GetDeviceProperties() may failed in the multiple process(for example, in + // dataloader with num_worker > 0) + try { + const auto& prop = platform::GetDeviceProperties(p.GetDeviceId()); + need_addr_align = prop.textureAlignment < alignment; + VLOG(4) << "GetDeviceProperties ok, textureAlignment: " + << prop.textureAlignment + << ", set need_addr_align=" << need_addr_align; + } catch (...) { + need_addr_align = true; + VLOG(4) << "GetDeviceProperties failed, set need_addr_align=true"; + } + // The address returned is aligned already, + // ref: + // https://stackoverflow.com/questions/14082964/cuda-alignment-256bytes-seriously/14083295#14083295 + std::shared_ptr underlying_allocator{nullptr}; + if (need_addr_align) { + VLOG(10) << "use AlignedAllocator with alignment: " << alignment; + underlying_allocator = + std::make_shared(underlying_allocator, alignment); + } else { + VLOG(10) << "not use AlignedAllocator with alignment: " << alignment; + underlying_allocator = cuda_allocator; + } allocators_[p] = std::make_shared( - cuda_allocator, platform::GpuMinChunkSize()); + underlying_allocator, alignment, 0, allow_free_idle_chunk); } #endif @@ -226,6 +331,7 @@ class AllocatorFacadePrivate { }; void InitZeroSizeAllocators() { + if (!zero_size_allocators_.empty()) return; std::vector places; places.emplace_back(platform::CPUPlace()); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) @@ -279,12 +385,57 @@ class AllocatorFacadePrivate { } } +#ifdef PADDLE_WITH_CUDA + + public: + void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) { + PADDLE_ENFORCE_EQ(strategy_, AllocatorStrategy::kAutoGrowth, + platform::errors::InvalidArgument( + "CUDA Graph is only supported when the " + "FLAGS_allocator_strategy=\"auto_growth\", but got " + "FLAGS_allocator_strategy=\"%s\"", + FLAGS_allocator_strategy)); + auto& allocator = cuda_graph_allocator_map_[id]; + PADDLE_ENFORCE_EQ( + allocator.get(), nullptr, + platform::errors::InvalidArgument( + "The memory pool of the CUDA Graph with ID %d have been prepared.", + id)); + allocator.reset( + new AllocatorFacadePrivate(/*allow_free_idle_chunk=*/false)); + for (auto& item : allocator->allocators_) { + auto& old_allocator = item.second; + old_allocator = CUDAGraphAllocator::Create(old_allocator); + } + VLOG(10) << "Prepare memory pool for CUDA Graph with ID " << id; + } + + void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) { + auto iter = cuda_graph_allocator_map_.find(id); + PADDLE_ENFORCE_NE(iter, cuda_graph_allocator_map_.end(), + platform::errors::InvalidArgument( + "Cannot find CUDA Graph with ID = %d", id)); + cuda_graph_allocator_map_.erase(iter); + VLOG(10) << "Remove memory pool of CUDA Graph with ID " << id; + } +#endif + private: AllocatorMap allocators_; - AllocatorMap zero_size_allocators_; - AllocatorMap system_allocators_; +#ifdef PADDLE_WITH_CUDA + std::unordered_map> + cuda_graph_allocator_map_; +#endif + AllocatorStrategy strategy_; + + static AllocatorMap zero_size_allocators_; + static AllocatorMap system_allocators_; }; +AllocatorFacadePrivate::AllocatorMap + AllocatorFacadePrivate::zero_size_allocators_; +AllocatorFacadePrivate::AllocatorMap AllocatorFacadePrivate::system_allocators_; + // Pimpl. Make interface clean. AllocatorFacade::AllocatorFacade() : m_(new AllocatorFacadePrivate()) {} // delete m_ may cause core dump when the destructor of python in conflict with @@ -316,6 +467,16 @@ const std::shared_ptr& AllocatorFacade::GetAllocator( return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1); } +#ifdef PADDLE_WITH_CUDA +void AllocatorFacade::PrepareMemoryPoolForCUDAGraph(CUDAGraphID id) { + return m_->PrepareMemoryPoolForCUDAGraph(id); +} + +void AllocatorFacade::RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id) { + return m_->RemoveMemoryPoolOfCUDAGraph(id); +} +#endif + } // namespace allocation } // namespace memory } // namespace paddle diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h index 7f6ad561aa931b..8d889ec38eed7e 100644 --- a/paddle/fluid/memory/allocation/allocator_facade.h +++ b/paddle/fluid/memory/allocation/allocator_facade.h @@ -18,6 +18,9 @@ #ifdef PADDLE_WITH_ASCEND_CL #include "paddle/fluid/memory/allocation/npu_pinned_allocator.h" #endif +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/gpu_info.h" +#endif #include "paddle/fluid/platform/place.h" namespace paddle { @@ -54,6 +57,11 @@ class AllocatorFacade { uint64_t Release(const platform::Place& place); const std::shared_ptr& GetAllocator(const platform::Place& place); +#ifdef PADDLE_WITH_CUDA + void PrepareMemoryPoolForCUDAGraph(CUDAGraphID id); + void RemoveMemoryPoolOfCUDAGraph(CUDAGraphID id); +#endif + // TODO(yy): Allocate a Copy-On-Write allocation? private: AllocatorFacade(); diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc index a35d8a73f7edae..9f34f5198a1796 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc @@ -39,14 +39,15 @@ namespace allocation { AutoGrowthBestFitAllocator::AutoGrowthBestFitAllocator( const std::shared_ptr &underlying_allocator, size_t alignment, - size_t chunk_size) - : underlying_allocator_( - std::make_shared(underlying_allocator, alignment)), + size_t chunk_size, bool allow_free_idle_chunk) + : underlying_allocator_(underlying_allocator), alignment_(alignment), - chunk_size_(std::max(AlignedSize(chunk_size, alignment), alignment)) {} + chunk_size_(std::max(AlignedSize(chunk_size, alignment), alignment)), + allow_free_idle_chunk_(allow_free_idle_chunk) {} -Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) { - size = AlignedSize(size, alignment_); +Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t unaligned_size) { + size_t size = AlignedSize(unaligned_size, alignment_); + VLOG(10) << "Allocate " << unaligned_size << " bytes, aligned to " << size; std::lock_guard guard(spinlock_); auto iter = free_blocks_.lower_bound(std::make_pair(size, nullptr)); @@ -56,6 +57,8 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) { free_blocks_.erase(iter); auto *chunk = block_it->chunk_; size_t remaining_size = block_it->size_ - size; + VLOG(10) << "Allocate " << size << " bytes from chunk size " + << block_it->size_ << ", remaining " << remaining_size; if (remaining_size == 0) { block_it->is_free_ = false; } else { @@ -94,13 +97,14 @@ Allocation *AutoGrowthBestFitAllocator::AllocateImpl(size_t size) { } blocks.emplace_back(p + remaining_size, size, false, chunk); block_it = --(blocks.end()); - VLOG(2) << "Not found and reallocate " << realloc_size << ", and remaining " - << remaining_size; + VLOG(2) << "Not found and reallocate " << realloc_size << "(" + << static_cast(p) << "), and remaining " << remaining_size; } return new BlockAllocation(block_it); } void AutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) { + VLOG(10) << "Free " << allocation->size() << " bytes"; std::lock_guard guard(spinlock_); auto block_it = static_cast(allocation)->block_it_; auto &blocks = block_it->chunk_->blocks_; @@ -139,6 +143,9 @@ void AutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) { } uint64_t AutoGrowthBestFitAllocator::FreeIdleChunks() { + if (!allow_free_idle_chunk_) { + return 0; + } uint64_t bytes = 0; for (auto chunk_it = chunks_.begin(); chunk_it != chunks_.end();) { auto &blocks = chunk_it->blocks_; diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h index 5ed6eb94f158fe..d1fa6cce0164f6 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h @@ -31,7 +31,7 @@ class AutoGrowthBestFitAllocator : public Allocator { public: AutoGrowthBestFitAllocator( const std::shared_ptr &underlying_allocator, size_t alignment, - size_t chunk_size = 0); + size_t chunk_size = 0, bool allow_free_idle_chunk = true); bool IsAllocThreadSafe() const override { return true; } @@ -86,6 +86,7 @@ class AutoGrowthBestFitAllocator : public Allocator { std::list chunks_; size_t alignment_; size_t chunk_size_; + bool allow_free_idle_chunk_; SpinLock spinlock_; }; diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc index 6f2591c8b15c8e..926af8292d2e86 100644 --- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc +++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc @@ -12,10 +12,11 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h" - #include +#include "paddle/fluid/memory/allocation/aligned_allocator.h" +#include "paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h" + #include "gtest/gtest.h" DECLARE_bool(free_idle_chunk); @@ -50,10 +51,13 @@ static void TestFreeIdleChunk(bool free_idle_chunk, FLAGS_free_idle_chunk = free_idle_chunk; FLAGS_free_when_no_cache_hit = free_when_no_cache_hit; auto recorded_allocator = std::make_shared(); + size_t alignment = 4096; size_t memory_size = 8192; + auto underlying_allocator = + std::make_shared(recorded_allocator, alignment); auto ag_allocator = std::make_shared( - recorded_allocator, alignment); + underlying_allocator, alignment); for (size_t i = 0; i < 10; ++i) { auto allocation = ag_allocator->Allocate(memory_size); @@ -131,8 +135,10 @@ static void TestFreeWhenNoCacheHit(bool free_when_no_cache_hit) { auto underlying_allocator = std::make_shared(memory_capacity); + auto aligned_allocator = + std::make_shared(underlying_allocator, alignment); auto ag_allocator = std::make_shared( - underlying_allocator, alignment); + aligned_allocator, alignment); ag_allocator->Allocate(allocate_size[0]); ASSERT_EQ(underlying_allocator->AllocatedSize(), diff --git a/paddle/fluid/memory/allocation/spin_lock.h b/paddle/fluid/memory/allocation/spin_lock.h index 42462fd74b4cd7..2bbe340e7c6912 100644 --- a/paddle/fluid/memory/allocation/spin_lock.h +++ b/paddle/fluid/memory/allocation/spin_lock.h @@ -15,37 +15,48 @@ #pragma once #include -#if !defined(_WIN32) -#include -#else -#include -#endif // !_WIN32 +#if defined(_M_X64) || defined(__x86_64__) || defined(_M_IX86) || \ + defined(__i386__) +#define __PADDLE_x86__ +#include +#endif +#include #include "paddle/fluid/platform/macros.h" namespace paddle { namespace memory { +static inline void CpuRelax() { +#if defined(__PADDLE_x86__) + _mm_pause(); +#endif +} class SpinLock { public: SpinLock() : mlock_(false) {} void lock() { - bool expect = false; - uint64_t spin_cnt = 0; - while (!mlock_.compare_exchange_weak(expect, true)) { - expect = false; - if ((++spin_cnt & 0xFF) == 0) { -#if defined(_WIN32) - SleepEx(50, FALSE); -#else - sched_yield(); -#endif + for (;;) { + if (!mlock_.exchange(true, std::memory_order_acquire)) { + break; + } + constexpr int kMaxLoop = 32; + for (int loop = 1; mlock_.load(std::memory_order_relaxed);) { + if (loop <= kMaxLoop) { + for (int i = 1; i <= loop; ++i) { + CpuRelax(); + } + loop *= 2; + } else { + std::this_thread::yield(); + } } } } - void unlock() { mlock_.store(false); } + void unlock() { mlock_.store(false, std::memory_order_release); } + DISABLE_COPY_AND_ASSIGN(SpinLock); private: diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 0d7d0a5e13bf3d..dcf492dc6da371 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -17,6 +17,7 @@ add_subdirectory(metrics) add_subdirectory(optimizers) add_subdirectory(reduce_ops) add_subdirectory(sequence_ops) +add_subdirectory(string) add_subdirectory(jit) if(WITH_MKLDNN) add_subdirectory(mkldnn) @@ -78,10 +79,12 @@ if(WITH_UNITY_BUILD) include(unity_build_rule.cmake) endif() -register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op lstm_op run_program_op eye_op recurrent_op - sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS}) +register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op load_combine_op lstm_op run_program_op eye_op + recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op spectral_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS}) op_library(run_program_op SRCS run_program_op.cc run_program_op.cu.cc DEPS executor_cache ${OP_HEADER_DEPS}) +op_library(save_combine_op DEPS string_array) +op_library(load_combine_op DEPS string_array) if (WITH_GPU OR WITH_ROCM) if(WITH_ROCM) @@ -94,14 +97,33 @@ if (WITH_GPU OR WITH_ROCM) endif() op_library(sync_batch_norm_op) file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sync_batch_norm);\n") + if ((NOT WIN32) AND (NOT WITH_ROCM) AND (NOT PADDLE_WITH_ARM) AND (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 11.2) ) + op_library(sparse_attention_op) + file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(sparse_attention);\n") + endif() else() op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale) endif() -if (WITH_GPU AND (NOT WITH_ROCM)) - op_library(spectral_op SRCS spectral_op.cc spectral_op.cu DEPS dynload_cuda ${OP_HEADER_DEPS}) +if (WITH_GPU OR WITH_ROCM) + if (MKL_FOUND AND WITH_ONEMKL) + op_library(spectral_op SRCS spectral_op.cc spectral_op.cu DEPS dynload_cuda dynload_mklrt ${OP_HEADER_DEPS}) + target_include_directories(spectral_op PRIVATE ${MKL_INCLUDE}) + else() + op_library(spectral_op SRCS spectral_op.cc spectral_op.cu DEPS dynload_cuda ${OP_HEADER_DEPS}) + endif() else() - op_library(spectral_op SRCS spectral_op.cc DEPS ${OP_HEADER_DEPS}) + if (MKL_FOUND AND WITH_ONEMKL) + op_library(spectral_op SRCS spectral_op.cc DEPS dynload_mklrt ${OP_HEADER_DEPS}) + target_include_directories(spectral_op PRIVATE ${MKL_INCLUDE}) + else() + op_library(spectral_op SRCS spectral_op.cc DEPS ${OP_HEADER_DEPS}) + endif() +endif() + +if (WITH_ASCEND_CL) + op_library(sync_batch_norm_op) + file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(sync_batch_norm);\n") endif() op_library(lstm_op DEPS ${OP_HEADER_DEPS} lstm_compute) diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 5a498e617a4ff4..5e5cd0ea1c504d 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -77,12 +77,12 @@ class ActivationGradOpMaker : public framework::SingleGradOpMaker { FLAGS_use_mkldnn || (op->HasAttr("use_mkldnn") && BOOST_GET_CONST(bool, op->GetAttr("use_mkldnn")))) { - op->SetInput("X", this->Input("X")); + op->SetInput("X", this->Input("X")); // x } if (static_cast(kDepValue) & static_cast(ActBwdOpFwdDeps::kDepOut)) { - op->SetInput("Out", this->Output("Out")); + op->SetInput("Out", this->Output("Out")); // out } } }; @@ -560,6 +560,28 @@ Applies the following element-wise computation on the input according to } }; +class CELUOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "The input is a multi-dimensional Tensor. The data type is " + "float32 or float64."); + AddOutput("Out", + "The output is a multi-dimensional Tensor which has same " + "dimension and data type as the ``x``."); + AddAttr("alpha", "The alpha value of CELU").SetDefault(1.0f); + AddComment(R"DOC( +CELU Activation Operator. + +Applies the following element-wise computation on the input according to +https://arxiv.org/abs/1704.07483. + +$$out = \max(0, x) + \min(0, \alpha * (e^(x/\alpha) - 1))$$ + +)DOC"); + } +}; + class Relu6OpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -767,6 +789,10 @@ class ActivationOpDoubleGrad : public framework::OperatorWithKernel { ctx->ShareDim("Out", "DDOut"); ctx->ShareLoD("Out", "DDOut"); } + if (ctx->HasOutput("DOutNew")) { + ctx->ShareDim("Out", "DOutNew"); + ctx->ShareLoD("Out", "DOutNew"); + } } } @@ -804,6 +830,45 @@ class ActivationOpDoubleGrad2 : public framework::OperatorWithKernel { } }; +template +class ActivationOpTripleGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + if (static_cast(kDepValue) & static_cast(kDepX)) { + if (ctx->HasOutput("DX")) { + ctx->ShareDim("X", "DX"); + ctx->ShareLoD("X", "DX"); + } + if (ctx->HasOutput("DDOut")) { + ctx->ShareDim("X", "DDOut"); + ctx->ShareLoD("X", "DDOut"); + } + } + if (static_cast(kDepValue) & static_cast(kDepOut)) { + if (ctx->HasOutput("D_DOut")) { + ctx->ShareDim("Out", "D_DOut"); + ctx->ShareLoD("Out", "D_DOut"); + } + if (ctx->HasOutput("D_OutNew")) { + ctx->ShareDim("Out", "D_OutNew"); + ctx->ShareLoD("Out", "D_OutNew"); + } + if (ctx->HasOutput("D_DDx")) { + ctx->ShareDim("DDX", "D_DDx"); + ctx->ShareLoD("DDX", "D_DDx"); + } + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return GetKernelType(ctx, *this, "DDX"); + } +}; + template class SigmoidDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker { @@ -825,6 +890,36 @@ class SigmoidDoubleGradMaker } }; +template +class SigmoidTripleGradMaker + : public ::paddle::framework::SingleGradOpMaker { + public: + using ::paddle::framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("sigmoid_triple_grad"); + // Out, DDX, DOut, D_DDOut, D_DOut_New // input + // D_OutNew, D_DOut, D_DDx // output + // input1: Out + op->SetInput("Out", this->Input("Out")); + // input2: ddx + op->SetInput("DDX", this->Input("DDX")); + // input3: dout + op->SetInput("DOut", this->Input("DOut")); + // input4: d_ddout + op->SetInput("D_DDOut", this->OutputGrad("DDOut")); + // input5: d_dout_new + op->SetInput("D_DOut_New", this->OutputGrad("DOutNew")); + op->SetAttrMap(this->Attrs()); + + // output: d_dOut, d_OutNew, d_ddx + op->SetOutput("D_OutNew", this->InputGrad("Out")); + op->SetOutput("D_DOut", this->InputGrad("DOut")); + op->SetOutput("D_DDx", this->InputGrad("DDX")); + } +}; + template class TanhDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker { public: @@ -845,6 +940,34 @@ class TanhDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker { } }; +template +class TanhTripleGradMaker : public ::paddle::framework::SingleGradOpMaker { + public: + using ::paddle::framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("tanh_triple_grad"); + // Out, DDX, DOut, D_DDOut, D_DOut_New // input + // D_OutNew, D_DOut, D_DDx // output + // input1: Out + op->SetInput("Out", this->Input("Out")); + // input2: ddx + op->SetInput("DDX", this->Input("DDX")); + // input3: dout + op->SetInput("DOut", this->Input("DOut")); + // input4: d_ddout + op->SetInput("D_DDOut", this->OutputGrad("DDOut")); + // input5: d_dout_new + op->SetInput("D_DOut_New", this->OutputGrad("DOutNew")); + op->SetAttrMap(this->Attrs()); + + // output: d_dOut, d_OutNew, d_ddx + op->SetOutput("D_OutNew", this->InputGrad("Out")); + op->SetOutput("D_DOut", this->InputGrad("DOut")); + op->SetOutput("D_DDx", this->InputGrad("DDX")); + } +}; // ReluGrad: dx = dy if y >= 0 else 0 // ReluGradGrad: ddy = ddx if y >= 0 else 0 template @@ -909,6 +1032,29 @@ class ELUDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker { } }; +// celu grad: dx=dy if y>0 else dy*(x/alpha).exp() +// celu gradgrad: ddx=ddy if y>0 else ddy*(x/alpha).exp()/alpha +template +class CELUDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker { + public: + using ::paddle::framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("celu_grad_grad"); + + op->SetInput("X", this->Input("X")); + op->SetInput("DOut", this->Input(framework::GradVarName("Out"))); + // X@GRAD@GRAD: ddx + op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X"))); + op->SetAttrMap(this->Attrs()); + + // Out@GRAD@GRAD: ddy + op->SetOutput("DX", this->InputGrad("X")); + op->SetOutput("DDOut", this->InputGrad(framework::GradVarName("Out"))); + } +}; + // sqrt Grad: dx = 0.5 * dy / y // sqrt GradGrad: ddy = 0.5 * ddx / y, dy = -1 * dx * ddx template @@ -995,10 +1141,12 @@ class LogDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker { }; DECLARE_INPLACE_OP_INFERER(ActivationGradOpInplaceInferer, - {framework::GradVarName("Out"), - framework::GradVarName("X")}); + {framework::GradVarName("Out"), // dout + framework::GradVarName("X")}); // dx DECLARE_INPLACE_OP_INFERER(ActivationDoubleGradOpInplaceInferer, {"DDX", "DDOut"}); +DECLARE_INPLACE_OP_INFERER(ActivationTripleGradOpInplaceInferer, + {"DDX", "D_DOut"}); template class PowGradOpMaker : public framework::SingleGradOpMaker { @@ -1121,13 +1269,21 @@ REGISTER_OPERATOR( REGISTER_OPERATOR(sigmoid_grad, ops::ActivationOpGrad, ops::ActivationGradOpInplaceInferer, ops::SigmoidDoubleGradMaker, - ops::SigmoidDoubleGradMaker) + ops::SigmoidDoubleGradMaker); // 3. Register Sigmoid DoubleGrad Operator REGISTER_OPERATOR( sigmoid_grad_grad, - ops::ActivationOpDoubleGrad::FwdDeps()>, - ops::ActivationDoubleGradOpInplaceInferer); + ops::ActivationOpDoubleGrad::FwdDeps()>, + ops::ActivationDoubleGradOpInplaceInferer, + ops::SigmoidTripleGradMaker, + ops::SigmoidTripleGradMaker); + +// 4. Register Sigmoid TripleGrad Operator +REGISTER_OPERATOR(sigmoid_triple_grad, + ops::ActivationOpTripleGrad< + ops::SigmoidTripleGradFunctor::FwdDeps()>, + ops::ActivationTripleGradOpInplaceInferer); // Register Sigmoid/GradSigmoid Kernels REGISTER_ACTIVATION_CPU_KERNEL(sigmoid, Sigmoid, SigmoidFunctor, @@ -1143,6 +1299,16 @@ REGISTER_OP_CPU_KERNEL( ops::SigmoidDoubleGradKernel>); +// Register TripleGrad Kernel +REGISTER_OP_CPU_KERNEL( + sigmoid_triple_grad, + ops::SigmoidTripleGradKernel>, + ops::SigmoidTripleGradKernel>, + ops::SigmoidTripleGradKernel>); + /* ========================================================================== */ /* ========================== tanh register ============================= */ @@ -1161,7 +1327,14 @@ REGISTER_OPERATOR(tanh_grad, ops::ActivationOpGrad, REGISTER_OPERATOR( tanh_grad_grad, ops::ActivationOpDoubleGrad::FwdDeps()>, - ops::ActivationDoubleGradOpInplaceInferer); + ops::ActivationDoubleGradOpInplaceInferer, + ops::TanhTripleGradMaker, + ops::TanhTripleGradMaker); + +REGISTER_OPERATOR( + tanh_triple_grad, + ops::ActivationOpTripleGrad::FwdDeps()>, + ops::ActivationTripleGradOpInplaceInferer); REGISTER_ACTIVATION_CPU_KERNEL(tanh, Tanh, TanhFunctor, TanhGradFunctor); REGISTER_OP_CPU_KERNEL( @@ -1171,6 +1344,15 @@ REGISTER_OP_CPU_KERNEL( ops::TanhGradGradFunctor>, ops::TanhDoubleGradKernel>); +// Register TripleGrad Kernel +REGISTER_OP_CPU_KERNEL( + tanh_triple_grad, + ops::TanhTripeGradKernel>, + ops::TanhTripeGradKernel>, + ops::TanhTripeGradKernel>); /* ========================================================================== */ /* ========================== relu register ============================= */ @@ -1260,6 +1442,35 @@ REGISTER_OP_CPU_KERNEL( /* ========================================================================== */ +/* ======================== celu register ============================ + */ +REGISTER_OPERATOR( + celu, ops::ActivationOp, ops::CELUOpMaker, ops::ActivationOpInferVarType, + ops::ActivationGradOpMaker::FwdDeps(), + paddle::framework::OpDesc>, + ops::ActivationGradOpMaker::FwdDeps(), + paddle::imperative::OpBase>, + ops::ActFwdInplaceInferer); +REGISTER_OPERATOR(celu_grad, ops::ActivationOpGrad, + ops::ActivationGradOpInplaceInferer, + ops::CELUDoubleGradMaker, + ops::CELUDoubleGradMaker); +REGISTER_OPERATOR( + celu_grad_grad, + ops::ActivationOpDoubleGrad::FwdDeps()>, + ops::ActivationDoubleGradOpInplaceInferer); + +REGISTER_ACTIVATION_CPU_KERNEL(celu, CELU, CELUFunctor, CELUGradFunctor); +REGISTER_OP_CPU_KERNEL( + celu_grad_grad, ops::CELUDoubleGradKernel>, + ops::CELUDoubleGradKernel>, + ops::CELUDoubleGradKernel>); + +/* ========================================================================== */ + /* =========================== sqrt register ============================= */ REGISTER_OPERATOR( sqrt, ops::ActivationOp, ops::SqrtOpMaker, ops::ActivationOpInferVarType, diff --git a/paddle/fluid/operators/activation_op.cu b/paddle/fluid/operators/activation_op.cu index 72f10bf19e733a..cde8e9a4507441 100644 --- a/paddle/fluid/operators/activation_op.cu +++ b/paddle/fluid/operators/activation_op.cu @@ -1202,6 +1202,59 @@ struct CudaELUGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } }; +template +struct CudaCELUFunctor : public BaseActivationFunctor { + using CT = typename details::MPTypeTrait::Type; + CT zero = static_cast(0.0f); + CT one = static_cast(1.0f); + float alpha; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + // celu(x) = max(0, x) + min(0, alpha * (exp(x/alpha) - 1)) + __device__ __forceinline__ T operator()(const T& arg_x) const { + CT x = static_cast(arg_x); + CT temp = static_cast(alpha) * (exp(x / static_cast(alpha)) - one); + CT res = (x > zero ? x : zero) + (temp > zero ? zero : temp); + return static_cast(res); + } +}; + +template +struct CudaCELUGradFunctor : public BaseActivationFunctor { + using MPType = typename details::MPTypeTrait::Type; + MPType zero = static_cast(0.0f); + MPType one = static_cast(1.0f); + float alpha; + + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + // dx = dout, if alpha > 0 and x > 0 + // dx = dout * (x/alpha).exp(), if alpha > 0 and x <= 0 + // dx = dout , if alpha < 0 and x > 0 + // dx = dout * (x/alpha).exp(), if alpha < 0 and x <=0 + __device__ __forceinline__ T operator()(const T& arg_dout, + const T& arg_x) const { + MPType dout = static_cast(arg_dout); + MPType x = static_cast(arg_x); + MPType a = static_cast(alpha); + MPType temp_a_pos = static_cast(alpha > 0.0f); + MPType temp_a_neg = static_cast(alpha <= 0.0f); + MPType temp_x_pos = static_cast(x > zero); + MPType temp_x_neg = static_cast(x <= zero); + return static_cast( + dout * + (temp_a_pos * temp_x_pos + temp_a_pos * temp_x_neg * exp(x / a) + + temp_a_neg * temp_x_pos + exp(x / a) * temp_a_neg * temp_x_neg)); + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + template class ActivationCudaKernel : public framework::OpKernel { @@ -1341,6 +1394,19 @@ REGISTER_OP_CUDA_KERNEL( ops::ELUGradGradFunctor>); /* ========================================================================== */ +/* ======================== celu register ============================ */ +REGISTER_ACTIVATION_CUDA_KERNEL(celu, CELU, CudaCELUFunctor, + CudaCELUGradFunctor); + +REGISTER_OP_CUDA_KERNEL( + celu_grad_grad, ops::CELUDoubleGradKernel>, + ops::CELUDoubleGradKernel>, + ops::CELUDoubleGradKernel>); +/* ========================================================================== */ + /* =========================== relu register ============================ */ #ifdef PADDLE_WITH_HIP REGISTER_ACTIVATION_CUDA_KERNEL(relu, Relu, CudaReluFunctor, @@ -1398,6 +1464,15 @@ REGISTER_OP_CUDA_KERNEL( ops::SigmoidGradGradFunctor>, ops::SigmoidDoubleGradKernel>); + +REGISTER_OP_CUDA_KERNEL( + sigmoid_triple_grad, + ops::SigmoidTripleGradKernel>, + ops::SigmoidTripleGradKernel>, + ops::SigmoidTripleGradKernel>); /* ========================================================================== */ /* =========================== tanh register ============================ */ @@ -1412,6 +1487,15 @@ REGISTER_OP_CUDA_KERNEL( ops::TanhGradGradFunctor>, ops::TanhDoubleGradKernel>); + +REGISTER_OP_CUDA_KERNEL( + tanh_triple_grad, + ops::TanhTripeGradKernel>, + ops::TanhTripeGradKernel>, + ops::TanhTripeGradKernel>); /* ========================================================================== */ /* =========================== sqrt register ============================= */ diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index 57ea97f746246b..627522e1da06d9 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -24,12 +24,13 @@ limitations under the License. */ #define _USE_MATH_DEFINES #endif +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/float16.h" - #ifdef PADDLE_WITH_MKLDNN #include "paddle/fluid/platform/mkldnn_helper.h" #endif @@ -282,19 +283,77 @@ struct SigmoidGradGradFunctor : public BaseActivationFunctor { auto dout = framework::EigenVector::Flatten( GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidGradGrad")); auto dout_new = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "SquareGradGrad")); + GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "SigmoidGradGrad")); dout_new.device(*d) = (static_cast(1) - static_cast(2) * out) * dout * ddx; } if (ddOut) { auto ddout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SquareGradGrad")); + GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SigmoidGradGrad")); ddout.device(*d) = (static_cast(1) - out) * out * ddx; } } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } }; +/* + Out + DOut D_Dout + DDx -> SigmoidTripleGrad -> D_DDx + D_DDout d_OutNew + D_Dout_new + + D_Dout = (1-2*Out)*DDx*D_Dout_new + D_DDx = (1-Out)*Out*D_DDout + (1-2*Out)*DOut*D_Dout_new + D_OutNew = (DDx-2*Out*DDx)*D_DDout - 2*DOut*DDx*D_Dout_new + + Out, DDX, DOut, D_DDOut, D_DOut_New // input + D_OutNew, D_DOut, D_DDx // output +*/ +template +struct SigmoidTripleGradFunctor : public BaseActivationFunctor { + template + void operator()(const Device& dev, const framework::Tensor* Out, + const framework::Tensor* ddX, const framework::Tensor* dOut, + const framework::Tensor* d_DDOut, + const framework::Tensor* d_dOut_New, + framework::Tensor* d_d_Out, framework::Tensor* d_Out_New, + framework::Tensor* d_DDx) const { + auto* d = dev.eigen_device(); + auto ddx = framework::EigenVector::Flatten( + GET_DATA_SAFELY(ddX, "Input", "DDX", "SigmoidTripleGrad")); + auto out = framework::EigenVector::Flatten( + GET_DATA_SAFELY(Out, "Input", "Out", "SigmoidTripleGrad")); + auto dout = framework::EigenVector::Flatten( + GET_DATA_SAFELY(dOut, "Input", "DOut", "SigmoidTripleGrad")); + auto d_ddOut = framework::EigenVector::Flatten( + GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "SigmoidTripleGrad")); + auto d_dOutNew = framework::EigenVector::Flatten(GET_DATA_SAFELY( + d_dOut_New, "Input", "D_DOut_New", "SigmoidTripleGrad")); + + if (d_Out_New) { + auto d_OutNew = framework::EigenVector::Flatten(GET_DATA_SAFELY( + d_Out_New, "Output", "D_OutNew", "SigmoidTripleGrad")); + d_OutNew.device(*d) = (ddx - static_cast(2) * out * ddx) * d_ddOut - + static_cast(2) * dout * ddx * d_dOutNew; + } + if (d_d_Out) { + auto d_dOut = framework::EigenVector::Flatten( + GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "SigmoidTripleGrad")); + d_dOut.device(*d) = + (static_cast(1) - static_cast(2) * out) * ddx * d_dOutNew; + } + if (d_DDx) { + auto d_ddx = framework::EigenVector::Flatten( + GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "SigmoidTripleGrad")); + d_ddx.device(*d) = + (static_cast(1) - out) * out * d_ddOut + + (static_cast(1) - static_cast(2) * out) * dout * d_dOutNew; + } + } + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; + // silu(x) = x / (1 + exp(-x)) template struct SiluFunctor : public BaseActivationFunctor { @@ -465,18 +524,73 @@ struct TanhGradGradFunctor : public BaseActivationFunctor { auto dout = framework::EigenVector::Flatten( GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhGradGrad")); auto dout_new = framework::EigenVector::Flatten( - GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "SquareGradGrad")); + GET_DATA_SAFELY(dOutNew, "Output", "DOutNew", "TanhGradGrad")); dout_new.device(*d) = static_cast(-1) * dout * static_cast(2) * out * ddx; } if (ddOut) { auto ddout = framework::EigenVector::Flatten( - GET_DATA_SAFELY(ddOut, "Output", "DDOut", "SquareGradGrad")); + GET_DATA_SAFELY(ddOut, "Output", "DDOut", "TanhGradGrad")); ddout.device(*d) = (static_cast(1) - out * out) * ddx; } } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } }; +/* + Out + DOut D_Dout + DDx -> TanhTripleGrad -> D_DDx + D_DDout d_OutNew + D_Dout_new + + D_Dout = (-2) * Out * DDx * D_Dout_new + D_DDx = (1-Out^2)*D_DDout + (-2) * Out * DOut * D_Dout_new + D_OutNew = (-2) * Out * DDx * D_DDout + (-2) * DOut * DDx * D_Dout_new + + Out, DDX, DOut, D_DDOut, D_DOut_New // input + D_OutNew, D_DOut, D_DDx // output +*/ +template +struct TanhTripleGradFunctor : public BaseActivationFunctor { + template + void operator()(const Device& dev, const framework::Tensor* Out, + const framework::Tensor* ddX, const framework::Tensor* dOut, + const framework::Tensor* d_DDOut, + const framework::Tensor* d_dOut_New, + framework::Tensor* d_d_Out, framework::Tensor* d_Out_New, + framework::Tensor* d_DDx) const { + auto* d = dev.eigen_device(); + auto ddx = framework::EigenVector::Flatten( + GET_DATA_SAFELY(ddX, "Input", "DDX", "TanhTripleGrad")); + auto out = framework::EigenVector::Flatten( + GET_DATA_SAFELY(Out, "Input", "Out", "TanhTripleGrad")); + auto dout = framework::EigenVector::Flatten( + GET_DATA_SAFELY(dOut, "Input", "DOut", "TanhTripleGrad")); + auto d_ddOut = framework::EigenVector::Flatten( + GET_DATA_SAFELY(d_DDOut, "Input", "D_DDOut", "TanhTripleGrad")); + auto d_dOutNew = framework::EigenVector::Flatten( + GET_DATA_SAFELY(d_dOut_New, "Input", "D_DOut_New", "TanhTripleGrad")); + + if (d_Out_New) { + auto d_OutNew = framework::EigenVector::Flatten( + GET_DATA_SAFELY(d_Out_New, "Output", "D_OutNew", "TanhTripleGrad")); + d_OutNew.device(*d) = (static_cast(-2) * out * ddx * d_ddOut) - + (static_cast(2) * dout * ddx * d_dOutNew); + } + if (d_d_Out) { + auto d_dOut = framework::EigenVector::Flatten( + GET_DATA_SAFELY(d_d_Out, "Output", "D_DOut", "TanhTripleGrad")); + d_dOut.device(*d) = static_cast(-2) * out * ddx * d_dOutNew; + } + if (d_DDx) { + auto d_ddx = framework::EigenVector::Flatten( + GET_DATA_SAFELY(d_DDx, "Output", "D_DDx", "TanhTripleGrad")); + d_ddx.device(*d) = (static_cast(1) - (out * out)) * d_ddOut - + static_cast(2) * out * dout * d_dOutNew; + } + } + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } +}; // tanhshrink(x) = x - tanh(x) // where tanh(x) = (exp(x) - exp(-x)) / (exp(x) + exp(-x)) @@ -1330,6 +1444,51 @@ struct ELUGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } }; +template +struct CELUFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + + template + void operator()(Device d, X x, Out out) const { + out.device(d) = + (x < static_cast(0)) + .select(static_cast(alpha) * + ((x / static_cast(alpha)).exp() - static_cast(1)), + x); + } +}; + +template +struct CELUGradFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + template + void operator()(Device d, X x, Out out, dOut dout, dX dx) const { + auto temp_a_pos = static_cast(alpha > 0); + auto temp_a_neg = static_cast(alpha <= 0); + auto temp_x_pos = (x > static_cast(0)).template cast(); + auto temp_x_neg = (x <= static_cast(0)).template cast(); + + // dx = dout, if alpha > 0 and x > 0 + // dx = dout * (x/alpha).exp(), if alpha > 0 and x <= 0 + // dx = dout , if alpha < 0 and x > 0 + // dx = dout * (x/alpha).exp(), if alpha < 0 and x <=0 + dx.device(d) = + dout * temp_a_pos * temp_x_pos + + dout * (x / static_cast(alpha)).exp() * temp_a_pos * temp_x_neg + + dout * temp_a_neg * temp_x_pos + + dout * (x / static_cast(alpha)).exp() * temp_a_neg * temp_x_neg; + } + + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + // FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5198 template struct PowFunctor : public BaseActivationFunctor { @@ -1716,6 +1875,45 @@ struct ELUGradGradFunctor : public BaseActivationFunctor { static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } }; +template +struct CELUGradGradFunctor : public BaseActivationFunctor { + float alpha; + typename BaseActivationFunctor::AttrPair GetAttrs() { + return {{"alpha", &alpha}}; + } + template + void operator()(const Device& dev, const framework::Tensor* X, + const framework::Tensor* ddX, framework::Tensor* ddOut, + const framework::Tensor* dOut, framework::Tensor* dX) const { + auto* d = dev.eigen_device(); + auto ddx = framework::EigenVector::Flatten( + GET_DATA_SAFELY(ddX, "Input", "DDX", "CELUGradGrad")); + auto x = framework::EigenVector::Flatten( + GET_DATA_SAFELY(X, "Input", "X", "CELUGradGrad")); + + if (dX) { + auto dx = framework::EigenVector::Flatten( + GET_DATA_SAFELY(dX, "Output", "DX", "CELUGradGrad")); + auto dout = framework::EigenVector::Flatten( + GET_DATA_SAFELY(dOut, "Output", "DOut", "CELUGradGrad")); + dx.device(*d) = ddx * dout / static_cast(alpha) * + (x / static_cast(alpha)).exp() * + (x <= static_cast(0)).template cast(); + } + + if (ddOut) { + auto ddout = framework::EigenVector::Flatten( + GET_DATA_SAFELY(ddOut, "Output", "DDOut", "CELUGradGrad")); + ddout.device(*d) = ddx * + ((x > static_cast(0)).template cast() + + (x / static_cast(alpha)).exp() * + (x <= static_cast(0)).template cast()) + .template cast(); + } + } + static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } +}; + template struct SqrtGradGradFunctor : public BaseActivationFunctor { template @@ -1856,7 +2054,6 @@ class SigmoidDoubleGradKernel framework::Tensor *dOutNew, *ddOut; Out = ddX = dOut = nullptr; dOutNew = ddOut = nullptr; - // extract ddx(input) and out(input) ddX = ctx.Input("DDX"); Out = ctx.Input("Out"); @@ -1868,20 +2065,15 @@ class SigmoidDoubleGradKernel Out, platform::errors::NotFound( "Cannot get input Variable Out, variable name = %s", ctx.InputName("Out"))); - // set output ddout ddOut = ctx.Output("DDOut"); - // extract dOut(intput) dOut = ctx.Input("DOut"); PADDLE_ENFORCE_NOT_NULL( dOut, platform::errors::NotFound( "Cannot get input Variable dOut, variable name = %s", ctx.InputName("DOut"))); - - // set output dout_new dOutNew = ctx.Output("DOutNew"); - if (dOutNew) dOutNew->mutable_data(Out->dims(), ctx.GetPlace()); if (ddOut) ddOut->mutable_data(Out->dims(), ctx.GetPlace()); auto& place = ctx.template device_context(); @@ -1890,6 +2082,64 @@ class SigmoidDoubleGradKernel } }; +// Out, DDX, DOut, D_DDOut, D_DOut_New // input +// D_OutNew, D_DOut, D_DDx // output +template +class SigmoidTripleGradKernel + : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + void Compute(const framework::ExecutionContext& ctx) const override { + const framework::Tensor *Out, *ddX, *dOut, *d_ddOut, *d_dOutNew; + framework::Tensor *d_OutNew, *d_dOut, *d_ddx; + Out = ddX = dOut = d_ddOut = d_dOutNew = nullptr; + d_OutNew = d_dOut = d_ddx = nullptr; + + // extract ddx(input), out(input), dOut(input), d_ddOut(input), + // d_dOutNew(input) + ddX = ctx.Input("DDX"); + Out = ctx.Input("Out"); + dOut = ctx.Input("DOut"); + d_ddOut = ctx.Input("D_DDOut"); + d_dOutNew = ctx.Input("D_DOut_New"); + + PADDLE_ENFORCE_NOT_NULL( + ddX, platform::errors::NotFound( + "Cannot get input Variable ddX, variable name = %s", + ctx.InputName("DDX"))); + PADDLE_ENFORCE_NOT_NULL( + Out, platform::errors::NotFound( + "Cannot get input Variable Out, variable name = %s", + ctx.InputName("Out"))); + PADDLE_ENFORCE_NOT_NULL( + dOut, platform::errors::NotFound( + "Cannot get input Variable dOut, variable name = %s", + ctx.InputName("DOut"))); + PADDLE_ENFORCE_NOT_NULL( + d_ddOut, platform::errors::NotFound( + "Cannot get input Variable d_ddOut, variable name = %s", + ctx.InputName("D_DDOut"))); + PADDLE_ENFORCE_NOT_NULL( + d_dOutNew, + platform::errors::NotFound( + "Cannot get input Variable d_dOutNew, variable name = %s", + ctx.InputName("D_DOutNew"))); + + // set output d_OutNew、d_dOut、d_ddx + d_dOut = ctx.Output("D_DOut"); + d_OutNew = ctx.Output("D_OutNew"); + d_ddx = ctx.Output("D_DDx"); + + if (d_dOut) d_dOut->mutable_data(Out->dims(), ctx.GetPlace()); + if (d_OutNew) d_OutNew->mutable_data(Out->dims(), ctx.GetPlace()); + if (d_ddx) d_ddx->mutable_data(ddX->dims(), ctx.GetPlace()); + auto& place = ctx.template device_context(); + Functor functor; + functor(place, Out, ddX, dOut, d_ddOut, d_dOutNew, // input + d_dOut, d_OutNew, d_ddx); // output + } +}; + template class TanhDoubleGradKernel : public framework::OpKernel { @@ -1942,6 +2192,63 @@ class TanhDoubleGradKernel functor(place, Out, ddX, dOut, dOutNew, ddOut); } }; + +template +class TanhTripeGradKernel + : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + void Compute(const framework::ExecutionContext& ctx) const override { + const framework::Tensor *Out, *ddX, *dOut, *d_ddOut, *d_dOutNew; + framework::Tensor *d_OutNew, *d_dOut, *d_ddx; + Out = ddX = dOut = d_ddOut = d_dOutNew = nullptr; + d_OutNew = d_dOut = d_ddx = nullptr; + + // extract ddx(input), out(input), dOut(input), d_ddOut(input), + // d_dOutNew(input) + ddX = ctx.Input("DDX"); + Out = ctx.Input("Out"); + dOut = ctx.Input("DOut"); + d_ddOut = ctx.Input("D_DDOut"); + d_dOutNew = ctx.Input("D_DOut_New"); + + PADDLE_ENFORCE_NOT_NULL( + ddX, platform::errors::NotFound( + "Cannot get input Variable ddX, variable name = %s", + ctx.InputName("DDX"))); + PADDLE_ENFORCE_NOT_NULL( + Out, platform::errors::NotFound( + "Cannot get input Variable Out, variable name = %s", + ctx.InputName("Out"))); + PADDLE_ENFORCE_NOT_NULL( + dOut, platform::errors::NotFound( + "Cannot get input Variable dOut, variable name = %s", + ctx.InputName("DOut"))); + PADDLE_ENFORCE_NOT_NULL( + d_ddOut, platform::errors::NotFound( + "Cannot get input Variable d_ddOut, variable name = %s", + ctx.InputName("D_DDOut"))); + PADDLE_ENFORCE_NOT_NULL( + d_dOutNew, + platform::errors::NotFound( + "Cannot get input Variable d_dOutNew, variable name = %s", + ctx.InputName("D_DOutNew"))); + + // set output d_OutNew、d_dOut、d_ddx + d_dOut = ctx.Output("D_DOut"); + d_OutNew = ctx.Output("D_OutNew"); + d_ddx = ctx.Output("D_DDx"); + + if (d_dOut) d_dOut->mutable_data(Out->dims(), ctx.GetPlace()); + if (d_OutNew) d_OutNew->mutable_data(Out->dims(), ctx.GetPlace()); + if (d_ddx) d_ddx->mutable_data(ddX->dims(), ctx.GetPlace()); + auto& place = ctx.template device_context(); + Functor functor; + functor(place, Out, ddX, dOut, d_ddOut, d_dOutNew, // input + d_dOut, d_OutNew, d_ddx); // output + } +}; + template class SquareDoubleGradKernel : public framework::OpKernel { @@ -1996,6 +2303,33 @@ class ELUDoubleGradKernel } }; +template +class CELUDoubleGradKernel + : public framework::OpKernel { + public: + using T = typename Functor::ELEMENT_TYPE; + void Compute(const framework::ExecutionContext& ctx) const override { + const framework::Tensor *X, *ddX, *dOut; + X = ddX = dOut = nullptr; + framework::Tensor *dX, *ddOut; + dX = ddOut = nullptr; + + ExtractDoubleGradTensorWithInputDOut(ctx, &X, &ddX, &dX, &dOut, &ddOut); + + if (dX) dX->mutable_data(X->dims(), ctx.GetPlace()); + if (ddOut) ddOut->mutable_data(ctx.GetPlace()); + + auto& place = ctx.template device_context(); + + Functor functor; + auto attrs = functor.GetAttrs(); + for (auto& attr : attrs) { + *attr.second = ctx.Attr(attr.first); + } + functor(place, X, ddX, ddOut, dOut, dX); + } +}; + template class SqrtDoubleGradKernel : public framework::OpKernel { diff --git a/paddle/fluid/operators/activation_op_npu.cc b/paddle/fluid/operators/activation_op_npu.cc index eb218507103dd6..20c56d6a279334 100644 --- a/paddle/fluid/operators/activation_op_npu.cc +++ b/paddle/fluid/operators/activation_op_npu.cc @@ -459,6 +459,78 @@ class SigmoidGradNPUKernel : public framework::OpKernel { } }; +// Swish = x * sigmoid(beta * x) +template +class SwishNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + float beta = ctx.Attr("beta"); + + out->mutable_data(ctx.GetPlace()); + auto stream = + ctx.template device_context() + .stream(); + + const auto& muls_runner = + NpuOpRunner("Muls", {*x}, {*out}, {{"value", beta}}); + muls_runner.Run(stream); + + const auto& sigmoid_runner = NpuOpRunner("Sigmoid", {*out}, {*out}, {}); + sigmoid_runner.Run(stream); + + const auto& mul_runner = NpuOpRunner("Mul", {*x, *out}, {*out}); + mul_runner.Run(stream); + } +}; + +template +class SwishGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + float beta = ctx.Attr("beta"); + + dx->mutable_data(ctx.GetPlace()); + auto stream = + ctx.template device_context() + .stream(); + + Tensor beta_x, sigmoid_out, swish_out; + beta_x.mutable_data(x->dims(), ctx.GetPlace()); + sigmoid_out.mutable_data(x->dims(), ctx.GetPlace()); + swish_out.mutable_data(x->dims(), ctx.GetPlace()); + + const auto& muls_runner = + NpuOpRunner("Muls", {*x}, {beta_x}, {{"value", beta}}); + muls_runner.Run(stream); + + const auto& sigmoid_runner = + NpuOpRunner("Sigmoid", {beta_x}, {sigmoid_out}, {}); + sigmoid_runner.Run(stream); + + const auto& mul_runner = + NpuOpRunner("Mul", {sigmoid_out, *x}, {swish_out}, {}); + mul_runner.Run(stream); + + const auto& mul_runner1 = + NpuOpRunner("Mul", {sigmoid_out, swish_out}, {*dx}, {}); + mul_runner1.Run(stream); + + const auto& sub_runner = NpuOpRunner("Sub", {swish_out, *dx}, {*dx}, {}); + sub_runner.Run(stream); + + const auto& add_runner = NpuOpRunner("Add", {sigmoid_out, *dx}, {*dx}, {}); + add_runner.Run(stream); + + const auto& mul_runner2 = NpuOpRunner("Mul", {*dout, *dx}, {*dx}, {}); + mul_runner2.Run(stream); + } +}; + // HardSwish = min(max(0, x+offset), threshold) * x / scale template class HardSwishNPUKernel : public framework::OpKernel { @@ -936,6 +1008,12 @@ REGISTER_OP_NPU_KERNEL( ops::SigmoidGradNPUKernel); +REGISTER_OP_NPU_KERNEL(swish, ops::SwishNPUKernel, + ops::SwishNPUKernel); + +REGISTER_OP_NPU_KERNEL(swish_grad, ops::SwishGradNPUKernel, + ops::SwishGradNPUKernel); + REGISTER_OP_NPU_KERNEL(hard_swish, ops::HardSwishNPUKernel, ops::HardSwishNPUKernel); diff --git a/paddle/fluid/operators/arg_max_op_npu.cc b/paddle/fluid/operators/arg_max_op_npu.cc index 38f9813ad02b40..8b70332c651c8b 100644 --- a/paddle/fluid/operators/arg_max_op_npu.cc +++ b/paddle/fluid/operators/arg_max_op_npu.cc @@ -17,30 +17,49 @@ limitations under the Licnse. */ namespace paddle { namespace operators { + using Tensor = framework::Tensor; +using NPUDeviceContext = platform::NPUDeviceContext; -template -class ArgMaxNPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - int64_t axis = ctx.Attr("axis"); - auto dtype = ctx.Attr("dtype"); +template +struct VisitDataArgNPUMaxFunctor { + const framework::ExecutionContext& ctx; - auto* out = ctx.Output("Out"); - out->mutable_data(ctx.GetPlace()); + explicit VisitDataArgNPUMaxFunctor(const framework::ExecutionContext& ctx) + : ctx(ctx) {} + template + void apply() const { + auto& x = *(ctx.Input("X")); + auto& out = *(ctx.Output("Out")); + out.template mutable_data(ctx.GetPlace()); + auto axis = ctx.Attr("axis"); + auto dtype = ctx.Attr("dtype"); + auto stream = ctx.template device_context().stream(); NpuOpRunner runner; runner.SetType("ArgMaxV2") - .AddInput(*x) + .AddInput(x) .AddInput(std::vector{axis}) - .AddOutput(*out) - .AddAttr("dtype", dtype); + .AddOutput(out) + .AddAttrDataType("dtype", dtype) + .Run(stream); + } +}; - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); +template +class ArgMaxNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& dtype = ctx.Attr("dtype"); + if (dtype < 0) { + framework::VisitDataTypeTiny(static_cast( + framework::proto::VarType::INT64), + VisitDataArgNPUMaxFunctor(ctx)); + return; + } + framework::VisitDataTypeTiny( + static_cast(dtype), + VisitDataArgNPUMaxFunctor(ctx)); } }; @@ -48,7 +67,5 @@ class ArgMaxNPUKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL( - arg_max, ops::ArgMaxNPUKernel, - ops::ArgMaxNPUKernel); +REGISTER_OP_NPU_KERNEL(arg_max, ops::ArgMaxNPUKernel, + ops::ArgMaxNPUKernel); diff --git a/paddle/fluid/operators/arg_max_op_xpu.cc b/paddle/fluid/operators/arg_max_op_xpu.cc index 8060b5cf755c0e..71ec26ea5a7927 100644 --- a/paddle/fluid/operators/arg_max_op_xpu.cc +++ b/paddle/fluid/operators/arg_max_op_xpu.cc @@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and -limitations under the Licnse. */ +limitations under the License. */ #ifdef PADDLE_WITH_XPU diff --git a/paddle/fluid/operators/arg_min_max_op_base.cu.h b/paddle/fluid/operators/arg_min_max_op_base.cu.h index b19ba1e1590fe1..2c34d6f8300a74 100644 --- a/paddle/fluid/operators/arg_min_max_op_base.cu.h +++ b/paddle/fluid/operators/arg_min_max_op_base.cu.h @@ -89,22 +89,25 @@ void ComputeFullArg(const platform::CUDADeviceContext& ctx, const Tensor& input, const int64_t n) { auto cu_stream = ctx.stream(); auto ComputeBlockSize = [](int64_t col) { + auto block_size = 8; if (col > 512) - return 1024; + block_size = 1024; else if (col > 256) - return 512; + block_size = 512; else if (col > 128) - return 256; + block_size = 256; else if (col > 64) - return 128; + block_size = 128; else if (col > 32) - return 64; + block_size = 64; else if (col > 16) - return 32; + block_size = 32; else if (col > 8) - return 16; - else - return 8; + block_size = 16; +#ifdef __HIPCC__ + block_size = std::min(block_size, 256); +#endif + return block_size; }; int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize().x; diff --git a/paddle/fluid/operators/arg_min_op_npu.cc b/paddle/fluid/operators/arg_min_op_npu.cc index f776412c16239f..cc81e320080b74 100644 --- a/paddle/fluid/operators/arg_min_op_npu.cc +++ b/paddle/fluid/operators/arg_min_op_npu.cc @@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and -limitations under the Licnse. */ +limitations under the License. */ #include "paddle/fluid/operators/arg_min_max_op_base.h" #include "paddle/fluid/operators/npu_op_runner.h" diff --git a/paddle/fluid/operators/argsort_op_npu.cc b/paddle/fluid/operators/argsort_op_npu.cc index e36dd322e0ea1d..f2a57b4b9bdfb1 100644 --- a/paddle/fluid/operators/argsort_op_npu.cc +++ b/paddle/fluid/operators/argsort_op_npu.cc @@ -1,8 +1,11 @@ /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -15,156 +18,142 @@ limitations under the License. */ namespace paddle { namespace operators { -template +using Tensor = framework::Tensor; +using NPUDeviceContext = platform::NPUDeviceContext; + +template +static void TranposeNPU(const framework::ExecutionContext& ctx, + const aclrtStream& stream, std::vector* perm, + const Tensor& in, Tensor* out) { + out->mutable_data(ctx.GetPlace()); + NpuOpRunner runner; + runner.SetType("Transpose") + .AddInput(in) + .AddInput(std::move(*perm)) + .AddOutput(*out) + .Run(stream); +} + +static void CastToInt64(const framework::ExecutionContext& ctx, + const aclrtStream& stream, const Tensor& in, + Tensor* out) { + out->mutable_data(ctx.GetPlace()); + NpuOpRunner runner; + runner.SetType("Cast") + .AddInput(in) + .AddOutput(*out) + .AddAttr("dst_type", ACL_INT64) + .Run(stream); +} + +template class ArgsortNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* input = ctx.Input("X"); auto* output = ctx.Output("Out"); - output->mutable_data(ctx.GetPlace()); auto* indices = ctx.Output("Indices"); - indices->mutable_data(ctx.GetPlace()); + int axis = ctx.Attr("axis"); + bool descending = ctx.Attr("descending"); - int32_t axis = ctx.Attr("axis"); - auto in_dims = indices->dims(); + auto in_dims = input->dims(); axis = (axis < 0) ? (in_dims.size() + axis) : axis; - bool descending = ctx.Attr("descending"); - auto stream = - ctx.template device_context() - .stream(); - framework::NPUAttributeMap sort_attr_input = { - {"axis", static_cast(-1)}, {"descending", descending}}; + + auto stream = ctx.template device_context().stream(); + framework::NPUAttributeMap attr = {{"axis", -1}, + {"descending", descending}}; + + Tensor indices_tmp(framework::proto::VarType::INT32); + indices_tmp.Resize(indices->dims()); if (axis == -1 || axis + 1 == in_dims.size()) { - const auto& sort_runner = - NpuOpRunner("Sort", {*input}, {*output, *indices}, sort_attr_input); - sort_runner.Run(stream); + output->mutable_data(ctx.GetPlace()); + indices_tmp.mutable_data(ctx.GetPlace()); + const auto& runner = + NpuOpRunner("Sort", {*input}, {*output, indices_tmp}, attr); + runner.Run(stream); } else { - // transpose - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.push_back(i); - } - trans.push_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.push_back(i); + std::vector perm; + for (int64_t i = 0; i < in_dims.size(); i++) { + perm.emplace_back(i); } - trans.push_back(axis); - framework::DDim trans_dims(in_dims); - for (size_t i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; + std::swap(perm[axis], perm[in_dims.size() - 1]); + + std::vector shape; + for (size_t i = 0; i < perm.size(); i++) { + shape.emplace_back(in_dims[perm[i]]); } - framework::NPUAttributeMap trans_attr_input = {{"perm", trans}}; - Tensor trans_input; - trans_input.mutable_data(trans_dims, ctx.GetPlace()); - const auto& trans_input_runner = - NpuOpRunner("TransposeD", {*input}, {trans_input}, trans_attr_input); - trans_input_runner.Run(stream); - Tensor trans_indices; - trans_indices.mutable_data(trans_dims, ctx.GetPlace()); - const auto& trans_indice_runner = NpuOpRunner( - "TransposeD", {*indices}, {trans_indices}, trans_attr_input); - trans_indice_runner.Run(stream); - Tensor trans_output; + auto trans_dims = framework::make_ddim(shape); + + Tensor trans_input(input->type()); + trans_input.Resize(trans_dims); + TranposeNPU(ctx, stream, &perm, *input, &trans_input); + + Tensor trans_output(input->type()); + Tensor trans_indices(framework::proto::VarType::INT32); trans_output.mutable_data(trans_dims, ctx.GetPlace()); - const auto& trans_output_runner = NpuOpRunner( - "TransposeD", {*output}, {trans_output}, trans_attr_input); - trans_output_runner.Run(stream); - const auto& sort_runner = - NpuOpRunner("Sort", {trans_input}, {trans_output, trans_indices}, - sort_attr_input); - sort_runner.Run(stream); - // transpose back - const auto& trans_indices_back_runner = NpuOpRunner( - "TransposeD", {trans_indices}, {*indices}, trans_attr_input); - trans_indices_back_runner.Run(stream); - const auto& trans_output_back_runner = NpuOpRunner( - "TransposeD", {trans_output}, {*output}, trans_attr_input); - trans_output_back_runner.Run(stream); + trans_indices.mutable_data(trans_dims, ctx.GetPlace()); + + const auto& runner = NpuOpRunner("Sort", {trans_input}, + {trans_output, trans_indices}, attr); + runner.Run(stream); + + TranposeNPU(ctx, stream, &perm, trans_output, output); + TranposeNPU(ctx, stream, &perm, trans_indices, &indices_tmp); } + CastToInt64(ctx, stream, indices_tmp, indices); } }; -template -static void ReshapeNPU(const framework::Tensor* input, - const std::vector& input_shapes, - framework::Tensor* output) { - output->ShareDataWith(*input); - output->Resize(framework::make_ddim(std::move(input_shapes))); -} - template static void FullAssignNPU(const framework::ExecutionContext& ctx, - Type ind_lastdim, Type outer_dim, - const framework::DDim& trans_dims, - const framework::Tensor* input, - const framework::Tensor* indices, - framework::Tensor* t_out) { - // reshape input - Type input_shape = ind_lastdim * outer_dim; - std::vector input_shapes = {input_shape}; - Tensor input_reshape_tensor(input->type()); - ReshapeNPU(input, input_shapes, &input_reshape_tensor); - // reshape index - std::vector index_shapes = {outer_dim, ind_lastdim}; - framework::DDim ind_2d = framework::make_ddim({outer_dim, ind_lastdim}); - Tensor ind_2d_tensor(indices->type()); - ReshapeNPU(indices, index_shapes, &ind_2d_tensor); - // range_flatten_index - std::vector range_flatten_index; - for (Type i = 0; i < input_shape; i += ind_lastdim) { - range_flatten_index.push_back(static_cast(i)); + const aclrtStream& stream, + const framework::DDim in_dims, const Tensor& input, + const Tensor& indices, Tensor* t_out) { + const int64_t input_height = + framework::product(framework::slice_ddim(in_dims, 0, in_dims.size() - 1)); + const int64_t input_width = in_dims[in_dims.size() - 1]; + + Tensor input_tmp; + input_tmp.ShareDataWith(input); + input_tmp.Resize( + framework::make_ddim(std::vector{input_height * input_width})); + + Tensor indices_tmp; + indices_tmp.ShareDataWith(indices); + indices_tmp.Resize( + framework::make_ddim(std::vector{input_height, input_width})); + + std::vector indexs_value; + for (Type i = 0; i < input_height; i++) { + indexs_value.push_back(i * input_width); } - Tensor range_flatten_index_tensor(framework::proto::VarType::INT32); - range_flatten_index_tensor.Resize(framework::make_ddim({outer_dim})); - range_flatten_index_tensor.mutable_data( - {static_cast(range_flatten_index.size())}, ctx.GetPlace()); - TensorFromVector(range_flatten_index, ctx.device_context(), - &range_flatten_index_tensor); - Tensor range_flatten_index_expand_tensor(range_flatten_index_tensor.type()); - std::vector flatten_shape = {outer_dim, 1}; - ReshapeNPU(&range_flatten_index_tensor, flatten_shape, - &range_flatten_index_expand_tensor); - auto stream = - ctx.template device_context() - .stream(); - Tensor ind_2d_add_tensor; - ind_2d_add_tensor.mutable_data(ind_2d, ctx.GetPlace()); - const auto& runner_ind_2d_tensor = NpuOpRunner( - std::string("Add"), {ind_2d_tensor, range_flatten_index_expand_tensor}, - {ind_2d_add_tensor}, {}); - runner_ind_2d_tensor.Run(stream); - Tensor ind_reshape_tensor(ind_2d_add_tensor.type()); - ReshapeNPU(&ind_2d_add_tensor, input_shapes, &ind_reshape_tensor); - Tensor ind_reshape_expand_tensor(ind_reshape_tensor.type()); - std::vector ind_shape = {input_shape, 1}; - ReshapeNPU(&ind_reshape_tensor, ind_shape, &ind_reshape_expand_tensor); - // expand_index - Tensor input_scatter_tensor; - input_scatter_tensor.Resize({input_shape}); - input_scatter_tensor.mutable_data(ctx.GetPlace()); - Tensor input_scatter_tensor_ori; - input_scatter_tensor_ori.Resize({input_shape}); - input_scatter_tensor_ori.mutable_data(ctx.GetPlace()); - std::vector trans_shapes; - - for (int i = 0; i < trans_dims.size(); i++) { - trans_shapes.push_back(trans_dims[i]); - } - NpuOpRunner runner_scatter; - runner_scatter.SetType("TensorScatterUpdate") - .AddInput(input_scatter_tensor_ori) - .AddInput(ind_reshape_expand_tensor) - .AddInput(input_reshape_tensor) - .AddOutput(input_scatter_tensor); - runner_scatter.Run(stream); - framework::TensorCopy(input_scatter_tensor, ctx.GetPlace(), - ctx.template device_context(), - t_out); - t_out->Resize(framework::make_ddim(trans_shapes)); + Tensor indexs_tmp(indices.type()); + framework::TensorFromVector(indexs_value, ctx.device_context(), + &indexs_tmp); + indexs_tmp.Resize( + framework::make_ddim(std::vector{input_height, 1})); + + Tensor indices_index(indices.type()); + indices_index.mutable_data(indices_tmp.dims(), ctx.GetPlace()); + const auto& runner_add = + NpuOpRunner("Add", {indices_tmp, indexs_tmp}, {indices_index}, {}); + runner_add.Run(stream); + + indices_index.Resize( + framework::make_ddim(std::vector{input_height * input_width})); + + t_out->mutable_data(ctx.GetPlace()); + Tensor out_tmp(t_out->type()); + out_tmp.ShareDataWith(*t_out); + + const auto& runner = + NpuOpRunner("TensorScatterUpdate", {input_tmp, indices_index, input_tmp}, + {out_tmp}, {}); + runner.Run(stream); } -template +template class ArgsortGradNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -172,75 +161,42 @@ class ArgsortGradNPUKernel : public framework::OpKernel { auto* dX = ctx.Output(framework::GradVarName("X")); auto* dO = ctx.Input(framework::GradVarName("Out")); int axis = ctx.Attr("axis"); + auto in_dims = indices->dims(); axis = (axis < 0) ? (in_dims.size() + axis) : axis; - auto place = ctx.GetPlace(); - - auto stream = - ctx.template device_context() - .stream(); - dX->mutable_data(ctx.GetPlace()); - Tensor dxt; - dxt.mutable_data(dX->dims(), place); - const auto& runner_flatten = - NpuOpRunner(std::string("Flatten"), {*dX}, {dxt}, {}); - runner_flatten.Run(stream); - FillNpuTensorWithConstant(&dxt, static_cast(0)); if (dO->numel() == 0) return; - // Do full assig n - if (axis == -1 || axis + 1 == in_dims.size()) { - const int64_t outer_dim = framework::product( - framework::slice_ddim(in_dims, 0, in_dims.size() - 1)); - const int64_t ind_lastdim = in_dims[in_dims.size() - 1]; - FullAssignNPU(ctx, ind_lastdim, outer_dim, in_dims, dO, - indices, dX); + auto stream = ctx.template device_context().stream(); + + if (axis == -1 || axis + 1 == in_dims.size()) { + FullAssignNPU(ctx, stream, in_dims, *dO, *indices, dX); } else { - // If not full assign do transpose - std::vector trans; - for (int i = 0; i < axis; i++) { - trans.push_back(i); - } - trans.push_back(in_dims.size() - 1); - for (int i = axis + 1; i < in_dims.size() - 1; i++) { - trans.push_back(i); + std::vector perm; + for (int64_t i = 0; i < in_dims.size(); i++) { + perm.emplace_back(i); } - trans.push_back(axis); - framework::DDim trans_dims(in_dims); - for (size_t i = 0; i < trans.size(); i++) { - trans_dims[i] = in_dims[trans[i]]; - } - std::vector axis; - for (size_t i = 0; i < trans.size(); i++) { - axis.push_back(in_dims[trans[i]]); + std::swap(perm[axis], perm[in_dims.size() - 1]); + + std::vector shape; + for (size_t i = 0; i < perm.size(); i++) { + shape.emplace_back(in_dims[perm[i]]); } - framework::NPUAttributeMap attr_input = {{"perm", trans}}; - Tensor trans_dO; - trans_dO.mutable_data(trans_dims, ctx.GetPlace()); - Tensor trans_ind; - trans_ind.mutable_data(trans_dims, ctx.GetPlace()); - // Do transpose - const auto& runner_transpose_dx = NpuOpRunner( - std::string("TransposeD"), {*dO}, {trans_dO}, {attr_input}); - runner_transpose_dx.Run(stream); - const auto& runner_transpose_ind = NpuOpRunner( - std::string("TransposeD"), {*indices}, {trans_ind}, {attr_input}); - runner_transpose_ind.Run(stream); - - const int64_t outer_dim = framework::product( - framework::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); - const int64_t ind_lastdim = trans_dims[trans_dims.size() - 1]; - - Tensor tmp_out; - tmp_out.mutable_data(trans_dims, ctx.GetPlace()); - - FullAssignNPU(ctx, ind_lastdim, outer_dim, trans_dims, - &trans_dO, &trans_ind, &tmp_out); - - // transpose back - const auto& runner_transpose_out = NpuOpRunner( - std::string("TransposeD"), {tmp_out}, {*dX}, {attr_input}); - runner_transpose_out.Run(stream); + auto trans_dims = framework::make_ddim(shape); + + Tensor trans_dout(dO->type()); + Tensor trans_ids(indices->type()); + trans_dout.Resize(trans_dims); + trans_ids.Resize(trans_dims); + + TranposeNPU(ctx, stream, &perm, *dO, &trans_dout); + TranposeNPU(ctx, stream, &perm, *indices, &trans_ids); + + Tensor trans_dx(dO->type()); + trans_dx.Resize(trans_dims); + FullAssignNPU(ctx, stream, trans_dims, trans_dout, trans_ids, + &trans_dx); + + TranposeNPU(ctx, stream, &perm, trans_dx, dX); } } }; @@ -251,11 +207,8 @@ class ArgsortGradNPUKernel : public framework::OpKernel { namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL( - argsort, ops::ArgsortNPUKernel, - ops::ArgsortNPUKernel); +REGISTER_OP_NPU_KERNEL(argsort, ops::ArgsortNPUKernel, + ops::ArgsortNPUKernel); -REGISTER_OP_NPU_KERNEL(argsort_grad, - ops::ArgsortGradNPUKernel, - ops::ArgsortGradNPUKernel); +REGISTER_OP_NPU_KERNEL(argsort_grad, ops::ArgsortGradNPUKernel, + ops::ArgsortGradNPUKernel); diff --git a/paddle/fluid/operators/batch_norm_op_npu.cc b/paddle/fluid/operators/batch_norm_op_npu.cc index dfb620a4e96bdb..3bcd0ac37b3750 100644 --- a/paddle/fluid/operators/batch_norm_op_npu.cc +++ b/paddle/fluid/operators/batch_norm_op_npu.cc @@ -38,11 +38,13 @@ class NPUBatchNormOpKernel : public framework::OpKernel { const auto *x = ctx.Input("X"); const auto &x_dims = x->dims(); - PADDLE_ENFORCE_EQ(x_dims.size(), 4, - platform::errors::InvalidArgument( - "The input tensor X's dimension must equal to 4. But " - "received X's shape = [%s], X's dimension = [%d].", - x_dims, x_dims.size())); + PADDLE_ENFORCE_EQ( + (x_dims.size() == 4UL || x_dims.size() == 3UL), true, + platform::errors::InvalidArgument( + "The input tensor X's dimension must equal to 3 or 4. " + " But got X's shape = [%s], X's dimension = [%d].", + x_dims.to_str(), x_dims.size())); + const auto *running_mean = ctx.Input("Mean"); const auto *running_var = ctx.Input("Variance"); const auto *scale = ctx.Input("Scale"); @@ -51,8 +53,11 @@ class NPUBatchNormOpKernel : public framework::OpKernel { auto *y = ctx.Output("Y"); y->mutable_data(ctx.GetPlace()); - Tensor x_tensor(x->type()); - Tensor y_tesnor(y->type()); + auto &dev_ctx = ctx.template device_context(); + auto x_tensor = + ctx.AllocateTmpTensor(x->dims(), dev_ctx); + auto y_tesnor = + ctx.AllocateTmpTensor(y->dims(), dev_ctx); x_tensor.ShareDataWith(*x); y_tesnor.ShareDataWith(*y); if (data_layout == DataLayout::kNHWC) { @@ -89,6 +94,18 @@ class NPUBatchNormOpKernel : public framework::OpKernel { sum.mutable_data(running_mean->dims(), ctx.GetPlace()); square_sum.mutable_data(running_mean->dims(), ctx.GetPlace()); + // BNTrainingReduce ONLY support rank = 4 + if (x->dims().size() == 3) { + auto x_shape_vec = framework::vectorize(x->dims()); + if (data_layout == DataLayout::kNCHW) { + x_shape_vec.push_back(1); // expand NCL -> NCL1 + } else { + x_shape_vec.insert(x_shape_vec.begin() + 2, 1); // expand NLC -> NL1C + } + auto x_new_shape = framework::make_ddim(x_shape_vec); + x_tensor.Resize(x_new_shape); + x_tensor.Resize(x_new_shape); + } const auto &runner_reduce = NpuOpRunner("BNTrainingReduce", {x_tensor}, {sum, square_sum}, {{"epsilon", epsilon}}); @@ -127,8 +144,11 @@ class NPUBatchNormGradOpKernel : public framework::OpKernel { use_global_stats = is_test || use_global_stats; - Tensor x_tensor(x->type()); - Tensor dy_tensor(d_y->type()); + auto &dev_ctx = ctx.template device_context(); + auto x_tensor = + ctx.AllocateTmpTensor(x->dims(), dev_ctx); + auto dy_tensor = + ctx.AllocateTmpTensor(d_y->dims(), dev_ctx); x_tensor.ShareDataWith(*x); dy_tensor.ShareDataWith(*d_y); if (data_layout == DataLayout::kNHWC) { @@ -136,14 +156,14 @@ class NPUBatchNormGradOpKernel : public framework::OpKernel { dy_tensor.set_layout(DataLayout::kNHWC); } - Tensor scale_grad_tmp(scale->type()); - Tensor bias_grad_tmp(bias->type()); + auto scale_grad_tmp = + ctx.AllocateTmpTensor(scale->dims(), dev_ctx); + auto bias_grad_tmp = + ctx.AllocateTmpTensor(bias->dims(), dev_ctx); if (d_scale == nullptr) { - scale_grad_tmp.Resize(scale->dims()); d_scale = &scale_grad_tmp; } if (d_bias == nullptr) { - bias_grad_tmp.Resize(bias->dims()); d_bias = &bias_grad_tmp; } @@ -169,9 +189,26 @@ class NPUBatchNormGradOpKernel : public framework::OpKernel { } if (d_x) { d_x->mutable_data(ctx.GetPlace()); - Tensor dx_tensor(d_x->type()); + auto dx_tensor = + ctx.AllocateTmpTensor(d_x->dims(), dev_ctx); dx_tensor.ShareDataWith(*d_x); + if (data_layout == DataLayout::kNHWC) { + dx_tensor.set_layout(DataLayout::kNHWC); + } if (use_global_stats) { + if (x->dims().size() == 3) { + // BNInferGrad only support x rank = 4, + auto x_shape_vec = framework::vectorize(d_x->dims()); + if (data_layout == DataLayout::kNCHW) { + x_shape_vec.push_back(1); // expand NCL -> NCL1 + } else { + x_shape_vec.insert(x_shape_vec.begin() + 2, + 1); // expand NLC -> NL1C + } + auto x_new_shape = framework::make_ddim(x_shape_vec); + dx_tensor.Resize(x_new_shape); + dy_tensor.Resize(x_new_shape); + } const auto *running_var = ctx.Input("Variance"); const auto &runner_infer = NpuOpRunner("BNInferGrad", {dy_tensor, *scale, *running_var}, diff --git a/paddle/fluid/operators/bincount_op.cc b/paddle/fluid/operators/bincount_op.cc new file mode 100644 index 00000000000000..8b2fa60f8722e5 --- /dev/null +++ b/paddle/fluid/operators/bincount_op.cc @@ -0,0 +1,116 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/bincount_op.h" + +#include +#include +#include + +namespace paddle { +namespace operators { + +using framework::OpKernelType; +using framework::Tensor; + +class BincountOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true, + platform::errors::InvalidArgument( + "Input(X) of BincountOp should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, + platform::errors::InvalidArgument( + "Output(Out) of BincountOp should not be null.")); + + auto input_dim = ctx->GetInputDim("X"); + auto minlength = ctx->Attrs().Get("minlength"); + + PADDLE_ENFORCE_GE(minlength, 0, + platform::errors::InvalidArgument( + "The minlength should be greater than or equal to 0." + "But received minlength is %d", + minlength)); + + PADDLE_ENFORCE_EQ(input_dim.size(), 1, + platform::errors::InvalidArgument( + "The 'shape' of Input(X) must be 1-D tensor." + "But the dimension of Input(X) is [%d]", + input_dim.size())); + + if (ctx->HasInput("Weights")) { + auto weights_dim = ctx->GetInputDim("Weights"); + PADDLE_ENFORCE_EQ(weights_dim.size(), 1, + platform::errors::InvalidArgument( + "The 'shape' of Input(Weights) must be 1-D tensor." + "But the dimension of Input(Weights) is [%d]", + weights_dim.size())); + + PADDLE_ENFORCE_EQ( + weights_dim[0], input_dim[0], + platform::errors::InvalidArgument( + "The 'shape' of Input(Weights) must be equal to the 'shape' of " + "Input(X)." + "But received: the 'shape' of Input(Weights) is [%s]," + "the 'shape' of Input(X) is [%s]", + weights_dim, input_dim)); + } + + ctx->SetOutputDim("Out", framework::make_ddim({-1})); + ctx->ShareLoD("X", /*->*/ "Out"); + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const { + auto data_type = + ctx.HasInput("Weights") + ? OperatorWithKernel::IndicateVarDataType(ctx, "Weights") + : OperatorWithKernel::IndicateVarDataType(ctx, "X"); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +class BincountOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor) The input tensor of Bincount op,"); + AddInput("Weights", "(Tensor) The weights tensor of Bincount op,") + .AsDispensable(); + AddOutput("Out", "(Tensor) The output tensor of Bincount op,"); + AddAttr("minlength", "(int) The minimal numbers of bins") + .SetDefault(0) + .EqualGreaterThan(0); + AddComment(R"DOC( + Bincount Operator. + Computes frequency of each value in the input tensor. + Elements of input tensor should be non-negative ints. + )DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR( + bincount, ops::BincountOp, ops::BincountOpMaker, + paddle::framework::EmptyGradOpMaker, + paddle::framework::EmptyGradOpMaker); +REGISTER_OP_CPU_KERNEL( + bincount, ops::BincountKernel, + ops::BincountKernel, + ops::BincountKernel, + ops::BincountKernel); diff --git a/paddle/fluid/operators/bincount_op.cu b/paddle/fluid/operators/bincount_op.cu new file mode 100644 index 00000000000000..757f7286291069 --- /dev/null +++ b/paddle/fluid/operators/bincount_op.cu @@ -0,0 +1,160 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/operators/bincount_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/gpu_launch_config.h" +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using platform::PADDLE_CUDA_NUM_THREADS; + +inline int GET_BLOCKS(const int N) { + return (N + PADDLE_CUDA_NUM_THREADS - 1) / PADDLE_CUDA_NUM_THREADS; +} + +template +__global__ void KernelBincount(const InputT* input, const int total_elements, + const bool has_weights, const T* weights, + OutT* output) { + if (!has_weights) { + for (int i = threadIdx.x; i < total_elements; i += blockDim.x) { + paddle::platform::CudaAtomicAdd(&output[input[i]], 1L); + } + } else { + for (int i = threadIdx.x; i < total_elements; i += blockDim.x) { + paddle::platform::CudaAtomicAdd(&output[input[i]], + static_cast(weights[i])); + } + } +} + +template +void BincountCUDAInner(const framework::ExecutionContext& context) { + const Tensor* input = context.Input("X"); + const Tensor* weights = context.Input("Weights"); + Tensor* output = context.Output("Out"); + auto& minlength = context.Attr("minlength"); + + const InputT* input_data = input->data(); + + const int input_numel = input->numel(); + + if (input_data == nullptr) { + framework::DDim out_dim{0}; + output->Resize(out_dim); + output->mutable_data(context.GetPlace()); + return; + } + auto input_x = framework::EigenVector::Flatten(*input); + + framework::Tensor input_min_t, input_max_t; + auto* input_max_data = + input_max_t.mutable_data({1}, context.GetPlace()); + auto* input_min_data = + input_min_t.mutable_data({1}, context.GetPlace()); + + auto input_max_scala = framework::EigenScalar::From(input_max_t); + auto input_min_scala = framework::EigenScalar::From(input_min_t); + + auto* place = context.template device_context().eigen_device(); + input_max_scala.device(*place) = input_x.maximum(); + input_min_scala.device(*place) = input_x.minimum(); + + Tensor input_min_cpu, input_max_cpu; + TensorCopySync(input_max_t, platform::CPUPlace(), &input_max_cpu); + TensorCopySync(input_min_t, platform::CPUPlace(), &input_min_cpu); + + InputT input_min = input_min_cpu.data()[0]; + + PADDLE_ENFORCE_GE( + input_min, static_cast(0), + platform::errors::InvalidArgument( + "The elements in input tensor must be non-negative ints")); + + int64_t output_size = + static_cast(input_max_cpu.data()[0]) + 1L; + + output_size = std::max(output_size, static_cast(minlength)); + framework::DDim out_dim{output_size}; + output->Resize(out_dim); + + bool has_weights = (weights != nullptr); + + const T* weights_data = has_weights ? weights->data() : nullptr; + + auto stream = + context.template device_context().stream(); + + if (!has_weights) { + int64_t* output_data = output->mutable_data(context.GetPlace()); + math::SetConstant()( + context.template device_context(), output, 0L); + + KernelBincount<<>>( + input_data, input_numel, has_weights, weights_data, output_data); + } else { + const auto& weights_type = weights->type(); + + if (weights_type == framework::proto::VarType::FP32) { + float* output_data = output->mutable_data(context.GetPlace()); + math::SetConstant()( + context.template device_context(), output, + static_cast(0)); + + KernelBincount<<>>( + input_data, input_numel, has_weights, weights_data, output_data); + } else { + double* output_data = output->mutable_data(context.GetPlace()); + math::SetConstant()( + context.template device_context(), output, + static_cast(0)); + + KernelBincount<<>>( + input_data, input_numel, has_weights, weights_data, output_data); + } + } +} + +template +class BincountCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* input = context.Input("X"); + const auto& input_type = input->type(); + + if (input_type == framework::proto::VarType::INT32) { + BincountCUDAInner(context); + } else if (input_type == framework::proto::VarType::INT64) { + BincountCUDAInner(context); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + bincount, ops::BincountCUDAKernel, + ops::BincountCUDAKernel, + ops::BincountCUDAKernel, + ops::BincountCUDAKernel); diff --git a/paddle/fluid/operators/bincount_op.h b/paddle/fluid/operators/bincount_op.h new file mode 100644 index 00000000000000..a142332bce2669 --- /dev/null +++ b/paddle/fluid/operators/bincount_op.h @@ -0,0 +1,109 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +void BincountInner(const framework::ExecutionContext& context) { + const Tensor* input = context.Input("X"); + const Tensor* weights = context.Input("Weights"); + Tensor* output = context.Output("Out"); + auto& minlength = context.Attr("minlength"); + + const InputT* input_data = input->data(); + + auto input_numel = input->numel(); + + if (input_data == nullptr) { + framework::DDim out_dim{0}; + output->Resize(out_dim); + output->mutable_data(context.GetPlace()); + return; + } + + PADDLE_ENFORCE_GE( + *std::min_element(input_data, input_data + input_numel), + static_cast(0), + platform::errors::InvalidArgument( + "The elements in input tensor must be non-negative ints")); + + int64_t output_size = static_cast(*std::max_element( + input_data, input_data + input_numel)) + + 1L; + output_size = std::max(output_size, static_cast(minlength)); + + framework::DDim out_dim{output_size}; + output->Resize(out_dim); + + bool has_weights = (weights != nullptr); + + if (has_weights) { + const T* weights_data = weights->data(); + const auto& weights_type = weights->type(); + if (weights_type == framework::proto::VarType::FP32) { + float* output_data = output->mutable_data(context.GetPlace()); + math::SetConstant()( + context.template device_context(), output, + static_cast(0)); + for (int64_t i = 0; i < input_numel; i++) { + output_data[input_data[i]] += static_cast(weights_data[i]); + } + } else { + double* output_data = output->mutable_data(context.GetPlace()); + math::SetConstant()( + context.template device_context(), output, + static_cast(0)); + for (int64_t i = 0; i < input_numel; i++) { + output_data[input_data[i]] += static_cast(weights_data[i]); + } + } + + } else { + int64_t* output_data = output->mutable_data(context.GetPlace()); + math::SetConstant()( + context.template device_context(), output, 0L); + for (int64_t i = 0; i < input_numel; i++) { + output_data[input_data[i]] += 1L; + } + } +} + +template +class BincountKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* input = context.Input("X"); + const auto& input_type = input->type(); + + if (input_type == framework::proto::VarType::INT32) { + BincountInner(context); + } else if (input_type == framework::proto::VarType::INT64) { + BincountInner(context); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/cast_op.cu b/paddle/fluid/operators/cast_op.cu index 06300817e0a128..05a110fe65b839 100644 --- a/paddle/fluid/operators/cast_op.cu +++ b/paddle/fluid/operators/cast_op.cu @@ -47,12 +47,12 @@ __global__ void CastCUDAKernel(const InT* in, const int64_t N, OutT* out) { } template -struct CastOpFunctor { +struct CastCUDAOpFunctor { const framework::Tensor* in_; framework::Tensor* out_; const platform::CUDADeviceContext& ctx_; - CastOpFunctor(const framework::Tensor* in, framework::Tensor* out, - const platform::CUDADeviceContext& ctx) + CastCUDAOpFunctor(const framework::Tensor* in, framework::Tensor* out, + const platform::CUDADeviceContext& ctx) : in_(in), out_(out), ctx_(ctx) {} template @@ -75,41 +75,38 @@ struct CastOpFunctor { } }; +template +class CastCUDAOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* in = context.Input("X"); + auto* out = context.Output("Out"); + framework::VisitDataType( + static_cast( + context.Attr("out_dtype")), + CastCUDAOpFunctor( + in, out, + context.template device_context())); + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; +namespace plat = paddle::platform; + +#define REGISTER_CAST_CUDA_BASE(op_name, ...) \ + REGISTER_OP_CUDA_KERNEL( \ + op_name, ops::CastCUDAOpKernel, ops::CastCUDAOpKernel, \ + ops::CastCUDAOpKernel, ops::CastCUDAOpKernel, \ + ops::CastCUDAOpKernel, ops::CastCUDAOpKernel, \ + ops::CastCUDAOpKernel, ops::CastCUDAOpKernel, \ + ops::CastCUDAOpKernel>, \ + ops::CastCUDAOpKernel>, ##__VA_ARGS__); -#ifdef PADDLE_WITH_HIP -REGISTER_OP_CUDA_KERNEL( - cast, ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel>, - ops::CastOpKernel>); +#if !defined(PADDLE_WITH_HIP) +REGISTER_CAST_CUDA_BASE(cast, ops::CastCUDAOpKernel) #else -REGISTER_OP_CUDA_KERNEL( - cast, ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel, - ops::CastOpKernel>, - ops::CastOpKernel>); +REGISTER_CAST_CUDA_BASE(cast) #endif diff --git a/paddle/fluid/operators/cast_op_xpu.cc b/paddle/fluid/operators/cast_op_xpu.cc index c7c0f81f2131f7..c1a296f2b2788d 100644 --- a/paddle/fluid/operators/cast_op_xpu.cc +++ b/paddle/fluid/operators/cast_op_xpu.cc @@ -23,6 +23,9 @@ limitations under the License. */ namespace paddle { namespace operators { +using var_type = framework::proto::VarType; +namespace plat = paddle::platform; + template class CastXPUKernel : public framework::OpKernel { using XPUInTDType = typename XPUTypeTrait::Type; @@ -31,53 +34,49 @@ class CastXPUKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& context) const override { auto* in = context.Input("X"); auto* out = context.Output("Out"); - auto in_type = static_cast( - context.Attr("in_dtype")); - auto out_type = static_cast( - context.Attr("out_dtype")); + auto in_type = static_cast(context.Attr("in_dtype")); + auto out_type = static_cast(context.Attr("out_dtype")); auto* in_data = in->data(); auto numel = in->numel(); auto& dev_ctx = context.template device_context(); int r = -1; - if (out_type == framework::proto::VarType::FP32) { - auto* out_data = out->mutable_data(context.GetPlace()); - r = xpu::cast_v2( - dev_ctx.x_context(), reinterpret_cast(in_data), - out_data, numel); - } else if (out_type == framework::proto::VarType::INT32) { - auto* out_data = out->mutable_data(context.GetPlace()); - r = xpu::cast_v2( - dev_ctx.x_context(), reinterpret_cast(in_data), - out_data, numel); - } else if (out_type == framework::proto::VarType::INT64) { - auto* out_data = out->mutable_data(context.GetPlace()); - r = xpu::cast_v2( - dev_ctx.x_context(), reinterpret_cast(in_data), - out_data, numel); - } else if ((out_type == framework::proto::VarType::BOOL) && - (in_type == framework::proto::VarType::FP32)) { - auto* out_data = out->mutable_data(context.GetPlace()); - r = xpu::cast_v2( - dev_ctx.x_context(), (const float*)in_data, - reinterpret_cast(out_data), numel); - } else if (out_type == framework::proto::VarType::FP16) { - auto* out_data = - out->mutable_data(context.GetPlace()); - r = xpu::cast_v2( - dev_ctx.x_context(), reinterpret_cast(in_data), - reinterpret_cast(out_data), numel); - - } else { - PADDLE_THROW(platform::errors::Unavailable("Not supported cast %d -> %d", - in_type, out_type)); + switch (out_type) { + case var_type::FP32: + r = xpu::cast_v2( + dev_ctx.x_context(), reinterpret_cast(in_data), + out->mutable_data(context.GetPlace()), numel); + break; + case var_type::FP16: + r = xpu::cast_v2( + dev_ctx.x_context(), reinterpret_cast(in_data), + reinterpret_cast( + out->mutable_data(context.GetPlace())), + numel); + break; + case var_type::INT64: + r = xpu::cast_v2( + dev_ctx.x_context(), reinterpret_cast(in_data), + out->mutable_data(context.GetPlace()), numel); + break; + case var_type::INT32: + r = xpu::cast_v2( + dev_ctx.x_context(), reinterpret_cast(in_data), + out->mutable_data(context.GetPlace()), numel); + break; + case var_type::BOOL: + r = xpu::cast_v2( + dev_ctx.x_context(), reinterpret_cast(in_data), + out->mutable_data(context.GetPlace()), numel); + break; + default: + PADDLE_THROW(platform::errors::Unavailable( + "Not supported cast %d -> %d", in_type, out_type)); } PADDLE_ENFORCE_EQ( r, XPU_SUCCESS, - platform::errors::External( - "XPU API return wrong value[%d], please check whether " - "Baidu Kunlun Card is properly installed.", - r)); + platform::errors::External("XPU CAST API return wrong value[%d %s].", r, + XPUAPIErrorMsg[r])); } }; @@ -90,5 +89,6 @@ REGISTER_OP_XPU_KERNEL( ops::CastXPUKernel, ops::CastXPUKernel, - ops::CastXPUKernel); + ops::CastXPUKernel, + ops::CastXPUKernel); #endif diff --git a/paddle/fluid/operators/clip_op.cu b/paddle/fluid/operators/clip_op.cu index fd61e4ea61d4ff..846354fcb81c5f 100644 --- a/paddle/fluid/operators/clip_op.cu +++ b/paddle/fluid/operators/clip_op.cu @@ -19,10 +19,14 @@ REGISTER_OP_CUDA_KERNEL( clip, ops::ClipKernel, ops::ClipKernel, ops::ClipKernel, - ops::ClipKernel); + ops::ClipKernel, + ops::ClipKernel); REGISTER_OP_CUDA_KERNEL( clip_grad, ops::ClipGradKernel, ops::ClipGradKernel, ops::ClipGradKernel, - ops::ClipGradKernel); + ops::ClipGradKernel, + ops::ClipGradKernel); diff --git a/paddle/fluid/operators/clip_op.h b/paddle/fluid/operators/clip_op.h index 93157ed9d47bbc..abf721936b41e3 100644 --- a/paddle/fluid/operators/clip_op.h +++ b/paddle/fluid/operators/clip_op.h @@ -54,7 +54,7 @@ class ClipGradFunctor { public: explicit ClipGradFunctor(const T min, const T max) : min_(min), max_(max) {} HOSTDEVICE T operator()(const T& x, const T& y) const { - return (y > min_ && y < max_) ? x : 0; + return (y > min_ && y < max_) ? x : static_cast(0); } private: @@ -79,7 +79,7 @@ class ClipKernel : public framework::OpKernel { } max = static_cast(max); - auto min = context.Attr("min"); + auto min = static_cast(context.Attr("min")); Tensor min_cpu; if (context.HasInput("Min")) { auto* min_t = context.Input("Min"); @@ -156,7 +156,7 @@ class ClipGradKernel : public framework::OpKernel { } max = static_cast(max); - auto min = context.Attr("min"); + auto min = static_cast(context.Attr("min")); Tensor min_cpu; if (context.HasInput("Min")) { auto* min_t = context.Input("Min"); diff --git a/paddle/fluid/operators/clip_op_xpu.cc b/paddle/fluid/operators/clip_op_xpu.cc new file mode 100644 index 00000000000000..7d4b02af418bef --- /dev/null +++ b/paddle/fluid/operators/clip_op_xpu.cc @@ -0,0 +1,78 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU + +#include "paddle/fluid/operators/clip_op.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class ClipXPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + auto max = static_cast(ctx.Attr("max")); + if (ctx.HasInput("Max")) { + Tensor max_cpu; + auto* max_t = ctx.Input("Max"); + auto* max_data = max_t->data(); + if (platform::is_xpu_place(max_t->place())) { + TensorCopySync(*max_t, platform::CPUPlace(), &max_cpu); + max_data = max_cpu.data(); + } + max = max_data[0]; + } + + auto min = ctx.Attr("min"); + if (ctx.HasInput("Min")) { + Tensor min_cpu; + auto* min_t = ctx.Input("Min"); + auto* min_data = min_t->data(); + if (platform::is_xpu_place(min_t->place())) { + TensorCopySync(*min_t, platform::CPUPlace(), &min_cpu); + min_data = min_cpu.data(); + } + min = min_data[0]; + } + + using XPUDataType = typename XPUTypeTrait::Type; + auto& dev_ctx = ctx.template device_context(); + auto x_data = reinterpret_cast(x->data()); + auto out_data = reinterpret_cast(out->data()); + int r = xpu::clip_v2(dev_ctx.x_context(), x_data, out_data, x->numel(), min, + max); + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, platform::errors::External( + "XPU API(clip_v2) return wrong " + "value[%d %s]", + r, XPUAPIErrorMsg[r])); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_XPU_KERNEL(clip, ops::ClipXPUKernel); + +#endif diff --git a/paddle/fluid/operators/collective/c_embedding_op_npu.cc b/paddle/fluid/operators/collective/c_embedding_op_npu.cc index c2d607223868a2..021e5790afe579 100644 --- a/paddle/fluid/operators/collective/c_embedding_op_npu.cc +++ b/paddle/fluid/operators/collective/c_embedding_op_npu.cc @@ -68,10 +68,21 @@ void shard_index(const Tensor &table_t, const Tensor &ids_t, int64_t start_idx, ignore_tensor.Resize(ids_t.dims()); NpuOpRunner sub_runner; +#if (CANN_VERSION_CODE >= 503003) + Tensor factor_tensor(ids_t.type()); + factor_tensor.mutable_data({1}, context.GetPlace()); + TensorFromVector(std::vector{static_cast(start_idx)}, + context.device_context(), &factor_tensor); + sub_runner.SetType("Sub") + .AddInput(ids_t) + .AddInput(factor_tensor) + .AddOutput(id_t); +#else sub_runner.SetType("Sub") .AddInput(ids_t) .AddInput(std::vector{static_cast(start_idx)}) .AddOutput(id_t); +#endif sub_runner.Run(); NpuOpRunner lessequal1_runner; @@ -137,6 +148,9 @@ void NPUGetIdsEmbedding(const framework::ExecutionContext &context) { .AddInput(table_t_pad) .AddInput(ids_t_local) .AddInput(std::vector{0}) +#if (CANN_VERSION_CODE >= 503003) + .AddAttrs({{"batch_dims", 0}}) +#endif .AddOutput(*output_t); runner.Run(); } diff --git a/paddle/fluid/operators/collective/global_scatter_op.cu.cc b/paddle/fluid/operators/collective/global_scatter_op.cu.cc index 64765b549e5c1f..bec984c6b57e19 100644 --- a/paddle/fluid/operators/collective/global_scatter_op.cu.cc +++ b/paddle/fluid/operators/collective/global_scatter_op.cu.cc @@ -47,8 +47,8 @@ class GlobalScatterOpCUDAKernel : public framework::OpKernel { if (platform::is_cpu_place(local_count->place())) { cpu_local_count_data = local_count->data(); } else { - framework::TensorCopy(*local_count, platform::CPUPlace(), - &cpu_local_count); + framework::TensorCopySync(*local_count, platform::CPUPlace(), + &cpu_local_count); cpu_local_count_data = cpu_local_count.data(); } auto global_count_len = 0; @@ -57,8 +57,8 @@ class GlobalScatterOpCUDAKernel : public framework::OpKernel { cpu_global_count_data = global_count->data(); global_count_len = global_count->numel(); } else { - framework::TensorCopy(*global_count, platform::CPUPlace(), - &cpu_global_count); + framework::TensorCopySync(*global_count, platform::CPUPlace(), + &cpu_global_count); cpu_global_count_data = cpu_global_count.data(); global_count_len = cpu_global_count.numel(); } diff --git a/paddle/fluid/operators/compat/matmul_v2.pbtxt b/paddle/fluid/operators/compat/matmul_v2.pbtxt index 5f43e1f8bf0e0c..fa2559939bbd2f 100644 --- a/paddle/fluid/operators/compat/matmul_v2.pbtxt +++ b/paddle/fluid/operators/compat/matmul_v2.pbtxt @@ -39,4 +39,12 @@ extra { name: "op_device" type: STRING } + attrs { + name: "fused_reshape_Out" + type: INTS + } + attrs { + name: "fused_transpose_Out" + type: INTS + } } diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc index a400d27b798e37..e6b1f6a1c18c38 100644 --- a/paddle/fluid/operators/concat_op.cc +++ b/paddle/fluid/operators/concat_op.cc @@ -169,9 +169,21 @@ class ConcatOpGrad : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")), - ctx.GetPlace()); + auto input_data_type = OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")); + +#ifdef PADDLE_WITH_MKLDNN + // extra checking if attr "use_mkldnn" exist is needed because + // test_reverse_op is calling concat_grad kernel without setting + // "use_mkldnn" to any value + if (ctx.HasAttr("use_mkldnn") && + this->CanMKLDNNBeUsed(ctx, input_data_type)) { + return framework::OpKernelType(input_data_type, ctx.GetPlace(), + framework::DataLayout::kMKLDNN, + framework::LibraryType::kMKLDNN); + } +#endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); } framework::OpKernelType GetKernelTypeForVar( diff --git a/paddle/fluid/operators/concat_op_npu.cc b/paddle/fluid/operators/concat_op_npu.cc index d242c9f8c3fbd5..109007d737c156 100644 --- a/paddle/fluid/operators/concat_op_npu.cc +++ b/paddle/fluid/operators/concat_op_npu.cc @@ -122,8 +122,14 @@ namespace ops = paddle::operators; REGISTER_OP_NPU_KERNEL(concat, ops::ConcatNPUKernel, ops::ConcatNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::ConcatNPUKernel, +#endif ops::ConcatNPUKernel); REGISTER_OP_NPU_KERNEL(concat_grad, ops::ConcatGradNPUKernel, ops::ConcatGradNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::ConcatGradNPUKernel, +#endif ops::ConcatGradNPUKernel); diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt index 1a2df2a0c7ba34..d2ad93bbae9217 100644 --- a/paddle/fluid/operators/controlflow/CMakeLists.txt +++ b/paddle/fluid/operators/controlflow/CMakeLists.txt @@ -22,3 +22,9 @@ endif() file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal_all);\nUSE_NO_KERNEL_OP(read_from_array);\n") file(APPEND ${pybind_file} "USE_OP(logical_and);\nUSE_OP(logical_or);\nUSE_OP(logical_xor);\nUSE_OP(logical_not);\n") file(APPEND ${pybind_file} "USE_OP(bitwise_and);\nUSE_OP(bitwise_or);\nUSE_OP(bitwise_xor);\nUSE_OP(bitwise_not);\n") + +if(WITH_XPU) + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(equal, XPU);\nUSE_OP_DEVICE_KERNEL(not_equal, XPU);\n") + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(less_than, XPU);\nUSE_OP_DEVICE_KERNEL(less_equal, XPU);\n") + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(greater_than, XPU);\nUSE_OP_DEVICE_KERNEL(greater_equal, XPU);\n") +endif() diff --git a/paddle/fluid/operators/controlflow/compare_op_xpu.cc b/paddle/fluid/operators/controlflow/compare_op_xpu.cc new file mode 100644 index 00000000000000..59e457caa18622 --- /dev/null +++ b/paddle/fluid/operators/controlflow/compare_op_xpu.cc @@ -0,0 +1,145 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#ifdef PADDLE_WITH_XPU + +#include "paddle/fluid/operators/controlflow/compare_op.h" +#include "paddle/fluid/framework/op_version_registry.h" + +namespace paddle { +namespace operators { + +template +void XPUCompare( + const framework::ExecutionContext& ctx, + std::function&, const std::vector&)> + func) { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* z = ctx.Output("Out"); + + auto x_shape = framework::vectorize(x->dims()); + auto y_shape = framework::vectorize(y->dims()); + + auto x_data = reinterpret_cast(x->data()); + auto y_data = reinterpret_cast(y->data()); + auto z_data = z->mutable_data(ctx.GetPlace()); + + auto& dev_ctx = + ctx.template device_context(); + + int ret = func(dev_ctx.x_context(), x_data, y_data, z_data, x_shape, y_shape); + PADDLE_ENFORCE_EQ( + ret, xpu::SUCCESS, + platform::errors::External( + "XPU kernel compare op occur error[%d %s] in XPUCompare.", ret, + XPUAPIErrorMsg[ret])); +} + +template +class EqualXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + + public: + void Compute(const framework::ExecutionContext& ctx) const override { + XPUCompare(ctx, xpu::broadcast_equal); + } +}; + +template +class NotEqualXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + + public: + void Compute(const framework::ExecutionContext& ctx) const override { + XPUCompare(ctx, xpu::broadcast_not_equal); + } +}; + +template +class LessThanXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + + public: + void Compute(const framework::ExecutionContext& ctx) const override { + XPUCompare(ctx, xpu::broadcast_less_than); + } +}; + +template +class LessEqualXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + + public: + void Compute(const framework::ExecutionContext& ctx) const override { + XPUCompare(ctx, xpu::broadcast_less_equal); + } +}; + +template +class GreaterThanXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + + public: + void Compute(const framework::ExecutionContext& ctx) const override { + XPUCompare(ctx, xpu::broadcast_greater_than); + } +}; + +template +class GreaterEqualXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + + public: + void Compute(const framework::ExecutionContext& ctx) const override { + XPUCompare(ctx, xpu::broadcast_greater_equal); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_XPU_KERNEL(equal, + ops::EqualXPUKernel, + ops::EqualXPUKernel, + ops::EqualXPUKernel); + +REGISTER_OP_XPU_KERNEL(not_equal, + ops::NotEqualXPUKernel, + ops::NotEqualXPUKernel, + ops::NotEqualXPUKernel); + +REGISTER_OP_XPU_KERNEL(less_than, + ops::LessThanXPUKernel, + ops::LessThanXPUKernel, + ops::LessThanXPUKernel); + +REGISTER_OP_XPU_KERNEL( + less_equal, ops::LessEqualXPUKernel, + ops::LessEqualXPUKernel, + ops::LessEqualXPUKernel); + +REGISTER_OP_XPU_KERNEL( + greater_than, ops::GreaterThanXPUKernel, + ops::GreaterThanXPUKernel, + ops::GreaterThanXPUKernel); + +REGISTER_OP_XPU_KERNEL( + greater_equal, ops::GreaterEqualXPUKernel, + ops::GreaterEqualXPUKernel, + ops::GreaterEqualXPUKernel); + +#endif diff --git a/paddle/fluid/operators/controlflow/feed_op.cc b/paddle/fluid/operators/controlflow/feed_op.cc index 9597dd25ec530f..bc29c92b094262 100644 --- a/paddle/fluid/operators/controlflow/feed_op.cc +++ b/paddle/fluid/operators/controlflow/feed_op.cc @@ -1,11 +1,8 @@ /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -29,6 +26,39 @@ class OpBase; namespace paddle { namespace operators { + +// FeedVariableVisitor is to feed the variable data +// according to data type (LoDTensor or Strings). +class FeedVariableVisitor : public boost::static_visitor { + public: + explicit FeedVariableVisitor(framework::Variable *out_var, + const platform::Place &place) + : out_var_(out_var), place_(place) {} + + void operator()(const framework::LoDTensor &in_tensor) const { + framework::LoDTensor *out_tensor = + out_var_->GetMutable(); + if (platform::is_same_place(in_tensor.place(), place_)) { + out_tensor->ShareDataWith(in_tensor); + } else { + platform::DeviceContext *context = + platform::DeviceContextPool::Instance().Get(place_); + framework::TensorCopy(in_tensor, place_, *context, out_tensor); + } + out_tensor->set_lod(in_tensor.lod()); + } + + void operator()(const framework::Strings &in_str) const { + framework::Strings *out_str = out_var_->GetMutable(); + out_str->resize(in_str.size()); + *out_str = in_str; + } + + private: + framework::Variable *out_var_; + const platform::Place &place_; +}; + class FeedOp : public framework::OperatorBase { public: FeedOp(const std::string &type, const framework::VariableNameMap &inputs, @@ -79,15 +109,9 @@ class FeedOp : public framework::OperatorBase { col, feed_list.size())); auto &feed_item = feed_list.at(static_cast(col)); - auto *out_item = out_var->GetMutable(); - if (platform::is_same_place(feed_item.place(), place)) { - out_item->ShareDataWith(feed_item); - } else { - auto *dev_ctx = platform::DeviceContextPool::Instance().Get(place); - framework::TensorCopy(feed_item, place, *dev_ctx, out_item); - } - out_item->set_lod(feed_item.lod()); + FeedVariableVisitor visitor(out_var, place); + boost::apply_visitor(visitor, feed_item); } }; @@ -95,17 +119,17 @@ class FeedOpInfoMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", - "(vector) A feeding list of LoDTensor, which may have " + "(vector) " + "A feeding list of LoDTensor, which may have " "different dimension and data type."); AddOutput("Out", - "(LoDTensor) The LoDTensor which is a copy of the col-th feeding " + "(LoDTensor) The LoDTensor which is a copy " + "of the col-th feeding " "object."); AddAttr("col", "(int) The column index of current feeding object."); AddComment(R"DOC( Feed Operator. - It should not be configured by users directly. - )DOC"); } }; diff --git a/paddle/fluid/operators/controlflow/fetch_op.cc b/paddle/fluid/operators/controlflow/fetch_op.cc index d86b6b48422d94..99b16d9b692538 100644 --- a/paddle/fluid/operators/controlflow/fetch_op.cc +++ b/paddle/fluid/operators/controlflow/fetch_op.cc @@ -109,6 +109,10 @@ class FetchOp : public framework::OperatorBase { auto &src_item = fetch_var->Get(); auto *dst_item = &(BOOST_GET(framework::LoDTensor, fetch_list->at(col))); DataCopy(src_item, fetch_var_name, dst_item); + } else if (fetch_var->IsType()) { + auto &src_item = fetch_var->Get(); + auto *dst_item = &(BOOST_GET(framework::Vocab, fetch_list->at(col))); + *dst_item = src_item; } else { auto &src_item = fetch_var->Get(); framework::LoDTensorArray tmp(src_item.size()); @@ -128,9 +132,11 @@ class FetchOpInfoMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "(LoDTensor) The resulted LoDTensor which is expected to return " "to users."); - AddOutput("Out", - "(vector) A fetching list of LoDTensor which may have " - "different dimension, shape and data type."); + AddOutput( + "Out", + "(vector|unordered_map) A fetching list" + " of LoDTensor|unordered_map which may have " + "different dimension, shape and data type."); AddAttr("col", "(int) The column index of fetching object."); AddComment(R"DOC( Fetch Operator. diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h index 4c0ef02074e2ed..f4183bf570926d 100644 --- a/paddle/fluid/operators/conv_cudnn_helper.h +++ b/paddle/fluid/operators/conv_cudnn_helper.h @@ -24,6 +24,7 @@ limitations under the License. */ #include "paddle/fluid/framework/operator_kernel_configs.h" #include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/operators/eigen/eigen_function.h" +#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" #include "paddle/fluid/platform/cudnn_desc.h" namespace paddle { namespace operators { @@ -480,6 +481,7 @@ struct SearchAlgorithm { static algo_t Find(const ConvArgs& args, bool exhaustive_search, bool deterministic, const framework::ExecutionContext& ctx) { + platform::CUDAGraphCaptureModeGuard guard; auto dtype = platform::CudnnDataType::type; size_t workspace_size_limit = FLAGS_conv_workspace_size_limit * 1024 * 1024; size_t workspace_size = 0; @@ -601,6 +603,7 @@ struct SearchAlgorithm { } static size_t GetWorkspaceSize(const ConvArgs& args, algo_t algo) { + platform::CUDAGraphCaptureModeGuard guard; size_t workspace_size = 0; PADDLE_ENFORCE_CUDA_SUCCESS( platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize( diff --git a/paddle/fluid/operators/conv_op_npu.cc b/paddle/fluid/operators/conv_op_npu.cc index 86724e06975ed4..47de843d1ac6f6 100644 --- a/paddle/fluid/operators/conv_op_npu.cc +++ b/paddle/fluid/operators/conv_op_npu.cc @@ -186,11 +186,6 @@ class DepthwiseConvGradNPUKernel : public framework::OpKernel { dilations[3] = dilation[1]; } - // LOG(INFO) << "strides = " << framework::make_ddim(strides).to_str(); - // LOG(INFO) << "dilations = " << framework::make_ddim(dilations).to_str(); - // LOG(INFO) << "padding = " << framework::make_ddim(padding).to_str(); - // LOG(INFO) << "data_format = " << data_format; - if (filter_grad) { filter_grad->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/cumsum_op_npu.cc b/paddle/fluid/operators/cumsum_op_npu.cc index e8cf1a46db3cca..0c0eb1577e8029 100644 --- a/paddle/fluid/operators/cumsum_op_npu.cc +++ b/paddle/fluid/operators/cumsum_op_npu.cc @@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and -limitations under the Licnse. */ +limitations under the License. */ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/operators/cum_op.h" @@ -21,6 +21,38 @@ namespace operators { using Tensor = framework::Tensor; +static void CumsumImp(const Tensor& input, Tensor* output, + const framework::NPUAttributeMap& attr_input, + const framework::ExecutionContext& ctx) { + auto stream = + ctx.template device_context() + .stream(); + if (input.type() == framework::proto::VarType::INT64) { + Tensor tmp_input; + tmp_input.mutable_data(input.dims(), ctx.GetPlace()); + auto dst_acl_dtype = ConvertToNpuDtype(tmp_input.type()); + const auto& cast_runner_1 = + NpuOpRunner("Cast", {input}, {tmp_input}, + {{"dst_type", static_cast(dst_acl_dtype)}}); + cast_runner_1.Run(stream); + + Tensor tmp_output; + tmp_output.mutable_data(output->dims(), ctx.GetPlace()); + const auto& runner = + NpuOpRunner("CumsumD", {tmp_input}, {tmp_output}, attr_input); + runner.Run(stream); + + dst_acl_dtype = ConvertToNpuDtype(output->type()); + const auto& cast_runner_2 = + NpuOpRunner("Cast", {tmp_output}, {*output}, + {{"dst_type", static_cast(dst_acl_dtype)}}); + cast_runner_2.Run(stream); + } else { + const auto& runner = NpuOpRunner("CumsumD", {input}, {*output}, attr_input); + runner.Run(stream); + } +} + template class CumSumNPUKernel : public framework::OpKernel { public: @@ -36,10 +68,6 @@ class CumSumNPUKernel : public framework::OpKernel { framework::NPUAttributeMap attr_input = { {"axis", axis}, {"exclusive", exclusive}, {"reverse", reverse}}; - auto stream = - ctx.template device_context() - .stream(); - bool flatten = ctx.Attr("flatten"); if (flatten) { PADDLE_ENFORCE_EQ( @@ -53,11 +81,9 @@ class CumSumNPUKernel : public framework::OpKernel { new_x.Resize(framework::make_ddim({x->numel()})); - const auto& runner = NpuOpRunner("CumsumD", {new_x}, {*out}, attr_input); - runner.Run(stream); + CumsumImp(new_x, out, attr_input, ctx); } else { - const auto& runner = NpuOpRunner("CumsumD", {*x}, {*out}, attr_input); - runner.Run(stream); + CumsumImp(*x, out, attr_input, ctx); } } }; @@ -69,5 +95,8 @@ namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_NPU_KERNEL( cumsum, ops::CumSumNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::CumSumNPUKernel, +#endif ops::CumSumNPUKernel, ops::CumSumNPUKernel); diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index c04d04f8413882..506ae56a126427 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -15,11 +15,17 @@ function(detection_library TARGET_NAME) PARENT_SCOPE) endfunction() +if (WITH_ASCEND_CL) + detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu box_coder_op_npu.cc) + detection_library(density_prior_box_op SRCS density_prior_box_op.cc density_prior_box_op.cu density_prior_box_op_npu.cc) +else() + detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu) + detection_library(density_prior_box_op SRCS density_prior_box_op.cc density_prior_box_op.cu) +endif() + detection_library(bipartite_match_op SRCS bipartite_match_op.cc) -detection_library(box_coder_op SRCS box_coder_op.cc box_coder_op.cu) detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc) detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu) -detection_library(density_prior_box_op SRCS density_prior_box_op.cc density_prior_box_op.cu) detection_library(anchor_generator_op SRCS anchor_generator_op.cc anchor_generator_op.cu) detection_library(target_assign_op SRCS target_assign_op.cc @@ -58,6 +64,8 @@ endif() if(WITH_XPU) detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op_xpu.cc) +elseif(WITH_ASCEND_CL) + detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op_npu.cc) else() detection_library(iou_similarity_op SRCS iou_similarity_op.cc iou_similarity_op.cu) endif() diff --git a/paddle/fluid/operators/detection/box_coder_op_npu.cc b/paddle/fluid/operators/detection/box_coder_op_npu.cc new file mode 100644 index 00000000000000..9d97c7af9630c9 --- /dev/null +++ b/paddle/fluid/operators/detection/box_coder_op_npu.cc @@ -0,0 +1,373 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/box_coder_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +struct BoxCoderFunction { + public: + explicit BoxCoderFunction(const framework::ExecutionContext& ctx) : ctx(ctx) { + place = ctx.GetPlace(); + stream = ctx.template device_context() + .stream(); + } + Tensor Adds(const Tensor& x, float scalar) { + Tensor y; + y.mutable_data(x.dims(), place); + const auto& runner = NpuOpRunner("Adds", {x}, {y}, {{"value", scalar}}); + runner.Run(stream); + return y; + } + Tensor Muls(const Tensor& x, float scalar) { + Tensor y; + y.mutable_data(x.dims(), place); + const auto& runner = NpuOpRunner("Muls", {x}, {y}, {{"value", scalar}}); + runner.Run(stream); + return y; + } + Tensor Mul(const Tensor& x, const Tensor& y) { + Tensor z; + z.mutable_data(x.dims(), place); + const auto& runner = NpuOpRunner("Mul", {x, y}, {z}, {}); + runner.Run(stream); + return z; + } + Tensor SubWithBroadCast(const Tensor& x, const Tensor& y, + const framework::DDim& shape) { + Tensor z; + z.mutable_data(shape, place); + const auto& runner = NpuOpRunner("Sub", {x, y}, {z}, {}); + runner.Run(stream); + return z; + } + void DivWithBroadCastVoid(const Tensor& x, const Tensor& y, + const framework::DDim& shape, Tensor* z) { + z->mutable_data(shape, place); + const auto& runner = NpuOpRunner("Div", {x, y}, {*z}, {}); + runner.Run(stream); + } + Tensor DivWithBroadCast(const Tensor& x, const Tensor& y, + const framework::DDim& shape) { + Tensor z; + DivWithBroadCastVoid(x, y, shape, &z); + return z; + } + void MulWithBroadCastVoid(const Tensor& x, const Tensor& y, + const framework::DDim& shape, Tensor* z) { + z->mutable_data(shape, place); + const auto& runner = NpuOpRunner("Mul", {x, y}, {*z}, {}); + runner.Run(stream); + } + Tensor MulWithBroadCast(const Tensor& x, const Tensor& y, + const framework::DDim& shape) { + Tensor z; + MulWithBroadCastVoid(x, y, shape, &z); + return z; + } + void AddWithBroadCastVoid(const Tensor& x, const Tensor& y, + const framework::DDim& shape, Tensor* z) { + z->mutable_data(shape, place); + const auto& runner = NpuOpRunner("AddV2", {x, y}, {*z}, {}); + runner.Run(stream); + } + Tensor AddWithBroadCast(const Tensor& x, const Tensor& y, + const framework::DDim& shape) { + Tensor z; + AddWithBroadCastVoid(x, y, shape, &z); + return z; + } + Tensor Abs(const Tensor& x) { + Tensor y; + y.mutable_data(x.dims(), place); + const auto& runner = NpuOpRunner("Abs", {x}, {y}, {}); + runner.Run(stream); + return y; + } + Tensor Log(const Tensor& x) { + Tensor t_x_m1 = Adds(x, -1); + Tensor y; + y.mutable_data(x.dims(), place); + const auto& runner = NpuOpRunner("Log1p", {t_x_m1}, {y}, {}); + runner.Run(stream); + return y; + } + Tensor Exp(const Tensor& x) { + Tensor y; + y.mutable_data(x.dims(), place); + const auto& runner = NpuOpRunner("Exp", {x}, {y}, {}); + runner.Run(stream); + return y; + } + Tensor Dot(const Tensor& x, const Tensor& y) { + auto dim_x = x.dims(); + auto dim_y = y.dims(); + PADDLE_ENFORCE_EQ( + dim_x.size(), 2, + platform::errors::InvalidArgument( + "x should be a 2-dim tensor, but got %d-dim.", dim_x.size())); + PADDLE_ENFORCE_EQ( + dim_y.size(), 2, + platform::errors::InvalidArgument( + "y should be a 2-dim tensor, but got %d-dim.", dim_y.size())); + PADDLE_ENFORCE_EQ( + dim_x[1], dim_y[0], + platform::errors::InvalidArgument("Expect dim_x[1] == dim_y[0], but " + "got dim_x[1] = %d, dim_y[0] = %d.", + dim_x[1], dim_y[0])); + Tensor z; + z.mutable_data({dim_x[0], dim_y[1]}, place); + const auto& runner = + NpuOpRunner("MatMul", {x, y}, {z}, + {{"transpose_x1", false}, {"transpose_x2", false}}); + runner.Run(stream); + return z; + } + void ConcatVoid(const std::vector& inputs, + const framework::DDim& shape_out, int axis, Tensor* output) { + output->mutable_data(shape_out, place); + std::vector names; + for (size_t i = 0; i < inputs.size(); i++) { + names.push_back("x" + std::to_string(i)); + } + NpuOpRunner runner{ + "ConcatD", + {inputs}, + {*output}, + {{"concat_dim", axis}, {"N", static_cast(inputs.size())}}}; + runner.AddInputNames(names); + runner.Run(stream); + } + Tensor Concat(const std::vector& inputs, + const framework::DDim& shape_out, int axis) { + Tensor output; + ConcatVoid(inputs, shape_out, axis, &output); + return output; + } + Tensor Slice(const Tensor& x, const std::vector& offsets, + const std::vector& size, const framework::DDim& shape) { + Tensor y; + y.mutable_data(shape, place); + const auto& runner = + NpuOpRunner("SliceD", {x}, {y}, {{"offsets", offsets}, {"size", size}}); + runner.Run(stream); + return y; + } + + private: + platform::Place place; + aclrtStream stream; + const framework::ExecutionContext& ctx; +}; + +template +void Vector2Tensor(const framework::ExecutionContext& ctx, + const std::vector& vec, const framework::DDim& ddim, + Tensor* tsr) { + framework::TensorFromVector(vec, ctx.device_context(), tsr); + ctx.template device_context().Wait(); + tsr->Resize(ddim); +} + +template +void BoxCoderEnc(const framework::ExecutionContext& ctx, const Tensor* tb, + const Tensor* pb, const Tensor* pbv, const bool norm, + const std::vector& variance, Tensor* out) { + auto M = pb->dims()[0]; + auto N = tb->dims()[0]; + auto shape_0 = framework::make_ddim({4, 2}); + Tensor m_diff; + Tensor m_aver; + std::vector vec_diff = {static_cast(-1), static_cast(0), + static_cast(0), static_cast(-1), + static_cast(1), static_cast(0), + static_cast(0), static_cast(1)}; + std::vector vec_aver = {static_cast(0.5), static_cast(0), + static_cast(0), static_cast(0.5), + static_cast(0.5), static_cast(0), + static_cast(0), static_cast(0.5)}; + Vector2Tensor(ctx, vec_diff, shape_0, &m_diff); + Vector2Tensor(ctx, vec_aver, shape_0, &m_aver); + + BoxCoderFunction F(ctx); + Tensor pb_xy = F.Adds(F.Dot(*pb, m_aver), (norm ? 0 : 0.5)); + Tensor pb_wh = F.Adds(F.Dot(*pb, m_diff), (norm ? 0 : 1)); + Tensor tb_xy = F.Dot(*tb, m_aver); + Tensor tb_wh = F.Adds(F.Dot(*tb, m_diff), (norm ? 0 : 1)); + + pb_xy.Resize({1, M, 2}); + pb_wh.Resize({1, M, 2}); + tb_xy.Resize({N, 1, 2}); + tb_wh.Resize({N, 1, 2}); + + auto shape_half = framework::make_ddim({N, M, 2}); + auto shape_full = framework::make_ddim({N, M, 4}); + + Tensor out_xy_0 = F.DivWithBroadCast( + F.SubWithBroadCast(tb_xy, pb_xy, shape_half), pb_wh, shape_half); + Tensor out_wh_0 = F.Log(F.Abs(F.DivWithBroadCast(tb_wh, pb_wh, shape_half))); + Tensor out_0 = F.Concat({out_xy_0, out_wh_0}, shape_full, 2); + + if (pbv) { + F.DivWithBroadCastVoid(out_0, *pbv, shape_full, out); + } else { + Tensor t_var; + std::vector vec_var(4); + for (auto i = 0; i < 4; i++) { + vec_var[i] = static_cast(variance[i]); + } + Vector2Tensor(ctx, vec_var, framework::make_ddim({1, 1, 4}), &t_var); + F.DivWithBroadCastVoid(out_0, t_var, shape_full, out); + } +} + +template +void BoxCoderDec(const framework::ExecutionContext& ctx, const Tensor* tb, + const Tensor* pb, const Tensor* pbv, const bool norm, + const std::vector& variance, int axis, Tensor* out) { + auto shape_0 = framework::make_ddim({4, 2}); + Tensor m_diff; + Tensor m_aver; + std::vector vec_diff = {static_cast(-1), static_cast(0), + static_cast(0), static_cast(-1), + static_cast(1), static_cast(0), + static_cast(0), static_cast(1)}; + std::vector vec_aver = {static_cast(0.5), static_cast(0), + static_cast(0), static_cast(0.5), + static_cast(0.5), static_cast(0), + static_cast(0), static_cast(0.5)}; + Vector2Tensor(ctx, vec_diff, shape_0, &m_diff); + Vector2Tensor(ctx, vec_aver, shape_0, &m_aver); + + BoxCoderFunction F(ctx); + Tensor pb_xy = F.Adds(F.Dot(*pb, m_aver), (norm ? 0 : 0.5)); + Tensor pb_wh = F.Adds(F.Dot(*pb, m_diff), (norm ? 0 : 1)); + auto pb_resize_shape = axis == 0 + ? framework::make_ddim({1, pb->dims()[0], 2}) + : framework::make_ddim({pb->dims()[0], 1, 2}); + pb_xy.Resize(pb_resize_shape); + pb_wh.Resize(pb_resize_shape); + + auto tbox_slice_shape = + framework::make_ddim({tb->dims()[0], tb->dims()[1], 2}); + std::vector tbox_slice_size = {static_cast(tb->dims()[0]), + static_cast(tb->dims()[1]), 2}; + Tensor tbox01 = F.Slice(*tb, {0, 0, 0}, tbox_slice_size, tbox_slice_shape); + Tensor tbox23 = F.Slice(*tb, {0, 0, 2}, tbox_slice_size, tbox_slice_shape); + + Tensor tb_xy; + Tensor tb_wh; + if (pbv) { + auto pbvt_slice_shape = framework::make_ddim({pbv->dims()[0], 2}); + auto pbvt_resize_shape = axis == 0 + ? framework::make_ddim({1, pbv->dims()[0], 2}) + : framework::make_ddim({pbv->dims()[0], 1, 2}); + std::vector pbvt_slice_size = {static_cast(pbv->dims()[0]), 2}; + Tensor pbv_t01 = F.Slice(*pbv, {0, 0}, pbvt_slice_size, pbvt_slice_shape); + Tensor pbv_t23 = F.Slice(*pbv, {0, 2}, pbvt_slice_size, pbvt_slice_shape); + pbv_t01.Resize(pbvt_resize_shape); + pbv_t23.Resize(pbvt_resize_shape); + + F.AddWithBroadCastVoid( + F.MulWithBroadCast(tbox01, F.Mul(pb_wh, pbv_t01), tbox_slice_shape), + pb_xy, tbox_slice_shape, &tb_xy); + F.MulWithBroadCastVoid( + F.Exp(F.MulWithBroadCast(pbv_t23, tbox23, tbox_slice_shape)), pb_wh, + tbox_slice_shape, &tb_wh); + } else if (variance.empty()) { + F.AddWithBroadCastVoid(F.MulWithBroadCast(tbox01, pb_wh, tbox_slice_shape), + pb_xy, tbox_slice_shape, &tb_xy); + F.MulWithBroadCastVoid(F.Exp(tbox23), pb_wh, tbox_slice_shape, &tb_wh); + } else { + Tensor t_var01, t_var23; + auto t_var_shape = framework::make_ddim({1, 1, 2}); + std::vector vec_var01 = {static_cast(variance[0]), + static_cast(variance[1])}; + std::vector vec_var23 = {static_cast(variance[2]), + static_cast(variance[3])}; + Vector2Tensor(ctx, vec_var01, t_var_shape, &t_var01); + Vector2Tensor(ctx, vec_var23, t_var_shape, &t_var23); + F.AddWithBroadCastVoid( + F.MulWithBroadCast(tbox01, + F.MulWithBroadCast(pb_wh, t_var01, pb_resize_shape), + tbox_slice_shape), + pb_xy, tbox_slice_shape, &tb_xy); + F.MulWithBroadCastVoid( + F.Exp(F.MulWithBroadCast(t_var23, tbox23, tbox_slice_shape)), pb_wh, + tbox_slice_shape, &tb_wh); + } + Tensor obox01 = + F.AddWithBroadCast(tb_xy, F.Muls(tb_wh, -0.5), tbox_slice_shape); + Tensor obox23 = + F.Adds(F.AddWithBroadCast(tb_xy, F.Muls(tb_wh, 0.5), tbox_slice_shape), + (norm ? 0 : -1)); + F.ConcatVoid({obox01, obox23}, out->dims(), 2, out); +} + +template +class BoxCoderNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* prior_box = ctx.Input("PriorBox"); + auto* prior_box_var = ctx.Input("PriorBoxVar"); + auto* target_box = ctx.Input("TargetBox"); + auto* output_box = ctx.Output("OutputBox"); + std::vector variance = ctx.Attr>("variance"); + const int axis = ctx.Attr("axis"); + + if (prior_box_var) { + PADDLE_ENFORCE_EQ(variance.empty(), true, + platform::errors::InvalidArgument( + "Input 'PriorBoxVar' and attribute 'variance'" + " of BoxCoder operator should not be used at the " + "same time.")); + } + if (!(variance.empty())) { + PADDLE_ENFORCE_EQ(static_cast(variance.size()), 4, + platform::errors::InvalidArgument( + "Size of attribute 'variance' in BoxCoder operator" + " should be 4. But received size is %d", + variance.size())); + } + + if (target_box->lod().size()) { + PADDLE_ENFORCE_EQ(target_box->lod().size(), 1, + platform::errors::InvalidArgument( + "Input 'TargetBox' of BoxCoder operator only" + " supports LoD with one level.")); + } + + auto code_type = GetBoxCodeType(ctx.Attr("code_type")); + bool normalized = ctx.Attr("box_normalized"); + + if (code_type == BoxCodeType::kEncodeCenterSize) { + BoxCoderEnc(ctx, target_box, prior_box, prior_box_var, normalized, + variance, output_box); + } else { + BoxCoderDec(ctx, target_box, prior_box, prior_box_var, normalized, + variance, axis, output_box); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(box_coder, ops::BoxCoderNPUKernel, + ops::BoxCoderNPUKernel); diff --git a/paddle/fluid/operators/detection/density_prior_box_op_npu.cc b/paddle/fluid/operators/detection/density_prior_box_op_npu.cc new file mode 100644 index 00000000000000..cb58640056438b --- /dev/null +++ b/paddle/fluid/operators/detection/density_prior_box_op_npu.cc @@ -0,0 +1,379 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/density_prior_box_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using fp16 = paddle::platform::float16; + +template +struct DensityPriorBoxFunction { + public: + explicit DensityPriorBoxFunction(const framework::ExecutionContext& ctx) + : ctx(ctx) { + place = ctx.GetPlace(); + stream = ctx.template device_context().stream(); + t0.mutable_data({1}, place); + t1.mutable_data({1}, place); + tn.mutable_data({1}, place); + FillNpuTensorWithConstant(&t0, static_cast(0)); + FillNpuTensorWithConstant(&t1, static_cast(1)); + } + void Arange(int n, Tensor* x) { + // x should be init first + FillNpuTensorWithConstant(&tn, static_cast(n)); + const auto& runner = NpuOpRunner("Range", {t0, tn, t1}, {*x}, {}); + runner.Run(stream); + } + void Add(const Tensor* x, const Tensor* y, Tensor* z) { + // z should be init first + const auto& runner = NpuOpRunner("AddV2", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + void Cast(const Tensor* x, Tensor* y) { + auto dst_dtype = ConvertToNpuDtype(y->type()); + const auto& runner = NpuOpRunner( + "Cast", {*x}, {*y}, {{"dst_type", static_cast(dst_dtype)}}); + runner.Run(stream); + } + void Sub(const Tensor* x, const Tensor* y, Tensor* z) { + // z should be init first + const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + void Mul(const Tensor* x, const Tensor* y, Tensor* z) { + // y should be init first + const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + void Adds(const Tensor* x, float scalar, Tensor* y) { + // y should be init first + const auto& runner = NpuOpRunner("Adds", {*x}, {*y}, {{"value", scalar}}); + runner.Run(stream); + } + void Muls(const Tensor* x, float scalar, Tensor* y) { + // y should be init first + const auto& runner = NpuOpRunner("Muls", {*x}, {*y}, {{"value", scalar}}); + runner.Run(stream); + } + void Maximum(const Tensor* x, const Tensor* y, Tensor* z) { + // y should be init first + const auto& runner = NpuOpRunner("Maximum", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + void Minimum(const Tensor* x, const Tensor* y, Tensor* z) { + // y should be init first + const auto& runner = NpuOpRunner("Minimum", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + void Concat(const std::vector& inputs, int axis, Tensor* output) { + // output should be init first + std::vector names; + for (size_t i = 0; i < inputs.size(); i++) { + names.push_back("x" + std::to_string(i)); + } + NpuOpRunner runner{ + "ConcatD", + {inputs}, + {*output}, + {{"concat_dim", axis}, {"N", static_cast(inputs.size())}}}; + runner.AddInputNames(names); + runner.Run(stream); + } + void Tile(const Tensor* x, Tensor* y, const std::vector& multiples) { + // y should be init first + if (x->dims() == y->dims()) { + framework::TensorCopy( + *x, place, ctx.template device_context(), + y); + return; + } + const auto& runner = + NpuOpRunner("TileD", {*x}, {*y}, {{"multiples", multiples}}); + runner.Run(stream); + } + void FloatVec2Tsr(const std::vector& vec, Tensor* tsr_dst) { + // + framework::TensorFromVector(vec, ctx.device_context(), tsr_dst); + ctx.template device_context().Wait(); + } + + private: + platform::Place place; + aclrtStream stream; + const framework::ExecutionContext& ctx; + Tensor t0; + Tensor t1; + Tensor tn; +}; + +template <> +void DensityPriorBoxFunction::Arange(int n, Tensor* x) { + Tensor x_fp32(framework::proto::VarType::FP32); + x_fp32.mutable_data(x->dims(), place); + FillNpuTensorWithConstant(&tn, static_cast(n)); + const auto& runner = NpuOpRunner("Range", {t0, tn, t1}, {x_fp32}, {}); + runner.Run(stream); + Cast(&x_fp32, x); +} + +template <> +void DensityPriorBoxFunction::FloatVec2Tsr(const std::vector& vec, + Tensor* tsr_dst) { + Tensor tsr_fp32(framework::proto::VarType::FP32); + tsr_fp32.mutable_data(tsr_dst->dims(), place); + framework::TensorFromVector(vec, ctx.device_context(), &tsr_fp32); + ctx.template device_context().Wait(); + Cast(&tsr_fp32, tsr_dst); +} + +template +class DensityPriorBoxOpNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto* image = ctx.Input("Image"); + auto* boxes = ctx.Output("Boxes"); + auto* vars = ctx.Output("Variances"); + + auto variances = ctx.Attr>("variances"); + auto clip = ctx.Attr("clip"); + + auto fixed_sizes = ctx.Attr>("fixed_sizes"); + auto fixed_ratios = ctx.Attr>("fixed_ratios"); + auto densities = ctx.Attr>("densities"); + + float step_w = ctx.Attr("step_w"); + float step_h = ctx.Attr("step_h"); + float offset = ctx.Attr("offset"); + + int image_w = image->dims()[3]; + int image_h = image->dims()[2]; + int layer_w = input->dims()[3]; + int layer_h = input->dims()[2]; + + auto _type = input->type(); + auto place = ctx.GetPlace(); + DensityPriorBoxFunction F(ctx); + + Tensor h(_type); + h.mutable_data({layer_h}, place); + Tensor w(_type); + w.mutable_data({layer_w}, place); + F.Arange(layer_h, &h); + F.Arange(layer_w, &w); + h.Resize({layer_h, 1, 1, 1}); + w.Resize({1, layer_w, 1, 1}); + + step_w = step_w > 0 ? step_w : static_cast(image_w) / layer_w; + step_h = step_h > 0 ? step_h : static_cast(image_h) / layer_h; + int step_average = static_cast((step_w + step_h) * 0.5); + + int ratios_size = fixed_ratios.size(); + int num_priors_per_ratio = 0; + for (size_t i = 0; i < densities.size(); ++i) { + num_priors_per_ratio += densities[i] * densities[i]; + } + Tensor di(_type); + Tensor dj(_type); + Tensor shifts(_type); + Tensor box_w_ratio(_type); + Tensor box_h_ratio(_type); + di.mutable_data({ratios_size * num_priors_per_ratio}, place); + dj.mutable_data({ratios_size * num_priors_per_ratio}, place); + shifts.mutable_data({ratios_size * num_priors_per_ratio}, place); + box_w_ratio.mutable_data({ratios_size * num_priors_per_ratio}, place); + box_h_ratio.mutable_data({ratios_size * num_priors_per_ratio}, place); + + int64_t start = 0; + std::vector vec_tile = {0, 0, 0}; + for (size_t i = 0; i < densities.size(); ++i) { + // Range = start:start+ratios_size*density_sqr, density = densities[i] + int density_sqr = densities[i] * densities[i]; + // shifts[Range] = [step_average/density]*ratios_size*density_sqr + Tensor shifts_part = + shifts.Slice(start, start + ratios_size * density_sqr); + FillNpuTensorWithConstant(&shifts_part, + static_cast(step_average / densities[i])); + + // di[Range] = [ i // density for i in range(density_sqr) ] * ratios_size + // dj[Range] = [ i % density for i in range(density_sqr) ] * ratios_size + Tensor di_part = di.Slice(start, start + ratios_size * density_sqr); + Tensor dj_part = dj.Slice(start, start + ratios_size * density_sqr); + if (densities[i] > 1) { + di_part.Resize({ratios_size, densities[i], densities[i]}); + dj_part.Resize({ratios_size, densities[i], densities[i]}); + Tensor range_n(_type); + range_n.mutable_data({densities[i]}, place); + F.Arange(densities[i], &range_n); + range_n.Resize({1, densities[i], 1}); + vec_tile[0] = ratios_size; + vec_tile[1] = 1; + vec_tile[2] = densities[i]; + F.Tile(&range_n, &di_part, vec_tile); + range_n.Resize({1, 1, densities[i]}); + vec_tile[1] = densities[i]; + vec_tile[2] = 1; + F.Tile(&range_n, &dj_part, vec_tile); + } else { + FillNpuTensorWithConstant(&di_part, static_cast(0)); + FillNpuTensorWithConstant(&dj_part, static_cast(0)); + } + + int start_box_ratio = start; + for (float ar : fixed_ratios) { + // Range_mini = start_box_ratio:start_box_ratio+density_sqr + // box_h_ratio[Range_mini] = [fixed_sizes[i] * sqrt(ar)] * density_sqr + // box_w_ratio[Range_mini] = [fixed_sizes[i] / sqrt(ar)] * density_sqr + Tensor box_h_ratio_part = + box_h_ratio.Slice(start_box_ratio, start_box_ratio + density_sqr); + Tensor box_w_ratio_part = + box_w_ratio.Slice(start_box_ratio, start_box_ratio + density_sqr); + FillNpuTensorWithConstant(&box_w_ratio_part, + static_cast(fixed_sizes[i] * sqrt(ar))); + FillNpuTensorWithConstant(&box_h_ratio_part, + static_cast(fixed_sizes[i] / sqrt(ar))); + start_box_ratio += density_sqr; + } + start = start_box_ratio; + } + di.Resize({1, 1, ratios_size * num_priors_per_ratio, 1}); + dj.Resize({1, 1, ratios_size * num_priors_per_ratio, 1}); + shifts.Resize({1, 1, ratios_size * num_priors_per_ratio, 1}); + box_w_ratio.Resize({1, 1, ratios_size * num_priors_per_ratio, 1}); + box_h_ratio.Resize({1, 1, ratios_size * num_priors_per_ratio, 1}); + + // c_x = (w+offset)*step_w - 0.5*step_average + 0.5*shifts + dj*shifts + // c_y = (h+offset)*step_h - 0.5*step_average + 0.5*shifts + di*shifts + Tensor c_x(_type); + Tensor c_y(_type); + auto dim0 = framework::make_ddim( + {1, layer_w, ratios_size * num_priors_per_ratio, 1}); + auto dim1 = framework::make_ddim( + {layer_h, 1, ratios_size * num_priors_per_ratio, 1}); + c_x.mutable_data(dim0, place); + c_y.mutable_data(dim1, place); + F.Adds(&w, offset, &w); + F.Muls(&w, step_w, &w); + F.Adds(&w, static_cast(-step_average) * static_cast(0.5), &w); + F.Adds(&h, offset, &h); + F.Muls(&h, step_h, &h); + F.Adds(&h, static_cast(-step_average) * static_cast(0.5), &h); + F.Mul(&di, &shifts, &di); + F.Mul(&dj, &shifts, &dj); + F.Muls(&shifts, static_cast(0.5), &shifts); + F.Add(&di, &shifts, &di); + F.Add(&dj, &shifts, &dj); + F.Add(&dj, &w, &c_x); + F.Add(&di, &h, &c_y); + + // box_w_ratio = box_w_ratio / 2 + // box_h_ratio = box_h_ratio / 2 + F.Muls(&box_w_ratio, static_cast(0.5), &box_w_ratio); + F.Muls(&box_h_ratio, static_cast(0.5), &box_h_ratio); + + Tensor zero_t(_type); + Tensor one_t(_type); + zero_t.mutable_data({1}, place); + one_t.mutable_data({1}, place); + FillNpuTensorWithConstant(&zero_t, static_cast(0)); + FillNpuTensorWithConstant(&one_t, static_cast(1)); + + Tensor outbox0(_type); + Tensor outbox1(_type); + Tensor outbox2(_type); + Tensor outbox3(_type); + outbox0.mutable_data(dim0, place); + outbox1.mutable_data(dim1, place); + outbox2.mutable_data(dim0, place); + outbox3.mutable_data(dim1, place); + + // outbox0 = max ( (c_x - box_w_ratio)/image_w, 0 ) + // outbox1 = max ( (c_y - box_h_ratio)/image_h, 0 ) + // outbox2 = min ( (c_x + box_w_ratio)/image_w, 1 ) + // outbox3 = min ( (c_y + box_h_ratio)/image_h, 1 ) + F.Sub(&c_x, &box_w_ratio, &outbox0); + F.Sub(&c_y, &box_h_ratio, &outbox1); + F.Add(&c_x, &box_w_ratio, &outbox2); + F.Add(&c_y, &box_h_ratio, &outbox3); + F.Muls(&outbox0, static_cast(1.0 / image_w), &outbox0); + F.Muls(&outbox1, static_cast(1.0 / image_h), &outbox1); + F.Muls(&outbox2, static_cast(1.0 / image_w), &outbox2); + F.Muls(&outbox3, static_cast(1.0 / image_h), &outbox3); + + F.Maximum(&outbox0, &zero_t, &outbox0); + F.Maximum(&outbox1, &zero_t, &outbox1); + F.Minimum(&outbox2, &one_t, &outbox2); + F.Minimum(&outbox3, &one_t, &outbox3); + if (clip) { + // outbox0 = min ( outbox0, 1 ) + // outbox1 = min ( outbox1, 1 ) + // outbox2 = max ( outbox2, 0 ) + // outbox3 = max ( outbox3, 0 ) + F.Minimum(&outbox0, &one_t, &outbox0); + F.Minimum(&outbox1, &one_t, &outbox1); + F.Maximum(&outbox2, &zero_t, &outbox2); + F.Maximum(&outbox3, &zero_t, &outbox3); + } + + auto out_dim = framework::make_ddim( + {layer_h, layer_w, ratios_size * num_priors_per_ratio, 4}); + boxes->mutable_data(place); + vars->mutable_data(place); + Tensor boxes_share(_type); + Tensor vars_share(_type); + boxes_share.ShareDataWith(*boxes); + boxes_share.Resize(out_dim); + vars_share.ShareDataWith(*vars); + vars_share.Resize(out_dim); + + Tensor box0(_type); + Tensor box1(_type); + Tensor box2(_type); + Tensor box3(_type); + // out_dim = {layer_h, layer_w, ratios_size*num_priors_per_ratio, 1} + out_dim[3] = 1; + box0.mutable_data(out_dim, place); + box1.mutable_data(out_dim, place); + box2.mutable_data(out_dim, place); + box3.mutable_data(out_dim, place); + + std::vector vec_exp_out02 = {layer_h, 1, 1, 1}; + std::vector vec_exp_out13 = {1, layer_w, 1, 1}; + F.Tile(&outbox0, &box0, vec_exp_out02); + F.Tile(&outbox1, &box1, vec_exp_out13); + F.Tile(&outbox2, &box2, vec_exp_out02); + F.Tile(&outbox3, &box3, vec_exp_out13); + F.Concat({box0, box1, box2, box3}, 3, &boxes_share); + + std::vector multiples = {layer_h, layer_w, + ratios_size * num_priors_per_ratio, 1}; + Tensor variances_t(_type); + // variances.size() == 4 + variances_t.mutable_data({4}, place); + F.FloatVec2Tsr(variances, &variances_t); + F.Tile(&variances_t, &vars_share, multiples); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(density_prior_box, + ops::DensityPriorBoxOpNPUKernel, + ops::DensityPriorBoxOpNPUKernel); diff --git a/paddle/fluid/operators/detection/iou_similarity_op_npu.cc b/paddle/fluid/operators/detection/iou_similarity_op_npu.cc new file mode 100644 index 00000000000000..9a91d4bd8fac13 --- /dev/null +++ b/paddle/fluid/operators/detection/iou_similarity_op_npu.cc @@ -0,0 +1,192 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/iou_similarity_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +struct IouFunction { + public: + explicit IouFunction(const framework::ExecutionContext& ctx) : ctx(ctx) { + place = ctx.GetPlace(); + stream = ctx.template device_context() + .stream(); + } + void Transpose(const Tensor* x, Tensor* y, const std::vector& axis) { + // y should be init first + const auto& runner = + NpuOpRunner("TransposeD", {*x}, {*y}, {{"perm", axis}}); + runner.Run(stream); + } + void Add(const Tensor* x, const Tensor* y, Tensor* z) { + // y should be init first + const auto& runner = NpuOpRunner("AddV2", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + void Sub(const Tensor* x, const Tensor* y, Tensor* z) { + // y should be init first + const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + void Mul(const Tensor* x, const Tensor* y, Tensor* z) { + // y should be init first + const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + void DivNoNan(const Tensor* x, const Tensor* y, Tensor* z) { + // y should be init first + const auto& runner = NpuOpRunner("DivNoNan", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + void Adds(const Tensor* x, float scalar, Tensor* y) { + // y should be init first + const auto& runner = NpuOpRunner("Adds", {*x}, {*y}, {{"value", scalar}}); + runner.Run(stream); + } + void Maximum(const Tensor* x, const Tensor* y, Tensor* z) { + // z should be init first + const auto& runner = NpuOpRunner("Maximum", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + void Minimum(const Tensor* x, const Tensor* y, Tensor* z) { + // z should be init first + const auto& runner = NpuOpRunner("Minimum", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + + private: + platform::Place place; + aclrtStream stream; + const framework::ExecutionContext& ctx; +}; + +template +class IouSimilarityNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + bool normalized = ctx.Attr("box_normalized"); + auto* out = ctx.Output("Out"); + + auto _type = x->type(); + auto place = ctx.GetPlace(); + + IouFunction F(ctx); + + auto N = x->dims()[0]; + auto M = y->dims()[0]; + + out->mutable_data({N, M}, place); + Tensor xt(_type); + Tensor yt(_type); + xt.mutable_data({4, N}, place); + yt.mutable_data({4, M}, place); + std::vector vec_trans = {1, 0}; + F.Transpose(x, &xt, vec_trans); + F.Transpose(y, &yt, vec_trans); + Tensor xmin1 = xt.Slice(0, 1); + Tensor ymin1 = xt.Slice(1, 2); + Tensor xmax1 = xt.Slice(2, 3); + Tensor ymax1 = xt.Slice(3, 4); + Tensor xmin2 = yt.Slice(0, 1); + Tensor ymin2 = yt.Slice(1, 2); + Tensor xmax2 = yt.Slice(2, 3); + Tensor ymax2 = yt.Slice(3, 4); + xmin1.Resize({N, 1}); + ymin1.Resize({N, 1}); + xmax1.Resize({N, 1}); + ymax1.Resize({N, 1}); + xmin2.Resize({1, M}); + ymin2.Resize({1, M}); + xmax2.Resize({1, M}); + ymax2.Resize({1, M}); + + Tensor w1(_type); + Tensor h1(_type); + Tensor w2(_type); + Tensor h2(_type); + Tensor area1(_type); + Tensor area2(_type); + w1.mutable_data({N, 1}, place); + h1.mutable_data({N, 1}, place); + w2.mutable_data({1, M}, place); + h2.mutable_data({1, M}, place); + area1.mutable_data({N, 1}, place); + area2.mutable_data({1, M}, place); + F.Sub(&xmax1, &xmin1, &w1); + F.Sub(&ymax1, &ymin1, &h1); + F.Sub(&xmax2, &xmin2, &w2); + F.Sub(&ymax2, &ymin2, &h2); + if (!normalized) { + F.Adds(&w1, 1.0f, &w1); + F.Adds(&h1, 1.0f, &h1); + F.Adds(&w2, 1.0f, &w2); + F.Adds(&h2, 1.0f, &h2); + } + F.Mul(&w1, &h1, &area1); + F.Mul(&w2, &h2, &area2); + + Tensor inter_xmax(_type); + Tensor inter_ymax(_type); + Tensor inter_xmin(_type); + Tensor inter_ymin(_type); + inter_xmax.mutable_data({N, M}, place); + inter_ymax.mutable_data({N, M}, place); + inter_xmin.mutable_data({N, M}, place); + inter_ymin.mutable_data({N, M}, place); + F.Minimum(&xmax1, &xmax2, &inter_xmax); + F.Minimum(&ymax1, &ymax2, &inter_ymax); + F.Maximum(&xmin1, &xmin2, &inter_xmin); + F.Maximum(&ymin1, &ymin2, &inter_ymin); + + Tensor inter_w(_type); + Tensor inter_h(_type); + inter_w.mutable_data({N, M}, place); + inter_h.mutable_data({N, M}, place); + F.Sub(&inter_xmax, &inter_xmin, &inter_w); + F.Sub(&inter_ymax, &inter_ymin, &inter_h); + + if (!normalized) { + F.Adds(&inter_w, 1.0f, &inter_w); + F.Adds(&inter_h, 1.0f, &inter_h); + } + Tensor zeros(_type); + zeros.mutable_data({1}, place); + FillNpuTensorWithConstant(&zeros, static_cast(0)); + F.Maximum(&inter_w, &zeros, &inter_w); + F.Maximum(&inter_h, &zeros, &inter_h); + + F.Mul(&inter_w, &inter_h, out); + Tensor union_area(_type); + union_area.mutable_data({N, M}, place); + F.Add(&area1, &area2, &union_area); + F.Sub(&union_area, out, &union_area); + F.DivNoNan(out, &union_area, out); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(iou_similarity, ops::IouSimilarityNPUKernel, + ops::IouSimilarityNPUKernel); diff --git a/paddle/fluid/operators/dropout_impl.cu.h b/paddle/fluid/operators/dropout_impl.cu.h index 4261a5f2534c85..695d29b294a51a 100644 --- a/paddle/fluid/operators/dropout_impl.cu.h +++ b/paddle/fluid/operators/dropout_impl.cu.h @@ -30,6 +30,7 @@ limitations under the License. */ #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/dropout_impl_util.h" #include "paddle/fluid/operators/dropout_op.h" #include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/gpu_launch_config.h" @@ -196,28 +197,9 @@ void DropoutFwGPUKernelDriver(const platform::CUDADeviceContext& dev_ctx, config.thread_per_block.x * vec_size) + 1) * vec_size; - int device_id = - BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()).GetDeviceId(); - auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); - - if ((seed) && platform::is_gpu_place(seed->place())) { - framework::Tensor seed_cpu_tensor; - TensorCopySync(*seed, platform::CPUPlace(), &seed_cpu_tensor); - seed_data = static_cast(seed_cpu_tensor.data()[0]); - increment = offset; - } else if (gen_cuda->GetIsInitPy() && (!is_fix_seed)) { - auto seed_offset = gen_cuda->IncrementOffset(offset); - seed_data = seed_offset.first; - increment = seed_offset.second; - } else { - if (seed) { - seed_data = *(seed->data()); - } else { - std::random_device rnd; - seed_data = is_fix_seed ? seed_val : rnd(); - } - increment = offset; - } + + GetSeedDataAndIncrement(dev_ctx, seed, is_fix_seed, seed_val, offset, + &seed_data, &increment); #ifdef __HIPCC__ if (vec_size == 4 && size % 4 == 0) { diff --git a/paddle/fluid/operators/dropout_impl_util.h b/paddle/fluid/operators/dropout_impl_util.h new file mode 100644 index 00000000000000..e11640d070625e --- /dev/null +++ b/paddle/fluid/operators/dropout_impl_util.h @@ -0,0 +1,52 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/framework/tensor_util.h" + +namespace paddle { +namespace operators { + +inline void GetSeedDataAndIncrement(const platform::CUDADeviceContext& dev_ctx, + const framework::Tensor* seed, + const bool is_fix_seed, const int seed_val, + const int offset, uint64_t* seed_data, + uint64_t* increment) { + int device_id = + BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()).GetDeviceId(); + auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id); + + if (seed) { + framework::Tensor seed_cpu_tensor; + TensorCopySync(*seed, platform::CPUPlace(), &seed_cpu_tensor); + *seed_data = static_cast(seed_cpu_tensor.data()[0]); + *increment = offset; + } else if (seed && platform::is_cpu_place(seed->place())) { + *seed_data = *(seed->data()); + *increment = offset; + } else if (gen_cuda->GetIsInitPy() && (!is_fix_seed)) { + auto seed_offset = gen_cuda->IncrementOffset(offset); + *seed_data = seed_offset.first; + *increment = seed_offset.second; + } else { + std::random_device rnd; + *seed_data = is_fix_seed ? seed_val : rnd(); + *increment = offset; + } +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/dropout_op.cc b/paddle/fluid/operators/dropout_op.cc index 9700b9a2f7a1c2..cbfb795d6a23e1 100644 --- a/paddle/fluid/operators/dropout_op.cc +++ b/paddle/fluid/operators/dropout_op.cc @@ -42,6 +42,19 @@ class DropoutOp : public framework::OperatorWithKernel { return framework::OpKernelType( OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); } + + framework::OpKernelType GetKernelTypeForVar( + const std::string& var_name, const Tensor& tensor, + const framework::OpKernelType& expected_kernel_type) const override { + if (var_name == "Seed") { + VLOG(10) << "var_name:" << var_name + << " does not need to transform in dropout op"; + return expected_kernel_type; + } + + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), tensor.layout()); + } }; class DropoutOpMaker : public framework::OpProtoAndCheckerMaker { diff --git a/paddle/fluid/operators/dropout_op_npu.cc b/paddle/fluid/operators/dropout_op_npu.cc index b5c8bfff0dc39f..50d247d9c05906 100644 --- a/paddle/fluid/operators/dropout_op_npu.cc +++ b/paddle/fluid/operators/dropout_op_npu.cc @@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and -limitations under the Licnse. */ +limitations under the License. */ #include #include diff --git a/paddle/fluid/operators/eig_op.cc b/paddle/fluid/operators/eig_op.cc new file mode 100644 index 00000000000000..c1aac4546e36e3 --- /dev/null +++ b/paddle/fluid/operators/eig_op.cc @@ -0,0 +1,168 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/eig_op.h" +#include +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class EigOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Eig"); + OP_INOUT_CHECK(ctx->HasOutput("Eigenvalues"), "Output", "Eigenvalues", + "Eig"); + OP_INOUT_CHECK(ctx->HasOutput("Eigenvectors"), "Output", "Eigenvectors", + "Eig"); + + auto x_dims = ctx->GetInputDim("X"); + int rank = x_dims.size(); + PADDLE_ENFORCE_GE(rank, 2, platform::errors::InvalidArgument( + "Expects input tensor x to be not less than " + "2 dimentions, but got dimention %d", + rank)); + PADDLE_ENFORCE_EQ(x_dims[rank - 2], x_dims[rank - 1], + platform::errors::InvalidArgument( + "The input matrix must be a square matrix, " + "but receive a matrix with %d rows and %d colums", + x_dims[rank - 2], x_dims[rank - 1])); + + std::vector batch_dims_vec{}; + for (int i = 0; i < rank - 1; ++i) { + batch_dims_vec.emplace_back(x_dims[i]); + } + + ctx->SetOutputDim("Eigenvectors", x_dims); + ctx->SetOutputDim("Eigenvalues", framework::make_ddim(batch_dims_vec)); + } + + protected: + // The output of eig is always complex-valued even for real-valued inputs + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X"); + if (dtype != framework::proto::VarType::FP32 && + dtype != framework::proto::VarType::FP64 && + dtype != framework::proto::VarType::COMPLEX64 && + dtype != framework::proto::VarType::COMPLEX128) { + PADDLE_THROW(platform::errors::InvalidArgument( + "unsupported data type: %s!", dtype)); + } + return framework::OpKernelType(dtype, ctx.GetPlace()); + } +}; + +class EigOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput( + "X", + "(Tensor), A complex-valued or real-valued tensor with shape (*, " + "n, n). The accepted datatype is one of float32, float64, complex64 " + "or complex128"); + AddOutput("Eigenvalues", + "(Tensor), The output eigenvalues tensor with shape (*, n). The " + "datatype is complex64 or complex128"); + AddOutput("Eigenvectors", + "(Tensor), The output eigenvectors tensor with shape (*, n, n). " + "The datatype is complex64 or complex128"); + + AddComment(R"DOC( + Eig Operator. + +This API processes eigen decomposition for general square matrices. + +)DOC"); + } +}; + +class EigGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("Eigenvalues"), "Input", "Eigenvalues", + "EigGrad"); + OP_INOUT_CHECK(ctx->HasInput("Eigenvectors"), "Input", "Eigenvectors", + "EigGrad"); + OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Eigenvalues")), + "Input", "Eigenvalues@GRAD", "EigGrad"); + OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Eigenvectors")), + "Input", "Eigenvectors@GRAD", "EigGrad"); + + auto dims = ctx->GetInputDim("Eigenvectors"); + auto x_grad_name = framework::GradVarName("X"); + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, dims); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Eigenvectors")), + ctx.device_context()); + } +}; + +template +class EigGradOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType(this->ForwardOpType() + "_grad"); + op->SetInput("Eigenvalues", this->Output("Eigenvalues")); + op->SetInput("Eigenvectors", this->Output("Eigenvectors")); + op->SetInput(framework::GradVarName("Eigenvalues"), + this->OutputGrad("Eigenvalues")); + op->SetInput(framework::GradVarName("Eigenvectors"), + this->OutputGrad("Eigenvectors")); + op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); + } +}; + +} // namespace operators +} // namespace paddle + +using complex64 = paddle::platform::complex; +using complex128 = paddle::platform::complex; + +namespace ops = paddle::operators; +REGISTER_OPERATOR(eig, ops::EigOp, ops::EigOpMaker, + ops::EigGradOpMaker, + ops::EigGradOpMaker); + +REGISTER_OPERATOR(eig_grad, ops::EigGradOp); + +REGISTER_OP_CPU_KERNEL( + eig, ops::EigKernel, + ops::EigKernel, + ops::EigKernel, + ops::EigKernel); + +REGISTER_OP_CPU_KERNEL( + eig_grad, + ops::EigGradKernel, + ops::EigGradKernel, + ops::EigGradKernel, + ops::EigGradKernel); diff --git a/paddle/fluid/operators/eig_op.h b/paddle/fluid/operators/eig_op.h new file mode 100644 index 00000000000000..b9a3cb300b4c21 --- /dev/null +++ b/paddle/fluid/operators/eig_op.h @@ -0,0 +1,330 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include "paddle/fluid/operators/math/complex_functors.h" +#include "paddle/fluid/operators/math/lapack_function.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/matrix_solve.h" +#include "paddle/fluid/operators/svd_helper.h" +#include "paddle/fluid/operators/transpose_op.h" +#include "paddle/fluid/platform/for_range.h" +#define EPSILON 1e-6 + +namespace paddle { +namespace operators { + +using paddle::framework::Tensor; + +inline int BatchCount(const Tensor& matrix) { + int count = 1; + int num_dims = matrix.dims().size(); + for (int i = 0; i < num_dims - 2; ++i) { + count *= matrix.dims()[i]; + } + return count; +} + +inline int MatrixStride(const Tensor& matrix) { + framework::DDim dims_list = matrix.dims(); + int num_dims = dims_list.size(); + return dims_list[num_dims - 1] * dims_list[num_dims - 2]; +} + +// Transpose two axis of a Tensor +template +void TransposeTwoAxis(const Tensor& input, Tensor* transposed_input, + const int axis1, const int axis2, + const framework::ExecutionContext& context) { + std::vector permute(input.dims().size()); + std::iota(permute.begin(), permute.end(), 0); + permute[axis1] = axis2; + permute[axis2] = axis1; + + transposed_input->mutable_data(input.dims(), context.GetPlace()); + auto& dev_ctx = context.template device_context(); + + TransCompute(input.dims().size(), dev_ctx, input, + transposed_input, permute); +} + +// Apply eig to a batch of matrices, values, vectors and (intermidiate +// tensor) info are overritten +template +void LapackEig(Tensor* input, Tensor* values, Tensor* vectors, int info, + const framework::ExecutionContext& context) { + char jobvl = 'N'; + char jobvr = 'V'; // only right eigenvectors are computed + int num_dims = input->dims().size(); + int order = input->dims()[num_dims - 1]; + + T* input_data = input->data(); + int lda = std::max(1, order); + T* values_data = values->mutable_data(context.GetPlace()); + T* lvector_data = nullptr; + int ldvl = 1; + T* rvector_data = vectors->mutable_data(context.GetPlace()); + int ldvr = lda; + int lwork = -1; + + int batch_count = BatchCount(*input); + int matrix_stride = MatrixStride(*input); + int values_stride = values->dims()[values->dims().size() - 1]; + + Tensor rwork; + math::Real* rwork_data = nullptr; + + rwork.Resize(framework::make_ddim({lda * 2})); + rwork_data = rwork.mutable_data>(context.GetPlace()); + + // call lapackEig once to compute the size of work; + T computed_work_size; + math::lapackEig>( + jobvl, jobvr, order, input_data, lda, values_data, lvector_data, ldvl, + rvector_data, ldvr, &computed_work_size, lwork, rwork_data, &info); + + lwork = std::max(1, static_cast(math::Real(computed_work_size))); + Tensor work; + work.Resize(framework::make_ddim({lwork})); + T* work_data = work.mutable_data(context.GetPlace()); + + for (auto i = 0; i < batch_count; ++i) { + T* current_matrix = &input_data[i * matrix_stride]; + T* current_values = &values_data[i * values_stride]; + T* current_rvectors = &rvector_data[i * matrix_stride]; + + math::lapackEig>( + jobvl, jobvr, order, current_matrix, lda, current_values, lvector_data, + ldvl, current_rvectors, ldvr, work_data, lwork, rwork_data, &info); + PADDLE_ENFORCE_EQ( + info, 0, + platform::errors::PreconditionNotMet( + "current info is not 0, computation failed. " + "= 0: successful exit." + "< 0: if INFO = -i, the i-th argument had an illegal value." + "> 0: if INFO = i, the QR algorithm failed to compute all the " + "eigenvalues, and no eigenvectors have been computed; " + "elements i+1:N of WR and WI contain eigenvalues which " + "have converged.")); + } +} + +template +void ApplyEigKernel(const Tensor& input, Tensor* values, Tensor* vectors, + const framework::ExecutionContext& context) { + Tensor input_column_major; + Tensor vectors_row_major; + int num_dims = input.dims().size(); + + // transfer to column-major memory layout i.e. make_ddim from tranposed_input: + // [batch,row,col]->[batch,col,row] + TransposeTwoAxis(input, &input_column_major, num_dims - 1, + num_dims - 2, context); + // make sure 'vectors_row_major' holds memory before passed to LapackEig() + vectors_row_major.Resize(input.dims()); + int info = 0; + LapackEig(&input_column_major, values, &vectors_row_major, info, context); + + // transfer column-major layout back + // vectors_row_major: column-major layout + // vector: original layout + TransposeTwoAxis(vectors_row_major, vectors, num_dims - 1, + num_dims - 2, context); +} + +template +void ConstructComplexVectors(Tensor* c_vectors, const Tensor& c_values, + const Tensor& r_vectors, + const framework::ExecutionContext& ctx, + int batch_count, int order) { + int matrix_stride = MatrixStride(r_vectors); + + auto* c_vectors_data = c_vectors->mutable_data(ctx.GetPlace()); + auto* c_values_data = c_values.data(); + auto* r_v_data = r_vectors.data(); + + for (int b = 0; b < batch_count; b++) { + auto* vecs = &r_v_data[b * matrix_stride]; + auto* res = &c_vectors_data[b * matrix_stride]; + auto* vals = &c_values_data[b * order]; + + for (int j = 0; j < order; j++) { + if (vals[j].imag < EPSILON) { + for (int i = 0; i < order; i++) { + res[j * order + i] = platform::complex(vecs[j * order + i], 0); + } + } else { + for (int i = 0; i < order; i++) { + res[j * order + i] = platform::complex(vecs[j * order + i], + vecs[(j + 1) * order + i]); + res[(j + 1) * order + i] = platform::complex( + vecs[j * order + i], -vecs[(j + 1) * order + i]); + } + j++; + } + } + } +} + +template +class EigKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x = context.Input("X"); + auto* out_values = context.Output("Eigenvalues"); + auto* out_vectors = context.Output("Eigenvectors"); + + if (!framework::IsComplexType(x->type())) { + out_values->mutable_data(context.GetPlace()); + out_vectors->mutable_data(context.GetPlace()); + + int batch_count = BatchCount(*x); + int order = x->dims()[x->dims().size() - 1]; + + Tensor real_values; + Tensor real_vectors; + // double the size of real_values, the first half stores the real part, + // the next half stores the imag part + std::vector origin_dim = + framework::vectorize(out_values->dims()); + int last_item = origin_dim.back(); + origin_dim.pop_back(); + origin_dim.push_back(last_item * 2); + framework::DDim big_dim = framework::make_ddim(origin_dim); + + real_values.mutable_data>(big_dim, context.GetPlace()); + real_vectors.mutable_data>(x->dims(), context.GetPlace()); + + ApplyEigKernel>(*x, &real_values, + &real_vectors, context); + auto dito = + math::DeviceIndependenceTensorOperations, + Tout>(context); + + // 1. extract real part & imag part from real_values + Tensor real_part = dito.Slice(real_values, {-1}, {0}, {order}); + Tensor imag_part = dito.Slice(real_values, {-1}, {order}, {order * 2}); + + // 2. construct complex values + auto* real_part_data = real_part.data>(); + auto* imag_part_data = imag_part.data>(); + int out_values_numel = out_values->numel(); + platform::ForRange for_range( + context.template device_context(), out_values_numel); + math::RealImagToComplexFunctor functor( + real_part_data, imag_part_data, + out_values->mutable_data(context.GetPlace()), out_values_numel); + for_range(functor); + + // 3. construct complex vectors + Tensor real_vector_trans = dito.Transpose(real_vectors); + Tensor out_vectors_trans; + out_vectors_trans.mutable_data(x->dims(), context.GetPlace()); + ConstructComplexVectors, Tout>( + &out_vectors_trans, *out_values, real_vector_trans, context, + batch_count, order); + TransposeTwoAxis(out_vectors_trans, out_vectors, + x->dims().size() - 1, + x->dims().size() - 2, context); + } else { + out_values->mutable_data(context.GetPlace()); + out_vectors->mutable_data(context.GetPlace()); + + ApplyEigKernel(*x, out_values, out_vectors, context); + } + } +}; + +template +void ComputeBackwardForComplexInput( + const Tensor& V, const Tensor& L, const Tensor& gL, const Tensor& gV, + Tout* x_grad_data, int batch_count, int order, + const framework::ExecutionContext& context) { + auto dito = + math::DeviceIndependenceTensorOperations( + context); + + Tensor trans_v = dito.Transpose(V); + Tensor Vh = dito.Conj(trans_v); + Tensor Lconj = dito.Conj(L); + Tensor Econj = dito.Sub(dito.Unsqueeze(Lconj, -2), dito.Unsqueeze(Lconj, -1)); + Tensor VhgV = dito.Matmul(Vh, gV); + Tensor diag_real = dito.Real(VhgV); + Tensor diag_res = dito.BatchDiag(diag_real, batch_count); + Tensor diag_unsqueezed = dito.Unsqueeze(diag_res, -2); + + // turn diag_unsqueezed into complex + auto numel = diag_unsqueezed.numel(); + Tensor diag_unsqueezed_complex; + auto* data_diag_un = diag_unsqueezed.data>(); + auto* data_diag_un_com = diag_unsqueezed_complex.mutable_data( + diag_unsqueezed.dims(), context.GetPlace(), + static_cast(numel * sizeof(Tout))); + auto& dev_ctx = context.template device_context(); + platform::ForRange for_range(dev_ctx, numel); + math::RealToComplexFunctor functor(data_diag_un, data_diag_un_com, + numel); + for_range(functor); + // real tensor multiply complex tensor in broadcast manner + Tensor res1 = dito.RealMulComplex(V, diag_unsqueezed_complex); + Tensor res2 = dito.Matmul(Vh, res1); + Tensor result = dito.Sub(VhgV, res2); + + result.mutable_data(V.dims(), context.GetPlace()); + result = dito.Div(result, Econj); + result = dito.DiagFill(order, order, order, 0, gL, result); + Tensor rhs = dito.Matmul(result, Vh); + + // solve linear system + // solve(Vh, rhs, out, m, k) + // Vh: matrix with shape [m,m] + // rhs: rhs with shape [m,k] + // x_grad: out + int m = Vh.dims()[Vh.dims().size() - 1]; + int k = rhs.dims()[rhs.dims().size() - 1]; + auto* matrix_data = Vh.data(); + auto* rhs_data = rhs.data(); + math::SolveLinearSystem(matrix_data, rhs_data, x_grad_data, m, k, + batch_count); +} + +template +class EigGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto& L = *context.Input("Eigenvalues"); + auto& V = *context.Input("Eigenvectors"); + auto& gL = *context.Input(framework::GradVarName("Eigenvalues")); + auto& gV = *context.Input(framework::GradVarName("Eigenvectors")); + + auto& x_grad = *context.Output(framework::GradVarName("X")); + auto* x_grad_data = x_grad.mutable_data(context.GetPlace()); + + auto& dims = V.dims(); + framework::DDim dim_origin = dims; + int num_dims = dim_origin.size(); + int batch_count = BatchCount(V); + const int order = dim_origin[num_dims - 1]; + + ComputeBackwardForComplexInput( + V, L, gL, gV, x_grad_data, batch_count, order, context); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/eigvalsh_op.cc b/paddle/fluid/operators/eigvalsh_op.cc new file mode 100644 index 00000000000000..fd5893df0c449d --- /dev/null +++ b/paddle/fluid/operators/eigvalsh_op.cc @@ -0,0 +1,163 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/eigvalsh_op.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class EigvalshOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Eigvalsh"); + OP_INOUT_CHECK(ctx->HasOutput("Eigenvalues"), "Output", "Eigenvalues", + "Eigvalsh"); + + auto input_dim = ctx->GetInputDim("X"); + auto rank = input_dim.size(); + + PADDLE_ENFORCE_GE(rank, 2, + platform::errors::InvalidArgument( + "The Input(X) should have at least 2 dimensions." + "But received a %d dimension tensor.", + rank)); + PADDLE_ENFORCE_EQ( + input_dim[rank - 2], input_dim[rank - 1], + platform::errors::InvalidArgument( + "Eigvalsh op is designed for square matrix, consequently" + "inner-most 2 dimensions of Input(X) should be symmetric." + "But received X's shape[-2] = %d and shape[-1] = %d.", + input_dim[rank - 2], input_dim[rank - 1])); + + std::vector values_dim; + + for (auto i = 0; i < rank - 1; i++) { + values_dim.emplace_back(input_dim[i]); + } + + ctx->SetOutputDim("Eigenvalues", framework::make_ddim(values_dim)); + + if (ctx->HasOutput("Eigenvectors")) { + ctx->SetOutputDim("Eigenvectors", input_dim); + } + } +}; + +class EigvalshOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor), Hermitian or real symmetric matrices." + "Its shape should be [*, N, N] where * is zero or" + "more batch dimensions. The data type is float32 ," + "float64, complex64, complex128."); + AddOutput("Eigenvalues", + "(Tensor), The eigenvalues in ascending order." + "The data type is float32 or float64."); + AddOutput( + "Eigenvectors", + "(Tensor), The column is the normalized eigenvector " + "corresponding to the eigenvalue. The data type is the same as ``X``." + "Eigenvectors are required to calculate gradient when backward."); + AddAttr( + "UPLO", + "(string, default 'L'), 'L' represents the lower triangular matrix," + "'U' represents the upper triangular matrix.") + .SetDefault("L"); + AddAttr("is_test", + "(bool, default false) Set to true for inference only, false " + "for training.") + .SetDefault(false); + AddComment(R"DOC( +Eigvalsh Operator. + +Computes the eigenvalues of a complex Hermitian + (conjugate symmetric) or a real symmetric matrix. + +)DOC"); + } +}; + +class EigvalshGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("Eigenvectors"), "Input", "Eigenvectors", + "EigvalshGrad"); + OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Eigenvalues")), + "Input", "Eigenvalues@GRAD", "EigvalshGrad"); + auto dims = ctx->GetInputDim("Eigenvectors"); + auto x_grad_name = framework::GradVarName("X"); + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, dims); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "Eigenvectors"), + ctx.device_context()); + } +}; + +template +class EigvalshGradOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType(this->ForwardOpType() + "_grad"); + op->SetInput("Eigenvectors", this->Output("Eigenvectors")); + op->SetInput(framework::GradVarName("Eigenvalues"), + this->OutputGrad("Eigenvalues")); + op->SetAttrMap(this->Attrs()); + op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(eigvalsh, ops::EigvalshOp, ops::EigvalshOpMaker, + ops::EigvalshGradOpMaker, + ops::EigvalshGradOpMaker); +REGISTER_OPERATOR(eigvalsh_grad, ops::EigvalshGradOp); + +REGISTER_OP_CPU_KERNEL( + eigvalsh, + ops::EigvalshKernel, + ops::EigvalshKernel, + ops::EigvalshKernel>, + ops::EigvalshKernel>); + +REGISTER_OP_CPU_KERNEL( + eigvalsh_grad, + ops::EigvalshGradKernel, + ops::EigvalshGradKernel, + ops::EigvalshGradKernel>, + ops::EigvalshGradKernel>); diff --git a/paddle/fluid/operators/eigvalsh_op.cu b/paddle/fluid/operators/eigvalsh_op.cu new file mode 100644 index 00000000000000..a6233078570942 --- /dev/null +++ b/paddle/fluid/operators/eigvalsh_op.cu @@ -0,0 +1,36 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/eigvalsh_op.h" + +namespace ops = paddle::operators; + +REGISTER_OP_CUDA_KERNEL( + eigvalsh, + ops::EigvalshKernel, + ops::EigvalshKernel, + ops::EigvalshKernel>, + ops::EigvalshKernel>); + +REGISTER_OP_CUDA_KERNEL( + eigvalsh_grad, + ops::EigvalshGradKernel, + ops::EigvalshGradKernel, + ops::EigvalshGradKernel>, + ops::EigvalshGradKernel>); diff --git a/paddle/fluid/operators/eigvalsh_op.h b/paddle/fluid/operators/eigvalsh_op.h new file mode 100644 index 00000000000000..6c40ce107a317f --- /dev/null +++ b/paddle/fluid/operators/eigvalsh_op.h @@ -0,0 +1,79 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/eigen_values_vectors.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +using EigenVector = framework::EigenVector; + +template +class EigvalshKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto input = ctx.Input("X"); + auto output_w = ctx.Output("Eigenvalues"); + + std::string lower = ctx.Attr("UPLO"); + bool is_lower = (lower == "L"); + bool is_test = ctx.Attr("is_test"); + math::MatrixEighFunctor functor; + if (is_test) { + functor(ctx, *input, output_w, nullptr, is_lower, false); + } else { + auto output_v = ctx.Output("Eigenvectors"); + functor(ctx, *input, output_w, output_v, is_lower, true); + } + } +}; + +template +class EigvalshGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& x_grad = *ctx.Output(framework::GradVarName("X")); + auto& output_v = *ctx.Input("Eigenvectors"); + auto& output_w_grad = + *ctx.Input(framework::GradVarName("Eigenvalues")); + + auto dito = + math::DeviceIndependenceTensorOperations( + ctx); + auto tV = dito.Transpose(dito.Conj(output_v)); + + // compute elementwise multiply of output_v and output_w_grad + x_grad.mutable_data(output_v.dims(), ctx.GetPlace()); + auto output_v_vector = EigenVector::Flatten(output_v); + auto output_w_grad_vector = EigenVector::Flatten(output_w_grad); + auto result_vector = EigenVector::Flatten(x_grad); + auto& place = *ctx.template device_context().eigen_device(); + std::vector broadcast_factor; + broadcast_factor.push_back(output_v.dims().at(output_v.dims().size() - 1)); + result_vector.device(place) = + output_v_vector * output_w_grad_vector.broadcast(broadcast_factor); + + x_grad = dito.Matmul(x_grad, tV); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc index 67e2e3a1e96772..d66d6b66a05824 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc @@ -110,6 +110,25 @@ class ElementwiseAddDoubleGradMaker : public framework::SingleGradOpMaker { } }; +template +class ElementwiseAddTripleGradMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("elementwise_add_triple_grad"); + op->SetInput("DDX", this->Input("DDX")); + op->SetInput("DDY", this->Input("DDY")); + op->SetInput("D_DDOut", this->OutputGrad("DDOut")); + + op->SetAttrMap(this->Attrs()); + + op->SetOutput("D_DDX", this->InputGrad("DDX")); + op->SetOutput("D_DDY", this->InputGrad("DDY")); + } +}; + } // namespace operators } // namespace paddle @@ -123,10 +142,16 @@ REGISTER_OPERATOR( ops::ElementwiseAddDoubleGradMaker, ops::ElementwiseAddDoubleGradMaker); -REGISTER_OPERATOR(elementwise_add_grad_grad, - ops::ElementwiseOpDoubleGradWithoutDXDY, - ops::ElementwiseDoubleGradOpInplaceInferer, - ops::ElementwiseDoubleGradNoBufVarsInferer); +REGISTER_OPERATOR( + elementwise_add_grad_grad, ops::ElementwiseOpDoubleGradWithoutDXDY, + ops::ElementwiseDoubleGradOpInplaceInferer, + ops::ElementwiseDoubleGradNoBufVarsInferer, + ops::ElementwiseAddTripleGradMaker, + ops::ElementwiseAddTripleGradMaker); + +REGISTER_OPERATOR(elementwise_add_triple_grad, ops::ElementwiseOpTripleGrad, + ops::ElementwiseTripleGradOpInplaceInferer, + ops::ElementwiseTripleGradNoBufVarsInferer); REGISTER_OP_CPU_KERNEL( elementwise_add, @@ -162,6 +187,20 @@ REGISTER_OP_CPU_KERNEL( paddle::platform::complex>, ops::ElementwiseAddDoubleGradKernel>); +REGISTER_OP_CPU_KERNEL( + elementwise_add_triple_grad, + ops::ElementwiseAddTripleGradKernel, + ops::ElementwiseAddTripleGradKernel, + ops::ElementwiseAddTripleGradKernel, + ops::ElementwiseAddTripleGradKernel, + ops::ElementwiseAddTripleGradKernel>, + ops::ElementwiseAddTripleGradKernel>); // A specialization elementwise_add operator, used in gradient accumulation with // inplace addto. diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu index 331867617bd78a..0b78aa4a01a741 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.cu +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cu @@ -196,6 +196,17 @@ REGISTER_OP_CUDA_KERNEL( plat::complex>, ops::ElementwiseAddDoubleGradKernel>); +REGISTER_OP_CUDA_KERNEL( + elementwise_add_triple_grad, + ops::ElementwiseAddTripleGradKernel, + ops::ElementwiseAddTripleGradKernel, + ops::ElementwiseAddTripleGradKernel, + ops::ElementwiseAddTripleGradKernel, + ops::ElementwiseAddTripleGradKernel, + ops::ElementwiseAddTripleGradKernel>, + ops::ElementwiseAddTripleGradKernel>); REGISTER_OP_CUDA_KERNEL( grad_add, ops::ElementwiseAddKernel, diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h index 6c61ce61eecd57..0ce4ca665dd9d1 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h @@ -205,5 +205,44 @@ class ElementwiseAddDoubleGradKernel : public framework::OpKernel { } }; +template +class ElementwiseAddTripleGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + using Tensor = framework::Tensor; + auto *ddx = ctx.Input("DDX"); + auto *ddy = ctx.Input("DDY"); + auto *d_ddout = ctx.Input("D_DDOut"); + auto *d_ddx = ctx.Output("D_DDX"); + auto *d_ddy = ctx.Output("D_DDY"); + // skip out + auto *out = d_ddout; + + // Special case when d_ddy is not needed and d_ddx doesn't reduce + if (d_ddx != nullptr && d_ddy == nullptr && + d_ddx->dims() == d_ddout->dims()) { + VLOG(4) << "Special case when d_ddy is not needed and d_ddx doesn't " + "reduce"; + framework::TensorCopy( + *d_ddout, ctx.GetPlace(), + ctx.template device_context(), d_ddx); + } else if (d_ddx == nullptr && d_ddy != nullptr && + d_ddy->dims() == d_ddout->dims()) { + VLOG(4) << "Special case when d_ddx is not needed and d_ddy doesn't " + "reduce"; + framework::TensorCopy( + *d_ddout, ctx.GetPlace(), + ctx.template device_context(), d_ddy); + } else if (d_ddx != nullptr && d_ddy != nullptr && + (d_ddx->dims() == d_ddy->dims())) { + elementwise_add_grad(ctx, ddx, ddy, out, d_ddout, d_ddx, + d_ddy); + } else { + default_elementwise_add_grad(ctx, ddx, ddy, out, + d_ddout, d_ddx, d_ddy); + } + } +}; + } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc index cd1d50a017c363..41d5d718c24209 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc @@ -146,6 +146,9 @@ namespace ops = paddle::operators; namespace plat = paddle::platform; REGISTER_OP_NPU_KERNEL(elementwise_add, ops::ElementwiseAddNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::ElementwiseAddNPUKernel, +#endif ops::ElementwiseAddNPUKernel); REGISTER_OP_NPU_KERNEL(elementwise_add_grad, diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc index 47aa7e2521f76a..b2030ad21e8d1f 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op_npu.cc @@ -12,67 +12,127 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_ASCEND_CL -#include -#include - #include "paddle/fluid/operators/elementwise/elementwise_mul_op.h" +#include "paddle/fluid/operators/elementwise/elementwise_npu.h" #include "paddle/fluid/operators/npu_op_runner.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; - -template +using NPUDeviceContext = platform::NPUDeviceContext; + +template +static void ReduceDims(const framework::ExecutionContext& ctx, + const aclrtStream& stream, const int axis, + const framework::DDim& ddims, + const framework::DDim& brd_ddims, const Tensor& in, + Tensor* out) { + std::vector axes; + int64_t brd_size = brd_ddims.size(); + int64_t org_size = ddims.size(); + // int64_t diff = brd_dims.size() - dims.size(); + for (int64_t i = 0; i < brd_size; ++i) { + if (i < axis || i >= org_size + axis) { + axes.push_back(i); + continue; + } + if (brd_ddims[i] > ddims[i - axis]) { + axes.push_back(i); + } + } + // LOG(INFO) << "axes = " << framework::make_ddim(axes).to_str(); + out->mutable_data(ctx.GetPlace()); + const auto& runner = NpuOpRunner("ReduceSumD", {in}, {*out}, + {{"axes", axes}, {"keep_dims", false}}); + runner.Run(stream); +} + +template class ElementwiseMulNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = ctx.template device_context(); auto* x = ctx.Input("X"); auto* y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + int axis = ctx.Attr("axis"); + + bool direct_compute = false; + auto x_dims = x->dims(); + auto y_dims = y->dims(); + axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); + if (x_dims.size() >= y_dims.size()) { + direct_compute = x_dims.size() == (y_dims.size() + axis); + } else { + direct_compute = y_dims.size() == (x_dims.size() + axis); + } - auto place = ctx.GetPlace(); - - out->mutable_data(place); - - auto stream = - ctx.template device_context() - .stream(); + auto stream = ctx.template device_context().stream(); - const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*out}, {}); - runner.Run(stream); + if (direct_compute) { + const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*out}, {}); + runner.Run(stream); + } else { + Tensor trans_x, trans_y; + NpuElementWiseOpBroadcast(dev_ctx, x, y, axis, &trans_x, &trans_y); + const auto& runner = NpuOpRunner("Mul", {trans_x, trans_y}, {*out}, {}); + runner.Run(stream); + } } }; -template +template class ElementwiseMulGradNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = ctx.template device_context(); auto* x = ctx.Input("X"); auto* y = ctx.Input("Y"); auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); - auto place = ctx.GetPlace(); + axis = (axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis); + auto stream = ctx.template device_context().stream(); - auto stream = - ctx.template device_context() - .stream(); + Tensor trans_x, trans_y; + NpuElementWiseOpBroadcast(dev_ctx, x, y, axis, &trans_x, &trans_y); if (dx) { - dx->mutable_data(place); - const auto& runner_dx = NpuOpRunner("Mul", {*dout, *y}, {*dx}, {}); - runner_dx.Run(stream); + if (dx->dims() == dout->dims()) { + dx->mutable_data(ctx.GetPlace()); + const auto& runner_dx = NpuOpRunner("Mul", {*dout, trans_y}, {*dx}, {}); + runner_dx.Run(stream); + } else { + Tensor dx_temp(x->type()); + dx_temp.Resize(trans_x.dims()); + dx_temp.mutable_data(ctx.GetPlace()); + const auto& runner_dx = + NpuOpRunner("Mul", {*dout, trans_y}, {dx_temp}, {}); + runner_dx.Run(stream); + ReduceDims(ctx, stream, axis, dx->dims(), trans_x.dims(), dx_temp, + dx); + } } - if (dy) { - dy->mutable_data(place); - const auto& runner_dy = NpuOpRunner("Mul", {*x, *dout}, {*dy}, {}); - runner_dy.Run(stream); + if (dy->dims() == dout->dims()) { + dy->mutable_data(ctx.GetPlace()); + const auto& runner_dy = NpuOpRunner("Mul", {trans_x, *dout}, {*dy}, {}); + runner_dy.Run(stream); + } else { + Tensor dy_temp(y->type()); + dy_temp.Resize(trans_y.dims()); + dy_temp.mutable_data(ctx.GetPlace()); + const auto& runner_dy = + NpuOpRunner("Mul", {trans_x, *dout}, {dy_temp}, {}); + runner_dy.Run(stream); + ReduceDims(ctx, stream, axis, dy->dims(), trans_y.dims(), dy_temp, + dy); + } } } }; @@ -82,15 +142,9 @@ class ElementwiseMulGradNPUKernel : public framework::OpKernel { namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL( - elementwise_mul, - ops::ElementwiseMulNPUKernel, - ops::ElementwiseMulNPUKernel); +REGISTER_OP_NPU_KERNEL(elementwise_mul, ops::ElementwiseMulNPUKernel, + ops::ElementwiseMulNPUKernel); REGISTER_OP_NPU_KERNEL( - elementwise_mul_grad, - ops::ElementwiseMulGradNPUKernel, - ops::ElementwiseMulGradNPUKernel); -#endif + elementwise_mul_grad, ops::ElementwiseMulGradNPUKernel, + ops::ElementwiseMulGradNPUKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index 3614602156f4d9..13e4624ef717fc 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -426,6 +426,51 @@ class ElementwiseOpDoubleGradWithoutDXDY } }; +class ElementwiseOpTripleGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + using Tensor = framework::Tensor; + + void InferShape(framework::InferShapeContext *ctx) const override { + if (ctx->HasOutput("D_DDX")) { + ctx->ShareDim("DDX", "D_DDX"); + ctx->ShareLoD("DDX", "D_DDX"); + } + if (ctx->HasOutput("D_DDY")) { + ctx->ShareDim("DDY", "D_DDY"); + ctx->ShareLoD("DDY", "D_DDY"); + } + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + framework::proto::VarType::Type input_data_type; + input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "D_DDOut"); + +#ifdef PADDLE_WITH_MKLDNN + if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + return framework::OpKernelType(input_data_type, ctx.GetPlace(), + framework::DataLayout::kMKLDNN, + framework::LibraryType::kMKLDNN); + } +#endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } + + framework::OpKernelType GetKernelTypeForVar( + const std::string &var_name, const framework::Tensor &tensor, + const framework::OpKernelType &expected_kernel_type) const { + if (framework::IsComplexType(expected_kernel_type.data_type_)) { + // only promote inputs’s types when contains complex input + return framework::OpKernelType(tensor.type(), tensor.place(), + tensor.layout()); + } else { + return framework::OpKernelType(expected_kernel_type.data_type_, + tensor.place(), tensor.layout()); + } + } +}; + template class ElemwiseGradKernel : public framework::OpKernel { public: @@ -447,9 +492,14 @@ DECLARE_INPLACE_OP_INFERER(ElementwiseGradOpInplaceInferer, DECLARE_INPLACE_OP_INFERER(ElementwiseDoubleGradOpInplaceInferer, {"DDX", "DDOut"}); +DECLARE_INPLACE_OP_INFERER(ElementwiseTripleGradOpInplaceInferer, + {"D_DDOut", "D_DDX"}); + DECLARE_NO_NEED_BUFFER_VARS_INFERER(ElementwiseGradNoBufVarsInferer, "X", "Y"); DECLARE_NO_NEED_BUFFER_VARS_INFERER(ElementwiseDoubleGradNoBufVarsInferer, "Y", "DOut"); +DECLARE_NO_NEED_BUFFER_VARS_INFERER(ElementwiseTripleGradNoBufVarsInferer, + "DDX", "DDY"); } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h index 53ac85802a6f43..549a6be0b4507e 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h @@ -171,7 +171,7 @@ __device__ __forceinline__ void LoadData( // num: how many data will be deal with in this time if (need_broadcast) { kps::ReadDataBc(dst, src, block_offset, - config, numel, 1, 1); + config, numel); } else { kps::ReadData(dst, src + block_offset, num); } diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h index 312978a010b30c..2df7dd06f2cc89 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h @@ -240,7 +240,7 @@ inline void GetBroadcastDimsArrays(const framework::DDim &x_dims, x_dims, y_dims, x_dims_array[i], y_dims_array[i], i)); if ((x_dims_array[i] > 1 || y_dims_array[i] > 1) || (x_dims_array[i] == 1 && y_dims_array[i] == 1)) { - out_dims_array[i] = std::max(x_dims_array[i], y_dims_array[i]); + out_dims_array[i] = (std::max)(x_dims_array[i], y_dims_array[i]); } else { out_dims_array[i] = -1; } @@ -1779,7 +1779,7 @@ void CommonElementwiseBroadcastForward( const framework::Tensor *y, framework::Tensor *z, const framework::DDim &x_dims, const framework::DDim &y_dims, Functor func, int axis, const bool is_xsize_larger = true) { - int max_dim = std::max(x_dims.size(), y_dims.size()); + int max_dim = (std::max)(x_dims.size(), y_dims.size()); axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); PADDLE_ENFORCE_GE( axis, 0, diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc index 94e78defbbee5d..4cc4228b164298 100644 --- a/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_sub_op_npu.cc @@ -166,9 +166,17 @@ class ElementwiseSubGradNPUKernel : public framework::OpKernel { namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL(elementwise_sub, ops::ElementwiseSubNPUKernel, +REGISTER_OP_NPU_KERNEL(elementwise_sub, ops::ElementwiseSubNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::ElementwiseSubNPUKernel, +#endif + ops::ElementwiseSubNPUKernel, ops::ElementwiseSubNPUKernel); REGISTER_OP_NPU_KERNEL(elementwise_sub_grad, + ops::ElementwiseSubGradNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::ElementwiseSubGradNPUKernel, +#endif ops::ElementwiseSubGradNPUKernel, ops::ElementwiseSubGradNPUKernel); diff --git a/paddle/fluid/operators/expand_v2_op_npu.cc b/paddle/fluid/operators/expand_v2_op_npu.cc index 85fe86a9e606f3..4b0e0770573a6f 100644 --- a/paddle/fluid/operators/expand_v2_op_npu.cc +++ b/paddle/fluid/operators/expand_v2_op_npu.cc @@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and -limitations under the Licnse. */ +limitations under the License. */ #include "paddle/fluid/operators/expand_v2_op.h" #include "paddle/fluid/operators/npu_op_runner.h" diff --git a/paddle/fluid/operators/fake_quantize_op.cu b/paddle/fluid/operators/fake_quantize_op.cu index 583ff157a0d398..8f2235c7e3d21f 100644 --- a/paddle/fluid/operators/fake_quantize_op.cu +++ b/paddle/fluid/operators/fake_quantize_op.cu @@ -216,14 +216,14 @@ __global__ void ClipAndQuantDequantKernel(const T* in, const T* scale, int tid = threadIdx.x; T s = scale[0]; + T inv_s = inverse(s); T bin_cnt_t = static_cast(bin_cnt); for (int i = bid; i < n; i += blockDim.x * gridDim.x) { T x = in[i]; x = x > s ? s : x; x = x < -s ? -s : x; - x = (bin_cnt_t / s) * x; - + x = bin_cnt_t * inv_s * x; x = static_cast(round(static_cast(x))); out[i] = (x * s) / bin_cnt_t; } diff --git a/paddle/fluid/operators/fake_quantize_op.h b/paddle/fluid/operators/fake_quantize_op.h index 11a2d2de8bcf73..21e7079ff62334 100644 --- a/paddle/fluid/operators/fake_quantize_op.h +++ b/paddle/fluid/operators/fake_quantize_op.h @@ -28,8 +28,9 @@ namespace operators { template inline HOSTDEVICE T inverse(T s) { - T eps = 1e-6; - return s <= 1e-30 ? 1.0 / (s + eps) : 1.0 / s; + T eps = static_cast(1e-6); + T one = static_cast(1.0); + return s <= static_cast(1e-30) ? one / (s + eps) : one / s; } template diff --git a/paddle/fluid/operators/fill_any_like_op_npu.cc b/paddle/fluid/operators/fill_any_like_op_npu.cc index d5204f5cacfc68..566b265bfdba9b 100644 --- a/paddle/fluid/operators/fill_any_like_op_npu.cc +++ b/paddle/fluid/operators/fill_any_like_op_npu.cc @@ -63,9 +63,12 @@ class FillAnyLikeNPUKernel : public framework::OpKernel { .stream(); auto shape = out->dims(); - const auto& runner = NpuOpRunner("FillD", {tensor_tmp}, {*out}, - {{"dims", framework::vectorize(shape)}}); - runner.Run(stream); + NpuOpRunner runner; + runner.SetType("Fill") + .AddInput(framework::vectorize(shape)) + .AddInput(tensor_tmp) + .AddOutput(*out) + .Run(stream); } }; @@ -75,5 +78,8 @@ class FillAnyLikeNPUKernel : public framework::OpKernel { namespace ops = paddle::operators; REGISTER_OP_NPU_KERNEL(fill_any_like, ops::FillAnyLikeNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::FillAnyLikeNPUKernel, +#endif ops::FillAnyLikeNPUKernel, ops::FillAnyLikeNPUKernel); diff --git a/paddle/fluid/operators/fill_any_like_op_xpu.cc b/paddle/fluid/operators/fill_any_like_op_xpu.cc new file mode 100644 index 00000000000000..76cf339fbf5cca --- /dev/null +++ b/paddle/fluid/operators/fill_any_like_op_xpu.cc @@ -0,0 +1,79 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU + +#include "paddle/fluid/operators/fill_any_like_op.h" + +namespace paddle { +namespace operators { + +template +class FillAnyLikeXPUKernel : public framework::OpKernel { + public: + using CommonType = typename std::common_type< + float, + typename std::conditional::value, + float, T>::type>::type; + using XPUInTDType = typename XPUTypeTrait::Type; + + void Compute(const framework::ExecutionContext& context) const override { + auto* out = context.Output("Out"); + out->mutable_data(context.GetPlace()); + + float value = context.Attr("value"); + + auto common_type_value = static_cast(value); + + PADDLE_ENFORCE_EQ( + (common_type_value >= + static_cast(std::numeric_limits::lowest())) && + (common_type_value <= + static_cast(std::numeric_limits::max())), + true, + platform::errors::InvalidArgument( + "The filled value is out of range for target type, " + "current kernel type is %s, the range should between %f " + "and %f, but now value is %f.", + typeid(T).name(), + static_cast(std::numeric_limits::lowest()), + static_cast(std::numeric_limits::max()), value)); + + PADDLE_ENFORCE_EQ( + std::isnan(value), false, + platform::errors::InvalidArgument("The filled value is NaN.")); + + auto& dev_ctx = + context.template device_context(); + auto out_data = reinterpret_cast(out->data()); + int ret = xpu::constant(dev_ctx.x_context(), out_data, out->numel(), + static_cast(value)); + PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, + platform::errors::External( + "XPU CONSTANT API return wrong value[%d %s].", ret, + XPUAPIErrorMsg[ret])); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_XPU_KERNEL(fill_any_like, ops::FillAnyLikeXPUKernel, + ops::FillAnyLikeXPUKernel, + ops::FillAnyLikeXPUKernel, + ops::FillAnyLikeXPUKernel); + +#endif diff --git a/paddle/fluid/operators/fill_constant_op_npu.cc b/paddle/fluid/operators/fill_constant_op_npu.cc index ae0148a9bf5132..16a2433f5cad6f 100644 --- a/paddle/fluid/operators/fill_constant_op_npu.cc +++ b/paddle/fluid/operators/fill_constant_op_npu.cc @@ -66,11 +66,21 @@ class FillConstantNPUKernel : public framework::OpKernel { out_var->mutable_data(shape, ctx.GetPlace()); NpuOpRunner runner; +#if (CANN_VERSION_CODE >= 503003) + runner.SetType("FillD") + .AddInput(tensor_value) + .AddOutput(*out_var) + .AddAttrs( + {{ "dims", + framework::vectorize(shape) }}) + .Run(stream); +#else runner.SetType("Fill") .AddInput(framework::vectorize(shape)) .AddInput(tensor_value) .AddOutput(*out_var) .Run(stream); +#endif } }; } // namespace operators diff --git a/paddle/fluid/operators/fill_diagonal_op.cc b/paddle/fluid/operators/fill_diagonal_op.cc index db55c3e99693ae..be3239d5048442 100644 --- a/paddle/fluid/operators/fill_diagonal_op.cc +++ b/paddle/fluid/operators/fill_diagonal_op.cc @@ -108,8 +108,15 @@ class FillIDiagonalKernel : public framework::OpKernel { size = std::min(size, out_dims[1] * out_dims[1]); } - for (int64_t i = offset; i < size; i += strides) { - out_data[i] = temp_var; + for (int64_t i = 0; i < size; i += strides) { + // to check if the new position with offset is still in the same line; + // this modify should not affect across lines. + // out_dims[1] is also work for tensor with dim>2, for which the dims must + // be the same number + if (i % out_dims[1] + offset >= 0 && + i % out_dims[1] + offset < out_dims[1]) { + out_data[i + offset] = temp_var; + } } } }; @@ -176,8 +183,11 @@ class FillIDiagonalGradKernel : public framework::OpKernel { wrapsize = size; } - for (int64_t i = offset; i < wrapsize; i += strides) { - data[i] = T(0); + for (int64_t i = 0; i < wrapsize; i += strides) { + if (i % dx_dims[1] + offset >= 0 && + i % dx_dims[1] + offset < dx_dims[1]) { + data[i + offset] = T(0); + } } } } diff --git a/paddle/fluid/operators/fill_diagonal_op.cu b/paddle/fluid/operators/fill_diagonal_op.cu index 5047059fb364d3..15eabd4216d0bb 100644 --- a/paddle/fluid/operators/fill_diagonal_op.cu +++ b/paddle/fluid/operators/fill_diagonal_op.cu @@ -22,11 +22,19 @@ using CUDADeviceContext = paddle::platform::CUDADeviceContext; template __global__ void fill_constant_kernel(const int64_t featuresize, T* in_data, - int64_t strides, int offset, T fillvar) { + int64_t strides, int offset, T fillvar, + int dims) { for (int64_t idx = blockIdx.x * featuresize + threadIdx.x; idx * strides + offset < (blockIdx.x + 1) * featuresize; idx += blockDim.x) { - in_data[idx * strides + offset] = fillvar; + // to check if the new position with offset is still in the same line; + // this modify should not affect across lines. + // out_dims[1] is also work for tensor with dim>2, for which the dims must + // be the same number + if ((idx * strides) % dims + offset < dims && + (idx * strides) % dims + offset >= 0) { + in_data[idx * strides + offset] = fillvar; + } } } @@ -62,7 +70,7 @@ class FillIDiagonalCUDAKernel : public framework::OpKernel { int64_t kBlockDim = std::min(int64_t(size / strides), kMaxBlockDim); fill_constant_kernel<<<1, kBlockDim, 0>>>(size, out_data, strides, - offset, temp_var); + offset, temp_var, out_dims[1]); } }; @@ -96,7 +104,7 @@ class FillIDiagonalGradCUDAKernel : public framework::OpKernel { int64_t kBlockDim = std::min(int64_t(size), kMaxBlockDim); fill_constant_kernel<<<1, kBlockDim, 0>>>(wrapsize, in_data, strides, - offset, T(0)); + offset, T(0), out_dims[1]); } }; diff --git a/paddle/fluid/operators/flatten_op.cc b/paddle/fluid/operators/flatten_op.cc index 0858a43838b964..14f2e9061b742f 100644 --- a/paddle/fluid/operators/flatten_op.cc +++ b/paddle/fluid/operators/flatten_op.cc @@ -77,9 +77,17 @@ class FlattenOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - OperatorWithKernel::IndicateVarDataType(ctx, "X"), - ctx.device_context()); + auto input_data_type = + framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); + + //#ifdef PADDLE_WITH_MKLDNN + // if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + // return framework::OpKernelType(input_data_type, ctx.GetPlace(), + // framework::DataLayout::kMKLDNN, + // framework::LibraryType::kMKLDNN); + // } + //#endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; @@ -101,6 +109,14 @@ class FlattenOpMaker : public framework::OpProtoAndCheckerMaker { "tensor is (1, (d_0 X d_1 ... d_n), where the shape of the" "input tensor is (d_0, d_1, ... d_n).") .SetDefault(1); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddAttr( + "mkldnn_data_type", + "(string, default \"float32\"). Data type of mkldnn kernel") + .SetDefault("float32") + .InEnum({"float32", "bfloat16"}); AddComment(R"DOC( Flatten Operator @@ -139,9 +155,17 @@ class FlattenGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")), - ctx.device_context()); + auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")); + + //#ifdef PADDLE_WITH_MKLDNN + // if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + // return framework::OpKernelType(input_data_type, ctx.GetPlace(), + // framework::DataLayout::kMKLDNN, + // framework::LibraryType::kMKLDNN); + // } + //#endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; @@ -198,6 +222,21 @@ class Flatten2Op : public framework::OperatorWithKernel { ctx->SetOutputDim("XShape", framework::make_ddim(xshape_dims)); ctx->ShareLoD("X", "XShape"); } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + auto input_data_type = + framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); + + //#ifdef PADDLE_WITH_MKLDNN + // if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + // return framework::OpKernelType(input_data_type, ctx.GetPlace(), + // framework::DataLayout::kMKLDNN, + // framework::LibraryType::kMKLDNN); + // } + //#endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } }; class Flatten2OpMaker : public FlattenOpMaker { @@ -244,9 +283,17 @@ class Flatten2GradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( - ctx, framework::GradVarName("Out")), - ctx.device_context()); + auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")); + + //#ifdef PADDLE_WITH_MKLDNN + // if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + // return framework::OpKernelType(input_data_type, ctx.GetPlace(), + // framework::DataLayout::kMKLDNN, + // framework::LibraryType::kMKLDNN); + // } + //#endif + return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/flatten_op_xpu.cc b/paddle/fluid/operators/flatten_op_xpu.cc new file mode 100644 index 00000000000000..53c0c688fd9e9d --- /dev/null +++ b/paddle/fluid/operators/flatten_op_xpu.cc @@ -0,0 +1,67 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_XPU + +#include "paddle/fluid/operators/flatten_op.h" + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_XPU_KERNEL( + flatten, ops::FlattenKernel, + ops::FlattenKernel, + ops::FlattenKernel, + ops::FlattenKernel); +REGISTER_OP_XPU_KERNEL( + flatten_grad, + ops::FlattenGradKernel, + ops::FlattenGradKernel, + ops::FlattenGradKernel, + ops::FlattenGradKernel); +REGISTER_OP_XPU_KERNEL( + flatten2, ops::Flatten2Kernel, + ops::Flatten2Kernel, + ops::Flatten2Kernel, + ops::Flatten2Kernel); +REGISTER_OP_XPU_KERNEL( + flatten2_grad, + ops::Flatten2GradKernel, + ops::Flatten2GradKernel, + ops::Flatten2GradKernel, + ops::Flatten2GradKernel); +REGISTER_OP_XPU_KERNEL( + flatten_contiguous_range, + ops::FlattenContiguousRangeKernel, + ops::FlattenContiguousRangeKernel, + ops::FlattenContiguousRangeKernel, + ops::FlattenContiguousRangeKernel, + ops::FlattenContiguousRangeKernel); +REGISTER_OP_XPU_KERNEL( + flatten_contiguous_range_grad, + ops::FlattenContiguousRangeGradKernel, + ops::FlattenContiguousRangeGradKernel, + ops::FlattenContiguousRangeGradKernel, + ops::FlattenContiguousRangeGradKernel, + ops::FlattenContiguousRangeGradKernel); +#endif diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt index 599be6912b760e..eec925b2c057b7 100644 --- a/paddle/fluid/operators/fused/CMakeLists.txt +++ b/paddle/fluid/operators/fused/CMakeLists.txt @@ -16,7 +16,10 @@ register_operators(EXCLUDES fusion_gru_op fusion_lstm_op fused_bn_add_activation_op - fused_transformer_op) + fused_attention_op + fused_transformer_op + fused_feedforward_op + resnet_unit_op) # fusion_gru_op does not have CUDA kernel op_library(fusion_gru_op) @@ -77,8 +80,18 @@ if (WITH_GPU OR WITH_ROCM) nv_test(test_fused_residual_dropout_bias SRCS fused_residual_dropout_bias_test.cu DEPS tensor op_registry dropout_op layer_norm_op device_context generator memory) nv_test(test_fused_dropout_act_bias SRCS fused_dropout_act_bias_test.cu DEPS tensor op_registry dropout_op layer_norm_op device_context generator memory) nv_test(test_fused_layernorm_residual_dropout_bias SRCS fused_layernorm_residual_dropout_bias_test.cu DEPS tensor op_registry dropout_op layer_norm_op device_context generator memory) + + op_library(fused_feedforward_op) + file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fused_feedforward);\n") + # fused_attention_op + op_library(fused_attention_op) + file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fused_attention);\n") endif() + # resnet_unit needs cudnn 8.0 above if ((NOT WITH_ROCM) AND (NOT ${CUDNN_VERSION} VERSION_LESS 8000)) + op_library(resnet_unit_op) + file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(resnet_unit);\n") cc_test(test_cudnn_norm_conv SRCS cudnn_norm_conv_test.cc DEPS conv_op blas im2col vol2col depthwise_conv eigen_function tensor op_registry device_context generator memory) + cc_test(test_cudnn_bn_add_relu SRCS cudnn_bn_add_relu_test.cc DEPS batch_norm_op fused_bn_add_activation_op tensor op_registry device_context generator memory) endif() endif() diff --git a/paddle/fluid/operators/fused/attn_bias_add.cu.h b/paddle/fluid/operators/fused/attn_bias_add.cu.h index fa3eb19b29995a..18ae932c9325a9 100644 --- a/paddle/fluid/operators/fused/attn_bias_add.cu.h +++ b/paddle/fluid/operators/fused/attn_bias_add.cu.h @@ -72,14 +72,14 @@ __global__ void BroadcastKernelBinary( // load in0 if (use_broadcast[0]) { kernel_primitives::ReadDataBc( - arg0, in0, fix, configlists[0], numel, 1, 1); + arg0, in0, fix, configlists[0], numel); } else { kernel_primitives::ReadData(arg0, in0 + fix, num); } // load in1 if (use_broadcast[1]) { kernel_primitives::ReadDataBc( - arg1, in1, fix, configlists[1], numel, 1, 1); + arg1, in1, fix, configlists[1], numel); } else { kernel_primitives::ReadData(arg1, in1 + fix, num); } diff --git a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc new file mode 100644 index 00000000000000..c5995fe3554b4e --- /dev/null +++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc @@ -0,0 +1,784 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h" +#include "paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/float16.h" + +DECLARE_bool(cudnn_batchnorm_spatial_persistent); + +namespace framework = paddle::framework; +namespace platform = paddle::platform; +namespace op = paddle::operators; +using Tensor = paddle::framework::Tensor; + +USE_OP(batch_norm); +USE_CUDA_ONLY_OP(fused_bn_add_activation); +USE_CUDA_ONLY_OP(fused_bn_add_activation_grad); + +template +void InitRandomTensor(const std::vector &dims, + framework::Tensor *cpu_out) { + T *cpu_out_ptr = cpu_out->mutable_data(framework::make_ddim(dims), + platform::CPUPlace()); + std::default_random_engine random(0); + std::uniform_real_distribution dis(-1.0, 1.0); + for (int i = 0; i < cpu_out->numel(); ++i) { + cpu_out_ptr[i] = static_cast(dis(random)); + } +} + +template +void InitConstantTensor(const std::vector &dims, T value, + framework::Tensor *cpu_out) { + T *cpu_out_ptr = cpu_out->mutable_data(framework::make_ddim(dims), + platform::CPUPlace()); + for (int i = 0; i < cpu_out->numel(); ++i) { + cpu_out_ptr[i] = value; + } +} + +template +void CheckOutput(std::string name, const framework::Tensor &cpu_res, + const framework::Tensor &cpu_base, float diff, + bool is_relative_atol = false) { + if (cpu_res.dims().size() == cpu_base.dims().size()) { + EXPECT_EQ(cpu_res.dims(), cpu_base.dims()); + } else { + EXPECT_EQ(cpu_res.numel(), cpu_base.numel()); + } + + const T *cpu_res_ptr = cpu_res.data(); + const T *cpu_base_ptr = cpu_base.data(); + float max_diff = 0; + int index = 0; + for (int i = 0; i < cpu_res.numel(); ++i) { + float cur_diff; + if (is_relative_atol) { + cur_diff = static_cast( + std::abs((cpu_res_ptr[i] - cpu_base_ptr[i]) / cpu_base_ptr[i])); + EXPECT_LT(static_cast(std::abs((cpu_res_ptr[i] - cpu_base_ptr[i]) / + cpu_base_ptr[i])), + diff); + } else { + cur_diff = static_cast(std::abs(cpu_res_ptr[i] - cpu_base_ptr[i])); + EXPECT_LT(static_cast(std::abs(cpu_res_ptr[i] - cpu_base_ptr[i])), + diff); + } + if (cur_diff > max_diff) { + max_diff = cur_diff; + index = i; + } + } + std::string error_type = is_relative_atol ? "relative" : "absolute"; + LOG(INFO) << "[" << name << "] The dims is [" << cpu_res.dims() + << "], maximum " << error_type << " error is " << max_diff << ": " + << cpu_res_ptr[index] << " vs " << cpu_base_ptr[index]; +} + +template +void ComputeSumAndSquareSum(const framework::Tensor &cpu_x, + framework::Tensor *cpu_sum, + framework::Tensor *cpu_sum_of_square) { + // x is in NHWC format. + auto dims = cpu_x.dims(); + int64_t c = dims[3]; + + const T *cpu_x_ptr = cpu_x.data(); + float *cpu_sum_ptr = + cpu_sum->mutable_data({1, 1, 1, c}, platform::CPUPlace()); + float *cpu_sum_square_ptr = cpu_sum_of_square->mutable_data( + {1, 1, 1, c}, platform::CPUPlace()); + + for (int j = 0; j < c; ++j) { + float tmp_sum = 0.0f; + float tmp_sum_of_squares = 0.0f; + for (int i = 0; i < cpu_x.numel() / c; ++i) { + float tmp_x = static_cast(cpu_x_ptr[i * c + j]); + tmp_sum += tmp_x; + tmp_sum_of_squares += tmp_x * tmp_x; + } + cpu_sum_ptr[j] = tmp_sum; + cpu_sum_square_ptr[j] = tmp_sum_of_squares; + } +} + +template +void ComputeInplaceAdd(const framework::Tensor &cpu_x, + framework::Tensor *cpu_y) { + EXPECT_EQ(cpu_x.dims(), cpu_y->dims()); + + const T *cpu_x_ptr = cpu_x.data(); + T *cpu_y_ptr = cpu_y->data(); + for (int64_t i = 0; i < cpu_x.numel(); ++i) { + cpu_y_ptr[i] += cpu_x_ptr[i]; + } +} + +template +void ComputeInplaceRelu(framework::Tensor *cpu_x) { + T *cpu_x_ptr = cpu_x->data(); + for (int64_t i = 0; i < cpu_x->numel(); ++i) { + cpu_x_ptr[i] = + cpu_x_ptr[i] > static_cast(0) ? cpu_x_ptr[i] : static_cast(0); + } +} + +void ComputeBatchNormForward(const platform::CUDADeviceContext &ctx, + const Tensor &cpu_x, const Tensor &cpu_scale, + const Tensor &cpu_bias, Tensor *cpu_mean, + Tensor *cpu_var, Tensor *cpu_saved_mean, + Tensor *cpu_saved_var, Tensor *cpu_y, + Tensor *saved_reserve_space) { + framework::Scope scope; + auto *x = scope.Var("X")->GetMutable(); + auto *scale = scope.Var("Scale")->GetMutable(); + auto *bias = scope.Var("Bias")->GetMutable(); + auto *mean = scope.Var("Mean")->GetMutable(); + auto *var = scope.Var("Variance")->GetMutable(); + auto *y = scope.Var("Y")->GetMutable(); + auto *saved_mean = scope.Var("SavedMean")->GetMutable(); + auto *saved_var = + scope.Var("SavedVariance")->GetMutable(); + auto *reserve_space = + scope.Var("ReserveSpace")->GetMutable(); + + auto place = ctx.GetPlace(); + TensorCopySync(cpu_x, place, x); + TensorCopySync(cpu_scale, place, scale); + TensorCopySync(cpu_bias, place, bias); + TensorCopySync(*cpu_mean, place, mean); + TensorCopySync(*cpu_var, place, var); + + int64_t channels = x->dims()[3]; + scale->Resize({channels}); + bias->Resize({channels}); + mean->Resize({channels}); + var->Resize({channels}); + + framework::AttributeMap attrs; + std::string data_layout = "NHWC"; + attrs.insert({"data_layout", data_layout}); + + auto op = framework::OpRegistry::CreateOp( + "batch_norm", {{"X", {"X"}}, + {"Scale", {"Scale"}}, + {"Bias", {"Bias"}}, + {"Mean", {"Mean"}}, + {"Variance", {"Variance"}}}, + {{"Y", {"Y"}}, + {"MeanOut", {"Mean"}}, + {"VarianceOut", {"Variance"}}, + {"SavedMean", {"SavedMean"}}, + {"SavedVariance", {"SavedVariance"}}, + {"ReserveSpace", {"ReserveSpace"}}}, + attrs); + op->Run(scope, ctx.GetPlace()); + + TensorCopySync(*y, platform::CPUPlace(), cpu_y); + TensorCopySync(*mean, platform::CPUPlace(), cpu_mean); + TensorCopySync(*var, platform::CPUPlace(), cpu_var); + TensorCopySync(*saved_mean, platform::CPUPlace(), cpu_saved_mean); + TensorCopySync(*saved_var, platform::CPUPlace(), cpu_saved_var); + // reserved_space will stay on GPU and used in grad op. + saved_reserve_space->ShareDataWith(*reserve_space); +} + +void ComputeFusedBNAddReluForward(const platform::CUDADeviceContext &ctx, + const Tensor &cpu_x, const Tensor &cpu_z, + const Tensor &cpu_scale, + const Tensor &cpu_bias, Tensor *cpu_mean, + Tensor *cpu_var, Tensor *cpu_saved_mean, + Tensor *cpu_saved_var, Tensor *cpu_y, + Tensor *saved_reserve_space) { + framework::Scope scope; + auto *x = scope.Var("X")->GetMutable(); + auto *z = scope.Var("Z")->GetMutable(); + auto *scale = scope.Var("Scale")->GetMutable(); + auto *bias = scope.Var("Bias")->GetMutable(); + auto *mean = scope.Var("Mean")->GetMutable(); + auto *var = scope.Var("Variance")->GetMutable(); + auto *y = scope.Var("Y")->GetMutable(); + auto *saved_mean = scope.Var("SavedMean")->GetMutable(); + auto *saved_var = + scope.Var("SavedVariance")->GetMutable(); + auto *reserve_space = + scope.Var("ReserveSpace")->GetMutable(); + + auto place = ctx.GetPlace(); + TensorCopySync(cpu_x, place, x); + TensorCopySync(cpu_z, place, z); + TensorCopySync(cpu_scale, place, scale); + TensorCopySync(cpu_bias, place, bias); + TensorCopySync(*cpu_mean, place, mean); + TensorCopySync(*cpu_var, place, var); + + int64_t channels = x->dims()[3]; + scale->Resize({channels}); + bias->Resize({channels}); + mean->Resize({channels}); + var->Resize({channels}); + + framework::AttributeMap attrs; + + auto op = framework::OpRegistry::CreateOp( + "fused_bn_add_activation", + {{"X", {"X"}}, {"Z", {"Z"}}, {"Scale", {"Scale"}}, {"Bias", {"Bias"}}}, + {{"Y", {"Y"}}, + {"MeanOut", {"Mean"}}, + {"VarianceOut", {"Variance"}}, + {"SavedMean", {"SavedMean"}}, + {"SavedVariance", {"SavedVariance"}}, + {"ReserveSpace", {"ReserveSpace"}}}, + attrs); + op->Run(scope, ctx.GetPlace()); + + TensorCopySync(*y, platform::CPUPlace(), cpu_y); + TensorCopySync(*mean, platform::CPUPlace(), cpu_mean); + TensorCopySync(*var, platform::CPUPlace(), cpu_var); + TensorCopySync(*saved_mean, platform::CPUPlace(), cpu_saved_mean); + TensorCopySync(*saved_var, platform::CPUPlace(), cpu_saved_var); + // reserved_space will stay on GPU and used in grad op. + saved_reserve_space->ShareDataWith(*reserve_space); +} + +void ComputeFusedBNAddReluBackward( + const platform::CUDADeviceContext &ctx, const Tensor &cpu_dy, + const Tensor &cpu_x, const Tensor &cpu_scale, const Tensor &cpu_bias, + const Tensor &cpu_saved_mean, const Tensor &cpu_saved_var, + const Tensor &cpu_y, const Tensor &saved_reserve_space, Tensor *cpu_dx, + Tensor *cpu_dz, Tensor *cpu_dscale, Tensor *cpu_dbias) { + framework::Scope scope; + auto *x = scope.Var("X")->GetMutable(); + auto *y = scope.Var("Y")->GetMutable(); + auto *dy = scope.Var("Y@GRAD")->GetMutable(); + auto *scale = scope.Var("Scale")->GetMutable(); + auto *bias = scope.Var("Bias")->GetMutable(); + auto *saved_mean = scope.Var("SavedMean")->GetMutable(); + auto *saved_var = + scope.Var("SavedVariance")->GetMutable(); + auto *reserve_space = + scope.Var("ReserveSpace")->GetMutable(); + auto *dx = scope.Var("X@GRAD")->GetMutable(); + auto *dz = scope.Var("Z@GRAD")->GetMutable(); + auto *dscale = scope.Var("Scale@GRAD")->GetMutable(); + auto *dbias = scope.Var("Bias@GRAD")->GetMutable(); + + auto place = ctx.GetPlace(); + TensorCopySync(cpu_x, place, x); + TensorCopySync(cpu_y, place, y); + TensorCopySync(cpu_dy, place, dy); + TensorCopySync(cpu_scale, place, scale); + TensorCopySync(cpu_bias, place, bias); + TensorCopySync(cpu_saved_mean, place, saved_mean); + TensorCopySync(cpu_saved_var, place, saved_var); + reserve_space->ShareDataWith(saved_reserve_space); + + int64_t channels = x->dims()[3]; + scale->Resize({channels}); + bias->Resize({channels}); + saved_mean->Resize({channels}); + saved_var->Resize({channels}); + + framework::AttributeMap attrs; + float momentum = 0.9; + float epsilon = 1e-5; + std::string act_type = "relu"; + attrs.insert({"momentum", momentum}); + attrs.insert({"epsilon", epsilon}); + attrs.insert({"act_type", act_type}); + + auto op = framework::OpRegistry::CreateOp( + "fused_bn_add_activation_grad", {{"X", {"X"}}, + {"Y", {"Y"}}, + {"Y@GRAD", {"Y@GRAD"}}, + {"Scale", {"Scale"}}, + {"Bias", {"Bias"}}, + {"SavedMean", {"SavedMean"}}, + {"SavedVariance", {"SavedVariance"}}, + {"ReserveSpace", {"ReserveSpace"}}}, + {{"X@GRAD", {"X@GRAD"}}, + {"Z@GRAD", {"Z@GRAD"}}, + {"Scale@GRAD", {"Scale@GRAD"}}, + {"Bias@GRAD", {"Bias@GRAD"}}}, + attrs); + op->Run(scope, ctx.GetPlace()); + + TensorCopySync(*dx, platform::CPUPlace(), cpu_dx); + TensorCopySync(*dz, platform::CPUPlace(), cpu_dz); + TensorCopySync(*dscale, platform::CPUPlace(), cpu_dscale); + TensorCopySync(*dbias, platform::CPUPlace(), cpu_dbias); +} + +template +class CudnnBNAddReluTester { + public: + CudnnBNAddReluTester(int batch_size, int height, int width, int channels, + std::string act_type, bool fuse_add, bool has_shortcut) { + batch_size_ = batch_size; + height_ = height; + width_ = width; + channels_ = channels; + ele_count_ = batch_size_ * height_ * width_; + act_type_ = act_type; + fuse_add_ = fuse_add; + has_shortcut_ = has_shortcut; + SetUp(); + } + + ~CudnnBNAddReluTester() {} + + void CheckForward(float diff, bool is_relative_atol = false) { + LOG(INFO) << "[CheckForward, diff=" << diff + << ", is_relative_atol=" << is_relative_atol + << "] act_type=" << act_type_ << ", fuse_add=" << fuse_add_ + << ", has_shortcut=" << has_shortcut_; + platform::CUDADeviceContext *ctx = + static_cast( + platform::DeviceContextPool::Instance().Get( + platform::CUDAPlace(0))); + + auto select = [&](Tensor *in) { return has_shortcut_ ? in : nullptr; }; + + framework::Tensor cpu_mean_base_x; + framework::Tensor cpu_var_base_x; + framework::Tensor cpu_mean_base_z; + framework::Tensor cpu_var_base_z; + if (!has_shortcut_ && fuse_add_ && (act_type_ == "relu")) { + BaselineForwardFusedBNAddRelu( + *ctx, &cpu_mean_base_x, &cpu_var_base_x, &cpu_saved_mean_base_x_, + &cpu_saved_var_base_x_, &cpu_y_base_, &saved_reserve_space_x_); + } else { + BaselineForward( + *ctx, &cpu_mean_base_x, &cpu_var_base_x, &cpu_saved_mean_base_x_, + &cpu_saved_var_base_x_, &cpu_y_base_, &saved_reserve_space_x_, + select(&cpu_mean_base_z), select(&cpu_var_base_z), + select(&cpu_saved_mean_base_z_), select(&cpu_saved_var_base_z_), + select(&saved_reserve_space_z_)); + } + + framework::Tensor cpu_mean_x; + framework::Tensor cpu_var_x; + framework::Tensor cpu_y; + framework::Tensor cpu_mean_z; + framework::Tensor cpu_var_z; + FusedForward(*ctx, &cpu_mean_x, &cpu_var_x, &cpu_saved_mean_x_, + &cpu_saved_var_x_, &cpu_y, &cpu_bitmask_, select(&cpu_mean_z), + select(&cpu_var_z), select(&cpu_saved_mean_z_), + select(&cpu_saved_var_z_)); + + CheckOutput("Mean", cpu_mean_x, cpu_mean_base_x, diff, + is_relative_atol); + CheckOutput("Variance", cpu_var_x, cpu_var_base_x, diff, + is_relative_atol); + CheckOutput("SavedMean", cpu_saved_mean_x_, cpu_saved_mean_base_x_, + diff, is_relative_atol); + CheckOutput("SavedVariance", cpu_saved_var_x_, cpu_saved_var_base_x_, + diff, is_relative_atol); + if (has_shortcut_) { + CheckOutput("MeanZ", cpu_mean_z, cpu_mean_base_z, diff, + is_relative_atol); + CheckOutput("VarianceZ", cpu_var_z, cpu_var_base_z, diff, + is_relative_atol); + CheckOutput("SavedMeanZ", cpu_saved_mean_z_, + cpu_saved_mean_base_z_, diff, is_relative_atol); + CheckOutput("SavedVarianceZ", cpu_saved_var_z_, + cpu_saved_var_base_z_, diff, is_relative_atol); + } + CheckOutput("Y", cpu_y, cpu_y_base_, diff, is_relative_atol); + } + + void CheckBackward(float diff, bool is_relative_atol = false) { + platform::CUDADeviceContext *ctx = + static_cast( + platform::DeviceContextPool::Instance().Get( + platform::CUDAPlace(0))); + + framework::Tensor cpu_dx_base; + framework::Tensor cpu_dz_base; + framework::Tensor cpu_dscale_base; + framework::Tensor cpu_dbias_base; + BaselineBackwardFusedBNAddRelu(*ctx, &cpu_dx_base, &cpu_dz_base, + &cpu_dscale_base, &cpu_dbias_base); + + framework::Tensor cpu_dx; + framework::Tensor cpu_dz; + framework::Tensor cpu_dscale; + framework::Tensor cpu_dbias; + FusedBackward(*ctx, &cpu_dx, &cpu_dz, &cpu_dscale, &cpu_dbias); + + CheckOutput("DX", cpu_dx, cpu_dx_base, diff, is_relative_atol); + CheckOutput("DZ", cpu_dz, cpu_dz_base, diff, is_relative_atol); + CheckOutput("DScale", cpu_dscale, cpu_dscale_base, diff, + is_relative_atol); + CheckOutput("DBias", cpu_dbias, cpu_dbias_base, diff, + is_relative_atol); + } + + private: + void SetUp() { + InitRandomTensor({batch_size_, height_, width_, channels_}, &cpu_x_); + InitRandomTensor({channels_}, &cpu_bn_scale_x_); + InitRandomTensor({channels_}, &cpu_bn_bias_x_); + + if (has_shortcut_) { + InitRandomTensor({batch_size_, height_, width_, channels_}, &cpu_z_); + InitRandomTensor({channels_}, &cpu_bn_scale_z_); + InitRandomTensor({channels_}, &cpu_bn_bias_z_); + } else { + if (fuse_add_) { + InitRandomTensor({batch_size_, height_, width_, channels_}, &cpu_z_); + } + } + + InitRandomTensor({batch_size_, height_, width_, channels_}, &cpu_dy_); + } + + void InitMeanVar(Tensor *cpu_mean, Tensor *cpu_var, Tensor *cpu_saved_mean, + Tensor *cpu_saved_var) { + InitConstantTensor({channels_}, static_cast(0.0f), cpu_mean); + InitConstantTensor({channels_}, static_cast(1.0f), cpu_var); + InitConstantTensor({channels_}, static_cast(0.0f), + cpu_saved_mean); + InitConstantTensor({channels_}, static_cast(0.0f), + cpu_saved_var); + } + + void BaselineForward(const platform::CUDADeviceContext &ctx, + Tensor *cpu_mean_x, Tensor *cpu_var_x, + Tensor *cpu_saved_mean_x, Tensor *cpu_saved_var_x, + Tensor *cpu_y, Tensor *saved_reserve_space_x, + Tensor *cpu_mean_z = nullptr, + Tensor *cpu_var_z = nullptr, + Tensor *cpu_saved_mean_z = nullptr, + Tensor *cpu_saved_var_z = nullptr, + Tensor *saved_reserve_space_z = nullptr) { + InitMeanVar(cpu_mean_x, cpu_var_x, cpu_saved_mean_x, cpu_saved_var_x); + ComputeBatchNormForward(ctx, cpu_x_, cpu_bn_scale_x_, cpu_bn_bias_x_, + cpu_mean_x, cpu_var_x, cpu_saved_mean_x, + cpu_saved_var_x, cpu_y, saved_reserve_space_x); + if (has_shortcut_) { + framework::Tensor cpu_z_out; + InitMeanVar(cpu_mean_z, cpu_var_z, cpu_saved_mean_z, cpu_saved_var_z); + ComputeBatchNormForward( + ctx, cpu_z_, cpu_bn_scale_z_, cpu_bn_bias_z_, cpu_mean_z, cpu_var_z, + cpu_saved_mean_z, cpu_saved_var_z, &cpu_z_out, saved_reserve_space_z); + ComputeInplaceAdd(cpu_z_out, cpu_y); + } else { + if (fuse_add_) { + ComputeInplaceAdd(cpu_z_, cpu_y); + } + } + if (act_type_ == "relu") { + ComputeInplaceRelu(cpu_y); + } + } + + void BaselineForwardFusedBNAddRelu(const platform::CUDADeviceContext &ctx, + Tensor *cpu_mean, Tensor *cpu_var, + Tensor *cpu_saved_mean, + Tensor *cpu_saved_var, Tensor *cpu_y, + Tensor *saved_reserve_space) { + InitMeanVar(cpu_mean, cpu_var, cpu_saved_mean, cpu_saved_var); + ComputeFusedBNAddReluForward( + ctx, cpu_x_, cpu_z_, cpu_bn_scale_x_, cpu_bn_bias_x_, cpu_mean, cpu_var, + cpu_saved_mean, cpu_saved_var, cpu_y, saved_reserve_space); + } + + void BaselineBackwardFusedBNAddRelu(const platform::CUDADeviceContext &ctx, + Tensor *cpu_dx, Tensor *cpu_dz, + Tensor *cpu_dscale, Tensor *cpu_dbias) { + ComputeFusedBNAddReluBackward( + ctx, cpu_dy_, cpu_x_, cpu_bn_scale_x_, cpu_bn_bias_x_, + cpu_saved_mean_base_x_, cpu_saved_var_base_x_, cpu_y_base_, + saved_reserve_space_x_, cpu_dx, cpu_dz, cpu_dscale, cpu_dbias); + } + + void ComputeFusedBNStatsFinalize(const platform::CUDADeviceContext &ctx, + const Tensor &cpu_x, + const Tensor &cpu_bn_scale, + const Tensor &cpu_bn_bias, Tensor *sum, + Tensor *sum_of_square, Tensor *bn_scale, + Tensor *bn_bias, Tensor *mean, Tensor *var, + Tensor *saved_mean, Tensor *saved_var, + Tensor *equiv_scale, Tensor *equiv_bias) { + framework::Tensor cpu_sum; + framework::Tensor cpu_sum_of_square; + ComputeSumAndSquareSum(cpu_x, &cpu_sum, &cpu_sum_of_square); + + auto place = ctx.GetPlace(); + TensorCopySync(cpu_sum, place, sum); + TensorCopySync(cpu_sum_of_square, place, sum_of_square); + TensorCopySync(cpu_bn_scale, place, bn_scale); + TensorCopySync(cpu_bn_bias, place, bn_bias); + + bn_scale->Resize({1, 1, 1, channels_}); + bn_bias->Resize({1, 1, 1, channels_}); + + // input + mean->Resize({1, 1, 1, channels_}); + var->Resize({1, 1, 1, channels_}); + + // output + equiv_scale->Resize({1, 1, 1, channels_}); + equiv_bias->Resize({1, 1, 1, channels_}); + saved_mean->Resize({1, 1, 1, channels_}); + saved_var->Resize({1, 1, 1, channels_}); + + auto param_shape = framework::vectorize(bn_scale->dims()); + op::CudnnBNStatsFinalize bn_op(ctx, param_shape); + bn_op.Forward(ctx, *sum, *sum_of_square, *bn_scale, *bn_bias, saved_mean, + saved_var, mean, var, equiv_scale, equiv_bias, eps_, + momentum_, ele_count_, true); + } + + // Get forward results of CudnnBNStatsFinalize + CudnnScaleBiasAddRelu + void FusedForward(const platform::CUDADeviceContext &ctx, Tensor *cpu_mean_x, + Tensor *cpu_var_x, Tensor *cpu_saved_mean_x, + Tensor *cpu_saved_var_x, Tensor *cpu_y, Tensor *cpu_bitmask, + Tensor *cpu_mean_z = nullptr, Tensor *cpu_var_z = nullptr, + Tensor *cpu_saved_mean_z = nullptr, + Tensor *cpu_saved_var_z = nullptr) { + framework::Tensor x; + framework::Tensor sum_x; + framework::Tensor sum_of_square_x; + framework::Tensor bn_scale_x; + framework::Tensor bn_bias_x; + + framework::Tensor z; + framework::Tensor sum_z; + framework::Tensor sum_of_square_z; + framework::Tensor bn_scale_z; + framework::Tensor bn_bias_z; + + auto place = ctx.GetPlace(); + TensorCopySync(cpu_x_, place, &x); + if (fuse_add_ || has_shortcut_) { + TensorCopySync(cpu_z_, place, &z); + } + + framework::Tensor mean_x; + framework::Tensor var_x; + framework::Tensor saved_mean_x; + framework::Tensor saved_var_x; + framework::Tensor equiv_scale_x; + framework::Tensor equiv_bias_x; + + framework::Tensor mean_z; + framework::Tensor var_z; + framework::Tensor saved_mean_z; + framework::Tensor saved_var_z; + framework::Tensor equiv_scale_z; + framework::Tensor equiv_bias_z; + + framework::Tensor y; + framework::Tensor bitmask; + + InitMeanVar(cpu_mean_x, cpu_var_x, cpu_saved_mean_x, cpu_saved_var_x); + TensorCopySync(*cpu_mean_x, place, &mean_x); + TensorCopySync(*cpu_var_x, place, &var_x); + if (has_shortcut_) { + InitMeanVar(cpu_mean_z, cpu_var_z, cpu_saved_mean_z, cpu_saved_var_z); + TensorCopySync(*cpu_mean_z, place, &mean_z); + TensorCopySync(*cpu_var_z, place, &var_z); + } + + // 1. BN Stats Finalize + ComputeFusedBNStatsFinalize(ctx, cpu_x_, cpu_bn_scale_x_, cpu_bn_bias_x_, + &sum_x, &sum_of_square_x, &bn_scale_x, + &bn_bias_x, &mean_x, &var_x, &saved_mean_x, + &saved_var_x, &equiv_scale_x, &equiv_bias_x); + if (has_shortcut_) { + ComputeFusedBNStatsFinalize(ctx, cpu_z_, cpu_bn_scale_z_, cpu_bn_bias_z_, + &sum_z, &sum_of_square_z, &bn_scale_z, + &bn_bias_z, &mean_z, &var_z, &saved_mean_z, + &saved_var_z, &equiv_scale_z, &equiv_bias_z); + } + + y.Resize(framework::make_ddim({batch_size_, height_, width_, channels_})); + + int c = channels_; + int64_t nhw = ele_count_; + int32_t c_int32_elems = ((c + 63) & ~63) / 32; + int32_t nhw_int32_elems = (nhw + 31) & ~31; + bitmask.Resize(framework::make_ddim({nhw_int32_elems, c_int32_elems, 1})); + + auto data_shape = framework::vectorize(x.dims()); + auto param_shape = framework::vectorize(bn_scale_x.dims()); + auto bitmask_shape = framework::vectorize(bitmask.dims()); + + // 2. Scale Bias + Relu + op::CudnnScaleBiasAddRelu sbar_op(ctx, act_type_, fuse_add_, + has_shortcut_, data_shape, param_shape, + bitmask_shape); + sbar_op.Forward(ctx, x, equiv_scale_x, equiv_bias_x, &z, &equiv_scale_z, + &equiv_bias_z, &y, &bitmask); + + TensorCopySync(mean_x, platform::CPUPlace(), cpu_mean_x); + TensorCopySync(var_x, platform::CPUPlace(), cpu_var_x); + TensorCopySync(saved_mean_x, platform::CPUPlace(), cpu_saved_mean_x); + TensorCopySync(saved_var_x, platform::CPUPlace(), cpu_saved_var_x); + if (has_shortcut_) { + TensorCopySync(mean_z, platform::CPUPlace(), cpu_mean_z); + TensorCopySync(var_z, platform::CPUPlace(), cpu_var_z); + TensorCopySync(saved_mean_z, platform::CPUPlace(), cpu_saved_mean_z); + TensorCopySync(saved_var_z, platform::CPUPlace(), cpu_saved_var_z); + } + TensorCopySync(y, platform::CPUPlace(), cpu_y); + TensorCopySync(bitmask, platform::CPUPlace(), cpu_bitmask); + } + + // Get backward results of CudnnBNStatsFinalize + CudnnScaleBiasAddRelu + void FusedBackward(const platform::CUDADeviceContext &ctx, Tensor *cpu_dx, + Tensor *cpu_dz, Tensor *cpu_dscale, Tensor *cpu_dbias) { + framework::Tensor dy; + framework::Tensor x; + framework::Tensor bn_scale; + framework::Tensor bn_bias; + framework::Tensor saved_mean; + framework::Tensor saved_var; + framework::Tensor bitmask; + framework::Tensor dx; + framework::Tensor dz; + framework::Tensor dscale; + framework::Tensor dbias; + + auto place = ctx.GetPlace(); + TensorCopySync(cpu_dy_, place, &dy); + TensorCopySync(cpu_x_, place, &x); + TensorCopySync(cpu_bn_scale_x_, place, &bn_scale); + TensorCopySync(cpu_bn_bias_x_, place, &bn_bias); + TensorCopySync(cpu_saved_mean_x_, place, &saved_mean); + TensorCopySync(cpu_saved_var_x_, place, &saved_var); + TensorCopySync(cpu_bitmask_, place, &bitmask); + + bn_scale.Resize({1, 1, 1, channels_}); + bn_bias.Resize({1, 1, 1, channels_}); + saved_mean.Resize({1, 1, 1, channels_}); + saved_var.Resize({1, 1, 1, channels_}); + + dx.Resize(framework::make_ddim({batch_size_, height_, width_, channels_})); + dz.Resize(framework::make_ddim({batch_size_, height_, width_, channels_})); + dscale.Resize(framework::make_ddim({1, 1, 1, channels_})); + dbias.Resize(framework::make_ddim({1, 1, 1, channels_})); + + auto data_shape = framework::vectorize(x.dims()); + auto param_shape = framework::vectorize(bn_scale.dims()); + auto bitmask_shape = framework::vectorize(bitmask.dims()); + + std::string act_type = "relu"; + op::CudnnScaleBiasAddRelu sbar_op(ctx, act_type, true, false, data_shape, + param_shape, bitmask_shape); + sbar_op.Backward(ctx, dy, x, bn_scale, bn_bias, saved_mean, saved_var, + &bitmask, &dx, &dz, &dscale, &dbias, eps_); + + TensorCopySync(dx, platform::CPUPlace(), cpu_dx); + TensorCopySync(dz, platform::CPUPlace(), cpu_dz); + TensorCopySync(dscale, platform::CPUPlace(), cpu_dscale); + TensorCopySync(dbias, platform::CPUPlace(), cpu_dbias); + } + + private: + int batch_size_; + int height_; + int width_; + int channels_; + int ele_count_; + + std::string act_type_; + bool fuse_add_; + bool has_shortcut_; + + // Forward input + framework::Tensor cpu_x_; + framework::Tensor cpu_bn_scale_x_; + framework::Tensor cpu_bn_bias_x_; + framework::Tensor cpu_z_; + framework::Tensor cpu_bn_scale_z_; + framework::Tensor cpu_bn_bias_z_; + + // Backward input + framework::Tensor cpu_dy_; + framework::Tensor cpu_bitmask_; + framework::Tensor cpu_saved_mean_x_; + framework::Tensor cpu_saved_var_x_; + framework::Tensor cpu_saved_mean_z_; + framework::Tensor cpu_saved_var_z_; + framework::Tensor cpu_saved_mean_base_x_; + framework::Tensor cpu_saved_var_base_x_; + framework::Tensor saved_reserve_space_x_; + framework::Tensor cpu_saved_mean_base_z_; + framework::Tensor cpu_saved_var_base_z_; + framework::Tensor saved_reserve_space_z_; + framework::Tensor cpu_y_base_; + + double eps_ = 1e-5; + float momentum_ = 0.9; +}; + +TEST(CudnnBNAddReluFp16, BNAdd) { + int batch_size = 4; + int height = 8; + int width = 8; + int channels = 64; + std::string act_type = ""; + bool has_shortcut = false; + FLAGS_cudnn_batchnorm_spatial_persistent = true; + for (auto fuse_add : {false, true}) { + CudnnBNAddReluTester test( + batch_size, height, width, channels, act_type, fuse_add, has_shortcut); + test.CheckForward(2e-3); + } +} + +TEST(CudnnBNAddReluFp16, BNAddRelu) { + int batch_size = 4; + int height = 8; + int width = 8; + int channels = 64; + std::string act_type = "relu"; + bool has_shortcut = false; + FLAGS_cudnn_batchnorm_spatial_persistent = true; + for (auto fuse_add : {false, true}) { + CudnnBNAddReluTester test( + batch_size, height, width, channels, act_type, fuse_add, has_shortcut); + test.CheckForward(2e-3); + if (fuse_add) { + test.CheckBackward(2e-4); + } + } +} + +TEST(CudnnBNAddReluFp16, HasShortcut) { + int batch_size = 4; + int height = 8; + int width = 8; + int channels = 64; + std::string act_type = ""; + bool fuse_add = false; + bool has_shortcut = true; + FLAGS_cudnn_batchnorm_spatial_persistent = true; + CudnnBNAddReluTester test( + batch_size, height, width, channels, act_type, fuse_add, has_shortcut); + test.CheckForward(5e-3); +} diff --git a/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h new file mode 100644 index 00000000000000..dc703f9a822b5b --- /dev/null +++ b/paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h @@ -0,0 +1,193 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/operators/fused/cudnn_fusion_helper.h" +#include "paddle/fluid/platform/cudnn_desc.h" +#include "paddle/fluid/platform/cudnn_helper.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +namespace dynload = platform::dynload; +template +using BatchNormParamType = + typename platform::CudnnDataType::BatchNormParamType; + +#if CUDNN_VERSION >= 8000 + +template +struct BNStatsFinalizeArgs { + BNStatsFinalizeArgs() { + dtype = platform::CudnnDataType::type; + param_dtype = platform::CudnnDataType>::type; + format = CUDNN_TENSOR_NHWC; + } + + void Set(const std::vector ¶m_shape) { + PADDLE_ENFORCE_EQ( + param_shape.size(), 4U, + platform::errors::InvalidArgument( + "The size of param_shape is expected to 4. But recieved " + "param_shape's size is %d, param_shape is [%s].", + param_shape.size(), framework::make_ddim(param_shape))); + + in_desc.set(param_shape, format, param_dtype); + out_desc.set(param_shape, format, dtype); + } + + cudnnDataType_t dtype; + cudnnDataType_t param_dtype; + cudnnTensorFormat_t format; + + platform::TensorDescriptor in_desc; + platform::TensorDescriptor out_desc; +}; + +template +class CudnnBNStatsFinalize { + public: + CudnnBNStatsFinalize(const platform::CUDADeviceContext &ctx, + const std::vector ¶m_shape) + : train_op_(CUDNN_FUSED_BN_FINALIZE_STATISTICS_TRAINING), + inference_op_(CUDNN_FUSED_BN_FINALIZE_STATISTICS_INFERENCE) { + args_.Set(param_shape); + } + ~CudnnBNStatsFinalize() {} + + void Forward(const platform::CUDADeviceContext &ctx, const Tensor &sum, + const Tensor &sum_of_squares, const Tensor &scale, + const Tensor &bias, Tensor *saved_mean, Tensor *saved_invstd, + Tensor *running_mean, Tensor *running_var, Tensor *equiv_scale, + Tensor *equiv_bias, double eps, float momentum, + int64_t ele_count, bool is_train) { + auto place = ctx.GetPlace(); + if (is_train) { + TrainInit(ctx); + } else { + InferenceInit(ctx); + } + auto &op = is_train ? train_op_ : inference_op_; + + // Set variant_param for both inference_op_ and train_op_ + float *sum_ptr = const_cast(sum.data()); + float *sum_of_squares_ptr = + const_cast(sum_of_squares.data()); + float *scale_ptr = const_cast(scale.data()); + float *bias_ptr = const_cast(bias.data()); + float *saved_mean_ptr = saved_mean->mutable_data(place); + float *saved_invstd_ptr = saved_invstd->mutable_data(place); + float *running_mean_ptr = running_mean->mutable_data(place); + float *running_var_ptr = running_var->mutable_data(place); + T *equiv_scale_ptr = equiv_scale->mutable_data(place); + T *equiv_bias_ptr = equiv_bias->mutable_data(place); + op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_SCALE, scale_ptr); + op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_BIAS, bias_ptr); + op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_RUNNING_MEAN, running_mean_ptr); + op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_RUNNING_VAR, running_var_ptr); + op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_EQSCALE, equiv_scale_ptr); + op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_EQBIAS, equiv_bias_ptr); + op.SetOpVariantParamAttrPtr(CUDNN_SCALAR_DOUBLE_BN_EPSILON, &eps); + + // Set extra variant_param only for train_op_: + if (is_train) { + op.SetOpVariantParamAttrPtr(CUDNN_PTR_YSUM, sum_ptr); + op.SetOpVariantParamAttrPtr(CUDNN_PTR_YSQSUM, sum_of_squares_ptr); + op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_SAVED_MEAN, saved_mean_ptr); + op.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_SAVED_INVSTD, saved_invstd_ptr); + double avg_factor = 1.0 - momentum; + op.SetOpVariantParamAttrPtr(CUDNN_SCALAR_INT64_T_BN_ACCUMULATION_COUNT, + &ele_count); + op.SetOpVariantParamAttrPtr(CUDNN_SCALAR_DOUBLE_BN_EXP_AVG_FACTOR, + &avg_factor); + } + // fused op execute + auto handle = ctx.cudnn_handle(); + op.Execute(handle); + } + + private: + void TrainInit(const platform::CUDADeviceContext &ctx) { + // Set constant_param for train op + train_op_.SetOpConstParamAttr( + {CUDNN_PARAM_YSUM_PLACEHOLDER, CUDNN_PARAM_YSQSUM_PLACEHOLDER, + CUDNN_PARAM_BN_SCALE_PLACEHOLDER, CUDNN_PARAM_BN_BIAS_PLACEHOLDER, + CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER, + CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER, + CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER, + CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER, + CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER, CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER}, + CUDNN_PTR_16B_ALIGNED); + // Set input and output desc for train op + train_op_.SetOpConstParamDesc( + {CUDNN_PARAM_YSTATS_DESC, CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC}, + args_.in_desc.desc()); + train_op_.SetOpConstParamDesc(CUDNN_PARAM_BN_EQSCALEBIAS_DESC, + args_.out_desc.desc()); + + // Get workspace + auto handle = ctx.cudnn_handle(); + train_op_.SetOpConstParamAttr(CUDNN_PARAM_BN_MODE, + CUDNN_BATCHNORM_SPATIAL_PERSISTENT); + // Check workspace size, also creates plan. + size_t workspace_size_bytes = train_op_.GetWorkspaceSizeInBytes(handle); + PADDLE_ENFORCE_EQ(workspace_size_bytes, 0U, + platform::errors::InvalidArgument( + "Unexpected non-zero workspace size for " + "CudnnBNStatsFinalize.")); + train_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, + static_cast(nullptr)); + train_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, + &workspace_size_bytes); + } + + void InferenceInit(const platform::CUDADeviceContext &ctx) { + // Set constant_param for inference op + inference_op_.SetOpConstParamAttr( + {CUDNN_PARAM_BN_SCALE_PLACEHOLDER, CUDNN_PARAM_BN_BIAS_PLACEHOLDER, + CUDNN_PARAM_BN_RUNNING_MEAN_PLACEHOLDER, + CUDNN_PARAM_BN_RUNNING_VAR_PLACEHOLDER, + CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER, CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER}, + CUDNN_PTR_16B_ALIGNED); + // Set input and output desc for inference op + inference_op_.SetOpConstParamDesc(CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC, + args_.in_desc.desc()); + inference_op_.SetOpConstParamDesc(CUDNN_PARAM_BN_EQSCALEBIAS_DESC, + args_.out_desc.desc()); + + // Get workspace + auto handle = ctx.cudnn_handle(); + inference_op_.SetOpConstParamAttr(CUDNN_PARAM_BN_MODE, + CUDNN_BATCHNORM_SPATIAL_PERSISTENT); + // Check workspace size, also creates plan. + size_t workspace_size_bytes = inference_op_.GetWorkspaceSizeInBytes(handle); + PADDLE_ENFORCE_EQ(workspace_size_bytes, 0U, + platform::errors::InvalidArgument( + "Unexpected non-zero workspace size for " + "CudnnBNStatsFinalize.")); + inference_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, + static_cast(nullptr)); + inference_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, + &workspace_size_bytes); + } + + BNStatsFinalizeArgs args_; + CudnnFusionOp train_op_; + CudnnFusionOp inference_op_; +}; +#endif +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/fused/cudnn_fusion_helper.h b/paddle/fluid/operators/fused/cudnn_fusion_helper.h index 4434681e60b3b1..1de64cf5ad947d 100644 --- a/paddle/fluid/operators/fused/cudnn_fusion_helper.h +++ b/paddle/fluid/operators/fused/cudnn_fusion_helper.h @@ -14,10 +14,8 @@ limitations under the License. */ #pragma once -#include #include -#include "paddle/fluid/platform/cudnn_desc.h" -#include "paddle/fluid/platform/cudnn_helper.h" +#include "paddle/fluid/framework/operator_kernel_configs.h" #include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/enforce.h" @@ -40,8 +38,7 @@ class CudnnFusionOp { &op_variant_params_, op_id)); } - ~CudnnFusionOp() { - // New 'fused op' descriptor destruction + ~CudnnFusionOp() PADDLE_MAY_THROW { PADDLE_ENFORCE_CUDA_SUCCESS( dynload::cudnnDestroyFusedOpsVariantParamPack(op_variant_params_)); PADDLE_ENFORCE_CUDA_SUCCESS( @@ -121,41 +118,49 @@ class CudnnFusionOp { // Get the workspace, which is required before Execute(). size_t GetWorkspaceSizeInBytes(cudnnHandle_t cudnn_handle) { - size_t workspace_bytes = 0U; - PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnMakeFusedOpsPlan( - cudnn_handle, op_, op_const_params_, &workspace_bytes)); - plan_created_ = true; - return workspace_bytes; + if (!plan_created_) { + workspace_bytes_ = 0U; + PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnMakeFusedOpsPlan( + cudnn_handle, op_, op_const_params_, &workspace_bytes_)); + plan_created_ = true; + } + return workspace_bytes_; } private: bool plan_created_; + size_t workspace_bytes_; cudnnFusedOpsPlan_t op_; cudnnFusedOpsConstParamPack_t op_const_params_; cudnnFusedOpsVariantParamPack_t op_variant_params_; }; -static inline std::vector GetStrides(const std::vector &shape) { - if (shape.size() < 1) { - return {}; +class CudnnFusionOpCache { + public: + static CudnnFusionOpCache &Instance() { + static CudnnFusionOpCache instance; + return instance; } - int dim = static_cast(shape.size()); - std::vector pro_shape(shape); - std::vector strides(dim); - int temp = pro_shape[1]; - pro_shape.erase(pro_shape.begin() + 1); - pro_shape.push_back(temp); - strides.back() = 1; - for (int i = dim - 2; i >= 0; --i) { - strides[i] = strides[i + 1] * pro_shape[i + 1]; + + framework::AlgorithmsCache *GetForward() { + return &forward_cache_; + } + framework::AlgorithmsCache *GetBackward() { + return &backward_cache_; } - strides.pop_back(); - strides.insert(strides.begin() + 1, 1); - return strides; -} -static inline int64_t AlignUp(int64_t a, int64_t b) { return (a + b - 1) / b; } + private: + CudnnFusionOpCache() {} + ~CudnnFusionOpCache() { + // Need to delete the memory of cache. + } + CudnnFusionOpCache(const CudnnFusionOpCache &) {} + + private: + framework::AlgorithmsCache forward_cache_; + framework::AlgorithmsCache backward_cache_; +}; #endif // CUDNN_VERSION >= 8000 } // namespace operators diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h index 1ead78b8b64e18..9b9328a5ca6208 100644 --- a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h +++ b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h @@ -15,125 +15,374 @@ limitations under the License. */ #pragma once #include "paddle/fluid/operators/fused/cudnn_fusion_helper.h" +#include "paddle/fluid/platform/cudnn_desc.h" +#include "paddle/fluid/platform/cudnn_helper.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; namespace dynload = platform::dynload; +template +using ScalingParamType = typename platform::CudnnDataType::ScalingParamType; + #if CUDNN_VERSION >= 8000 + +static size_t RoundUp(int64_t a, int64_t b) { return (a + b - 1) / b * b; } + template -class CudnnNormConvolutionOp { +struct NormConvolutionArgs { + NormConvolutionArgs() { + dtype = platform::CudnnDataType::type; + format = CUDNN_TENSOR_NHWC; + compute_type = platform::CudnnDataType::type; + } + + void Set(const platform::CUDADeviceContext &ctx, + const std::vector &input_shape, + const std::vector &filter_shape, + const std::vector &output_shape, int padding, int stride, + int dilation, int group) { + PADDLE_ENFORCE_EQ( + input_shape.size(), 4U, + platform::errors::InvalidArgument( + "The size of input_shape is expected to 4. But recieved " + "input_shape's size is %d, input_shape is [%s].", + input_shape.size(), framework::make_ddim(input_shape))); + PADDLE_ENFORCE_EQ( + filter_shape.size(), 4U, + platform::errors::InvalidArgument( + "The size of filter_shape is expected to 4. But recieved " + "filter_shape's size is %d, filter_shape is [%s].", + filter_shape.size(), framework::make_ddim(filter_shape))); + PADDLE_ENFORCE_EQ(filter_shape[1] == filter_shape[2] && + (filter_shape[1] == 1 || filter_shape[1] == 3), + true, + platform::errors::InvalidArgument( + "The filter_shape is expected to store as nhwc, and " + "h = w = 1 or 3. But recieved filter_shape is [%s].", + framework::make_ddim(filter_shape))); + PADDLE_ENFORCE_EQ((filter_shape[0] % 32 == 0 && filter_shape[3] % 8 == 0), + true, + platform::errors::InvalidArgument( + "The input channel is expected to be multiple of 8, " + "and the output channel is expected to be multiple " + "of 32. But recieved input channel is %d, output " + "channel is %d.", + filter_shape[3], filter_shape[0])); + PADDLE_ENFORCE_EQ( + output_shape.size(), 4U, + platform::errors::InvalidArgument( + "The size of output_shape is expected to 4. But recieved " + "filter_shape's size is %d, filter_shape is [%s].", + output_shape.size(), framework::make_ddim(output_shape))); + is_support = IsSupport(ctx, filter_shape, stride, dilation, group); + PADDLE_ENFORCE_EQ( + is_support, true, + platform::errors::InvalidArgument( + "Current test is only supported in the platforms with " + "compatiblity greater than or equal to 70 and the kernel size " + "must be equal to 1 or 3. When the kernel size is 1, " + "the stride must be 1 if the compatiblity is equal to 70. " + "Besides, the dilation and group must be equal to 1. But recieved " + "compatiblity is %d, kernel size is %d, stride is %d, " + "dilation is %d, group is %d", + ctx.GetComputeCapability(), filter_shape[1], stride, dilation, + group)); + + for (size_t i = 0; i < input_shape.size(); ++i) { + in_dims.push_back(input_shape[i]); + } + for (size_t i = 0; i < filter_shape.size(); ++i) { + filter_dims.push_back(filter_shape[i]); + } + paddings = {padding, padding}; + strides = {stride, stride}; + dilations = {dilation, dilation}; + + in_desc.set(input_shape, format, dtype); + filter_desc.set(filter_shape, format, dtype, group); + out_desc.set(output_shape, format, dtype); + + int output_channel = filter_shape[0]; + std::vector stats_shape = {1, 1, 1, output_channel}; + out_stats_desc.set(stats_shape, format, compute_type); + + conv_desc.set(dtype, paddings, strides, dilations, false, group); + } + + bool IsSupport(const platform::CUDADeviceContext &ctx, + const std::vector &filter_shape, int stride, int dilation, + int group) { + int kernel_size = filter_shape[1]; + if (dilation != 1 || group != 1) { + return false; + } + if (ctx.GetComputeCapability() == 70) { + if ((kernel_size == 3) || ((kernel_size == 1) && (stride == 1))) { + return true; + } + } else if (ctx.GetComputeCapability() > 70) { + if ((kernel_size == 3) || (kernel_size == 1)) { + return true; + } + } + return false; + } + + cudnnDataType_t dtype; + cudnnTensorFormat_t format; + cudnnDataType_t compute_type; + + std::vector in_dims; + std::vector filter_dims; + std::vector strides; + std::vector paddings; + std::vector dilations; + + platform::TensorDescriptor in_desc; + platform::FilterDescriptor filter_desc; + platform::TensorDescriptor out_desc; + platform::TensorDescriptor out_stats_desc; + platform::ConvolutionDescriptor conv_desc; + + bool is_support; +}; + +template +class CudnnNormConvolution { public: - CudnnNormConvolutionOp() - : fwd_op_(CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS) {} - ~CudnnNormConvolutionOp() {} - - void Init(const platform::CUDADeviceContext &ctx, - const std::vector &input_shape, - const std::vector &filter_shape, - const std::vector &output_shape, const int &pad, - const int &stride, const int &dilate, const int &group) { - cudnn_fwd_compute_type_ = platform::CudnnDataType::type; - dtype_ = platform::CudnnDataType::type; - format_ = CUDNN_TENSOR_NHWC; - - InitDescriptors(ctx, input_shape, filter_shape, output_shape, pad, stride, - dilate, group); - GetWorkspaceSize(ctx); + CudnnNormConvolution(const platform::CUDADeviceContext &ctx, + const std::vector &input_shape, + const std::vector &filter_shape, + const std::vector &output_shape, const int &padding, + const int &stride, const int &dilation, + const int &group) { + args_.Set(ctx, input_shape, filter_shape, output_shape, padding, stride, + dilation, group); } + ~CudnnNormConvolution() {} + + void Forward(const platform::CUDADeviceContext &ctx, const Tensor &input, + const Tensor &filter, Tensor *output, Tensor *sum, + Tensor *sum_of_squares) { + auto cudnn_handle = ctx.cudnn_handle(); + auto place = ctx.GetPlace(); + + CudnnFusionOp *fwd_op = GetForwardOp(ctx); + size_t workspace_size = RoundUp( + static_cast(fwd_op->GetWorkspaceSizeInBytes(cudnn_handle)), + 512); - void Forward(const platform::CUDADeviceContext &ctx, T *input_ptr, - T *filter_ptr, T *output_ptr, float *sum_ptr, - float *sum_of_squares_ptr) { - auto handle = ctx.cudnn_handle(); - auto workspace_handle = ctx.cudnn_workspace_handle(); // Set variant_param // input ptr - fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_XDATA, input_ptr); - fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WDATA, filter_ptr); - fwd_op_.SetOpVariantParamAttrPtr( - CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES, &fwd_workspace_byte_); + T *input_ptr = const_cast(input.data()); + T *filter_ptr = const_cast(filter.data()); + fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_XDATA, input_ptr); + fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_WDATA, filter_ptr); + fwd_op->SetOpVariantParamAttrPtr( + CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES, &workspace_size); + // output ptr - fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_YDATA, output_ptr); - fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_YSUM, sum_ptr); - fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_YSQSUM, sum_of_squares_ptr); - workspace_handle.RunFunc( + T *output_ptr = output->mutable_data(place); + float *sum_ptr = sum->mutable_data(place); + float *sum_of_squares_ptr = sum_of_squares->mutable_data(place); + fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_YDATA, output_ptr); + fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_YSUM, sum_ptr); + fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_YSQSUM, sum_of_squares_ptr); + + ctx.cudnn_workspace_handle().RunFunc( [&](void *workspace_ptr) { // workspace ptr - fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, workspace_ptr); + fwd_op->SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, workspace_ptr); // fused op execute - fwd_op_.Execute(handle); + fwd_op->Execute(cudnn_handle); }, - fwd_workspace_byte_); + workspace_size); } - // TBD - void Backward(const platform::CUDADeviceContext &ctx) {} + private: + CudnnFusionOp *GetForwardOp(const platform::CUDADeviceContext &ctx) { + framework::AlgorithmsCache &cache = + *(CudnnFusionOpCache::Instance().GetForward()); + + CudnnFusionOp *fwd_op = cache.GetAlgorithm( + args_.in_dims, args_.filter_dims, args_.strides, args_.paddings, + args_.dilations, 0, static_cast(args_.dtype), [&]() { + CudnnFusionOp *fwd_op = + new CudnnFusionOp(CUDNN_FUSED_SCALE_BIAS_ACTIVATION_CONV_BNSTATS); + + // Set constant_param + fwd_op->SetOpConstParamAttr( + {CUDNN_PARAM_XDATA_PLACEHOLDER, CUDNN_PARAM_WDATA_PLACEHOLDER, + CUDNN_PARAM_YDATA_PLACEHOLDER}, + CUDNN_PTR_16B_ALIGNED); + fwd_op->SetOpConstParamAttr( + {CUDNN_PARAM_YSUM_PLACEHOLDER, CUDNN_PARAM_YSQSUM_PLACEHOLDER}, + CUDNN_PTR_16B_ALIGNED); + + // conv desc + fwd_op->SetOpConstParamDesc(CUDNN_PARAM_CONV_DESC, + args_.conv_desc.desc()); + // input desc + fwd_op->SetOpConstParamDesc(CUDNN_PARAM_XDESC, args_.in_desc.desc()); + // filter desc + fwd_op->SetOpConstParamDesc(CUDNN_PARAM_WDESC, + args_.filter_desc.desc()); + // output desc + fwd_op->SetOpConstParamDesc(CUDNN_PARAM_YDESC, args_.out_desc.desc()); + // output_stats desc + fwd_op->SetOpConstParamDesc(CUDNN_PARAM_YSTATS_DESC, + args_.out_stats_desc.desc()); + // batch_norm mode + fwd_op->SetOpConstParamAttr(CUDNN_PARAM_BN_MODE, + CUDNN_BATCHNORM_SPATIAL_PERSISTENT); + + // Make cudnn fused ops plan + fwd_op->GetWorkspaceSizeInBytes(ctx.cudnn_handle()); + return fwd_op; + }); + return fwd_op; + } private: - void InitDescriptors(const platform::CUDADeviceContext &ctx, - const std::vector &input_shape, - const std::vector &filter_shape, - const std::vector &output_shape, const int &pad, - const int &stride, const int &dilate, const int &group) { - // Set constant_param - fwd_op_.SetOpConstParamAttr( - {CUDNN_PARAM_XDATA_PLACEHOLDER, CUDNN_PARAM_WDATA_PLACEHOLDER, - CUDNN_PARAM_YDATA_PLACEHOLDER}, - CUDNN_PTR_16B_ALIGNED); - fwd_op_.SetOpConstParamAttr( - {CUDNN_PARAM_YSUM_PLACEHOLDER, CUDNN_PARAM_YSQSUM_PLACEHOLDER}, - CUDNN_PTR_16B_ALIGNED); - - std::vector pad_vec = {pad, pad}; - std::vector stride_vec = {stride, stride}; - std::vector dilate_vec = {dilate, dilate}; - int output_channel = filter_shape[0]; - std::vector stats_shape = {1, 1, 1, output_channel}; + NormConvolutionArgs args_; +}; + +template +class CudnnNormConvolutionGrad { + public: + CudnnNormConvolutionGrad(const platform::CUDADeviceContext &ctx, + const std::vector &input_shape, + const std::vector &filter_shape, + const std::vector &output_shape, + const int &padding, const int &stride, + const int &dilation, const int &group) { + args_.Set(ctx, input_shape, filter_shape, output_shape, padding, stride, + dilation, group); + dgrad_algo_ = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; + } + ~CudnnNormConvolutionGrad() {} - // set conv desc - conv_desc_.set(dtype_, pad_vec, stride_vec, dilate_vec, false, group); - fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_CONV_DESC, conv_desc_.desc()); + void Backward(const platform::CUDADeviceContext &ctx, const Tensor &input, + const Tensor &filter, const Tensor &output_grad, + Tensor *input_grad, Tensor *filter_grad, + bool use_addto = false) { + auto place = ctx.GetPlace(); + T *input_ptr = const_cast(input.data()); + T *filter_ptr = const_cast(filter.data()); + T *output_grad_ptr = const_cast(output_grad.data()); - // set input desc - in_desc_.set(input_shape, format_, dtype_); - fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_XDESC, in_desc_.desc()); + if (filter_grad) { + T *filter_grad_ptr = filter_grad->mutable_data(place); + BackwardFilter(ctx, output_grad_ptr, input_ptr, filter_grad_ptr); + } + if (input_grad) { + T *input_grad_ptr = input_grad->mutable_data(place); + BackwardData(ctx, output_grad_ptr, filter_ptr, input_grad_ptr, use_addto); + } + } - // set filter desc - filter_desc_.set(filter_shape, format_, dtype_, group); - fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_WDESC, filter_desc_.desc()); + private: + void BackwardFilter(const platform::CUDADeviceContext &ctx, + T *output_grad_ptr, T *input_ptr, T *filter_grad_ptr) { + auto cudnn_handle = ctx.cudnn_handle(); - // set output desc - out_desc_.set(output_shape, format_, dtype_); - fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_YDESC, out_desc_.desc()); + CudnnFusionOp *wgrad_op = GetBackwardFilterOp(ctx); + size_t workspace_size = RoundUp( + static_cast(wgrad_op->GetWorkspaceSizeInBytes(cudnn_handle)), + 512); - // set output_stats desc - out_stats_desc_.set(stats_shape, format_, cudnn_fwd_compute_type_); - fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_YSTATS_DESC, - out_stats_desc_.desc()); + wgrad_op->SetOpVariantParamAttrPtr(CUDNN_PTR_XDATA, input_ptr); + wgrad_op->SetOpVariantParamAttrPtr(CUDNN_PTR_DYDATA, output_grad_ptr); + wgrad_op->SetOpVariantParamAttrPtr(CUDNN_PTR_DWDATA, filter_grad_ptr); + wgrad_op->SetOpVariantParamAttrPtr( + CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES, &workspace_size); - fwd_op_.SetOpConstParamAttr(CUDNN_PARAM_BN_MODE, CUDNN_BATCHNORM_SPATIAL); + ctx.cudnn_workspace_handle().RunFunc( + [&](void *workspace_ptr) { + // workspace ptr + wgrad_op->SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, + workspace_ptr); + // fused op execute + wgrad_op->Execute(cudnn_handle); + }, + workspace_size); } - void GetWorkspaceSize(const platform::CUDADeviceContext &ctx) { - auto handle = ctx.cudnn_handle(); - fwd_workspace_byte_ = fwd_op_.GetWorkspaceSizeInBytes(handle); + void BackwardData(const platform::CUDADeviceContext &ctx, T *output_grad_ptr, + T *filter_ptr, T *input_grad_ptr, bool use_addto = false) { + auto cudnn_handle = ctx.cudnn_handle(); + size_t workspace_size = GetWorkspaceSizeBwdData(ctx); + + // Convolution dgrad followed optionally by batchnorm dgrad + ScalingParamType alpha = 1.0f; + ScalingParamType beta = use_addto ? 1.0f : 0.0f; + ctx.cudnn_workspace_handle().RunFunc( + [&](void *cudnn_workspace_ptr) { + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnConvolutionBackwardData( + cudnn_handle, &alpha, args_.filter_desc.desc(), filter_ptr, + args_.out_desc.desc(), output_grad_ptr, + args_.conv_desc.desc(), dgrad_algo_, cudnn_workspace_ptr, + workspace_size, &beta, args_.in_desc.desc(), input_grad_ptr)); + }, + workspace_size); } - size_t fwd_workspace_byte_ = 0; + CudnnFusionOp *GetBackwardFilterOp(const platform::CUDADeviceContext &ctx) { + framework::AlgorithmsCache &cache = + *(CudnnFusionOpCache::Instance().GetBackward()); + + CudnnFusionOp *wgrad_op = cache.GetAlgorithm( + args_.in_dims, args_.filter_dims, args_.strides, args_.paddings, + args_.dilations, 0, static_cast(args_.dtype), [&]() { + CudnnFusionOp *wgrad_op = + new CudnnFusionOp(CUDNN_FUSED_SCALE_BIAS_ACTIVATION_WGRAD); + + wgrad_op->SetOpConstParamAttr( + {CUDNN_PARAM_DYDATA_PLACEHOLDER, CUDNN_PARAM_XDATA_PLACEHOLDER, + CUDNN_PARAM_DWDATA_PLACEHOLDER}, + CUDNN_PTR_16B_ALIGNED); - cudnnDataType_t dtype_; - cudnnDataType_t cudnn_fwd_compute_type_; - platform::TensorDescriptor in_desc_; - platform::FilterDescriptor filter_desc_; - platform::TensorDescriptor out_desc_; - platform::TensorDescriptor out_stats_desc_; - platform::ConvolutionDescriptor conv_desc_; - cudnnTensorFormat_t format_; + // conv desc + wgrad_op->SetOpConstParamDesc(CUDNN_PARAM_CONV_DESC, + args_.conv_desc.desc()); + // input desc + wgrad_op->SetOpConstParamDesc(CUDNN_PARAM_XDESC, + args_.in_desc.desc()); + // filter desc + wgrad_op->SetOpConstParamDesc(CUDNN_PARAM_DWDESC, + args_.filter_desc.desc()); + // output desc + wgrad_op->SetOpConstParamDesc(CUDNN_PARAM_DYDESC, + args_.out_desc.desc()); + wgrad_op->SetOpConstParamAttr(CUDNN_PARAM_BN_MODE, + CUDNN_BATCHNORM_SPATIAL_PERSISTENT); - CudnnFusionOp fwd_op_; + // Make cudnn fused ops plan + wgrad_op->GetWorkspaceSizeInBytes(ctx.cudnn_handle()); + return wgrad_op; + }); + return wgrad_op; + } + + size_t GetWorkspaceSizeBwdData(const platform::CUDADeviceContext &ctx) { + size_t workspace_size = 0U; + auto handle = ctx.cudnn_handle(); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize( + handle, args_.filter_desc.desc(), args_.out_desc.desc(), + args_.conv_desc.desc(), args_.in_desc.desc(), dgrad_algo_, + &workspace_size)); + return RoundUp(workspace_size, 512); + } + + private: + NormConvolutionArgs args_; + cudnnConvolutionBwdDataAlgo_t dgrad_algo_; }; + #endif } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc index 125ed856422920..23983d447e4788 100644 --- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc +++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc @@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ + #include #include @@ -29,56 +30,182 @@ namespace op = paddle::operators; using Tensor = paddle::framework::Tensor; USE_OP(conv2d); +USE_OP(conv2d_grad); USE_OP_DEVICE_KERNEL(conv2d, CUDNN); +USE_OP_DEVICE_KERNEL(conv2d_grad, CUDNN); + +template +void InitRandomTensor(const std::vector &dims, + framework::Tensor *cpu_out) { + T *cpu_out_ptr = cpu_out->mutable_data(framework::make_ddim(dims), + platform::CPUPlace()); + + std::default_random_engine random(0); + std::uniform_real_distribution dis(0.0, 1.0); + for (int i = 0; i < cpu_out->numel(); ++i) { + cpu_out_ptr[i] = static_cast(dis(random)); + } +} -// get paddle conv2d op results as baseline template -void Conv2DForwardCompute(const Tensor &x, const Tensor &w, Tensor *y, - const platform::CUDADeviceContext &ctx) { +void TransposeNchwToNhwc(const framework::Tensor &cpu_in, + framework::Tensor *cpu_out) { + auto in_dims = cpu_in.dims(); + EXPECT_EQ(cpu_in.dims().size(), 4); + + const T *cpu_in_ptr = cpu_in.data(); + T *cpu_out_ptr = cpu_out->mutable_data( + {in_dims[0], in_dims[2], in_dims[3], in_dims[1]}, platform::CPUPlace()); + + int64_t n = in_dims[0]; + int64_t c = in_dims[1]; + int64_t hw = in_dims[2] * in_dims[3]; + for (int i = 0; i < n; ++i) { + for (int j = 0; j < hw; ++j) { + for (int k = 0; k < c; ++k) { + int dst_idx = i * hw * c + j * c + k; + int src_idx = i * c * hw + k * hw + j; + cpu_out_ptr[dst_idx] = cpu_in_ptr[src_idx]; + } + } + } +} + +template +void CheckOutput(const framework::Tensor &cpu_res, + const framework::Tensor &cpu_base, float diff, + bool is_relative_atol = false) { + EXPECT_EQ(cpu_res.dims(), cpu_base.dims()); + + const T *cpu_res_ptr = cpu_res.data(); + const T *cpu_base_ptr = cpu_base.data(); + for (int i = 0; i < cpu_res.numel(); ++i) { + if (is_relative_atol) { + EXPECT_LT(static_cast(std::abs((cpu_res_ptr[i] - cpu_base_ptr[i]) / + cpu_base_ptr[i])), + diff); + } else { + EXPECT_LT(static_cast(std::abs(cpu_res_ptr[i] - cpu_base_ptr[i])), + diff); + } + } +} + +// Use Paddle conv2d op results as baseline +void ComputeConv2DForward(const platform::CUDADeviceContext &ctx, + const Tensor &cpu_input, const Tensor &cpu_filter, + Tensor *cpu_output, int stride, int padding) { framework::Scope scope; - auto var_x = scope.Var("Input"); - auto tensor_x = var_x->GetMutable(); - auto var_w = scope.Var("Filter"); - auto tensor_w = var_w->GetMutable(); - auto var_y = scope.Var("Output"); - auto tensor_y = var_y->GetMutable(); + auto *input = scope.Var("Input")->GetMutable(); + auto *filter = scope.Var("Filter")->GetMutable(); + auto *output = scope.Var("Output")->GetMutable(); auto place = ctx.GetPlace(); - TensorCopySync(x, place, tensor_x); - TensorCopySync(w, place, tensor_w); + TensorCopySync(cpu_input, place, input); + TensorCopySync(cpu_filter, place, filter); framework::AttributeMap attrs; bool use_cudnn = true; std::string data_format = "NHWC"; - std::string padding_algorithm = "SAME"; + std::vector strides = {stride, stride}; + std::vector paddings = {padding, padding}; + attrs.insert({"strides", strides}); + attrs.insert({"paddings", paddings}); attrs.insert({"use_cudnn", use_cudnn}); attrs.insert({"data_format", data_format}); - attrs.insert({"padding_algorithm", padding_algorithm}); auto op = framework::OpRegistry::CreateOp( "conv2d", {{"Input", {"Input"}}, {"Filter", {"Filter"}}}, {{"Output", {"Output"}}}, attrs); op->Run(scope, ctx.GetPlace()); - TensorCopySync(*tensor_y, place, y); - ctx.Wait(); + TensorCopySync(*output, platform::CPUPlace(), cpu_output); +} + +// Use Paddle conv2d_grad op results as baseline +void ComputeConv2DBackward(const platform::CUDADeviceContext &ctx, + const Tensor &cpu_input, const Tensor &cpu_filter, + const Tensor &cpu_output_grad, + framework::Tensor *cpu_input_grad, + framework::Tensor *cpu_filter_grad, int stride, + int padding, int dilation) { + framework::Scope scope; + auto *input = scope.Var("Input")->GetMutable(); + auto *filter = scope.Var("Filter")->GetMutable(); + auto *output_grad = + scope.Var("Output@GRAD")->GetMutable(); + auto *input_grad = + scope.Var("Input@GRAD")->GetMutable(); + auto *filter_grad = + scope.Var("Filter@GRAD")->GetMutable(); + + auto place = ctx.GetPlace(); + TensorCopySync(cpu_input, place, input); + TensorCopySync(cpu_filter, place, filter); + TensorCopySync(cpu_output_grad, place, output_grad); + + framework::AttributeMap attrs; + bool use_cudnn = true; + std::string data_format = "NHWC"; + std::string padding_algorithm = "EXPLICIT"; + std::vector strides = {stride, stride}; + std::vector paddings = {padding, padding}; + std::vector dilations = {dilation, dilation}; + int groups = 1; + bool exhaustive_search = false; + bool use_addto = false; + attrs.insert({"use_cudnn", use_cudnn}); + attrs.insert({"data_format", data_format}); + attrs.insert({"padding_algorithm", padding_algorithm}); + attrs.insert({"strides", strides}); + attrs.insert({"paddings", paddings}); + attrs.insert({"dilations", dilations}); + attrs.insert({"groups", groups}); + attrs.insert({"exhaustive_search", exhaustive_search}); + attrs.insert({"use_addto", use_addto}); + + auto op = framework::OpRegistry::CreateOp( + "conv2d_grad", {{"Input", {"Input"}}, + {"Filter", {"Filter"}}, + {"Output@GRAD", {"Output@GRAD"}}}, + {{"Input@GRAD", {"Input@GRAD"}}, {"Filter@GRAD", {"Filter@GRAD"}}}, + attrs); + op->Run(scope, ctx.GetPlace()); + + TensorCopySync(*input_grad, platform::CPUPlace(), cpu_input_grad); + TensorCopySync(*filter_grad, platform::CPUPlace(), cpu_filter_grad); } template -class TestCudnnNormConvOpForward { - public: - TestCudnnNormConvOpForward() { - batch_size_ = 2; - height_ = 8; - width_ = 8; - input_channels_ = 8; - output_channels_ = 32; - kernel_size_ = 1; - stride_ = 1; - pad_ = 0; +void ComputeSumAndSquareSum(const framework::Tensor &cpu_out, + framework::Tensor *cpu_sum, + framework::Tensor *cpu_sum_of_square) { + auto dims = cpu_out.dims(); + int64_t c = dims[3]; + + const T *cpu_out_ptr = cpu_out.data(); + float *cpu_sum_ptr = + cpu_sum->mutable_data({1, 1, 1, c}, platform::CPUPlace()); + float *cpu_sum_square_ptr = cpu_sum_of_square->mutable_data( + {1, 1, 1, c}, platform::CPUPlace()); + + for (int j = 0; j < c; ++j) { + float tmp_sum = 0.0f; + float tmp_sum_of_squares = 0.0f; + for (int i = 0; i < cpu_out.numel() / c; ++i) { + float tmp_out = static_cast(cpu_out_ptr[i * c + j]); + tmp_sum += tmp_out; + tmp_sum_of_squares += tmp_out * tmp_out; + } + cpu_sum_ptr[j] = tmp_sum; + cpu_sum_square_ptr[j] = tmp_sum_of_squares; } +} - TestCudnnNormConvOpForward(int batch_size, int height, int width, +template +class CudnnNormConvolutionTester { + public: + CudnnNormConvolutionTester(int batch_size, int height, int width, int input_channels, int output_channels, int kernel_size, int stride) { batch_size_ = batch_size; @@ -88,133 +215,180 @@ class TestCudnnNormConvOpForward { output_channels_ = output_channels; kernel_size_ = kernel_size; stride_ = stride; - pad_ = (kernel_size_ - 1) / 2; + padding_ = (kernel_size_ - 1) / 2; + out_height_ = (height_ + 2 * padding_ - kernel_size_) / stride_ + 1; + out_width_ = (width_ + 2 * padding_ - kernel_size_) / stride_ + 1; + SetUp(); + } + + ~CudnnNormConvolutionTester() {} + + void CheckForward(float diff, bool is_relative_atol = false) { + platform::CUDADeviceContext *ctx = + static_cast( + platform::DeviceContextPool::Instance().Get( + platform::CUDAPlace(0))); + + framework::Tensor cpu_output_base; + framework::Tensor cpu_sum_base; + framework::Tensor cpu_sum_of_square_base; + BaselineForward(*ctx, &cpu_output_base, &cpu_sum_base, + &cpu_sum_of_square_base); + + framework::Tensor cpu_output; + framework::Tensor cpu_sum; + framework::Tensor cpu_sum_of_square; + FusedForward(*ctx, &cpu_output, &cpu_sum, &cpu_sum_of_square); + + // Check forward correctness between baseline and results of normconv. + CheckOutput(cpu_output, cpu_output_base, diff, is_relative_atol); + CheckOutput(cpu_sum, cpu_sum_base, diff, is_relative_atol); + CheckOutput(cpu_sum_of_square, cpu_sum_of_square_base, diff, + is_relative_atol); } - ~TestCudnnNormConvOpForward() {} + void CheckBackward(float diff, bool is_relative_atol = false) { + platform::CUDADeviceContext *ctx = + static_cast( + platform::DeviceContextPool::Instance().Get( + platform::CUDAPlace(0))); + + framework::Tensor cpu_input_grad_base; + framework::Tensor cpu_filter_nchw_grad_base; + framework::Tensor cpu_filter_nhwc_grad_base; + BaselineBackward(*ctx, &cpu_input_grad_base, &cpu_filter_nchw_grad_base); + TransposeNchwToNhwc(cpu_filter_nchw_grad_base, + &cpu_filter_nhwc_grad_base); + framework::Tensor cpu_input_grad; + framework::Tensor cpu_filter_nhwc_grad; + FusedBackward(*ctx, &cpu_input_grad, &cpu_filter_nhwc_grad); + + // Check backward correctness between baseline and results of normconv. + CheckOutput(cpu_input_grad, cpu_input_grad_base, diff, is_relative_atol); + CheckOutput(cpu_filter_nhwc_grad, cpu_filter_nhwc_grad_base, diff, + is_relative_atol); + } + + private: void SetUp() { - input_size_ = batch_size_ * height_ * width_ * input_channels_; - filter_size_ = - output_channels_ * input_channels_ * kernel_size_ * kernel_size_; - output_size_ = batch_size_ * height_ * width_ * output_channels_; - param_size_ = output_channels_; - - input_vec_.resize(input_size_); - filter_raw_vec_.resize(filter_size_); - filter_pro_vec_.resize(filter_size_); - - std::default_random_engine random(0); - std::uniform_real_distribution dis(0.0, 1.0); - for (int i = 0; i < input_size_; ++i) { - input_vec_[i] = static_cast(dis(random)); - } - for (int i = 0; i < filter_size_; ++i) { - filter_raw_vec_[i] = static_cast(dis(random)); - } - // transpoes for filter - // NCHW->NHWC - for (int oc = 0; oc < output_channels_; ++oc) { - for (int kh = 0; kh < kernel_size_; ++kh) { - for (int kw = 0; kw < kernel_size_; ++kw) { - for (int ic = 0; ic < input_channels_; ++ic) { - int dst_idx = oc * kernel_size_ * kernel_size_ * input_channels_ + - kh * kernel_size_ * input_channels_ + - kw * input_channels_ + ic; - int src_idx = oc * kernel_size_ * kernel_size_ * input_channels_ + - ic * kernel_size_ * kernel_size_ + kh * kernel_size_ + - kw; - filter_pro_vec_[dst_idx] = filter_raw_vec_[src_idx]; - } - } - } - } + InitRandomTensor({batch_size_, height_, width_, input_channels_}, + &cpu_input_); + InitRandomTensor( + {output_channels_, input_channels_, kernel_size_, kernel_size_}, + &cpu_filter_nchw_); + // transpoes for filter, NCHW -> NHWC + TransposeNchwToNhwc(cpu_filter_nchw_, &cpu_filter_nhwc_); + InitRandomTensor( + {batch_size_, out_height_, out_width_, output_channels_}, + &cpu_output_grad_); + } - framework::TensorFromVector(input_vec_, *ctx_, &input_); - input_.Resize({batch_size_, height_, width_, input_channels_}); - framework::TensorFromVector(filter_raw_vec_, *ctx_, &filter_raw_); - filter_raw_.Resize( - {output_channels_, input_channels_, kernel_size_, kernel_size_}); - framework::TensorFromVector(filter_pro_vec_, *ctx_, &filter_pro_); - filter_pro_.Resize( - {output_channels_, kernel_size_, kernel_size_, input_channels_}); - output_.Resize({batch_size_, height_, width_, output_channels_}); - base_output_.Resize({batch_size_, height_, width_, output_channels_}); - sum_.Resize({1, 1, 1, output_channels_}); - sum_of_squares_.Resize({1, 1, 1, output_channels_}); - ctx_->Wait(); + void BaselineForward(const platform::CUDADeviceContext &ctx, + framework::Tensor *cpu_output_base, + framework::Tensor *cpu_sum_base, + framework::Tensor *cpu_sum_of_square_base) { + ComputeConv2DForward(ctx, cpu_input_, cpu_filter_nchw_, cpu_output_base, + stride_, padding_); + ComputeSumAndSquareSum(*cpu_output_base, cpu_sum_base, + cpu_sum_of_square_base); } - void BaselineForward() { - Conv2DForwardCompute(input_, filter_raw_, &base_output_, *ctx_); - ctx_->Wait(); + void BaselineBackward(const platform::CUDADeviceContext &ctx, + framework::Tensor *cpu_input_grad_base, + framework::Tensor *cpu_filter_grad_base) { + ComputeConv2DBackward(ctx, cpu_input_, cpu_filter_nchw_, cpu_output_grad_, + cpu_input_grad_base, cpu_filter_grad_base, stride_, + padding_, dilation_); } // get forward results of cudnn_norm_conv - void FusedForward() { - auto input_shape = framework::vectorize(input_.dims()); - auto filter_shape = framework::vectorize(filter_pro_.dims()); - auto output_shape = framework::vectorize(output_.dims()); - T *input_ptr = input_.data(); - T *filter_ptr = filter_pro_.data(); - T *output_ptr = output_.mutable_data(place_); - float *sum_ptr = sum_.mutable_data(place_); - float *sum_of_squares_ptr = sum_of_squares_.mutable_data(place_); - - std::shared_ptr> conv_op( - new op::CudnnNormConvolutionOp()); - conv_op->Init(*ctx_, input_shape, filter_shape, output_shape, pad_, stride_, - dilate_, group_); - conv_op->Forward(*ctx_, input_ptr, filter_ptr, output_ptr, sum_ptr, - sum_of_squares_ptr); - ctx_->Wait(); - } + void FusedForward(const platform::CUDADeviceContext &ctx, + framework::Tensor *cpu_output, framework::Tensor *cpu_sum, + framework::Tensor *cpu_sum_of_square) { + framework::Tensor input; + framework::Tensor filter_nhwc; + framework::Tensor output; + framework::Tensor sum; + framework::Tensor sum_of_square; - void Run() { - SetUp(); - BaselineForward(); - FusedForward(); + auto place = ctx.GetPlace(); + TensorCopySync(cpu_input_, place, &input); + TensorCopySync(cpu_filter_nhwc_, place, &filter_nhwc); + + output.Resize(framework::make_ddim( + {batch_size_, out_height_, out_width_, output_channels_})); + sum.Resize(framework::make_ddim({1, 1, 1, output_channels_})); + sum_of_square.Resize(framework::make_ddim({1, 1, 1, output_channels_})); + + auto input_shape = framework::vectorize(input.dims()); + auto filter_shape = framework::vectorize(filter_nhwc.dims()); + auto output_shape = framework::vectorize(output.dims()); + op::CudnnNormConvolution conv_op(ctx, input_shape, filter_shape, + output_shape, padding_, stride_, + dilation_, group_); + conv_op.Forward(ctx, input, filter_nhwc, &output, &sum, &sum_of_square); + + TensorCopySync(output, platform::CPUPlace(), cpu_output); + TensorCopySync(sum, platform::CPUPlace(), cpu_sum); + TensorCopySync(sum_of_square, platform::CPUPlace(), cpu_sum_of_square); } - // check forward correctness between baseline and results of normconv. - void CheckOut(const T diff, bool is_relative_atol = false) { - std::vector base_output_vec, output_vec; - output_vec.resize(output_size_); - base_output_vec.resize(output_size_); - TensorToVector(base_output_, *ctx_, &base_output_vec); - TensorToVector(output_, *ctx_, &output_vec); - ctx_->Wait(); - - for (int i = 0; i < output_size_; ++i) { - if (is_relative_atol) { - EXPECT_LT( - std::abs((output_vec[i] - base_output_vec[i]) / base_output_vec[i]), - diff); - } else { - EXPECT_LT(std::abs(output_vec[i] - base_output_vec[i]), diff); - } - } + void FusedBackward(const platform::CUDADeviceContext &ctx, + framework::Tensor *cpu_input_grad, + framework::Tensor *cpu_filter_grad) { + framework::Tensor input; + framework::Tensor filter_nhwc; + framework::Tensor output_grad; + framework::Tensor input_grad; + framework::Tensor filter_grad; + + auto place = ctx.GetPlace(); + TensorCopySync(cpu_input_, place, &input); + TensorCopySync(cpu_filter_nhwc_, place, &filter_nhwc); + TensorCopySync(cpu_output_grad_, place, &output_grad); + + input_grad.Resize(input.dims()); + filter_grad.Resize(filter_nhwc.dims()); + + auto input_shape = framework::vectorize(input.dims()); + auto filter_shape = framework::vectorize(filter_nhwc.dims()); + auto output_shape = framework::vectorize(output_grad.dims()); + op::CudnnNormConvolutionGrad conv_grad_op(ctx, input_shape, filter_shape, + output_shape, padding_, + stride_, dilation_, group_); + conv_grad_op.Backward(ctx, input, filter_nhwc, output_grad, &input_grad, + &filter_grad); + + TensorCopySync(input_grad, platform::CPUPlace(), cpu_input_grad); + TensorCopySync(filter_grad, platform::CPUPlace(), cpu_filter_grad); } private: - int batch_size_, height_, width_, input_channels_, output_channels_; - int kernel_size_, stride_, pad_; - const int dilate_ = 1; + int batch_size_; + int height_; + int width_; + int out_height_; + int out_width_; + int input_channels_; + int output_channels_; + int kernel_size_; + int stride_; + int padding_; + const int dilation_ = 1; const int group_ = 1; - int input_size_, filter_size_, output_size_, param_size_; - framework::Tensor input_, filter_raw_, filter_pro_, output_, base_output_; - framework::Tensor sum_, sum_of_squares_; - std::vector input_vec_, filter_raw_vec_, filter_pro_vec_; + // Forward input + framework::Tensor cpu_input_; + framework::Tensor cpu_filter_nchw_; + framework::Tensor cpu_filter_nhwc_; - platform::CUDAPlace place_ = platform::CUDAPlace(0); - platform::CUDADeviceContext *ctx_ = - static_cast( - platform::DeviceContextPool::Instance().Get(place_)); + // Backward input + framework::Tensor cpu_output_grad_; }; // test for fp16, kernel = 1, output_channels = input_channels -TEST(CudnnNormConvForward, GPUCudnnNormConvForward1Fp16) { +TEST(CudnnNormConvFp16, K1S1) { int batch_size = 4; int height = 56; int width = 56; @@ -222,15 +396,15 @@ TEST(CudnnNormConvForward, GPUCudnnNormConvForward1Fp16) { int output_channels = 32; int kernel_size = 1; int stride = 1; - TestCudnnNormConvOpForward test( + CudnnNormConvolutionTester test( batch_size, height, width, input_channels, output_channels, kernel_size, stride); - test.Run(); - test.CheckOut(static_cast(1e-3), true); + test.CheckForward(1e-3, true); + test.CheckBackward(1e-3, true); } // test for fp16, kernel = 3, output_channels = input_channels -TEST(CudnnNormConvForward, GPUCudnnNormConvForward2Fp16) { +TEST(CudnnNormConvFp16, K3S1) { int batch_size = 4; int height = 56; int width = 56; @@ -238,15 +412,15 @@ TEST(CudnnNormConvForward, GPUCudnnNormConvForward2Fp16) { int output_channels = 32; int kernel_size = 3; int stride = 1; - TestCudnnNormConvOpForward test( + CudnnNormConvolutionTester test( batch_size, height, width, input_channels, output_channels, kernel_size, stride); - test.Run(); - test.CheckOut(static_cast(1e-3), true); + test.CheckForward(1e-3, true); + test.CheckBackward(1e-3, true); } // test for fp16, kernel = 1, output_channels = input_channels * 4 -TEST(CudnnNormConvForward, GPUCudnnNormConvForward3Fp16) { +TEST(CudnnNormConvFp16, K1S1O4) { int batch_size = 4; int height = 56; int width = 56; @@ -254,9 +428,34 @@ TEST(CudnnNormConvForward, GPUCudnnNormConvForward3Fp16) { int output_channels = 128; int kernel_size = 1; int stride = 1; - TestCudnnNormConvOpForward test( + CudnnNormConvolutionTester test( batch_size, height, width, input_channels, output_channels, kernel_size, stride); - test.Run(); - test.CheckOut(static_cast(1e-3), true); + test.CheckForward(1e-3, true); + test.CheckBackward(1e-3, true); +} + +// test for fp16, kernel = 1, stride = 2, output_channels = input_channels * 4 +TEST(CudnnNormConvFp16, K1S2O4) { + int batch_size = 4; + int height = 8; + int width = 8; + int input_channels = 32; + int output_channels = 128; + int kernel_size = 1; + int stride = 2; + CudnnNormConvolutionTester test( + batch_size, height, width, input_channels, output_channels, kernel_size, + stride); + platform::CUDADeviceContext *ctx = static_cast( + platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0))); + + if (ctx->GetComputeCapability() <= 70) { + ASSERT_THROW(test.CheckForward(1e-3, true), + paddle::platform::EnforceNotMet); + ASSERT_THROW(test.CheckBackward(1e-3), paddle::platform::EnforceNotMet); + } else { + ASSERT_NO_THROW(test.CheckForward(1e-3, true)); + ASSERT_NO_THROW(test.CheckBackward(1e-3)); + } } diff --git a/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h new file mode 100644 index 00000000000000..5166ff27234f23 --- /dev/null +++ b/paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h @@ -0,0 +1,317 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/operators/fused/cudnn_fusion_helper.h" +#include "paddle/fluid/platform/cudnn_desc.h" +#include "paddle/fluid/platform/cudnn_helper.h" + +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; +template +using CudnnDataType = platform::CudnnDataType; +namespace dynload = platform::dynload; +template +using BatchNormParamType = + typename platform::CudnnDataType::BatchNormParamType; + +#if CUDNN_VERSION >= 8000 + +template +struct ScaleBiasAddReluArgs { + ScaleBiasAddReluArgs() { + dtype = platform::CudnnDataType::type; + param_dtype = platform::CudnnDataType>::type; + format = CUDNN_TENSOR_NHWC; + } + + void Set(const std::string &act_type, const std::vector &data_shape, + const std::vector ¶m_shape, + const std::vector &bitmask_shape) { + PADDLE_ENFORCE_EQ( + data_shape.size(), 4U, + platform::errors::InvalidArgument( + "The size of data_shape is expected to 4. But recieved " + "data_shape's size is %d, data_shape is [%s].", + data_shape.size(), framework::make_ddim(data_shape))); + PADDLE_ENFORCE_EQ( + param_shape.size(), 4U, + platform::errors::InvalidArgument( + "The size of param_shape is expected to 4. But recieved " + "param_shape's size is %d, param_shape is [%s].", + param_shape.size(), framework::make_ddim(param_shape))); + PADDLE_ENFORCE_EQ( + bitmask_shape.size(), 3U, + platform::errors::InvalidArgument( + "The size of bitmask_shape is expected to 3. But recieved " + "bitmask_shape's size is %d, bitmask_shape is [%s].", + bitmask_shape.size(), framework::make_ddim(bitmask_shape))); + + in_desc.set(data_shape, format, dtype); + out_desc.set(data_shape, format, dtype); + equiv_scale_bias_desc.set(param_shape, format, dtype); + scale_bias_mean_var_desc.set(param_shape, format, param_dtype); + bitmask_desc.set(bitmask_shape, format, CUDNN_DATA_INT32); + // set activation desc + cudnnActivationMode_t mode = CUDNN_ACTIVATION_IDENTITY; + if (act_type != "") { + PADDLE_ENFORCE_EQ( + act_type, "relu", + platform::errors::InvalidArgument( + "Only relu activation supported in normalized convolution.")); + mode = CUDNN_ACTIVATION_RELU; + } + double dummy_clip = 0.0; + activation_desc.set(mode, dummy_clip); + } + + cudnnDataType_t dtype; + cudnnDataType_t param_dtype; + cudnnTensorFormat_t format; + + platform::TensorDescriptor in_desc; + platform::TensorDescriptor out_desc; + platform::TensorDescriptor equiv_scale_bias_desc; + platform::TensorDescriptor scale_bias_mean_var_desc; + platform::TensorDescriptor bitmask_desc; + platform::ActivationDescriptor activation_desc; +}; + +template +class CudnnScaleBiasAddRelu { + public: + CudnnScaleBiasAddRelu(const platform::CUDADeviceContext &ctx, + const std::string &act_type, bool fuse_add, + bool has_shortcut, const std::vector &data_shape, + const std::vector ¶m_shape, + const std::vector &bitmask_shape) + : fwd_op_(CUDNN_FUSED_SCALE_BIAS_ADD_ACTIVATION_GEN_BITMASK), + bwd_op_(CUDNN_FUSED_DACTIVATION_FORK_DBATCHNORM) { + fuse_add_ = fuse_add; + has_shortcut_ = has_shortcut; + args_.Set(act_type, data_shape, param_shape, bitmask_shape); + } + + ~CudnnScaleBiasAddRelu() {} + + void Forward(const platform::CUDADeviceContext &ctx, const Tensor &x, + const Tensor &x_scale, const Tensor &x_bias, const Tensor *z, + const Tensor *z_scale, const Tensor *z_bias, Tensor *out, + Tensor *bitmask) { + ForwardInit(ctx); + auto handle = ctx.cudnn_handle(); + auto place = ctx.GetPlace(); + auto workspace_handle = ctx.cudnn_workspace_handle(); + fwd_workspace_byte_ = fwd_op_.GetWorkspaceSizeInBytes(handle); + // Set variant_param + // input ptr + T *x_ptr = const_cast(x.data()); + T *x_scale_ptr = const_cast(x_scale.data()); + T *x_bias_ptr = const_cast(x_bias.data()); + fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_XDATA, x_ptr); + fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_EQSCALE, x_scale_ptr); + fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_EQBIAS, x_bias_ptr); + if (has_shortcut_) { + T *z_ptr = const_cast(z->data()); + T *z_scale_ptr = const_cast(z_scale->data()); + T *z_bias_ptr = const_cast(z_bias->data()); + fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_ZDATA, z_ptr); + fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_Z_EQSCALE, z_scale_ptr); + fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_Z_EQBIAS, z_bias_ptr); + } else { + if (fuse_add_) { + T *z_ptr = const_cast(z->data()); + fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_ZDATA, z_ptr); + } + } + + fwd_op_.SetOpVariantParamAttrPtr( + CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES, &fwd_workspace_byte_); + + // output ptr + T *out_ptr = out->mutable_data(place); + int32_t *bitmask_ptr = bitmask->mutable_data(place); + fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_YDATA, out_ptr); + fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_ACTIVATION_BITMASK, bitmask_ptr); + + workspace_handle.RunFunc( + [&](void *workspace_ptr) { + // workspace ptr + fwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, workspace_ptr); + // workspace ptr + fwd_op_.Execute(handle); + }, + fwd_workspace_byte_); + } + + void Backward(const platform::CUDADeviceContext &ctx, const Tensor &dy, + const Tensor &x, const Tensor &scale, const Tensor &bias, + const Tensor &saved_mean, const Tensor &saved_invstd, + const Tensor *bitmask, Tensor *dx, Tensor *dz, Tensor *dscale, + Tensor *dbias, double eps) { + BackwardInit(ctx); + auto handle = ctx.cudnn_handle(); + auto place = ctx.GetPlace(); + auto workspace_handle = ctx.cudnn_workspace_handle(); + bwd_workspace_byte_ = bwd_op_.GetWorkspaceSizeInBytes(handle); + // Set variant_param + // input ptr + T *dy_ptr = const_cast(dy.data()); + T *x_ptr = const_cast(x.data()); + float *scale_ptr = const_cast(scale.data()); + float *bias_ptr = const_cast(bias.data()); + float *saved_mean_ptr = const_cast(saved_mean.data()); + float *saved_invstd_ptr = const_cast(saved_invstd.data()); + int32_t *bitmask_ptr = + bitmask ? const_cast(bitmask->data()) : nullptr; + T *dx_ptr = dx->mutable_data(place); + T *dz_ptr = dz ? dz->mutable_data(place) : nullptr; + float *dscale_ptr = dscale ? dscale->mutable_data(place) : nullptr; + float *dbias_ptr = dbias ? dbias->mutable_data(place) : nullptr; + + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_XDATA, x_ptr); + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_DYDATA, dy_ptr); + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_SCALE, scale_ptr); + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_BIAS, bias_ptr); + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_SAVED_MEAN, saved_mean_ptr); + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_SAVED_INVSTD, + saved_invstd_ptr); + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_ACTIVATION_BITMASK, bitmask_ptr); + + bwd_op_.SetOpVariantParamAttrPtr( + CUDNN_SCALAR_SIZE_T_WORKSPACE_SIZE_IN_BYTES, &bwd_workspace_byte_); + + // output ptr + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_DXDATA, dx_ptr); + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_DSCALE, dscale_ptr); + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_BN_DBIAS, dbias_ptr); + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_SCALAR_DOUBLE_BN_EPSILON, + &eps); + if (has_shortcut_ || fuse_add_) { + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_DZDATA, dz_ptr); + } + + workspace_handle.RunFunc( + [&](void *workspace_ptr) { + // workspace ptr + bwd_op_.SetOpVariantParamAttrPtr(CUDNN_PTR_WORKSPACE, workspace_ptr); + // workspace ptr + bwd_op_.Execute(handle); + }, + bwd_workspace_byte_); + } + + private: + void ForwardInit(const platform::CUDADeviceContext &ctx) { + // Set constant_param + fwd_op_.SetOpConstParamAttr( + {CUDNN_PARAM_XDATA_PLACEHOLDER, CUDNN_PARAM_BN_EQSCALE_PLACEHOLDER, + CUDNN_PARAM_BN_EQBIAS_PLACEHOLDER, CUDNN_PARAM_YDATA_PLACEHOLDER, + CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER}, + CUDNN_PTR_16B_ALIGNED); + if (has_shortcut_) { + fwd_op_.SetOpConstParamAttr( + {CUDNN_PARAM_ZDATA_PLACEHOLDER, CUDNN_PARAM_BN_Z_EQSCALE_PLACEHOLDER, + CUDNN_PARAM_BN_Z_EQBIAS_PLACEHOLDER}, + CUDNN_PTR_16B_ALIGNED); + } else if (fuse_add_) { + fwd_op_.SetOpConstParamAttr(CUDNN_PARAM_ZDATA_PLACEHOLDER, + CUDNN_PTR_16B_ALIGNED); + } + + // input desc + fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_XDESC, args_.in_desc.desc()); + if (has_shortcut_ || fuse_add_) { + fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_ZDESC, args_.in_desc.desc()); + } + + // equiv scale/bias desc + fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_BN_EQSCALEBIAS_DESC, + args_.equiv_scale_bias_desc.desc()); + if (has_shortcut_) { + fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_BN_Z_EQSCALEBIAS_DESC, + args_.equiv_scale_bias_desc.desc()); + } + + // output desc + fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_YDESC, args_.out_desc.desc()); + + // bitmask desc + fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_ACTIVATION_BITMASK_DESC, + args_.bitmask_desc.desc()); + + // activation desc + fwd_op_.SetOpConstParamDesc(CUDNN_PARAM_ACTIVATION_DESC, + args_.activation_desc.desc()); + + // others + fwd_op_.SetOpConstParamAttr(CUDNN_PARAM_BN_MODE, + CUDNN_BATCHNORM_SPATIAL_PERSISTENT); + } + + void BackwardInit(const platform::CUDADeviceContext &ctx) { + // Set constant_param + bwd_op_.SetOpConstParamAttr( + {CUDNN_PARAM_XDATA_PLACEHOLDER, CUDNN_PARAM_DYDATA_PLACEHOLDER, + CUDNN_PARAM_DXDATA_PLACEHOLDER, CUDNN_PARAM_BN_SCALE_PLACEHOLDER, + CUDNN_PARAM_BN_BIAS_PLACEHOLDER, CUDNN_PARAM_BN_SAVED_MEAN_PLACEHOLDER, + CUDNN_PARAM_BN_SAVED_INVSTD_PLACEHOLDER, + CUDNN_PARAM_BN_DSCALE_PLACEHOLDER, CUDNN_PARAM_BN_DBIAS_PLACEHOLDER, + CUDNN_PARAM_ACTIVATION_BITMASK_PLACEHOLDER}, + CUDNN_PTR_16B_ALIGNED); + if (has_shortcut_ || fuse_add_) { + bwd_op_.SetOpConstParamAttr(CUDNN_PARAM_DZDATA_PLACEHOLDER, + CUDNN_PTR_16B_ALIGNED); + } + + // input desc + bwd_op_.SetOpConstParamDesc(CUDNN_PARAM_XDESC, args_.in_desc.desc()); + bwd_op_.SetOpConstParamDesc(CUDNN_PARAM_DXDESC, args_.in_desc.desc()); + if (has_shortcut_ || fuse_add_) { + bwd_op_.SetOpConstParamDesc(CUDNN_PARAM_DZDESC, args_.in_desc.desc()); + } + + // scale/bias/mean/var desc for backward + bwd_op_.SetOpConstParamDesc(CUDNN_PARAM_BN_SCALEBIAS_MEANVAR_DESC, + args_.scale_bias_mean_var_desc.desc()); + + // output desc + bwd_op_.SetOpConstParamDesc(CUDNN_PARAM_DYDESC, args_.out_desc.desc()); + + // bitmask desc + bwd_op_.SetOpConstParamDesc(CUDNN_PARAM_ACTIVATION_BITMASK_DESC, + args_.bitmask_desc.desc()); + + // activation desc + bwd_op_.SetOpConstParamDesc(CUDNN_PARAM_ACTIVATION_DESC, + args_.activation_desc.desc()); + + // others + bwd_op_.SetOpConstParamAttr(CUDNN_PARAM_BN_MODE, + CUDNN_BATCHNORM_SPATIAL_PERSISTENT); + } + + bool fuse_add_ = false; + bool has_shortcut_ = false; + size_t fwd_workspace_byte_; + size_t bwd_workspace_byte_; + ScaleBiasAddReluArgs args_; + CudnnFusionOp fwd_op_; + CudnnFusionOp bwd_op_; +}; +#endif +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/fused/fused_attention_op.cc b/paddle/fluid/operators/fused/fused_attention_op.cc new file mode 100644 index 00000000000000..6c4ac318264e80 --- /dev/null +++ b/paddle/fluid/operators/fused/fused_attention_op.cc @@ -0,0 +1,533 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +class FusedAttentionOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FusedAttentionOp"); + OP_INOUT_CHECK(ctx->HasInput("SrcMask"), "Input", "SrcMask", + "FusedAttentionOp"); + OP_INOUT_CHECK(ctx->HasInput("QKVW"), "Input", "QKVW", "FusedAttentionOp"); + OP_INOUT_CHECK(ctx->HasInput("QKVBias"), "Input", "QKVBias", + "FusedAttentionOp"); + OP_INOUT_CHECK(ctx->HasInput("OutLinearW"), "Input", "OutLinearW", + "FusedAttentionOp"); + OP_INOUT_CHECK(ctx->HasInput("OutLinearBias"), "Input", "OutLinearBias", + "FusedAttentionOp"); + + OP_INOUT_CHECK(ctx->HasOutput("LnMean"), "Output", "LnMean", + "FusedAttentionOp"); + OP_INOUT_CHECK(ctx->HasOutput("LnVariance"), "Output", "LnVariance", + "FusedAttentionOp"); + OP_INOUT_CHECK(ctx->HasOutput("LnOut"), "Output", "LnOut", + "FusedAttentionOp"); + // qkv_out: [batch_size, seq_len, 3, num_head, dim_head] + OP_INOUT_CHECK(ctx->HasOutput("QKVOut"), "Output", "QKVOut", + "FusedAttentionOp"); + OP_INOUT_CHECK(ctx->HasOutput("QKVBiasOut"), "Output", "QKVBiasOut", + "FusedAttentionOp"); + OP_INOUT_CHECK(ctx->HasOutput("TransposeOut2"), "Output", "TransposeOut2", + "FusedAttentionOp"); + OP_INOUT_CHECK(ctx->HasOutput("QKOut"), "Output", "QKOut", + "FusedAttentionOp"); + OP_INOUT_CHECK(ctx->HasOutput("QKTVOut"), "Output", "QKTVOut", + "FusedAttentionOp"); + OP_INOUT_CHECK(ctx->HasOutput("SrcMaskOut"), "Output", "SrcMaskOut", + "FusedAttentionOp"); + OP_INOUT_CHECK(ctx->HasOutput("SoftmaxOut"), "Output", "SoftmaxOut", + "FusedAttentionOp"); + OP_INOUT_CHECK(ctx->HasOutput("AttnDropoutMaskOut"), "Output", + "AttnDropoutMaskOut", "FusedAttentionOp"); + OP_INOUT_CHECK(ctx->HasOutput("AttnDropoutOut"), "Output", "AttnDropoutOut", + "FusedAttentionOp"); + OP_INOUT_CHECK(ctx->HasOutput("FMHAOut"), "Output", "FMHAOut", + "FusedAttentionOp"); + OP_INOUT_CHECK(ctx->HasOutput("OutLinearOut"), "Output", "OutLinearOut", + "FusedAttentionOp"); + OP_INOUT_CHECK(ctx->HasOutput("Ln2Mean"), "Output", "Ln2Mean", + "FusedAttentionOp"); + OP_INOUT_CHECK(ctx->HasOutput("Ln2Variance"), "Output", "Ln2Variance", + "FusedAttentionOp"); + OP_INOUT_CHECK(ctx->HasOutput("BiasDropoutResidualOut"), "Output", + "BiasDropoutResidualOut", "FusedAttentionOp"); + OP_INOUT_CHECK(ctx->HasOutput("DropoutMaskOut"), "Output", "DropoutMaskOut", + "FusedAttentionOp"); + OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y", "FusedAttentionOp"); + + // x: qkv's input [batch_size, seq_len, dim_embed] + // y: qkv's weight: [3, num_head, dim_head, dim_embed] + auto x_dim = ctx->GetInputDim("X"); + auto y_dim = ctx->GetInputDim("QKVW"); + PADDLE_ENFORCE_EQ(x_dim.size(), 3, platform::errors::InvalidArgument( + "The dimensions of x must be 3" + "(batch_size, seq_len, dim_embed)," + "but received dimensions of" + "Input is [%d]", + x_dim.size())); + PADDLE_ENFORCE_EQ(y_dim.size(), 4, + platform::errors::InvalidArgument( + "The dimensions of qkv_weight must be 4" + "(3, num_head, dim_head, dim_embed)," + "but received dimensions of" + "Input is [%d]", + y_dim.size())); + PADDLE_ENFORCE_EQ(x_dim[2], y_dim[3], + platform::errors::InvalidArgument( + "ShapeError: the dimension of x_dim[2] and y_dim[3]" + "must be equal. But received: the shape " + "of input x = [%s], and the shape of " + "input qkv_weight = [%s]", + x_dim, y_dim)); + + ctx->SetOutputDim("LnMean", {x_dim[0] * x_dim[1]}); + ctx->SetOutputDim("LnVariance", {x_dim[0] * x_dim[1]}); + ctx->SetOutputDim("LnOut", ctx->GetInputDim("X")); + // [batch_size, seq_len, 3, num_head, head_size] + ctx->SetOutputDim("QKVOut", + {x_dim[0], x_dim[1], y_dim[0], y_dim[1], y_dim[2]}); + ctx->SetOutputDim("QKVBiasOut", + {x_dim[0], x_dim[1], y_dim[0], y_dim[1], y_dim[2]}); + // [3, batch_size, num_head, seq_len, head_size] + ctx->SetOutputDim("TransposeOut2", + {y_dim[0], x_dim[0], y_dim[1], x_dim[1], y_dim[2]}); + // [batch, num_head, seq_len, seq_len] + ctx->SetOutputDim("QKOut", {x_dim[0], y_dim[1], x_dim[1], x_dim[1]}); + ctx->SetOutputDim("SrcMaskOut", {x_dim[0], y_dim[1], x_dim[1], x_dim[1]}); + // the same as QKOut's shape. + ctx->SetOutputDim("AttnDropoutOut", + {x_dim[0], y_dim[1], x_dim[1], x_dim[1]}); + if (ctx->Attrs().Get("attn_dropout_is_test") == false) { + ctx->SetOutputDim("AttnDropoutMaskOut", + {x_dim[0], y_dim[1], x_dim[1], x_dim[1]}); + } + ctx->SetOutputDim("SoftmaxOut", {x_dim[0], y_dim[1], x_dim[1], x_dim[1]}); + // [batch_size, num_heads, seq_len, head_dim] + ctx->SetOutputDim("QKTVOut", {x_dim[0], y_dim[1], x_dim[1], y_dim[2]}); + // [batch_size, seq_len, number of heads*head size] + ctx->SetOutputDim("FMHAOut", {x_dim[0], x_dim[1], y_dim[1], y_dim[2]}); + ctx->SetOutputDim("OutLinearOut", ctx->GetInputDim("X")); + + ctx->SetOutputDim("Ln2Mean", {x_dim[0] * x_dim[1]}); + ctx->SetOutputDim("Ln2Variance", {x_dim[0] * x_dim[1]}); + if (ctx->Attrs().Get("dropout_is_test") == false) { + ctx->SetOutputDim("DropoutMaskOut", ctx->GetInputDim("X")); + } + ctx->SetOutputDim("BiasDropoutResidualOut", ctx->GetInputDim("X")); + ctx->SetOutputDim("Y", ctx->GetInputDim("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + auto input = ctx.Input("X"); + auto input_data_type = input->type(); + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } +}; + +class FusedAttentionOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "The input tensor."); + AddInput("LnScale", + "(optional) Scale is a 1-dimensional tensor of size " + "H. Here, H represents the last dimension of its input tensor.") + .AsDispensable(); + AddInput("LnBias", + "(optional) Bias is a 1-dimensional tensor of size " + "H. Here, H represents the last dimension of its input tensor.") + .AsDispensable(); + AddInput("QKVW", "The qkv weight tensor."); + AddInput("QKVBias", "The qkv bias tensor."); + AddInput("SrcMask", "(optional) The attention mask tensor in fmha.") + .AsDispensable(); + AddInput("OutLinearW", "The out_linear weight tensor."); + AddInput("OutLinearBias", "The out_linear bias tensor."); + AddInput("Ln2Scale", + "(optional) Scale is a 1-dimensional tensor of size " + "H. Here, H represents the last dimension of its input tensor.") + .AsDispensable(); + AddInput("Ln2Bias", + "(optional) Bias is a 1-dimensional tensor of size " + "H. Here, H represents the last dimension of its input tensor.") + .AsDispensable(); + AddOutput("LnMean", "Mean of the current mini batch.").AsIntermediate(); + AddOutput("LnVariance", "Variance of the current mini batch.") + .AsIntermediate(); + AddOutput("LnOut", "The output of pre layer_norm.").AsIntermediate(); + AddOutput("QKVOut", "Result after qkv.").AsIntermediate(); + AddOutput("QKVBiasOut", "Result after qkv and bias op.").AsIntermediate(); + AddOutput("TransposeOut2", "Result in fmha.").AsIntermediate(); + AddOutput("QKOut", "Result in fmha.").AsIntermediate(); + AddOutput("QKTVOut", "Result in fmha.").AsIntermediate(); + AddOutput("SoftmaxOut", "Result in fmha.").AsIntermediate(); + AddOutput("AttnDropoutMaskOut", "Result in fmha.").AsIntermediate(); + AddOutput("AttnDropoutOut", "Result in fmha.").AsIntermediate(); + AddOutput("SrcMaskOut", "Result in fmha.").AsIntermediate(); + AddOutput("FMHAOut", "Result after fmha.").AsIntermediate(); + AddOutput("OutLinearOut", "Result after out_linear.").AsIntermediate(); + AddOutput("DropoutMaskOut", "The random sampled dropout mask.") + .AsIntermediate(); + AddOutput("Ln2Mean", "Mean of the current mini batch.").AsIntermediate(); + AddOutput("Ln2Variance", "Variance of the current mini batch.") + .AsIntermediate(); + AddOutput("BiasDropoutResidualOut", + "Result of residual + dropout(src + bias).") + .AsIntermediate(); + AddOutput("Y", "Result after attention."); + + AddAttr("pre_layer_norm", + "if true, the attention op uses pre_layer_norm architecure, " + "else, uses post_layer_norm architecuture. " + "[default false].") + .SetDefault(false); + AddAttr("epsilon", + "Constant for numerical stability [default 1e-5].") + .SetDefault(1e-5) + .AddCustomChecker([](const float &epsilon) { + PADDLE_ENFORCE_EQ(epsilon >= 0.0f && epsilon <= 0.001f, true, + platform::errors::InvalidArgument( + "'epsilon' in Op(LayerNorm) should be between" + "0.0 and 0.001, But received [%s].", + epsilon)); + }); + + // for dropout in fmha. + AddAttr("attn_dropout_rate", "Probability of setting units to zero.") + .SetDefault(.5f) + .AddCustomChecker([](const float &drop_p) { + PADDLE_ENFORCE_EQ( + drop_p >= 0.0f && drop_p <= 1.0f, true, + platform::errors::InvalidArgument( + "'attn_dropout_rate' must be between 0.0 and 1.0.")); + }); + AddAttr("attn_dropout_is_test", + "(bool, default false) Set to true for inference only, false " + "for training. Some layers may run faster when this is true.") + .SetDefault(false); + AddAttr("attn_dropout_fix_seed", + "A flag indicating whether to use a fixed seed to generate " + "random mask. NOTE: DO NOT set this flag to true in " + "training. Setting this flag to true is only useful in " + "unittest or for debug that always the same output units " + "will be dropped.") + .SetDefault(true); + AddAttr("attn_dropout_seed", "Dropout random seed.").SetDefault(0); + AddAttr( + "attn_dropout_implementation", + "[\"downgrade_in_infer\"|\"upscale_in_train\"]" + "There are two kinds of ways to implement dropout" + "(the mask below is a tensor have the same shape with input" + "the value of mask is 0 or 1, the ratio of 0 is dropout_rate)" + "1. downgrade_in_infer(default), downgrade the outcome at inference " + "time" + " train: out = input * mask" + " inference: out = input * (1.0 - dropout_rate)" + "2. upscale_in_train, upscale the outcome at training time, do nothing " + "in inference" + " train: out = input * mask / ( 1.0 - dropout_rate )" + " inference: out = input" + " dropout op can be removed from the program. the program will be " + "efficient") + .SetDefault("upscale_in_train") + .AddCustomChecker([](const std::string &type) { + PADDLE_ENFORCE_EQ( + type == "downgrade_in_infer" || type == "upscale_in_train", true, + platform::errors::InvalidArgument( + "dropout_implementation can only be downgrade_in_infer or " + "upscale_in_train")); + }); + + AddAttr("dropout_rate", "Probability of setting units to zero.") + .SetDefault(.5f) + .AddCustomChecker([](const float &drop_p) { + PADDLE_ENFORCE_EQ(drop_p >= 0.0f && drop_p <= 1.0f, true, + platform::errors::InvalidArgument( + "'dropout_rate' must be between 0.0 and 1.0.")); + }); + + AddAttr("dropout_is_test", + "(bool, default false) Set to true for inference only, false " + "for training. Some layers may run faster when this is true.") + .SetDefault(false); + AddAttr("dropout_fix_seed", + "A flag indicating whether to use a fixed seed to generate " + "random mask. NOTE: DO NOT set this flag to true in " + "training. Setting this flag to true is only useful in " + "unittest or for debug that always the same output units " + "will be dropped.") + .SetDefault(true); + AddAttr("dropout_seed", "Dropout random seed.").SetDefault(0); + AddAttr( + "dropout_implementation", + "[\"downgrade_in_infer\"|\"upscale_in_train\"]" + "The meaning is the same as 'attn_dropout_implementation'.") + .SetDefault("downgrade_in_infer") + .AddCustomChecker([](const std::string &type) { + PADDLE_ENFORCE_EQ( + type == "downgrade_in_infer" || type == "upscale_in_train", true, + platform::errors::InvalidArgument( + "dropout_implementation can only be downgrade_in_infer or " + "upscale_in_train")); + }); + AddAttr("ln_epsilon", + "Constant for numerical stability [default 1e-5].") + .SetDefault(1e-5) + .AddCustomChecker([](const float &ln_epsilon) { + PADDLE_ENFORCE_EQ(ln_epsilon >= 0.0f && ln_epsilon <= 0.001f, true, + platform::errors::InvalidArgument( + "'epsilon' of the second LayerNorm in Fused " + "attention op should be between" + "0.0 and 0.001, But received [%s].", + ln_epsilon)); + }); + + AddComment(R"DOC( + Add fused attention op whose logic is as follows: + // @input: [batch_size, seq_len, 3, num_head, head_dim] + // @final_out: [batch_size, seq_len, num_heads, head_dim] + if (pre_layernorm) + out = layer_norm(input); + out = compute_qkv(out) + bias; + // fmha module + { + out = transpose(out, perm=[2, 0, 3, 1, 4]); + out = q * k^t; + out = attn_mark + out; + out = softmax(out); + out = dropout(out); + out = out * v; + out = transpose(out, perm=[0, 2, 1, 3]); + + } + out = out_linear(out); + final_out = layer_norm(residual + dropout(bias + out)); + )DOC"); + } +}; + +class FusedAttentionGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE_EQ( + ctx->Attrs().Get("attn_dropout_is_test"), false, + platform::errors::InvalidArgument( + "GradOp is only callable when attn_dropout_is_test is false")); + + OP_INOUT_CHECK(ctx->HasInput("Ln2Mean"), "Input", "Ln2Mean", + "FusedAttentionGrad"); + OP_INOUT_CHECK(ctx->HasInput("Ln2Variance"), "Input", "Ln2Variance", + "FusedAttentionGrad"); + if (ctx->HasOutput(framework::GradVarName("Ln2Scale"))) { + ctx->SetOutputDim(framework::GradVarName("Ln2Scale"), + ctx->GetInputDim("Ln2Scale")); + } + if (ctx->HasOutput(framework::GradVarName("Ln2Bias"))) { + ctx->SetOutputDim(framework::GradVarName("Ln2Bias"), + ctx->GetInputDim("Ln2Bias")); + } + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "FusedAttentionGrad"); + OP_INOUT_CHECK(ctx->HasInput("LnMean"), "Input", "LnMean", + "FusedAttentionGrad"); + OP_INOUT_CHECK(ctx->HasInput("LnVariance"), "Input", "LnVariance", + "FusedAttentionGrad"); + if (ctx->Attrs().Get("pre_layer_norm") == true) { + OP_INOUT_CHECK(ctx->HasInput("LnOut"), "Input", "LnOut", + "FusedAttentionGrad"); + } + OP_INOUT_CHECK(ctx->HasInput("QKVW"), "Input", "QKVW", + "FusedAttentionGrad"); + OP_INOUT_CHECK(ctx->HasInput("QKVBias"), "Input", "QKVBias", + "FusedAttentionGrad"); + OP_INOUT_CHECK(ctx->HasInput("SrcMask"), "Input", "SrcMask", + "FusedAttentionGrad"); + OP_INOUT_CHECK(ctx->HasInput("OutLinearW"), "Input", "OutLinearW", + "FusedAttentionGrad"); + OP_INOUT_CHECK(ctx->HasInput("OutLinearBias"), "Input", "OutLinearBias", + "FusedAttentionGrad"); + + if (ctx->HasOutput(framework::GradVarName("LnScale"))) { + ctx->SetOutputDim(framework::GradVarName("LnScale"), + ctx->GetInputDim("LnScale")); + } + if (ctx->HasOutput(framework::GradVarName("LnBias"))) { + ctx->SetOutputDim(framework::GradVarName("LnBias"), + ctx->GetInputDim("LnBias")); + } + if (ctx->HasOutput(framework::GradVarName("X"))) { + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } + + ctx->SetOutputDim(framework::GradVarName("OutLinearBias"), + ctx->GetInputDim("OutLinearBias")); + ctx->SetOutputDim(framework::GradVarName("OutLinearW"), + ctx->GetInputDim("OutLinearW")); + ctx->SetOutputDim(framework::GradVarName("QKVW"), ctx->GetInputDim("QKVW")); + ctx->SetOutputDim(framework::GradVarName("QKVBias"), + ctx->GetInputDim("QKVBias")); + + ctx->SetOutputDim(framework::GradVarName("LnOut"), + ctx->GetInputDim("LnOut")); + ctx->SetOutputDim(framework::GradVarName("FMHAOut"), + ctx->GetInputDim("FMHAOut")); + ctx->SetOutputDim(framework::GradVarName("QKTVOut"), + ctx->GetInputDim("QKTVOut")); + ctx->SetOutputDim(framework::GradVarName("TransposeOut2"), + ctx->GetInputDim("TransposeOut2")); + ctx->SetOutputDim(framework::GradVarName("QKOut"), + ctx->GetInputDim("QKOut")); + ctx->SetOutputDim(framework::GradVarName("SoftmaxOut"), + ctx->GetInputDim("SoftmaxOut")); + ctx->SetOutputDim(framework::GradVarName("AttnDropoutOut"), + ctx->GetInputDim("AttnDropoutOut")); + ctx->SetOutputDim(framework::GradVarName("SrcMaskOut"), + ctx->GetInputDim("SrcMaskOut")); + ctx->SetOutputDim(framework::GradVarName("QKVOut"), + ctx->GetInputDim("QKVOut")); + ctx->SetOutputDim(framework::GradVarName("QKVBiasOut"), + ctx->GetInputDim("QKVBiasOut")); + ctx->SetOutputDim(framework::GradVarName("OutLinearOut"), + ctx->GetInputDim("OutLinearOut")); + ctx->SetOutputDim(framework::GradVarName("BiasDropoutResidualOut"), + ctx->GetInputDim("BiasDropoutResidualOut")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + auto input = ctx.Input("X"); + auto input_data_type = input->type(); + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } +}; + +template +class FusedAttentionGradOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("fused_attention_grad"); + op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y")); + + // inputs x, parameters and their grad. + op->SetInput("X", this->Input("X")); + op->SetInput("QKVW", this->Input("QKVW")); + op->SetInput("QKVBias", this->Input("QKVBias")); + op->SetInput("SrcMask", this->Input("SrcMask")); + op->SetInput("OutLinearW", this->Input("OutLinearW")); + op->SetInput("OutLinearBias", this->Input("OutLinearBias")); + if (this->HasInput("LnScale")) { + op->SetInput("LnScale", this->Input("LnScale")); + op->SetOutput(framework::GradVarName("LnScale"), + this->InputGrad("LnScale")); + } + if (this->HasInput("LnBias")) { + op->SetInput("LnBias", this->Input("LnBias")); + op->SetOutput(framework::GradVarName("LnBias"), + this->InputGrad("LnBias")); + } + if (this->HasInput("Ln2Scale")) { + op->SetInput("Ln2Scale", this->Input("Ln2Scale")); + op->SetOutput(framework::GradVarName("Ln2Scale"), + this->InputGrad("Ln2Scale")); + } + if (this->HasInput("Ln2Bias")) { + op->SetInput("Ln2Bias", this->Input("Ln2Bias")); + op->SetOutput(framework::GradVarName("Ln2Bias"), + this->InputGrad("Ln2Bias")); + } + + op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); + op->SetOutput(framework::GradVarName("QKVW"), this->InputGrad("QKVW")); + op->SetOutput(framework::GradVarName("QKVBias"), + this->InputGrad("QKVBias")); + op->SetOutput(framework::GradVarName("OutLinearBias"), + this->InputGrad("OutLinearBias")); + op->SetOutput(framework::GradVarName("OutLinearW"), + this->InputGrad("OutLinearW")); + + // use forward outputs as backward inputs. + op->SetInput("LnOut", this->Output("LnOut")); + op->SetInput("LnMean", this->Output("LnMean")); + op->SetInput("LnVariance", this->Output("LnVariance")); + op->SetInput("QKVOut", this->Output("QKVOut")); + op->SetInput("QKVBiasOut", this->Output("QKVBiasOut")); + op->SetInput("TransposeOut2", this->Output("TransposeOut2")); + op->SetInput("QKOut", this->Output("QKOut")); + op->SetInput("QKTVOut", this->Output("QKTVOut")); + op->SetInput("SoftmaxOut", this->Output("SoftmaxOut")); + op->SetInput("AttnDropoutMaskOut", this->Output("AttnDropoutMaskOut")); + op->SetInput("AttnDropoutOut", this->Output("AttnDropoutOut")); + op->SetInput("SrcMaskOut", this->Output("SrcMaskOut")); + op->SetInput("FMHAOut", this->Output("FMHAOut")); + op->SetInput("OutLinearOut", this->Output("OutLinearOut")); + + op->SetInput("Ln2Mean", this->Output("Ln2Mean")); + op->SetInput("Ln2Variance", this->Output("Ln2Variance")); + op->SetInput("DropoutMaskOut", this->Output("DropoutMaskOut")); + op->SetInput("BiasDropoutResidualOut", + this->Output("BiasDropoutResidualOut")); + op->SetInput("QKVOut", this->Output("QKVOut")); + + // backward outputs: dinput + op->SetOutput(framework::GradVarName("LnOut"), this->OutputGrad("LnOut")); + op->SetOutput(framework::GradVarName("QKVOut"), this->OutputGrad("QKVOut")); + op->SetOutput(framework::GradVarName("QKVBiasOut"), + this->OutputGrad("QKVBiasOut")); + op->SetOutput(framework::GradVarName("QKTVOut"), + this->OutputGrad("QKTVOut")); + op->SetOutput(framework::GradVarName("TransposeOut2"), + this->OutputGrad("TransposeOut2")); + op->SetOutput(framework::GradVarName("QKOut"), this->OutputGrad("QKOut")); + op->SetOutput(framework::GradVarName("SoftmaxOut"), + this->OutputGrad("SoftmaxOut")); + op->SetOutput(framework::GradVarName("AttnDropoutOut"), + this->OutputGrad("AttnDropoutOut")); + op->SetOutput(framework::GradVarName("SrcMaskOut"), + this->OutputGrad("SrcMaskOut")); + op->SetOutput(framework::GradVarName("FMHAOut"), + this->OutputGrad("FMHAOut")); + op->SetOutput(framework::GradVarName("BiasDropoutResidualOut"), + this->OutputGrad("BiasDropoutResidualOut")); + op->SetOutput(framework::GradVarName("OutLinearOut"), + this->OutputGrad("OutLinearOut")); + + op->SetAttrMap(this->Attrs()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(fused_attention, ops::FusedAttentionOp, + ops::FusedAttentionOpMaker, + ops::FusedAttentionGradOpMaker, + ops::FusedAttentionGradOpMaker); +REGISTER_OPERATOR(fused_attention_grad, ops::FusedAttentionGradOp); diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu new file mode 100644 index 00000000000000..95e690cb17ec14 --- /dev/null +++ b/paddle/fluid/operators/fused/fused_attention_op.cu @@ -0,0 +1,444 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/platform/cuda_device_function.h" +#include "paddle/fluid/platform/cudnn_helper.h" + +#include "paddle/fluid/operators/elementwise/elementwise_add_op.h" +#include "paddle/fluid/operators/math/math_function.h" + +#include "paddle/fluid/operators/fused/attention_layer_norm.h" +#include "paddle/fluid/operators/fused/attn_gemm.h" +#include "paddle/fluid/operators/fused/fmha_ref.h" +#include "paddle/fluid/operators/fused/fused_dropout_helper.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class FusedAttentionOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + using U = LayerNormParamType; + auto *input_x = ctx.Input("X"); + + const auto pre_layer_norm = ctx.Attr("pre_layer_norm"); + const float epsilon = ctx.Attr("epsilon"); + auto *ln_scale = ctx.Input("LnScale"); + auto *ln_bias = ctx.Input("LnBias"); + auto *ln_mean = ctx.Output("LnMean"); + auto *ln_var = ctx.Output("LnVariance"); + auto *ln_out = ctx.Output("LnOut"); + + // x: qkv's input [batch_size, seq_len, dim_embed] + // y: qkv's weight: [3, num_head, dim_head, dim_embed] + auto *qkv_weight = ctx.Input("QKVW"); + auto *qkv_bias = ctx.Input("QKVBias"); + auto *qkv_out = ctx.Output("QKVOut"); + auto *qkv_bias_out = ctx.Output("QKVBiasOut"); + + auto *src_mask = ctx.Input("SrcMask"); + auto *transpose_out_2 = ctx.Output("TransposeOut2"); + auto *qk_out = ctx.Output("QKOut"); + auto *qktv_out = ctx.Output("QKTVOut"); + auto *softmax_out = ctx.Output("SoftmaxOut"); + auto *attn_dropout_mask_out = ctx.Output("AttnDropoutMaskOut"); + auto *attn_dropout_out = ctx.Output("AttnDropoutOut"); + auto *src_mask_out = ctx.Output("SrcMaskOut"); + auto *fmha_out = ctx.Output("FMHAOut"); + + auto *out_linear_weight = ctx.Input("OutLinearW"); + auto *out_linear_bias = ctx.Input("OutLinearBias"); + auto *out_linear_out = ctx.Output("OutLinearOut"); + + auto *ln_scale_2 = ctx.Input("Ln2Scale"); + auto *ln_bias_2 = ctx.Input("Ln2Bias"); + auto *dropout_mask_out = ctx.Output("DropoutMaskOut"); + auto *bias_dropout_residual_out = + ctx.Output("BiasDropoutResidualOut"); + auto *ln_mean_2 = ctx.Output("Ln2Mean"); + auto *ln_var_2 = ctx.Output("Ln2Variance"); + const float ln_epsilon = ctx.Attr("ln_epsilon"); + + float attn_dropout_rate = ctx.Attr("attn_dropout_rate"); + bool is_test_1 = ctx.Attr("attn_dropout_is_test"); + auto &dropout_implementation_1 = + ctx.Attr("attn_dropout_implementation"); + bool is_upscale_in_train_1 = + (dropout_implementation_1 == "upscale_in_train"); + auto *seed_1 = ctx.HasInput("Seed1") ? ctx.Input("Seed1") : nullptr; + bool is_fix_seed_1 = ctx.Attr("attn_dropout_fix_seed"); + int seed_val_1 = ctx.Attr("attn_dropout_seed"); + + // final output. + auto *out = ctx.Output("Y"); + + // get data ptr for qkv part. + const auto input_x_dims = input_x->dims(); + const auto qkv_w_dims = qkv_weight->dims(); + + auto *x_data = input_x->data(); + auto *ln_scale_data = (ln_scale == nullptr ? nullptr : ln_scale->data()); + auto *ln_bias_data = (ln_bias == nullptr ? nullptr : ln_bias->data()); + auto *ln_mean_data = ln_mean->mutable_data(ctx.GetPlace()); + auto *ln_var_data = ln_var->mutable_data(ctx.GetPlace()); + auto *ln_out_data = ln_out->mutable_data(ctx.GetPlace()); + + auto *qkv_weight_data = qkv_weight->data(); + auto *qkv_bias_data = qkv_bias->data(); + auto *qkv_out_data = qkv_out->mutable_data(ctx.GetPlace()); + auto *qkv_bias_out_data = qkv_bias_out->mutable_data(ctx.GetPlace()); + + // get data ptr for FMHA. + auto *transpose_out_2_data = + transpose_out_2->mutable_data(ctx.GetPlace()); + auto *qk_out_data = qk_out->mutable_data(ctx.GetPlace()); + auto *qktv_out_data = qktv_out->mutable_data(ctx.GetPlace()); + auto *src_mask_out_data = src_mask_out->mutable_data(ctx.GetPlace()); + auto *softmax_out_data = softmax_out->mutable_data(ctx.GetPlace()); + auto *attn_dropout_mask_out_data = + attn_dropout_mask_out->mutable_data(ctx.GetPlace()); + auto *attn_dropout_out_data = + attn_dropout_out->mutable_data(ctx.GetPlace()); + auto *fmha_out_data = fmha_out->mutable_data(ctx.GetPlace()); + + // get data ptr for out_linear. + auto *out_linear_weight_data = out_linear_weight->data(); + auto *out_linear_bias_data = out_linear_bias->data(); + auto *out_linear_out_data = out_linear_out->mutable_data(ctx.GetPlace()); + + // get data ptr for bias+dropout+residual+layernorm + auto *ln_scale_2_data = + (ln_scale_2 == nullptr ? nullptr : ln_scale_2->data()); + auto *ln_bias_2_data = + (ln_bias_2 == nullptr ? nullptr : ln_bias_2->data()); + auto *dropout_mask_out_data = + dropout_mask_out->mutable_data(ctx.GetPlace()); + auto *bias_dropout_residual_out_data = + bias_dropout_residual_out->mutable_data(ctx.GetPlace()); + auto *ln_mean_2_data = ln_mean_2->mutable_data(ctx.GetPlace()); + auto *ln_var_2_data = ln_var_2->mutable_data(ctx.GetPlace()); + auto *final_out_data = out->mutable_data(ctx.GetPlace()); + + int batch_size = input_x_dims[0]; + int max_seq_len = input_x_dims[1]; + int dim_embed = input_x_dims[2]; + + int num_head = qkv_w_dims[1]; + int dim_head = qkv_w_dims[2]; + + int bsz_seq = batch_size * max_seq_len; + int hidden_size = num_head * dim_head; + int output_size = 3 * hidden_size; + int input_size = dim_embed; + + auto layer_norm_compute = AttnLayerNorm(ctx.cuda_device_context(), + epsilon, bsz_seq, dim_embed); + // (transA, transB, compute_bias) = (false, true, true) + auto qkv_compute = AttnMatMul(ctx.cuda_device_context(), false, true, + bsz_seq, output_size, input_size, true); + + AttnDropoutParam attn_dropout_param( + is_test_1, dropout_implementation_1, attn_dropout_rate, + is_upscale_in_train_1, is_fix_seed_1, seed_val_1, seed_1); + auto fmha_ref_compute = + FMHARef(ctx.cuda_device_context(), batch_size, max_seq_len, num_head, + dim_head, attn_dropout_param); + + output_size = hidden_size; + // (transA, transB, compute_bias) = (false, false, false) + auto out_linear_compute = + AttnMatMul(ctx.cuda_device_context(), false, false, bsz_seq, + output_size, input_size, false); + DropoutParam dropout_param2(ctx, 0); + FusedDropoutLayerNormHelper fused_dropout_layernorm_helper( + ctx.cuda_device_context(), bsz_seq, dim_embed, dropout_param2, + ln_epsilon); + + if (pre_layer_norm) { + layer_norm_compute.ComputeForward(x_data, ln_scale_data, ln_bias_data, + ln_out_data, ln_mean_data, ln_var_data); + qkv_compute.ComputeForward(qkv_weight_data, ln_out_data, qkv_bias_data, + qkv_out_data, qkv_bias_out_data); + } else { + qkv_compute.ComputeForward(qkv_weight_data, x_data, qkv_bias_data, + qkv_out_data, qkv_bias_out_data); + } + fmha_ref_compute.ComputeForward(*qkv_bias_out, *src_mask, transpose_out_2, + qk_out, src_mask_out, softmax_out, + attn_dropout_mask_out, attn_dropout_out, + qktv_out, fmha_out); + // fmha_out: [batch_size, seq_len, num_head, head_dim] + // weight: [embed_dim, embed_dim] + // out_linear_out: [batch_size, seq_len, embed_dim] + out_linear_compute.ComputeForward(out_linear_weight_data, fmha_out_data, + nullptr, out_linear_out_data, nullptr); + // output = layernorm(residual + dropout(input + bias)) + fused_dropout_layernorm_helper.LayernormResidualDropoutBias( + ctx.cuda_device_context(), out_linear_out_data, x_data, + out_linear_bias_data, ln_scale_2_data, ln_bias_2_data, + bias_dropout_residual_out_data, dropout_mask_out_data, final_out_data, + ln_mean_2_data, ln_var_2_data); + } +}; + +template +class FusedAttentionGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + using U = LayerNormParamType; + const auto pre_layer_norm = ctx.Attr("pre_layer_norm"); + const float epsilon = ctx.Attr("epsilon"); + const float ln2epsilon = ctx.Attr("ln_epsilon"); + + float attn_dropout_prob = ctx.Attr("attn_dropout_rate"); + bool is_test_1 = ctx.Attr("attn_dropout_is_test"); + auto &dropout_implementation_1 = + ctx.Attr("attn_dropout_implementation"); + bool is_upscale_in_train_1 = + (dropout_implementation_1 == "upscale_in_train"); + auto *seed_1 = ctx.HasInput("Seed1") ? ctx.Input("Seed1") : nullptr; + bool is_fix_seed_1 = ctx.Attr("attn_dropout_fix_seed"); + int seed_val_1 = ctx.Attr("attn_dropout_seed"); + + // get inputs. + auto *d_y = ctx.Input(framework::GradVarName("Y")); + auto *d_y_data = d_y->data(); + + // fw input + auto *input_x = ctx.Input("X"); + auto *ln_scale = ctx.Input("LnScale"); + auto *ln_2_scale = ctx.Input("Ln2Scale"); + auto *x_data = input_x->data(); + auto *ln_scale_data = (ln_scale == nullptr ? nullptr : ln_scale->data()); + auto *ln_2_scale_data = + (ln_2_scale == nullptr ? nullptr : ln_2_scale->data()); + // fw parameters. + auto *src_mask = ctx.Input("SrcMask"); + auto *qkv_weight = ctx.Input("QKVW"); + auto *qkv_bias = ctx.Input("QKVBias"); + auto *out_linear_weight = ctx.Input("OutLinearW"); + auto *out_linear_bias = ctx.Input("OutLinearBias"); + auto *src_mask_data = (src_mask == nullptr ? nullptr : src_mask->data()); + auto *qkv_weight_data = qkv_weight->data(); + auto *qkv_bias_data = qkv_bias->data(); + auto *out_linear_weight_data = out_linear_weight->data(); + auto *out_linear_bias_data = out_linear_bias->data(); + + // fw output + auto *ln_mean = ctx.Input("LnMean"); + auto *ln_var = ctx.Input("LnVariance"); + auto *ln_out = ctx.Input("LnOut"); + auto *fmha_out = ctx.Input("FMHAOut"); + auto *transpose_out_2 = ctx.Input("TransposeOut2"); + auto *qk_out = ctx.Input("QKOut"); + auto *qktv_out = ctx.Input("QKTVOut"); + auto *softmax_out = ctx.Input("SoftmaxOut"); + auto *attn_dropout_mask_out = ctx.Input("AttnDropoutMaskOut"); + auto *attn_dropout_out = ctx.Input("AttnDropoutOut"); + auto *src_mask_out = ctx.Input("SrcMaskOut"); + auto *out_linear_out = ctx.Input("OutLinearOut"); + auto *ln_2_mean = ctx.Input("Ln2Mean"); + auto *ln_2_var = ctx.Input("Ln2Variance"); + auto *dropout_mask_out = ctx.Input("DropoutMaskOut"); + auto *bias_dropout_residual_out = + ctx.Input("BiasDropoutResidualOut"); + auto *ln_mean_data = ln_mean->data(); + auto *ln_var_data = ln_var->data(); + auto *ln_out_data = ln_out->data(); + auto *fmha_out_data = fmha_out->data(); + auto *transpose_out_2_data = transpose_out_2->data(); + auto *qk_out_data = qk_out->data(); + auto *qktv_out_data = qktv_out->data(); + auto *softmax_out_data = softmax_out->data(); + auto *src_mask_out_data = src_mask_out->data(); + auto *out_linear_out_data = out_linear_out->data(); + auto *ln_2_mean_data = ln_2_mean->data(); + auto *ln_2_var_data = ln_2_var->data(); + auto *dropout_mask_out_data = dropout_mask_out->data(); + auto *bias_dropout_residual_out_data = bias_dropout_residual_out->data(); + + // output's grad + auto *d_x = ctx.Output(framework::GradVarName("X")); + auto *d_ln_out = ctx.Output(framework::GradVarName("LnOut")); + auto *d_qkv_out = ctx.Output(framework::GradVarName("QKVOut")); + auto *d_qkv_bias_out = + ctx.Output(framework::GradVarName("QKVBiasOut")); + auto *d_qktv_out = ctx.Output(framework::GradVarName("QKTVOut")); + auto *d_transpose_out_2 = + ctx.Output(framework::GradVarName("TransposeOut2")); + auto *d_qk_out = ctx.Output(framework::GradVarName("QKOut")); + auto *d_softmax_out = + ctx.Output(framework::GradVarName("SoftmaxOut")); + auto *d_attn_dropout_out = + ctx.Output(framework::GradVarName("AttnDropoutOut")); + auto *d_src_mask_out = + ctx.Output(framework::GradVarName("SrcMaskOut")); + auto *d_fmha_out = ctx.Output(framework::GradVarName("FMHAOut")); + auto *d_out_linear_out = + ctx.Output(framework::GradVarName("OutLinearOut")); + auto *d_bias_dropout_residual_out = + ctx.Output(framework::GradVarName("BiasDropoutResidualOut")); + auto *d_x_data = d_x->mutable_data(ctx.GetPlace()); + auto *d_ln_out_data = d_ln_out->mutable_data(ctx.GetPlace()); + auto *d_qkv_out_data = d_qkv_out->mutable_data(ctx.GetPlace()); + auto *d_qkv_bias_out_data = d_qkv_bias_out->mutable_data(ctx.GetPlace()); + auto *d_qktv_out_data = d_qktv_out->mutable_data(ctx.GetPlace()); + auto *d_transpose_out_2_data = + d_transpose_out_2->mutable_data(ctx.GetPlace()); + auto *d_qk_out_data = d_qk_out->mutable_data(ctx.GetPlace()); + auto *d_softmax_out_data = d_softmax_out->mutable_data(ctx.GetPlace()); + auto *d_attn_dropout_out_data = + d_attn_dropout_out->mutable_data(ctx.GetPlace()); + auto *d_src_mask_out_data = d_src_mask_out->mutable_data(ctx.GetPlace()); + auto *d_fmha_out_data = d_fmha_out->mutable_data(ctx.GetPlace()); + auto *d_out_linear_out_data = + d_out_linear_out->mutable_data(ctx.GetPlace()); + auto *d_bias_dropout_residual_out_data = + d_bias_dropout_residual_out->mutable_data(ctx.GetPlace()); + + // parameter grad + auto *d_ln_scale = ctx.Output(framework::GradVarName("LnScale")); + auto *d_ln_bias = ctx.Output(framework::GradVarName("LnBias")); + auto *d_qkv_weight = ctx.Output(framework::GradVarName("QKVW")); + auto *d_qkv_bias = ctx.Output(framework::GradVarName("QKVBias")); + auto *d_out_linear_weight = + ctx.Output(framework::GradVarName("OutLinearW")); + auto *d_out_linear_bias = + ctx.Output(framework::GradVarName("OutLinearBias")); + auto *d_ln_2_scale = ctx.Output(framework::GradVarName("Ln2Scale")); + auto *d_ln_2_bias = ctx.Output(framework::GradVarName("Ln2Bias")); + auto *d_ln_scale_data = + (d_ln_scale == nullptr ? nullptr + : d_ln_scale->mutable_data(ctx.GetPlace())); + auto *d_ln_bias_data = + (d_ln_bias == nullptr ? nullptr + : d_ln_bias->mutable_data(ctx.GetPlace())); + auto *d_qkv_weight_data = d_qkv_weight->mutable_data(ctx.GetPlace()); + auto *d_qkv_bias_data = d_qkv_bias->mutable_data(ctx.GetPlace()); + auto *d_out_linear_weight_data = + d_out_linear_weight->mutable_data(ctx.GetPlace()); + auto *d_out_linear_bias_data = + d_out_linear_bias->mutable_data(ctx.GetPlace()); + auto *d_ln_2_scale_data = + (d_ln_2_scale == nullptr ? nullptr : d_ln_2_scale->mutable_data( + ctx.GetPlace())); + auto *d_ln_2_bias_data = + (d_ln_2_bias == nullptr ? nullptr + : d_ln_2_bias->mutable_data(ctx.GetPlace())); + + const auto input_x_dims = input_x->dims(); + const auto qkv_w_dims = qkv_weight->dims(); + + int batch_size = input_x_dims[0]; + int max_seq_len = input_x_dims[1]; + int dim_embed = input_x_dims[2]; + int num_head = qkv_w_dims[1]; + int dim_head = qkv_w_dims[2]; + + int bsz_seq = batch_size * max_seq_len; + int hidden_size = num_head * dim_head; + int output_size = 3 * hidden_size; + int input_size = dim_embed; + + Tensor d_residual; + d_residual.Resize(input_x_dims); + T *d_residual_data = d_residual.mutable_data(ctx.GetPlace()); + + bool transA = false; + bool transB = true; + bool compute_bias = true; + auto layer_norm_compute = AttnLayerNorm(ctx.cuda_device_context(), + epsilon, bsz_seq, dim_embed); + auto qkv_compute = + AttnMatMul(ctx.cuda_device_context(), transA, transB, bsz_seq, + output_size, input_size, compute_bias); + AttnDropoutParam attn_dropout_param( + is_test_1, dropout_implementation_1, attn_dropout_prob, + is_upscale_in_train_1, is_fix_seed_1, seed_val_1, seed_1); + auto fmha_ref_compute = + FMHARef(ctx.cuda_device_context(), batch_size, max_seq_len, num_head, + dim_head, attn_dropout_param); + output_size = hidden_size; + transA = false; + transB = false; + compute_bias = false; + auto out_linear_compute = + AttnMatMul(ctx.cuda_device_context(), transA, transB, bsz_seq, + output_size, input_size, compute_bias); + DropoutParam dropout_param2(ctx, 0); + FusedDropoutLayerNormHelper fused_dropout_layernorm_helper( + ctx.cuda_device_context(), bsz_seq, dim_embed, dropout_param2, + ln2epsilon); + + fused_dropout_layernorm_helper.LayernormResidualDropoutBiasGrad( + ctx.cuda_device_context(), d_y_data, bias_dropout_residual_out_data, + dropout_mask_out_data, ln_2_scale_data, ln_2_mean_data, ln_2_var_data, + d_bias_dropout_residual_out_data, d_ln_2_scale_data, d_ln_2_bias_data, + d_out_linear_out_data, d_out_linear_bias_data, d_residual_data); + + out_linear_compute.ComputeBackward(fmha_out_data, out_linear_weight_data, + d_out_linear_out_data, d_fmha_out_data, + d_out_linear_weight_data, nullptr); + fmha_ref_compute.ComputeBackward( + *transpose_out_2, *src_mask, *softmax_out, *attn_dropout_mask_out, + *attn_dropout_out, *qk_out, *src_mask_out, *d_fmha_out, d_qktv_out, + d_attn_dropout_out, d_softmax_out, d_src_mask_out, d_qk_out, + d_transpose_out_2, nullptr, d_qkv_bias_out); + cudaMemcpyAsync(d_qkv_out_data, d_qkv_bias_out_data, + bsz_seq * 3 * num_head * dim_head * sizeof(T), + cudaMemcpyDeviceToDevice); + + if (pre_layer_norm) { + qkv_compute.ComputeBackward(ln_out_data, qkv_weight_data, + d_qkv_bias_out_data, d_ln_out_data, + d_qkv_weight_data, d_qkv_bias_data); + layer_norm_compute.ComputeBackward(x_data, d_ln_out_data, ln_scale_data, + ln_mean_data, ln_var_data, d_x_data, + d_ln_scale_data, d_ln_bias_data); + } else { + qkv_compute.ComputeBackward(x_data, qkv_weight_data, d_qkv_bias_out_data, + d_x_data, d_qkv_weight_data, d_qkv_bias_data); + } + // gradient accumulation + std::vector ins; + std::vector outs; + ins.emplace_back(&d_residual); + ins.emplace_back(d_x); + outs.emplace_back(d_x); + int elewise_add_axis = -1; + LaunchElementwiseCudaKernel( + ctx.cuda_device_context(), ins, &outs, elewise_add_axis, + AddFunctor()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_CUDA_KERNEL(fused_attention, ops::FusedAttentionOpKernel, + ops::FusedAttentionOpKernel, + ops::FusedAttentionOpKernel); +REGISTER_OP_CUDA_KERNEL(fused_attention_grad, + ops::FusedAttentionGradKernel, + ops::FusedAttentionGradKernel, + ops::FusedAttentionGradKernel); diff --git a/paddle/fluid/operators/fused/fused_dropout_common.h b/paddle/fluid/operators/fused/fused_dropout_common.h index 3fb58eab077bca..049c37f1ea0c44 100644 --- a/paddle/fluid/operators/fused/fused_dropout_common.h +++ b/paddle/fluid/operators/fused/fused_dropout_common.h @@ -110,27 +110,34 @@ inline __device__ void CalculateDBias(const T *tmp_sum, T *dbias, } __syncthreads(); // reduce sum - T sum = static_cast(0); + T sum[2] = {static_cast(0)}; int tid = threadIdx.y * blockDim.x + threadIdx.x; int x = tid >> 5; // warp id int y = tid & 31; // thread id on warp 0~31 // need BlockSizeX * VecSize warps - if (x < BlockSizeX * VecSize) { + for (int j = x; j < BlockSizeX * VecSize; j += 32) { // reduce 128 to 32 #pragma unroll for (int i = 0; i < (BlockSizeY >> 5); i++) { - sum += cache[x][y + i * 32]; + sum[(j >> 5)] += cache[j][y + i * 32]; } } + int reduce_num_pre_thread = (BlockSizeX * VecSize + 31) / 32; // reduce 32 to 1 - sum = WarpReduceSum(sum); + for (int i = 0; i < reduce_num_pre_thread; i++) { + sum[i] = WarpReduceSum(sum[i]); + } // save sum to dbias - int bias_id = blockIdx.x * blockDim.x * VecSize + x; - if (y == 0 && x < VecSize * BlockSizeX && bias_id < cols) { - dbias[bias_id] = sum; + if (y == 0 && x < BlockSizeX * VecSize) { + for (int i = 0; i < reduce_num_pre_thread; i++) { + int bias_id = blockIdx.x * BlockSizeX * VecSize + x + i * 32; + if (bias_id < cols) { + dbias[bias_id] = sum[i]; + } + } } } diff --git a/paddle/fluid/operators/fused/fused_dropout_helper.h b/paddle/fluid/operators/fused/fused_dropout_helper.h new file mode 100644 index 00000000000000..33fde64164d129 --- /dev/null +++ b/paddle/fluid/operators/fused/fused_dropout_helper.h @@ -0,0 +1,282 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/generator.h" +#include "paddle/fluid/operators/dropout_impl_util.h" +#include "paddle/fluid/operators/fused/fused_dropout_act_bias.h" +#include "paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h" +#include "paddle/fluid/operators/fused/fused_residual_dropout_bias.h" +#include "paddle/fluid/operators/math/functors.h" + +namespace paddle { +namespace operators { + +/** + * Support two Dropouts in the use senarieo. + * This warpper can be used in FFN op. + * The DropoutParam will be used in the fused_dropout_act_bias, + * fused_residual_dropout_bias(pre_layer_norm=ture) or + * fused_layernorm_residual_dropout_bias(pre_layer_norm=false). +*/ +struct DropoutParam { + uint64_t seed; + float dropout_prob; + bool is_upscale_in_train; + bool is_test; + bool fix_seed; + int increment; + const framework::Tensor* tensor_seed; + int seed_val; + + DropoutParam() { + fix_seed = false; + seed = 0; + is_test = false; + is_upscale_in_train = false; + dropout_prob = 0.5; + tensor_seed = nullptr; + seed_val = 0; + } + + /** + * dropout_index: can be 0, 1, 2. 0 means there is only one dropout, + * 1 and 2 represent two dropout, the parameter name of dropout + * will be "dropout" + dropout_index + param name, such as dropout1_seed, + * dropout1_is_test. + */ + DropoutParam(const framework::ExecutionContext& context, + const int dropout_index) { + std::string pre_fix = "dropout"; + std::string str_index = std::to_string(dropout_index); + if (dropout_index > 0) { + pre_fix = pre_fix + str_index + "_"; + } else { + pre_fix = pre_fix + "_"; + } + dropout_prob = context.Attr(pre_fix + "rate"); + auto& dropout_implementation = + context.Attr(pre_fix + "implementation"); + is_upscale_in_train = (dropout_implementation == "upscale_in_train"); + is_test = context.Attr(pre_fix + "is_test"); + fix_seed = context.Attr(pre_fix + "fix_seed"); + + std::string str_seed = "Dropout"; + if (dropout_index > 0) { + str_seed = str_seed + str_index + "Seed"; + } else { + str_seed = str_seed + "Seed"; + } + tensor_seed = + context.HasInput(str_seed) ? context.Input(str_seed) : nullptr; + seed_val = context.Attr(pre_fix + "seed"); + } + + int UpdateSeedAndIncrement(const platform::CUDADeviceContext& ctx, + const int offset) { + uint64_t tmp_increment; + GetSeedDataAndIncrement(ctx, tensor_seed, fix_seed, seed_val, offset, &seed, + &tmp_increment); + increment = static_cast(tmp_increment); + return increment; + } +}; + +template +class FusedDropoutHelper { + private: + int GetIncrement(const platform::CUDADeviceContext& ctx) { + const int VecSize = MAX_CACHE_BYTES / sizeof(T); + const int real_vec_size = cols_ % VecSize == 0 ? VecSize : 1; + auto config = + Get1DBlocksAnd2DGrids(ctx, static_cast(rows_), + static_cast(cols_), real_vec_size); + int increment = ((cols_ - 1) / (config.thread_per_block.x * + config.block_per_grid.x * real_vec_size) + + 1) * + real_vec_size; + increment = dropout_param_.UpdateSeedAndIncrement(ctx, increment); + return increment; + } + + public: + FusedDropoutHelper() {} + FusedDropoutHelper(const platform::CUDADeviceContext& ctx, const int rows, + const int cols, const DropoutParam& dropout_param) { + rows_ = rows; + cols_ = cols; + dropout_param_ = dropout_param; + } + + // out = residual + dropout( src + bias ) + void ResidualDropoutBias(const platform::CUDADeviceContext& ctx, const T* src, + const T* residual, const T* bias, T* out, + MaskType* mask) { + auto increment = GetIncrement(ctx); + LaunchResidualDropoutBias( + rows_, cols_, increment, dropout_param_.seed, + dropout_param_.dropout_prob, dropout_param_.is_test, + dropout_param_.is_upscale_in_train, src, residual, bias, mask, out, + ctx); + } + + void ResidualDropoutBiasGrad(const platform::CUDADeviceContext& ctx, + const T* d_out, const MaskType* mask, T* d_src, + T* d_residual, T* d_bias) { + LaunchResidualDropoutBiasGrad( + d_out, mask, dropout_param_.dropout_prob, + dropout_param_.is_upscale_in_train, rows_, cols_, d_src, d_bias, ctx); + auto cuda_place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()); + memory::Copy(cuda_place, d_residual, cuda_place, d_out, + rows_ * cols_ * sizeof(T), ctx.stream()); + } + + // out = dropout(activation(src + bias)) + void DropoutActBias(const platform::CUDADeviceContext& ctx, const T* src, + const T* bias, const std::string& act_method, T* out, + MaskType* mask) { + auto increment = GetIncrement(ctx); + if (act_method == "gelu") { + GeluFunctor gelu; + LaunchDropoutActBias>( + gelu, dropout_param_.seed, rows_, cols_, dropout_param_.increment, + dropout_param_.dropout_prob, dropout_param_.is_upscale_in_train, + dropout_param_.is_test, src, bias, out, mask, ctx); + } else if (act_method == "relu") { + math::ReluFunctor relu; + LaunchDropoutActBias>( + relu, dropout_param_.seed, rows_, cols_, increment, + dropout_param_.dropout_prob, dropout_param_.is_upscale_in_train, + dropout_param_.is_test, src, bias, out, mask, ctx); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Currently only supports gelu or relu activation functions!")); + } + } + + void DropoutActBiasGrad(const platform::CUDADeviceContext& ctx, const T* dout, + const T* src, const T* bias, const MaskType* mask, + T* d_src, T* d_bias, const std::string& act_method) { + if (act_method == "gelu") { + GeluGradFunctor gelu_grad; + LaunchDropoutActBiasGrad>( + gelu_grad, dout, mask, src, bias, dropout_param_.dropout_prob, + dropout_param_.is_upscale_in_train, rows_, cols_, d_src, d_bias, ctx); + } else if (act_method == "relu") { + math::ReluGradFunctor relu_grad; + LaunchDropoutActBiasGrad>( + relu_grad, dout, mask, src, bias, dropout_param_.dropout_prob, + dropout_param_.is_upscale_in_train, rows_, cols_, d_src, d_bias, ctx); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Currently only supports gelu or relu activation functions!")); + } + } + + protected: + int rows_; + int cols_; + DropoutParam dropout_param_; +}; + +template +class FusedDropoutLayerNormHelper : public FusedDropoutHelper { + public: + FusedDropoutLayerNormHelper() {} + FusedDropoutLayerNormHelper(const int rows, const int cols, + const float epsilon) { + using U = LayerNormParamType; + this->rows_ = rows; + this->cols_ = cols; + epsilon_ = epsilon; + } + + FusedDropoutLayerNormHelper(const platform::CUDADeviceContext& ctx, + const int rows, const int cols, + const DropoutParam& dropout_param, + const float epsilon) + : FusedDropoutHelper(ctx, rows, cols, dropout_param) { + using U = LayerNormParamType; + epsilon_ = epsilon; + } + + // call layer_norm + void LayerNorm(const platform::CUDADeviceContext& ctx, const T* src, + const LayerNormParamType* gamma, + const LayerNormParamType* beta, T* out, + LayerNormParamType* mean, LayerNormParamType* variance) { + using U = LayerNormParamType; + switch (GetDesiredBlockDim(this->cols_)) { + FIXED_BLOCK_DIM_CASE( + LayerNormForward< + T, U, kBlockDim><<rows_, kBlockDim, 0, ctx.stream()>>>( + src, gamma, beta, out, mean, variance, epsilon_, this->cols_)); + } + } + + void LayerNormGrad(const platform::CUDADeviceContext& ctx, const T* dout, + const T* src, const LayerNormParamType* gamma, + const LayerNormParamType* mean, + const LayerNormParamType* variance, T* d_src, + LayerNormParamType* d_scale, + LayerNormParamType* d_bias) { + using U = LayerNormParamType; + LayerNormBackward(src, dout, gamma, mean, variance, d_src, d_scale, + d_bias, epsilon_, this->rows_, this->cols_, ctx); + } + + // out = layernorm(residual + dropout(src + bias)) + void LayernormResidualDropoutBias( + const platform::CUDADeviceContext& ctx, const T* src, const T* residual, + const T* bias, const LayerNormParamType* gamma, + const LayerNormParamType* beta, T* dropout_out, MaskType* mask, T* out, + LayerNormParamType* mean, LayerNormParamType* variance) { + using U = LayerNormParamType; + int vec_size = MAX_CACHE_BYTES / sizeof(T); + if (this->cols_ % vec_size != 0) { + vec_size = 1; + } + int threads = GetDesiredBlockDim(this->cols_ / vec_size); + int increment = ((this->cols_ - 1) / (threads * vec_size) + 1) * vec_size; + increment = this->dropout_param_.UpdateSeedAndIncrement(ctx, increment); + LaunchLayernormResidualDropoutBias( + this->rows_, this->cols_, increment, this->dropout_param_.seed, + this->dropout_param_.dropout_prob, epsilon_, + this->dropout_param_.is_upscale_in_train, this->dropout_param_.is_test, + src, residual, bias, gamma, beta, mask, dropout_out, out, mean, + variance, ctx); + } + + void LayernormResidualDropoutBiasGrad( + const platform::CUDADeviceContext& ctx, const T* d_out, + const T* layernorm_src, const MaskType* mask, + const LayerNormParamType* gamma, const LayerNormParamType* mean, + const LayerNormParamType* variance, T* d_layernorm_src, + LayerNormParamType* d_scale, LayerNormParamType* d_layernorm_bias, + T* d_dropout_src, T* d_bias, T* d_residual) { + using U = LayerNormParamType; + LayerNormBackward(layernorm_src, d_out, gamma, mean, variance, + d_layernorm_src, d_scale, d_layernorm_bias, + epsilon_, this->rows_, this->cols_, ctx); + this->ResidualDropoutBiasGrad(ctx, d_layernorm_src, mask, d_dropout_src, + d_residual, d_bias); + } + + protected: + float epsilon_; +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cc b/paddle/fluid/operators/fused/fused_feedforward_op.cc new file mode 100644 index 00000000000000..4e03c7369d10e8 --- /dev/null +++ b/paddle/fluid/operators/fused/fused_feedforward_op.cc @@ -0,0 +1,359 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/matmul_v2_op.h" + +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; + +class FusedFeedForwardOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *context) const override { + OP_INOUT_CHECK(context->HasInput("X"), "Input", "X", "fused_feedforward"); + OP_INOUT_CHECK(context->HasInput("Linear1Weight"), "Input", "Linear1Weight", + "fused_feedforward"); + OP_INOUT_CHECK(context->HasInput("Linear2Weight"), "Input", "Linear2Weight", + "fused_feedforward"); + OP_INOUT_CHECK(context->HasOutput("Out"), "Output", "Out", + "fused_feedforward"); + OP_INOUT_CHECK(context->HasOutput("Dropout1Mask"), "Output", "Dropout1Mask", + "fused_feedforward"); + OP_INOUT_CHECK(context->HasOutput("Dropout2Mask"), "Output", "Dropout2Mask", + "fused_feedforward"); + OP_INOUT_CHECK(context->HasOutput("Ln1Mean"), "Output", "Ln1Mean", + "fused_feedforward"); + OP_INOUT_CHECK(context->HasOutput("Ln1Variance"), "Output", "Ln1Variance", + "fused_feedforward"); + OP_INOUT_CHECK(context->HasOutput("Ln2Mean"), "Output", "Ln2Mean", + "fused_feedforward"); + OP_INOUT_CHECK(context->HasOutput("Ln2Variance"), "Output", "Ln2Variance", + "fused_feedforward"); + OP_INOUT_CHECK(context->HasOutput("Linear1Out"), "Output", "Linear1Out", + "fused_feedforward"); + OP_INOUT_CHECK(context->HasOutput("Ln1Out"), "Output", "Ln1Out", + "fused_feedforward"); + OP_INOUT_CHECK(context->HasOutput("Dropout1Out"), "Output", "Dropout1Out", + "fused_feedforward"); + OP_INOUT_CHECK(context->HasOutput("Dropout2Out"), "Output", "Dropout2Out", + "fused_feedforward"); + + auto dim_x = context->GetInputDim("X"); + auto mat_dim_x = + math::CreateMatrixDescriptor(RowMatrixFromVector(dim_x), 0, false); + // verify for the pre layer_norm, the feature size must be larger than 1 + PADDLE_ENFORCE_GT( + mat_dim_x.width_, static_cast(1), + platform::errors::InvalidArgument("Product from the X shape[1] to " + "shape[n-1] must be larger than 1!")); + auto dim_Linear1Weight = context->GetInputDim("Linear1Weight"); + auto tmp_dim_x = dim_x; + tmp_dim_x[dim_x.size() - 1] = + dim_Linear1Weight[dim_Linear1Weight.size() - 1]; + context->SetOutputDim("Out", dim_x); + if (context->Attrs().Get("dropout1_is_test") == false) { + context->SetOutputDim("Dropout1Mask", tmp_dim_x); + } + context->SetOutputDim("Dropout1Out", tmp_dim_x); + context->SetOutputDim("Linear1Out", tmp_dim_x); + context->SetOutputDim("Ln1Out", dim_x); + context->SetOutputDim("Dropout2Out", dim_x); + + if (context->Attrs().Get("dropout2_is_test") == false) { + context->SetOutputDim("Dropout2Mask", dim_x); + } + framework::DDim mean_dim = + framework::make_ddim({mat_dim_x.batch_size_ * mat_dim_x.height_}); + context->SetOutputDim("Ln1Mean", mean_dim); + context->SetOutputDim("Ln1Variance", mean_dim); + context->SetOutputDim("Ln2Mean", mean_dim); + context->SetOutputDim("Ln2Variance", mean_dim); + context->ShareLoD("X", "Out"); + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace()); + } +}; + +class FusedFeedForwardOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "The input of FusedFeedForward op"); + AddInput( + "Dropout1Seed", + "The seed of first dropout op, it has higher priority than the attr " + "fix_seed and seed") + .AsDispensable(); + AddInput( + "Dropout2Seed", + "The seed of second dropout op, it has higher priority than the attr " + "fix_seed and seed") + .AsDispensable(); + + AddInput("Linear1Weight", "The linear1 weight of FusedFeedForward op"); + AddInput("Linear1Bias", "The linear1 bias of FusedFeedForward op") + .AsDispensable(); + AddInput("Linear2Weight", "The linear2 weight of FusedFeedForward op"); + AddInput("Linear2Bias", "The linear2 bias input of FusedFeedForward op") + .AsDispensable(); + AddInput("Ln1Scale", "The layer_norm1 scale of FusedFeedForward op") + .AsDispensable(); + AddInput("Ln1Bias", "The layer_norm1 bias of FusedFeedForward op") + .AsDispensable(); + AddInput("Ln2Scale", "The layer_norm2 scale of FusedFeedForward op") + .AsDispensable(); + AddInput("Ln2Bias", "The layer_norm2 bias of FusedFeedForward op") + .AsDispensable(); + AddOutput("Out", "The output of FusedFeedForward op"); + AddOutput("Dropout1Mask", "The mask of dropout1").AsIntermediate(); + AddOutput("Dropout2Mask", "The mask of dropout2").AsIntermediate(); + AddOutput("Ln1Mean", "The mean of layer_norm1").AsIntermediate(); + AddOutput("Ln1Variance", "The variance of layer_norm1").AsIntermediate(); + AddOutput("Ln2Mean", "The mean of layer_nomr2").AsIntermediate(); + AddOutput("Ln2Variance", "The variance of layer_norm2").AsIntermediate(); + AddOutput("Linear1Out", "The output of linear1").AsIntermediate(); + AddOutput("Ln1Out", "The output of layer_norm1").AsIntermediate(); + AddOutput("Dropout1Out", "The output of dropout1").AsIntermediate(); + AddOutput("Dropout2Out", "The output of dropout2").AsIntermediate(); + + AddAttr("pre_layer_norm", "true is pre layernorm").SetDefault(false); + AddAttr("ln1_epsilon", "epsilon of pre layer_norm") + .SetDefault(1e-5f); + AddAttr("ln2_epsilon", "epsilon of post layer_norm") + .SetDefault(1e-5f); + AddAttr("act_method", "act_method").SetDefault("gelu"); + AddAttr("dropout1_rate", "the dropout rate of first dropout") + .SetDefault(.5f) + .AddCustomChecker([](const float &drop_p) { + PADDLE_ENFORCE_EQ( + drop_p >= 0.0f && drop_p <= 1.0f, true, + platform::errors::InvalidArgument( + "'dropout1_rate' must be between 0.0 and 1.0.")); + }); + AddAttr("dropout2_rate", "the dropout rate of second dropout") + .SetDefault(.5f) + .AddCustomChecker([](const float &drop_p) { + PADDLE_ENFORCE_EQ( + drop_p >= 0.0f && drop_p <= 1.0f, true, + platform::errors::InvalidArgument( + "'dropout2_rate' must be between 0.0 and 1.0.")); + }); + AddAttr("dropout1_implementation", + "the dropout implementation of first dropout") + .SetDefault("downgrade_in_infer") + .AddCustomChecker([](const std::string &type) { + PADDLE_ENFORCE_EQ( + type == "downgrade_in_infer" || type == "upscale_in_train", true, + platform::errors::InvalidArgument( + "dropout1_implementation can only be downgrade_in_infer or " + "upscale_in_train")); + }); + AddAttr("dropout2_implementation", + "the dropout implementation of second dropout") + .SetDefault("downgrade_in_infer") + .AddCustomChecker([](const std::string &type) { + PADDLE_ENFORCE_EQ( + type == "downgrade_in_infer" || type == "upscale_in_train", true, + platform::errors::InvalidArgument( + "dropout2_implementation can only be downgrade_in_infer or " + "upscale_in_train")); + }); + AddAttr("dropout1_is_test", "the is_test of first dropout") + .SetDefault(false); + AddAttr("dropout2_is_test", "the is_test of second dropout") + .SetDefault(false); + AddAttr("dropout1_fix_seed", "the is_test of first dropout") + .SetDefault(false); + AddAttr("dropout2_fix_seed", "the is_test of second dropout") + .SetDefault(false); + AddAttr("dropout1_seed", "Dropout1 random seed.").SetDefault(0); + AddAttr("dropout2_seed", "Dropout2 random seed.").SetDefault(0); + AddComment(R"DOC( + the function of fused_feedforward operator is the same as the following pseudo code: + residual = src; + ln1_out = src; + if(pre_layer_norm){ + ln1_out = layer_norm(src); + } + out = linear(dropout(activation(dropout(linear(ln1_out))))); + if(!pre_layer_norm) { + out = layer_norm(out); + } + )DOC"); + } +}; + +class FusedFeedForwardOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE_EQ(ctx->Attrs().Get("dropout1_is_test"), false, + platform::errors::InvalidArgument( + "GradOp is only callable when is_test is false")); + PADDLE_ENFORCE_EQ(ctx->Attrs().Get("dropout2_is_test"), false, + platform::errors::InvalidArgument( + "GradOp is only callable when is_test is false")); + OP_INOUT_CHECK(ctx->HasInput("Dropout1Mask"), "Input", "Dropout1Mask", + "FusedFeedForwardGrad"); + OP_INOUT_CHECK(ctx->HasInput("Dropout2Mask"), "Input", "Dropout1Mask", + "FusedFeedForwardGrad"); + OP_INOUT_CHECK(ctx->HasInput("Linear1Out"), "Input", "Linear1Out", + "FusedFeedForwardGrad"); + OP_INOUT_CHECK(ctx->HasInput("Ln1Out"), "Input", "Ln1Out", + "FusedFeedForwardGrad"); + OP_INOUT_CHECK(ctx->HasInput("Dropout1Out"), "Input", "Dropout1Out", + "FusedFeedForwardGrad"); + OP_INOUT_CHECK(ctx->HasInput("Dropout2Out"), "Input", "Dropout2Out", + "FusedFeedForwardGrad"); + OP_INOUT_CHECK(ctx->HasInput("Linear1Weight"), "Input", "Linear1Weight", + "FusedFeedForwardGrad"); + OP_INOUT_CHECK(ctx->HasInput("Linear2Weight"), "Input", "Linear2Weight", + "FusedFeedForwardGrad"); + OP_INOUT_CHECK(ctx->HasInput("Ln1Mean"), "Input", "Ln1Mean", + "FusedFeedForwardGrad"); + OP_INOUT_CHECK(ctx->HasInput("Ln1Variance"), "Input", "Ln1Variance", + "FusedFeedForwardGrad"); + OP_INOUT_CHECK(ctx->HasInput("Ln2Mean"), "Input", "Ln2Mean", + "FusedFeedForwardGrad"); + OP_INOUT_CHECK(ctx->HasInput("Ln2Variance"), "Input", "Ln2Variance", + "FusedFeedForwardGrad"); + + OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input", + framework::GradVarName("Out"), "FusedFeedForwardGrad"); + + auto d_out_dim = ctx->GetInputDim(framework::GradVarName("Out")); + ctx->SetOutputDim(framework::GradVarName("X"), d_out_dim); + if (ctx->HasOutput(framework::GradVarName("Ln1Scale"))) { + ctx->SetOutputDim(framework::GradVarName("Ln1Scale"), + ctx->GetInputDim("Ln1Scale")); + } + if (ctx->HasOutput(framework::GradVarName("Ln1Bias"))) { + ctx->SetOutputDim(framework::GradVarName("Ln1Bias"), + ctx->GetInputDim("Ln1Bias")); + } + if (ctx->HasOutput(framework::GradVarName("Ln2Scale"))) { + ctx->SetOutputDim(framework::GradVarName("Ln2Scale"), + ctx->GetInputDim("Ln2Scale")); + } + if (ctx->HasOutput(framework::GradVarName("Ln2Bias"))) { + ctx->SetOutputDim(framework::GradVarName("Ln2Bias"), + ctx->GetInputDim("Ln2Bias")); + } + ctx->SetOutputDim(framework::GradVarName("Linear1Weight"), + ctx->GetInputDim("Linear1Weight")); + if (ctx->HasOutput(framework::GradVarName("Linear1Bias"))) { + ctx->SetOutputDim(framework::GradVarName("Linear1Bias"), + ctx->GetInputDim("Linear1Bias")); + } + ctx->SetOutputDim(framework::GradVarName("Linear2Weight"), + ctx->GetInputDim("Linear2Weight")); + if (ctx->HasOutput(framework::GradVarName("Linear2Bias"))) { + ctx->SetOutputDim(framework::GradVarName("Linear2Bias"), + ctx->GetInputDim("Linear2Bias")); + } + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + auto input = ctx.Input("X"); + auto input_data_type = input->type(); + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } +}; + +template +class FusedFeedForwardOpGradMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("fused_feedforward_grad"); + op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); + op->SetInput("X", this->Input("X")); + op->SetInput("Linear1Weight", this->Input("Linear1Weight")); + op->SetInput("Linear1Bias", this->Input("Linear1Bias")); + op->SetInput("Linear2Weight", this->Input("Linear2Weight")); + op->SetInput("Ln1Scale", this->Input("Ln1Scale")); + op->SetInput("Ln1Bias", this->Input("Ln1Bias")); + op->SetInput("Ln2Scale", this->Input("Ln2Scale")); + op->SetInput("Ln2Bias", this->Input("Ln2Bias")); + op->SetInput("Dropout1Mask", this->Output("Dropout1Mask")); + op->SetInput("Dropout2Mask", this->Output("Dropout2Mask")); + op->SetInput("Linear1Out", this->Output("Linear1Out")); + op->SetInput("Ln1Out", this->Output("Ln1Out")); + op->SetInput("Ln1Mean", this->Output("Ln1Mean")); + op->SetInput("Ln1Variance", this->Output("Ln1Variance")); + op->SetInput("Ln2Mean", this->Output("Ln2Mean")); + op->SetInput("Ln2Variance", this->Output("Ln2Variance")); + op->SetInput("Dropout1Out", this->Output("Dropout1Out")); + op->SetInput("Dropout2Out", this->Output("Dropout2Out")); + + op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); + op->SetOutput(framework::GradVarName("Ln1Scale"), + this->InputGrad("Ln1Scale")); + op->SetOutput(framework::GradVarName("Ln1Bias"), + this->InputGrad("Ln1Bias")); + op->SetOutput(framework::GradVarName("Ln2Scale"), + this->InputGrad("Ln2Scale")); + op->SetOutput(framework::GradVarName("Ln2Bias"), + this->InputGrad("Ln2Bias")); + op->SetOutput(framework::GradVarName("Linear1Weight"), + this->InputGrad("Linear1Weight")); + op->SetOutput(framework::GradVarName("Linear1Bias"), + this->InputGrad("Linear1Bias")); + op->SetOutput(framework::GradVarName("Linear2Weight"), + this->InputGrad("Linear2Weight")); + if (this->HasInput("Linear2Bias")) { + op->SetInput("Linear2Bias", this->Input("Linear2Bias")); + op->SetOutput(framework::GradVarName("Linear2Bias"), + this->InputGrad("Linear2Bias")); + } + + op->SetAttrMap(this->Attrs()); + } +}; + +template +class FusedFeedForwardOpDoubleGradMaker + : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr grad_op) const override {} +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(fused_feedforward, ops::FusedFeedForwardOp, + ops::FusedFeedForwardOpMaker, + ops::FusedFeedForwardOpGradMaker, + ops::FusedFeedForwardOpGradMaker); +REGISTER_OPERATOR(fused_feedforward_grad, ops::FusedFeedForwardOpGrad); diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu new file mode 100644 index 00000000000000..61a8a9a82f2e0d --- /dev/null +++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu @@ -0,0 +1,394 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/matmul_v2_op.h" + +#include "paddle/fluid/operators/fused/fused_dropout_helper.h" +#include "paddle/fluid/operators/layer_norm_kernel.cu.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class FusedFeedForwardKernel : public framework::OpKernel { + public: + void MatMul(const platform::CUDADeviceContext& ctx, + const framework::Tensor& a, const framework::Tensor& b, + framework::Tensor* c) const { + auto blas = math::GetBlas(ctx); + auto a_2d = FoldInitDims(a); + auto b_2d = FoldInitDims(b); + auto mat_dim_a = math::CreateMatrixDescriptor(a_2d.dims(), 0, false); + auto mat_dim_b = math::CreateMatrixDescriptor(b_2d.dims(), 0, false); + T alpha = static_cast(1.0); + blas.MatMul(a, mat_dim_a, b, mat_dim_b, alpha, c, T(0)); + } + + void FFN(const framework::Tensor& x, const framework::Tensor& linear1_weight, + const framework::Tensor* linear1_bias, + const framework::Tensor& linear2_weight, + const framework::Tensor* linear2_bias, + const framework::Tensor* ln1_scale, + const framework::Tensor* ln1_bias, + const framework::Tensor* ln2_scale, + const framework::Tensor* ln2_bias, framework::Tensor* out, + framework::Tensor* dropout1_mask, framework::Tensor* dropout2_mask, + framework::Tensor* ln1_mean, framework::Tensor* ln1_variance, + framework::Tensor* ln2_mean, framework::Tensor* ln2_variance, + framework::Tensor* linear1_out, framework::Tensor* ln1_out, + framework::Tensor* dropout1_out, framework::Tensor* dropout2_out, + const int bsz_seq, const int d_model, const int dim_feedforward, + const std::string& act_method, const bool pre_layer_norm, + const float epsilon1, const float epsilon2, + const DropoutParam& dropout_param1, + const DropoutParam& dropout_param2, + const platform::CUDADeviceContext& ctx) const { + FusedDropoutLayerNormHelper pre_layernorm_helper( + bsz_seq, d_model, epsilon1); + FusedDropoutHelper fused_act_dropout_helper( + ctx, bsz_seq, dim_feedforward, dropout_param1); + FusedDropoutLayerNormHelper fused_dropout_layernorm_helper( + ctx, bsz_seq, d_model, dropout_param2, epsilon2); + + auto place = ctx.GetPlace(); + using U = LayerNormParamType; + const framework::Tensor* in = &x; + + const U* ln1_scale_ptr = + ln1_scale == nullptr ? nullptr : ln1_scale->data(); + const U* ln1_bias_ptr = ln1_bias == nullptr ? nullptr : ln1_bias->data(); + const U* ln2_scale_ptr = + ln2_scale == nullptr ? nullptr : ln2_scale->data(); + const U* ln2_bias_ptr = ln2_bias == nullptr ? nullptr : ln2_bias->data(); + const T* linear1_bias_ptr = + linear1_bias == nullptr ? nullptr : linear1_bias->data(); + const T* linear2_bias_ptr = + linear2_bias == nullptr ? nullptr : linear2_bias->data(); + + if (pre_layer_norm) { + pre_layernorm_helper.LayerNorm( + ctx, x.data(), ln1_scale_ptr, ln1_bias_ptr, ln1_out->data(), + ln1_mean->data(), ln1_variance->data()); + in = ln1_out; + } + MatMul(ctx, *in, linear1_weight, linear1_out); + fused_act_dropout_helper.DropoutActBias( + ctx, linear1_out->data(), linear1_bias_ptr, act_method, + dropout1_out->data(), dropout1_mask->data()); + framework::Tensor linear2_out; + linear2_out.mutable_data({bsz_seq, d_model}, place); + MatMul(ctx, *dropout1_out, linear2_weight, &linear2_out); + if (!pre_layer_norm) { + fused_dropout_layernorm_helper.LayernormResidualDropoutBias( + ctx, linear2_out.data(), x.data(), linear2_bias_ptr, + ln2_scale_ptr, ln2_bias_ptr, dropout2_out->data(), + dropout2_mask->data(), out->data(), ln2_mean->data(), + ln2_variance->data()); + } else { + fused_dropout_layernorm_helper.ResidualDropoutBias( + ctx, linear2_out.data(), x.data(), linear2_bias_ptr, + out->data(), dropout2_mask->data()); + } + } + + void Compute(const framework::ExecutionContext& context) const override { + auto* x = context.Input("X"); + auto* linear1_weight = context.Input("Linear1Weight"); + auto* linear1_bias = context.Input("Linear1Bias"); + auto* linear2_weight = context.Input("Linear2Weight"); + auto* linear2_bias = context.Input("Linear2Bias"); + auto* ln1_scale = context.Input("Ln1Scale"); + auto* ln1_bias = context.Input("Ln1Bias"); + auto* ln2_scale = context.Input("Ln2Scale"); + auto* ln2_bias = context.Input("Ln2Bias"); + + auto* ln1_mean = context.Output("Ln1Mean"); + auto* ln1_variance = context.Output("Ln1Variance"); + auto* ln2_mean = context.Output("Ln2Mean"); + auto* ln2_variance = context.Output("Ln2Variance"); + auto* out = context.Output("Out"); + auto* dropout1_mask = context.Output("Dropout1Mask"); + auto* dropout2_mask = context.Output("Dropout2Mask"); + auto* linear1_out = context.Output("Linear1Out"); + auto* ln1_out = context.Output("Ln1Out"); + auto* dropout1_out = context.Output("Dropout1Out"); + auto* dropout2_out = context.Output("Dropout2Out"); + + const std::string act_method = context.Attr("act_method"); + + const bool pre_layer_norm = context.Attr("pre_layer_norm"); + const float epsilon1 = context.Attr("ln1_epsilon"); + const float epsilon2 = context.Attr("ln2_epsilon"); + + DropoutParam dropout_param1(context, 1); + DropoutParam dropout_param2(context, 2); + + using U = LayerNormParamType; + auto place = context.GetPlace(); + out->mutable_data(place); + dropout1_mask->mutable_data(place); + dropout2_mask->mutable_data(place); + ln1_mean->mutable_data(place); + ln1_variance->mutable_data(place); + ln2_mean->mutable_data(place); + ln2_variance->mutable_data(place); + linear1_out->mutable_data(place); + ln1_out->mutable_data(place); + dropout1_out->mutable_data(place); + dropout2_out->mutable_data(place); + + auto x_dim = x->dims(); + auto mat_dim_x = + math::CreateMatrixDescriptor(RowMatrixFromVector(x_dim), 0, false); + + auto dim = linear1_weight->dims(); + int d_model = dim[0]; + int dim_feedforward = dim[dim.size() - 1]; + int bsz_seq = mat_dim_x.batch_size_ * mat_dim_x.height_; + + FFN(*x, *linear1_weight, linear1_bias, *linear2_weight, linear2_bias, + ln1_scale, ln1_bias, ln2_scale, ln2_bias, out, dropout1_mask, + dropout2_mask, ln1_mean, ln1_variance, ln2_mean, ln2_variance, + linear1_out, ln1_out, dropout1_out, dropout2_out, bsz_seq, d_model, + dim_feedforward, act_method, pre_layer_norm, epsilon1, epsilon2, + dropout_param1, dropout_param2, context.cuda_device_context()); + } +}; + +template +class FusedFeedForwardGradKernel : public framework::OpKernel { + public: + void MatMulGrad(const platform::CUDADeviceContext& ctx, + const framework::Tensor& d_out, const framework::Tensor& a, + const framework::Tensor& b, framework::Tensor* d_a, + framework::Tensor* d_b) const { + auto blas = math::GetBlas(ctx); + auto a_2d = FoldInitDims(a); + auto b_2d = FoldInitDims(b); + auto mat_dim_a = math::CreateMatrixDescriptor(a_2d.dims(), 0, true); + auto mat_dim_b = math::CreateMatrixDescriptor(b_2d.dims(), 0, true); + auto mat_dim_dout = math::CreateMatrixDescriptor(d_out.dims(), 0, false); + T alpha = static_cast(1.0); + blas.MatMul(d_out, mat_dim_dout, b, mat_dim_b, alpha, d_a, T(0)); + blas.MatMul(a, mat_dim_a, d_out, mat_dim_dout, alpha, d_b, T(0)); + } + + void FFNGrad( + const framework::Tensor& d_out, const framework::Tensor& x, + const framework::Tensor& dropout1_mask, + const framework::Tensor& dropout2_mask, + const framework::Tensor& linear1_out, const framework::Tensor& ln1_out, + const framework::Tensor& dropout1_out, + const framework::Tensor& dropout2_out, + const framework::Tensor& linear1_weight, + const framework::Tensor* linear1_bias, + const framework::Tensor& linear2_weight, + const framework::Tensor* ln1_gamma, const framework::Tensor* ln1_beta, + const framework::Tensor& ln1_mean, const framework::Tensor& ln1_variance, + const framework::Tensor* ln2_gamma, const framework::Tensor* ln2_beta, + const framework::Tensor& ln2_mean, const framework::Tensor& ln2_variance, + framework::Tensor* d_x, framework::Tensor* d_linear1_weight, + framework::Tensor* d_linear1_bias, framework::Tensor* d_linear2_weight, + framework::Tensor* d_linear2_bias, framework::Tensor* d_ln1_gamma, + framework::Tensor* d_ln1_beta, framework::Tensor* d_ln2_gamma, + framework::Tensor* d_ln2_beta, const int bsz_seq, const int d_model, + const int dim_feedforward, const DropoutParam& dropout_param1, + const DropoutParam& dropout_param2, const std::string& act_method, + const bool pre_layer_norm, const float epsilon1, const float epsilon2, + const platform::CUDADeviceContext& ctx) const { + FusedDropoutLayerNormHelper pre_layernorm_helper( + bsz_seq, d_model, epsilon1); + FusedDropoutHelper fused_act_dropout_helper( + ctx, bsz_seq, dim_feedforward, dropout_param1); + FusedDropoutLayerNormHelper fused_dropout_layernorm_helper( + ctx, bsz_seq, d_model, dropout_param2, epsilon2); + + auto place = ctx.GetPlace(); + using U = LayerNormParamType; + const U* ln1_gamma_ptr = + ln1_gamma == nullptr ? nullptr : ln1_gamma->data(); + const U* ln1_beta_ptr = ln1_beta == nullptr ? nullptr : ln1_beta->data(); + const U* ln2_gamma_ptr = + ln2_gamma == nullptr ? nullptr : ln2_gamma->data(); + const U* ln2_beta_ptr = ln2_beta == nullptr ? nullptr : ln2_beta->data(); + const T* linear1_bias_ptr = + linear1_bias == nullptr ? nullptr : linear1_bias->data(); + T* d_linear1_bias_ptr = + d_linear1_bias == nullptr ? nullptr : d_linear1_bias->data(); + T* d_linear2_bias_ptr = + d_linear2_bias == nullptr ? nullptr : d_linear2_bias->data(); + U* d_ln1_gamma_ptr = + d_ln1_gamma == nullptr ? nullptr : d_ln1_gamma->data(); + U* d_ln1_beta_ptr = d_ln1_beta == nullptr ? nullptr : d_ln1_beta->data(); + U* d_ln2_gamma_ptr = + d_ln2_gamma == nullptr ? nullptr : d_ln2_gamma->data(); + U* d_ln2_beta_ptr = d_ln2_beta == nullptr ? nullptr : d_ln2_beta->data(); + + framework::Tensor d_linear2_out, d_dropout2_out, d_residual; + d_linear2_out.mutable_data({bsz_seq, d_model}, place); + d_dropout2_out.mutable_data({bsz_seq, d_model}, place); + d_residual.mutable_data({bsz_seq, d_model}, place); + + if (pre_layer_norm) { + fused_dropout_layernorm_helper.ResidualDropoutBiasGrad( + ctx, d_out.data(), dropout2_mask.data(), + d_linear2_out.data(), d_residual.data(), d_linear2_bias_ptr); + } else { + fused_dropout_layernorm_helper.LayernormResidualDropoutBiasGrad( + ctx, d_out.data(), dropout2_out.data(), + dropout2_mask.data(), ln2_gamma_ptr, ln2_mean.data(), + ln2_variance.data(), d_dropout2_out.data(), d_ln2_gamma_ptr, + d_ln2_beta_ptr, d_linear2_out.data(), d_linear2_bias_ptr, + d_residual.data()); + } + + framework::Tensor d_dropout1_out; + d_dropout1_out.mutable_data({bsz_seq, dim_feedforward}, place); + MatMulGrad(ctx, d_linear2_out, dropout1_out, linear2_weight, + &d_dropout1_out, d_linear2_weight); + + framework::Tensor d_linear1_out; + d_linear1_out.mutable_data({bsz_seq, dim_feedforward}, place); + fused_act_dropout_helper.DropoutActBiasGrad( + ctx, d_dropout1_out.data(), linear1_out.data(), linear1_bias_ptr, + dropout1_mask.data(), d_linear1_out.data(), + d_linear1_bias_ptr, act_method); + + if (pre_layer_norm) { + framework::Tensor d_ln1_out; + d_ln1_out.mutable_data({bsz_seq, d_model}, place); + MatMulGrad(ctx, d_linear1_out, ln1_out, linear1_weight, &d_ln1_out, + d_linear1_weight); + + pre_layernorm_helper.LayerNormGrad(ctx, d_ln1_out.data(), x.data(), + ln1_gamma_ptr, ln1_mean.data(), + ln1_variance.data(), d_x->data(), + d_ln1_gamma_ptr, d_ln1_beta_ptr); + } else { + MatMulGrad(ctx, d_linear1_out, x, linear1_weight, d_x, d_linear1_weight); + } + } + + void Compute(const framework::ExecutionContext& context) const override { + using U = LayerNormParamType; + auto d_out = + *context.Input(framework::GradVarName("Out")); + auto x = *context.Input("X"); + auto dropout1_mask = *context.Input("Dropout1Mask"); + auto dropout2_mask = *context.Input("Dropout2Mask"); + auto linear1_out = *context.Input("Linear1Out"); + auto ln1_out = *context.Input("Ln1Out"); + auto dropout1_out = *context.Input("Dropout1Out"); + auto dropout2_out = *context.Input("Dropout2Out"); + auto linear1_weight = *context.Input("Linear1Weight"); + auto* linear1_bias = context.Input("Linear1Bias"); + auto linear2_weight = *context.Input("Linear2Weight"); + auto ln1_mean = *context.Input("Ln1Mean"); + auto ln1_variance = *context.Input("Ln1Variance"); + auto* ln1_scale = context.Input("Ln1Scale"); + auto* ln1_bias = context.Input("Ln1Bias"); + auto ln2_mean = *context.Input("Ln2Mean"); + auto ln2_variance = *context.Input("Ln2Variance"); + auto* ln2_scale = context.Input("Ln2Scale"); + auto* ln2_bias = context.Input("Ln2Bias"); + + auto* d_x = context.Output(framework::GradVarName("X")); + auto* d_ln1_scale = + context.Output(framework::GradVarName("Ln1Scale")); + auto* d_ln1_bias = + context.Output(framework::GradVarName("Ln1Bias")); + auto* d_ln2_scale = + context.Output(framework::GradVarName("Ln2Scale")); + auto* d_ln2_bias = + context.Output(framework::GradVarName("Ln2Bias")); + auto* d_linear1_weight = context.Output( + framework::GradVarName("Linear1Weight")); + auto* d_linear1_bias = context.Output( + framework::GradVarName("Linear1Bias")); + auto* d_linear2_weight = context.Output( + framework::GradVarName("Linear2Weight")); + auto* d_linear2_bias = context.Output( + framework::GradVarName("Linear2Bias")); + + const float epsilon1 = context.Attr("ln1_epsilon"); + const float epsilon2 = context.Attr("ln2_epsilon"); + const bool pre_layer_norm = context.Attr("pre_layer_norm"); + const std::string act_method = context.Attr("act_method"); + DropoutParam dropout_param1(context, 1); + DropoutParam dropout_param2(context, 2); + + auto place = context.GetPlace(); + d_x->mutable_data(place); + if (d_ln1_scale) { + d_ln1_scale->mutable_data(place); + } + if (d_ln1_bias) { + d_ln1_bias->mutable_data(place); + } + if (d_ln2_scale) { + d_ln2_scale->mutable_data(place); + } + if (d_ln2_bias) { + d_ln2_bias->mutable_data(place); + } + if (d_linear1_bias) { + d_linear1_bias->mutable_data(place); + } + if (d_linear2_bias) { + d_linear2_bias->mutable_data(place); + } + d_linear1_weight->mutable_data(place); + d_linear2_weight->mutable_data(place); + + auto x_dim = x.dims(); + auto mat_dim_x = + math::CreateMatrixDescriptor(RowMatrixFromVector(x_dim), 0, false); + + auto linear1_weight_dim = linear1_weight.dims(); + int d_model = linear1_weight_dim[0]; + int dim_feedforward = linear1_weight_dim[linear1_weight_dim.size() - 1]; + int bsz_seq = mat_dim_x.batch_size_ * mat_dim_x.height_; + + FFNGrad(d_out, x, dropout1_mask, dropout2_mask, linear1_out, ln1_out, + dropout1_out, dropout2_out, linear1_weight, linear1_bias, + linear2_weight, ln1_scale, ln1_bias, ln1_mean, ln1_variance, + ln2_scale, ln2_bias, ln2_mean, ln2_variance, d_x, d_linear1_weight, + d_linear1_bias, d_linear2_weight, d_linear2_bias, d_ln1_scale, + d_ln1_bias, d_ln2_scale, d_ln2_bias, bsz_seq, d_model, + dim_feedforward, dropout_param1, dropout_param2, act_method, + pre_layer_norm, epsilon1, epsilon2, context.cuda_device_context()); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + fused_feedforward, + ops::FusedFeedForwardKernel, + ops::FusedFeedForwardKernel, + ops::FusedFeedForwardKernel); +REGISTER_OP_CUDA_KERNEL( + fused_feedforward_grad, + ops::FusedFeedForwardGradKernel, + ops::FusedFeedForwardGradKernel, + ops::FusedFeedForwardGradKernel); diff --git a/paddle/fluid/operators/fused/resnet_unit_op.cc b/paddle/fluid/operators/fused/resnet_unit_op.cc new file mode 100644 index 00000000000000..d2ac089d4d1d21 --- /dev/null +++ b/paddle/fluid/operators/fused/resnet_unit_op.cc @@ -0,0 +1,411 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +// Shape of bitmask +static framework::DDim GetBitmaskDims(std::vector out_shape) { + int c = out_shape.back(); + int64_t nhw = std::accumulate(out_shape.begin(), out_shape.end(), 1, + std::multiplies()) / + c; + int32_t c_int32_elems = ((c + 63) & ~63) / 32; + int32_t nhw_int32_elems = ((nhw + 31) & ~31); + std::vector bitmask_shape = {nhw_int32_elems, c_int32_elems, 1}; + return framework::make_ddim(bitmask_shape); +} + +class ResNetUnitOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const { + // Check input + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ResNetUnitOp"); + OP_INOUT_CHECK(ctx->HasInput("FilterX"), "Input", "FilterX", + "ResNetUnitOp"); + OP_INOUT_CHECK(ctx->HasInput("ScaleX"), "Input", "ScaleX", "ResNetUnitOp"); + OP_INOUT_CHECK(ctx->HasInput("BiasX"), "Input", "BiasX", "ResNetUnitOp"); + OP_INOUT_CHECK(ctx->HasInput("MeanX"), "Input", "MeanX", "ResNetUnitOp"); + OP_INOUT_CHECK(ctx->HasInput("VarX"), "Input", "VarX", "ResNetUnitOp"); + + bool fuse_add = ctx->Attrs().Get("fuse_add"); + bool has_shortcut = ctx->Attrs().Get("has_shortcut"); + if (fuse_add || has_shortcut) { + OP_INOUT_CHECK(ctx->HasInput("Z"), "Input", "Z", "ResNetUnitOp"); + } + if (has_shortcut) { + OP_INOUT_CHECK(ctx->HasInput("FilterZ"), "Input", "FilterZ", + "ResNetUnitOp"); + OP_INOUT_CHECK(ctx->HasInput("ScaleZ"), "Input", "ScaleZ", + "ResNetUnitOp"); + OP_INOUT_CHECK(ctx->HasInput("BiasZ"), "Input", "BiasZ", "ResNetUnitOp"); + OP_INOUT_CHECK(ctx->HasInput("MeanZ"), "Input", "MeanZ", "ResNetUnitOp"); + OP_INOUT_CHECK(ctx->HasInput("VarZ"), "Input", "VarZ", "ResNetUnitOp"); + } + + // Check output + OP_INOUT_CHECK(ctx->HasOutput("Y"), "Output", "Y", "ResNetUnitOp"); + OP_INOUT_CHECK(ctx->HasOutput("BitMask"), "Output", "BitMask", + "ResNetUnitOp"); + OP_INOUT_CHECK(ctx->HasOutput("ConvX"), "Output", "ConvX", "ResNetUnitOp"); + OP_INOUT_CHECK(ctx->HasOutput("SavedMeanX"), "Output", "SavedMeanX", + "ResNetUnitOp"); + OP_INOUT_CHECK(ctx->HasOutput("SavedInvstdX"), "Output", "SavedInvstdX", + "ResNetUnitOp"); + OP_INOUT_CHECK(ctx->HasOutput("RunningMeanX"), "Output", "RunningMeanX", + "ResNetUnitOp"); + OP_INOUT_CHECK(ctx->HasOutput("RunningVarX"), "Output", "RunningVarX", + "ResNetUnitOp"); + if (has_shortcut) { + OP_INOUT_CHECK(ctx->HasOutput("ConvZ"), "Output", "ConvZ", + "ResNetUnitOp"); + OP_INOUT_CHECK(ctx->HasOutput("SavedMeanZ"), "Output", "SavedMeanZ", + "ResNetUnitOp"); + OP_INOUT_CHECK(ctx->HasOutput("SavedInvstdZ"), "Output", "SavedInvstdZ", + "ResNetUnitOp"); + OP_INOUT_CHECK(ctx->HasOutput("RunningMeanZ"), "Output", "RunningMeanZ", + "ResNetUnitOp"); + OP_INOUT_CHECK(ctx->HasOutput("RunningVarZ"), "Output", "RunningVarZ", + "ResNetUnitOp"); + } + + // make sure Mean/RunningMean and Var/RunningVar share memory + PADDLE_ENFORCE_EQ( + ctx->Inputs("MeanX")[0], ctx->Outputs("RunningMeanX")[0], + platform::errors::InvalidArgument( + "MeanX and RunningMeanX should share the same memory")); + PADDLE_ENFORCE_EQ(ctx->Inputs("VarX")[0], ctx->Outputs("RunningVarX")[0], + platform::errors::InvalidArgument( + "VarX and RunningVarX should share the same memory")); + if (has_shortcut) { + PADDLE_ENFORCE_EQ( + ctx->Inputs("MeanZ")[0], ctx->Outputs("RunningMeanZ")[0], + platform::errors::InvalidArgument( + "MeanZ and RunningMeanZ should share the same memory")); + PADDLE_ENFORCE_EQ( + ctx->Inputs("VarZ")[0], ctx->Outputs("RunningVarZ")[0], + platform::errors::InvalidArgument( + "VarZ and RunningVarZ should share the same memory")); + } + + // Check dims of inputs + const auto x_dims = ctx->GetInputDim("X"); + const auto w_dims = ctx->GetInputDim("FilterX"); + const auto bn_param_dims = ctx->GetInputDim("ScaleX"); + PADDLE_ENFORCE_EQ(x_dims.size(), 4, platform::errors::InvalidArgument( + "The dimensions of input " + "must equal to 4." + "But received: the shape of input " + "= [%s], the dimension of input = " + "[%d]", + x_dims, x_dims.size())); + PADDLE_ENFORCE_EQ(w_dims.size(), 4, + platform::errors::InvalidArgument( + "The dimensions of filter " + "must equal to 4." + "But received: the shape of filter " + "= [%s], the dimension of filter = [%d] ", + w_dims, w_dims.size())); + PADDLE_ENFORCE_EQ(bn_param_dims.size(), 4, + platform::errors::InvalidArgument( + "The dimensions of bn param " + "must equal to 4." + "But received: the shape of bn param " + "= [%s], the dimension of bn param = [%d] ", + bn_param_dims, bn_param_dims.size())); + auto data_format = ctx->Attrs().Get("data_format"); + PADDLE_ENFORCE_EQ( + data_format, "NHWC", + platform::errors::InvalidArgument("The data format must equal to NHWC. " + "But received: the data format " + "= [%s]", + data_format)); + // Calculate the dims of outputs + int batch = x_dims[0]; + int output_channel = w_dims[0]; + int filter_size = w_dims[2]; + int stride = ctx->Attrs().Get("stride"); + int padding = ctx->Attrs().Get("padding"); + int out_h = (x_dims[1] + padding * 2 - filter_size) / stride + 1; + int out_w = (x_dims[2] + padding * 2 - filter_size) / stride + 1; + std::vector out_shape = {batch, out_h, out_w, output_channel}; + + auto y_dims = framework::make_ddim(out_shape); + auto bitmask_dims = GetBitmaskDims(out_shape); + // Set dims of outputs + ctx->SetOutputDim("Y", y_dims); + ctx->SetOutputDim("BitMask", bitmask_dims); + ctx->SetOutputDim("ConvX", y_dims); + ctx->SetOutputDim("SavedMeanX", bn_param_dims); + ctx->SetOutputDim("SavedInvstdX", bn_param_dims); + ctx->SetOutputDim("RunningMeanX", bn_param_dims); + ctx->SetOutputDim("RunningVarX", bn_param_dims); + if (has_shortcut) { + ctx->SetOutputDim("ConvZ", y_dims); + ctx->SetOutputDim("SavedMeanZ", bn_param_dims); + ctx->SetOutputDim("SavedInvstdZ", bn_param_dims); + ctx->SetOutputDim("RunningMeanZ", bn_param_dims); + ctx->SetOutputDim("RunningVarZ", bn_param_dims); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); + // By default, the type of the scale, bias, mean, + // and var tensors should be float when input tensor's dtype is float16. + auto bn_param_type = framework::proto::VarType::FP32; + + PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input("ScaleX")->type(), + platform::errors::InvalidArgument( + "Scale input should be of float type")); + PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input("BiasX")->type(), + platform::errors::InvalidArgument( + "Bias input should be of float type")); + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, + library); + } +}; + +class ResNetUnitOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "The input 1 tensor"); + AddInput("FilterX", "Filter tensor of input 1"); + AddInput("ScaleX", "Scale tensor of input 1 used in batchnorm"); + AddInput("BiasX", "Bias tensor of input 1 used in batchnorm"); + AddInput("MeanX", "Mean tensor of input 1 used in batchnorm"); + AddInput("VarX", "Variance tensor of input 1 used in batchnorm"); + AddInput("Z", "The input 2 tensor").AsDispensable(); + AddInput("FilterZ", "Filter tensor of input 2").AsDispensable(); + AddInput("ScaleZ", "Scale tensor of input 2").AsDispensable(); + AddInput("BiasZ", "Bias tensor of input 2").AsDispensable(); + AddInput("MeanZ", "Mean tensor of input 2").AsDispensable(); + AddInput("VarZ", "Variance tensor of input 2").AsDispensable(); + AddOutput("Y", "The result of the resnet unit"); + AddOutput("BitMask", "The bitmask generated after relu"); + AddOutput("ConvX", "The output of input 1 after conv"); + AddOutput("SavedMeanX", "Mean of input 1 in the current batch"); + AddOutput("SavedInvstdX", "Invstd of input 1 in the current batch"); + AddOutput("RunningMeanX", "Shared memory with MeanX"); + AddOutput("RunningVarX", "Shared memory with VarX"); + AddOutput("ConvZ", "The output of input 2 after conv").AsDispensable(); + AddOutput("SavedMeanZ", "Mean of input 1 in the current batch") + .AsDispensable(); + AddOutput("SavedInvstdZ", "Invstd of input 1 in the current batch") + .AsDispensable(); + AddOutput("RunningMeanZ", "Shared memory with MeanZ").AsDispensable(); + AddOutput("RunningVarZ", "Shared memory with VarZ").AsDispensable(); + AddAttr("stride", "").SetDefault(1); + AddAttr("stride_z", "").SetDefault(1); + AddAttr("padding", "").SetDefault(0); + AddAttr("dilation", "").SetDefault(1); + AddAttr("group", "").SetDefault(1); + AddAttr("momentum", "").SetDefault(0.9); + AddAttr("epsilon", "").SetDefault(1e-5); + AddAttr("data_format", "").SetDefault("NHWC"); + AddAttr("fuse_add", "").SetDefault(false); + AddAttr("has_shortcut", "").SetDefault(false); + AddAttr("use_global_stats", "").SetDefault(false); + AddAttr("is_test", + "(bool, default false) Set to true for inference only, false " + "for training. Some layers may run faster when this is true.") + .SetDefault(false); + AddAttr("use_addto", "").SetDefault(false); + AddAttr("act_type", "The activation type to be fused.") + .SetDefault("relu"); + AddComment(R"DOC( +Fusion op of the basic unit of resnet block. + +The implementation is based on the latest fusion op interface in cuDNN v8.0. +For more details: +https://docs.nvidia.com/deeplearning/cudnn/api/index.html#cudnnFusedOps_t + +)DOC"); + } +}; + +class ResNetUnitGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const { + // check input + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "ResNetUnitGradOp"); + OP_INOUT_CHECK(ctx->HasInput("FilterX"), "Input", "FilterX", + "ResNetUnitGradOp"); + OP_INOUT_CHECK(ctx->HasInput("ConvX"), "Input", "ConvX", + "ResNetUnitGradOp"); + OP_INOUT_CHECK(ctx->HasInput("ScaleX"), "Input", "ScaleX", + "ResNetUnitGradOp"); + OP_INOUT_CHECK(ctx->HasInput("BiasX"), "Input", "BiasX", + "ResNetUnitGradOp"); + OP_INOUT_CHECK(ctx->HasInput("SavedMeanX"), "Input", "SavedMeanX", + "ResNetUnitGradOp"); + OP_INOUT_CHECK(ctx->HasInput("SavedInvstdX"), "Input", "SavedInvstdX", + "ResNetUnitGradOp"); + + bool fuse_add = ctx->Attrs().Get("fuse_add"); + bool has_shortcut = ctx->Attrs().Get("has_shortcut"); + if (fuse_add || has_shortcut) { + OP_INOUT_CHECK(ctx->HasInput("Z"), "Input", "Z", "ResNetUnitGradOp"); + } + if (has_shortcut) { + OP_INOUT_CHECK(ctx->HasInput("FilterZ"), "Input", "FilterZ", + "ResNetUnitGradOp"); + OP_INOUT_CHECK(ctx->HasInput("ConvZ"), "Input", "ConvZ", + "ResNetUnitGradOp"); + OP_INOUT_CHECK(ctx->HasInput("ScaleZ"), "Input", "ScaleZ", + "ResNetUnitGradOp"); + OP_INOUT_CHECK(ctx->HasInput("BiasZ"), "Input", "BiasZ", + "ResNetUnitGradOp"); + OP_INOUT_CHECK(ctx->HasInput("SavedMeanZ"), "Input", "SavedMeanZ", + "ResNetUnitGradOp"); + OP_INOUT_CHECK(ctx->HasInput("SavedInvstdZ"), "Input", "SavedInvstdZ", + "ResNetUnitGradOp"); + } + OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "ResNetUnitGradOp"); + OP_INOUT_CHECK(ctx->HasInput("BitMask"), "Input", "BitMask", + "ResNetUnitGradOp"); + OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")), "Input", + framework::GradVarName("Y"), "ResNetUnitGradOp"); + + // check output + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output", + framework::GradVarName("X"), "ResNetUnitGradOp"); + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("FilterX")), "Output", + framework::GradVarName("FilterX"), "ResNetUnitGradOp"); + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("ScaleX")), "Output", + framework::GradVarName("ScaleX"), "ResNetUnitGradOp"); + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("BiasX")), "Output", + framework::GradVarName("BiasX"), "ResNetUnitGradOp"); + if (fuse_add) { + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("Z")), "Output", + framework::GradVarName("Z"), "ResNetUnitGradOp"); + } + if (has_shortcut) { + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("FilterZ")), + "Output", framework::GradVarName("FilterZ"), + "ResNetUnitGradOp"); + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("ScaleZ")), "Output", + framework::GradVarName("ScaleZ"), "ResNetUnitGradOp"); + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("BiasZ")), "Output", + framework::GradVarName("BiasZ"), "ResNetUnitGradOp"); + } + const auto x_dims = ctx->GetInputDim("X"); + const auto filter_x_dims = ctx->GetInputDim("FilterX"); + const auto param_dims = ctx->GetInputDim("ScaleX"); + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + ctx->SetOutputDim(framework::GradVarName("FilterX"), filter_x_dims); + ctx->SetOutputDim(framework::GradVarName("ScaleX"), param_dims); + ctx->SetOutputDim(framework::GradVarName("BiasX"), param_dims); + if (fuse_add || has_shortcut) { + const auto z_dims = ctx->GetInputDim("Z"); + ctx->SetOutputDim(framework::GradVarName("Z"), z_dims); + } + if (has_shortcut) { + const auto filter_z_dims = ctx->GetInputDim("FilterZ"); + ctx->SetOutputDim(framework::GradVarName("FilterZ"), filter_z_dims); + ctx->SetOutputDim(framework::GradVarName("ScaleZ"), param_dims); + ctx->SetOutputDim(framework::GradVarName("BiasZ"), param_dims); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + PADDLE_ENFORCE_NOT_NULL( + ctx.InputVar(framework::GradVarName("Y")), + platform::errors::NotFound( + "Can not find Y@GRAD in the execution context.")); + + framework::LibraryType library = framework::LibraryType::kPlain; + framework::DataLayout layout = framework::DataLayout::kAnyLayout; + + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace(), + layout, library); + } +}; + +template +class ResNetUnitGradOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("resnet_unit_grad"); + op->SetInput("X", this->Input("X")); + op->SetInput("FilterX", this->Input("FilterX")); + op->SetInput("ConvX", this->Output("ConvX")); + op->SetInput("ScaleX", this->Input("ScaleX")); + op->SetInput("BiasX", this->Input("BiasX")); + op->SetInput("SavedMeanX", this->Output("SavedMeanX")); + op->SetInput("SavedInvstdX", this->Output("SavedInvstdX")); + op->SetInput("Z", this->Input("Z")); + op->SetInput("FilterZ", this->Input("FilterZ")); + op->SetInput("ConvZ", this->Output("ConvZ")); + op->SetInput("ScaleZ", this->Input("ScaleZ")); + op->SetInput("BiasZ", this->Input("BiasZ")); + op->SetInput("SavedMeanZ", this->Output("SavedMeanZ")); + op->SetInput("SavedInvstdZ", this->Output("SavedInvstdZ")); + op->SetInput("Y", this->Output("Y")); + op->SetInput("BitMask", this->Output("BitMask")); + op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y")); + + op->SetAttrMap(this->Attrs()); + + op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); + op->SetOutput(framework::GradVarName("FilterX"), + this->InputGrad("FilterX")); + op->SetOutput(framework::GradVarName("ScaleX"), this->InputGrad("ScaleX")); + op->SetOutput(framework::GradVarName("BiasX"), this->InputGrad("BiasX")); + op->SetOutput(framework::GradVarName("Z"), this->InputGrad("Z")); + op->SetOutput(framework::GradVarName("FilterZ"), + this->InputGrad("FilterZ")); + op->SetOutput(framework::GradVarName("ScaleZ"), this->InputGrad("ScaleZ")); + op->SetOutput(framework::GradVarName("BiasZ"), this->InputGrad("BiasZ")); + } +}; + +class ResNetUnitOpInferVarType + : public framework::PassInDtypeAndVarTypeToOutput { + protected: + std::unordered_map& GetInputOutputWithSameType() + const override { + static std::unordered_map m{{"X", /*->*/ "Y"}}; + return m; + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(resnet_unit, ops::ResNetUnitOp, ops::ResNetUnitOpMaker, + ops::ResNetUnitOpInferVarType, + ops::ResNetUnitGradOpMaker, + ops::ResNetUnitGradOpMaker); +REGISTER_OPERATOR(resnet_unit_grad, ops::ResNetUnitGradOp); diff --git a/paddle/fluid/operators/fused/resnet_unit_op.cu b/paddle/fluid/operators/fused/resnet_unit_op.cu new file mode 100644 index 00000000000000..b121864f80e4d9 --- /dev/null +++ b/paddle/fluid/operators/fused/resnet_unit_op.cu @@ -0,0 +1,299 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h" +#include "paddle/fluid/operators/fused/cudnn_norm_conv.cu.h" +#include "paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class ResNetUnitKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE_EQ( + platform::is_gpu_place(ctx.GetPlace()), true, + platform::errors::PreconditionNotMet("It must use CUDAPlace.")); + PADDLE_ENFORCE_EQ(platform::CudnnDataType::type, CUDNN_DATA_HALF, + platform::errors::Unavailable( + "ResNetUnitOp only supports float16 for now.")); + + // input x + const Tensor *input_x = ctx.Input("X"); + const Tensor *filter_x = ctx.Input("FilterX"); + const Tensor *scale_x = ctx.Input("ScaleX"); + const Tensor *bias_x = ctx.Input("BiasX"); + // norm conv + Tensor *conv_out_x = ctx.Output("ConvX"); + // bn finalize + Tensor *saved_mean_x = ctx.Output("SavedMeanX"); + Tensor *saved_invstd_x = ctx.Output("SavedInvstdX"); + Tensor *running_mean_x = ctx.Output("RunningMeanX"); + Tensor *running_var_x = ctx.Output("RunningVarX"); + // sbar + Tensor *output = ctx.Output("Y"); + Tensor *bitmask = ctx.Output("BitMask"); + // attrs + int padding = ctx.Attr("padding"); + int stride = ctx.Attr("stride"); + int stride_z = ctx.Attr("stride_z"); + int dilation = ctx.Attr("dilation"); + int group = ctx.Attr("group"); + double eps = static_cast(ctx.Attr("epsilon")); + double momentum = static_cast(ctx.Attr("momentum")); + bool has_shortcut = ctx.Attr("has_shortcut"); + bool fuse_add = ctx.Attr("fuse_add"); + bool use_global_stats = ctx.Attr("use_global_stats"); + bool is_test = ctx.Attr("is_test"); + bool is_train = !is_test && !use_global_stats; + std::string act_type = ctx.Attr("act_type"); + + auto input_x_shape = framework::vectorize(input_x->dims()); + auto filter_x_shape = framework::vectorize(filter_x->dims()); + auto param_dims = scale_x->dims(); + auto param_shape = framework::vectorize(scale_x->dims()); + auto output_shape = framework::vectorize(output->dims()); + auto bitmask_shape = framework::vectorize(bitmask->dims()); + int output_channel = filter_x_shape[0]; + int64_t ele_count = + std::accumulate(output_shape.begin(), output_shape.end(), 1, + std::multiplies()) / + output_channel; + + auto place = ctx.GetPlace(); + auto &dev_ctx = ctx.template device_context(); + + // 1. Conv + Tensor sum_x; + Tensor sum_of_squares_x; + sum_x.Resize(param_dims); + sum_of_squares_x.Resize(param_dims); + CudnnNormConvolution conv_x_op(dev_ctx, input_x_shape, filter_x_shape, + output_shape, padding, stride, dilation, + group); + conv_x_op.Forward(dev_ctx, *input_x, *filter_x, conv_out_x, &sum_x, + &sum_of_squares_x); + + // 2. BN + Tensor equiv_scale_x; + Tensor equiv_bias_x; + equiv_scale_x.Resize(param_dims); + equiv_bias_x.Resize(param_dims); + CudnnBNStatsFinalize bn_x_op(dev_ctx, param_shape); + bn_x_op.Forward(dev_ctx, sum_x, sum_of_squares_x, *scale_x, *bias_x, + saved_mean_x, saved_invstd_x, running_mean_x, running_var_x, + &equiv_scale_x, &equiv_bias_x, eps, momentum, ele_count, + is_train); + + // 3. scale + bias + add + relu + CudnnScaleBiasAddRelu sbar_op(dev_ctx, act_type, fuse_add, has_shortcut, + output_shape, param_shape, bitmask_shape); + if (has_shortcut) { + // input z + const Tensor *input_z = ctx.Input("Z"); + const Tensor *filter_z = ctx.Input("FilterZ"); + const Tensor *scale_z = ctx.Input("ScaleZ"); + const Tensor *bias_z = ctx.Input("BiasZ"); + // norm conv + Tensor *conv_out_z = ctx.Output("ConvZ"); + // bn finalize + Tensor *saved_mean_z = ctx.Output("SavedMeanZ"); + Tensor *saved_invstd_z = ctx.Output("SavedInvstdZ"); + Tensor *running_mean_z = ctx.Output("RunningMeanZ"); + Tensor *running_var_z = ctx.Output("RunningVarZ"); + + auto input_z_shape = framework::vectorize(input_z->dims()); + auto filter_z_shape = framework::vectorize(filter_z->dims()); + + // 3.1 Conv for second input + Tensor sum_z; + Tensor sum_of_squares_z; + sum_z.Resize(param_dims); + sum_of_squares_z.Resize(param_dims); + CudnnNormConvolution conv_z_op(dev_ctx, input_z_shape, filter_z_shape, + output_shape, padding, stride_z, + dilation, group); + conv_z_op.Forward(dev_ctx, *input_z, *filter_z, conv_out_z, &sum_z, + &sum_of_squares_z); + + // 3.2 BN for second input + Tensor equiv_scale_z; + Tensor equiv_bias_z; + equiv_scale_z.Resize(param_dims); + equiv_bias_z.Resize(param_dims); + CudnnBNStatsFinalize bn_z_op(dev_ctx, param_shape); + bn_z_op.Forward(dev_ctx, sum_z, sum_of_squares_z, *scale_z, *bias_z, + saved_mean_z, saved_invstd_z, running_mean_z, + running_var_z, &equiv_scale_z, &equiv_bias_z, eps, + momentum, ele_count, is_train); + // 3.3 sbar + sbar_op.Forward(dev_ctx, *conv_out_x, equiv_scale_x, equiv_bias_x, + conv_out_z, &equiv_scale_z, &equiv_bias_z, output, + bitmask); + } else { + const Tensor *input_z = fuse_add ? ctx.Input("Z") : nullptr; + sbar_op.Forward(dev_ctx, *conv_out_x, equiv_scale_x, equiv_bias_x, + input_z, nullptr, nullptr, output, bitmask); + } + } +}; + +template +class ResNetUnitGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PADDLE_ENFORCE_EQ( + platform::is_gpu_place(ctx.GetPlace()), true, + platform::errors::PreconditionNotMet("It must use CUDAPlace.")); + PADDLE_ENFORCE_EQ(platform::CudnnDataType::type, CUDNN_DATA_HALF, + platform::errors::Unavailable( + "ResNetUnitOp only supports float16 for now.")); + + const Tensor *y_grad = ctx.Input(framework::GradVarName("Y")); + + const Tensor *x = ctx.Input("X"); + const Tensor *filter_x = ctx.Input("FilterX"); + const Tensor *scale_x = ctx.Input("ScaleX"); + const Tensor *bias_x = ctx.Input("BiasX"); + const Tensor *saved_mean_x = ctx.Input("SavedMeanX"); + const Tensor *saved_invstd_x = ctx.Input("SavedInvstdX"); + + const Tensor *conv_out_x = ctx.Input("ConvX"); + const Tensor *output = ctx.Input("Y"); + const Tensor *bitmask = ctx.Input("BitMask"); + + Tensor *x_grad = ctx.Output(framework::GradVarName("X")); + Tensor *filter_x_grad = + ctx.Output(framework::GradVarName("FilterX")); + Tensor *scale_x_grad = ctx.Output(framework::GradVarName("ScaleX")); + Tensor *bias_x_grad = ctx.Output(framework::GradVarName("BiasX")); + + int padding = ctx.Attr("padding"); + int stride = ctx.Attr("stride"); + int stride_z = ctx.Attr("stride_z"); + int dilation = ctx.Attr("dilation"); + int group = ctx.Attr("group"); + double eps = static_cast(ctx.Attr("epsilon")); + double momentum = static_cast(ctx.Attr("momentum")); + bool has_shortcut = ctx.Attr("has_shortcut"); + bool fuse_add = ctx.Attr("fuse_add"); + bool use_global_stats = ctx.Attr("use_global_stats"); + std::string act_type = ctx.Attr("act_type"); + + auto x_shape = framework::vectorize(x->dims()); + auto filter_x_shape = framework::vectorize(filter_x->dims()); + auto param_shape = framework::vectorize(scale_x->dims()); + auto output_shape = framework::vectorize(output->dims()); + auto bitmask_shape = framework::vectorize(bitmask->dims()); + + auto place = ctx.GetPlace(); + auto &dev_ctx = ctx.template device_context(); + + // 1. Backward of BN (+ Add + Relu) for x, get conv_out_x_grad, + // scale_x_grad, bias_x_grad + Tensor conv_out_x_grad; + conv_out_x_grad.Resize(conv_out_x->dims()); + CudnnScaleBiasAddRelu sbar_x_op(dev_ctx, act_type, fuse_add, + has_shortcut, output_shape, param_shape, + bitmask_shape); + if (has_shortcut) { + // X Z + // | | + // NormConv NormConv + // | | + // BNStatsFinalize BNStatsFinalize + // \ / + // ScaleBiasAddRelu + // | + // Y + const Tensor *z = ctx.Input("Z"); + const Tensor *filter_z = ctx.Input("FilterZ"); + const Tensor *scale_z = ctx.Input("ScaleZ"); + const Tensor *bias_z = ctx.Input("BiasZ"); + const Tensor *saved_mean_z = ctx.Input("SavedMeanZ"); + const Tensor *saved_invstd_z = ctx.Input("SavedInvstdZ"); + const Tensor *conv_out_z = ctx.Input("ConvZ"); + + Tensor *z_grad = ctx.Output(framework::GradVarName("Z")); + Tensor *filter_z_grad = + ctx.Output(framework::GradVarName("FilterZ")); + Tensor *scale_z_grad = + ctx.Output(framework::GradVarName("ScaleZ")); + Tensor *bias_z_grad = ctx.Output(framework::GradVarName("BiasZ")); + + // 1.1 Backward of BN + Add (+ Relu) for x, get conv_out_x_grad, + // scale_x_grad, bias_x_grad and z_grad_temp + Tensor z_grad_temp; + z_grad_temp.Resize(conv_out_z->dims()); + sbar_x_op.Backward(dev_ctx, *y_grad, *conv_out_x, *scale_x, *bias_x, + *saved_mean_x, *saved_invstd_x, bitmask, + &conv_out_x_grad, &z_grad_temp, scale_x_grad, + bias_x_grad, eps); + + // 1.2 bn backward for z, get conv_out_z_grad, dscale_z, dbias_z + Tensor conv_out_z_grad; + conv_out_z_grad.Resize(conv_out_z->dims()); + CudnnScaleBiasAddRelu sbar_z_op( + dev_ctx, "", false, false, output_shape, param_shape, bitmask_shape); + sbar_z_op.Backward(dev_ctx, z_grad_temp, *conv_out_z, *scale_z, *bias_z, + *saved_mean_z, *saved_invstd_z, nullptr, + &conv_out_z_grad, nullptr, scale_z_grad, bias_z_grad, + eps); + + // 1.3 Backward of Conv for z, get z_grad and filter_z_grad + auto z_shape = framework::vectorize(z->dims()); + auto filter_z_shape = framework::vectorize(filter_z->dims()); + CudnnNormConvolutionGrad conv_z_op(dev_ctx, z_shape, filter_z_shape, + output_shape, padding, stride_z, + dilation, group); + conv_z_op.Backward(dev_ctx, *z, *filter_z, conv_out_z_grad, z_grad, + filter_z_grad); + } else { + // 1.1 Backward of BN (+ Add + Relu) for x, get conv_out_x_grad, + // scale_x_grad, bias_x_grad (and z_grad) + Tensor *z_grad = + fuse_add ? ctx.Output(framework::GradVarName("Z")) : nullptr; + sbar_x_op.Backward(dev_ctx, *y_grad, *conv_out_x, *scale_x, *bias_x, + *saved_mean_x, *saved_invstd_x, bitmask, + &conv_out_x_grad, z_grad, scale_x_grad, bias_x_grad, + eps); + } + + // 2. Backward of Conv for x, get x_grad and filter_x_grad + bool use_addto = ctx.Attr("use_addto"); + CudnnNormConvolutionGrad conv_x_op(dev_ctx, x_shape, filter_x_shape, + output_shape, padding, stride, + dilation, group); + conv_x_op.Backward(dev_ctx, *x, *filter_x, conv_out_x_grad, x_grad, + filter_x_grad, use_addto); + } +}; + +} // namespace operators +} // namespace paddle + +#if CUDNN_VERSION >= 8000 +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_CUDA_KERNEL(resnet_unit, ops::ResNetUnitKernel); +REGISTER_OP_CUDA_KERNEL(resnet_unit_grad, + ops::ResNetUnitGradKernel); +#endif diff --git a/paddle/fluid/operators/gather_nd_op_npu.cc b/paddle/fluid/operators/gather_nd_op_npu.cc index d04e0bce36fab2..8102322bd3b0ce 100644 --- a/paddle/fluid/operators/gather_nd_op_npu.cc +++ b/paddle/fluid/operators/gather_nd_op_npu.cc @@ -18,7 +18,10 @@ limitations under the License. */ namespace paddle { namespace operators { -template +using Tensor = framework::Tensor; +using NPUDeviceContext = platform::NPUDeviceContext; + +template class GatherNdNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { @@ -49,14 +52,12 @@ class GatherNdNPUKernel : public framework::OpKernel { framework::proto::VarType::INT64))); const auto &runner = NpuOpRunner("GatherNd", {*x, *index}, {*out}, {}); - auto stream = - ctx.template device_context() - .stream(); + auto stream = ctx.template device_context().stream(); runner.Run(stream); } }; -template +template class GatherNdGradNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &ctx) const override { @@ -91,10 +92,7 @@ class GatherNdGradNPUKernel : public framework::OpKernel { dout = &tmp_tensor2; } - auto stream = - ctx.template device_context() - .stream(); - + auto stream = ctx.template device_context().stream(); platform::NPUMemsetAsync(static_cast(p), 0, dx->numel() * sizeof(T), stream); @@ -108,13 +106,13 @@ class GatherNdGradNPUKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL( - gather_nd, ops::GatherNdNPUKernel, - ops::GatherNdNPUKernel); - -REGISTER_OP_NPU_KERNEL( - gather_nd_grad, - ops::GatherNdGradNPUKernel, - ops::GatherNdGradNPUKernel); +REGISTER_OP_NPU_KERNEL(gather_nd, + ops::GatherNdNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::GatherNdNPUKernel, +#endif + ops::GatherNdNPUKernel); + +REGISTER_OP_NPU_KERNEL(gather_nd_grad, + ops::GatherNdGradNPUKernel, + ops::GatherNdGradNPUKernel); diff --git a/paddle/fluid/operators/grid_sampler_op.h b/paddle/fluid/operators/grid_sampler_op.h index b1857b49eede0d..da386052c7dc01 100644 --- a/paddle/fluid/operators/grid_sampler_op.h +++ b/paddle/fluid/operators/grid_sampler_op.h @@ -82,6 +82,9 @@ static inline void clip(const platform::CPUDeviceContext& ctx, auto grid_abs = grid_slice_t.abs(); auto extra = grid_abs - (grid_abs / double_range).floor() * double_range; grid_slice_t.device(place) = extra.cwiseMin(double_range - extra); + if (max_val == 0) { + grid_slice_t.device(place) = grid_slice_t.constant(static_cast(0)); + } } else { auto double_range = static_cast((max_val + 1) * 2); auto grid_abs = (grid_slice_t + static_cast(0.5)).abs(); @@ -128,6 +131,9 @@ static inline void clipWithMask(const platform::CPUDeviceContext& ctx, grid_scale_t * ((is_neg == one_more_flip).template cast() - (is_neg != one_more_flip).template cast()); grid_slice_t.device(place) = extra.cwiseMin(double_range - extra); + if (max_val == 0) { + grid_slice_t.device(place) = grid_slice_t.constant(static_cast(0)); + } } else { auto double_range = static_cast((max_val + 1) * 2); auto grid_abs = (grid_slice_t + static_cast(0.5)).abs(); diff --git a/paddle/fluid/operators/group_norm_op_npu.cc b/paddle/fluid/operators/group_norm_op_npu.cc new file mode 100644 index 00000000000000..4ef8320cbdecd6 --- /dev/null +++ b/paddle/fluid/operators/group_norm_op_npu.cc @@ -0,0 +1,306 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/group_norm_op.h" +#include +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +struct GroupNormFunction { + public: + explicit GroupNormFunction(const framework::ExecutionContext& ctx) + : ctx(ctx) { + place = ctx.GetPlace(); + stream = ctx.template device_context() + .stream(); + } + void ReduceMean(const Tensor* x, Tensor* y, const std::vector& dim, + bool keep_dims = true) { + // y should be init first + const auto& runner = NpuOpRunner("ReduceMeanD", {*x}, {*y}, + {{"axes", dim}, {"keep_dims", keep_dims}}); + runner.Run(stream); + } + void ReduceSum(const Tensor* x, Tensor* y, const std::vector& dim, + bool keep_dims = true) { + // y should be init first + const auto& runner = NpuOpRunner("ReduceSumD", {*x}, {*y}, + {{"axes", dim}, {"keep_dims", keep_dims}}); + runner.Run(stream); + } + void Add(const Tensor* x, const Tensor* y, Tensor* z) { + // y should be init first + const auto& runner = NpuOpRunner("AddV2", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + void Sub(const Tensor* x, const Tensor* y, Tensor* z) { + // y should be init first + const auto& runner = NpuOpRunner("Sub", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + void Mul(const Tensor* x, const Tensor* y, Tensor* z) { + // y should be init first + const auto& runner = NpuOpRunner("Mul", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + void Div(const Tensor* x, const Tensor* y, Tensor* z) { + // y should be init first + const auto& runner = NpuOpRunner("Div", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + void DivNoNan(const Tensor* x, const Tensor* y, Tensor* z) { + // y should be init first + const auto& runner = NpuOpRunner("DivNoNan", {*x, *y}, {*z}, {}); + runner.Run(stream); + } + void Transpose(const Tensor* x, Tensor* y, const std::vector& axis) { + // y should be init first + const auto& runner = + NpuOpRunner("TransposeD", {*x}, {*y}, {{"perm", axis}}); + runner.Run(stream); + } + void Sqrt(const Tensor* x, Tensor* y) { + // y should be init first + const auto& runner = NpuOpRunner("Sqrt", {*x}, {*y}, {}); + runner.Run(stream); + } + void Adds(const Tensor* x, float scalar, Tensor* y) { + // y should be init first + const auto& runner = NpuOpRunner("Adds", {*x}, {*y}, {{"value", scalar}}); + runner.Run(stream); + } + Tensor ReduceMeanToNG(const Tensor* x, const DataLayout& data_layout, + const int64_t N, const int64_t C, const int64_t H, + const int64_t W, const int G) { + Tensor y(x->type()); + // y.mutable_data( {N,G,1}, place ); + if (data_layout == DataLayout::kNCHW) { + y.mutable_data({N, G, 1}, place); + // shape of x is [N, G, C*H*W/G] + this->ReduceMean(x, &y, std::vector{2}); + } else { + y.mutable_data({N, 1, G}, place); + // shape of x is [N, C*H*W/G, G] + Tensor x_trans(x->type()); + x_trans.mutable_data({N, G, C * H * W / G}, place); + this->Transpose(x, &x_trans, std::vector{0, 2, 1}); + this->ReduceMean(&x_trans, &y, std::vector{2}); + } + return y; + } + + private: + platform::Place place; + aclrtStream stream; + const framework::ExecutionContext& ctx; +}; + +template +class GroupNormNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + const float epsilon = ctx.Attr("epsilon"); + auto* scale = ctx.Input("Scale"); + auto* bias = ctx.Input("Bias"); + auto* x = ctx.Input("X"); + + auto* y = ctx.Output("Y"); + auto* mean = ctx.Output("Mean"); + auto* var = ctx.Output("Variance"); + const auto groups = ctx.Attr("groups"); + + auto place = ctx.GetPlace(); + Tensor xnorm(x->type()); + xnorm.mutable_data(x->dims(), place); + GroupNormFunction F(ctx); + if (data_layout != DataLayout::kNCHW) { + xnorm.Resize({x->dims()[0], x->dims()[3], x->dims()[1], x->dims()[2]}); + F.Transpose(x, &xnorm, std::vector{0, 3, 1, 2}); + } else { + TensorCopy(*x, platform::NPUPlace(), &xnorm); + } + auto N = xnorm.dims()[0]; + auto C = xnorm.dims()[1]; + auto H = xnorm.dims()[2]; + auto W = xnorm.dims()[3]; + xnorm.Resize({N * groups, C * H * W / groups}); + std::vector axis = {1}; + auto reduce_dim = mean->dims(); + + mean->mutable_data({N * groups, 1}, place); + var->mutable_data({N * groups, 1}, place); + y->mutable_data(place); + F.ReduceMean(&xnorm, mean, axis); + + F.Sub(&xnorm, mean, &xnorm); + Tensor sqr(x->type()); + sqr.mutable_data(xnorm.dims(), place); + + F.Mul(&xnorm, &xnorm, &sqr); + F.ReduceMean(&sqr, var, axis); + Tensor std(x->type()); + std.mutable_data(var->dims(), place); + F.Adds(var, epsilon, &std); + F.Sqrt(&std, &std); + y->Resize(xnorm.dims()); + F.Div(&xnorm, &std, y); + y->Resize({N, C, H, W}); + if (scale) { + Tensor scale_t(scale->type()); + scale_t.ShareDataWith(*scale); + scale_t.Resize({C, 1, 1}); + F.Mul(y, &scale_t, y); + } + if (bias) { + Tensor bias_t(bias->type()); + bias_t.ShareDataWith(*bias); + bias_t.Resize({C, 1, 1}); + F.Add(y, &bias_t, y); + } + if (data_layout != DataLayout::kNCHW) { + F.Transpose(y, y, std::vector{0, 2, 3, 1}); + y->Resize({x->dims()}); + } + mean->Resize(reduce_dim); + var->Resize(reduce_dim); + } +}; + +template +class GroupNormGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const std::string data_layout_str = ctx.Attr("data_layout"); + const DataLayout data_layout = + framework::StringToDataLayout(data_layout_str); + const float epsilon = ctx.Attr("epsilon"); + auto* y = ctx.Input("Y"); + auto* var = ctx.Input("Variance"); + + auto* scale = ctx.Input("Scale"); + auto* bias = ctx.Input("Bias"); + auto* d_y = ctx.Input(framework::GradVarName("Y")); + const auto G = ctx.Attr("groups"); + + // init output + auto* d_x = ctx.Output(framework::GradVarName("X")); + auto* d_scale = ctx.Output(framework::GradVarName("Scale")); + auto* d_bias = ctx.Output(framework::GradVarName("Bias")); + + GroupNormFunction F(ctx); + auto place = ctx.GetPlace(); + auto _type = y->type(); + + Tensor xnorm(_type); + xnorm.mutable_data(y->dims(), place); + Tensor scale_share(_type); + scale_share.ShareDataWith(*scale); + Tensor bias_share(_type); + bias_share.ShareDataWith(*bias); + + int64_t N = y->dims()[0]; + int64_t C, H, W; + framework::DDim scale_bias_dim; + if (data_layout == DataLayout::kNCHW) { + C = y->dims()[1]; + H = y->dims()[2]; + W = y->dims()[3]; + scale_bias_dim = framework::make_ddim({C, 1, 1}); + } else { + C = y->dims()[3]; + H = y->dims()[1]; + W = y->dims()[2]; + scale_bias_dim = framework::make_ddim({1, 1, C}); + } + scale_share.Resize(scale_bias_dim); + bias_share.Resize(scale_bias_dim); + F.Sub(y, &bias_share, &xnorm); + F.DivNoNan(&xnorm, &scale_share, &xnorm); + + if (d_bias) { + d_bias->mutable_data(place); + if (data_layout == DataLayout::kNCHW) { + F.ReduceSum(d_y, d_bias, std::vector{0, 2, 3}, false); + } else { + F.ReduceSum(d_y, d_bias, std::vector{0, 1, 2}, false); + } + } + if (d_scale) { + d_scale->mutable_data(place); + Tensor dy_xnorm(_type); + dy_xnorm.mutable_data(d_y->dims(), place); + F.Mul(d_y, &xnorm, &dy_xnorm); + if (data_layout == DataLayout::kNCHW) { + F.ReduceSum(&dy_xnorm, d_scale, std::vector{0, 2, 3}); + } else { + F.ReduceSum(&dy_xnorm, d_scale, std::vector{0, 1, 2}); + } + } + + // std = Sqrt(var+epsilon), init shape = [ N, G ] + Tensor std(_type); + std.mutable_data(var->dims(), place); + F.Adds(var, epsilon, &std); + F.Sqrt(&std, &std); + // d_xnorm_std = dy_proc * scale / std + Tensor d_xnorm_std(_type); + d_xnorm_std.mutable_data(y->dims(), place); + F.Mul(d_y, &scale_share, &d_xnorm_std); + if (data_layout == DataLayout::kNCHW) { + xnorm.Resize({N, G, C * H * W / G}); + d_xnorm_std.Resize({N, G, C * H * W / G}); + std.Resize({N, G, 1}); + } else { + xnorm.Resize({N, C * H * W / G, G}); + d_xnorm_std.Resize({N, C * H * W / G, G}); + std.Resize({N, 1, G}); + } + F.Div(&d_xnorm_std, &std, &d_xnorm_std); + + // d_x = d_xnorm_std + // - Mean ( d_xnorm_std * x_norm, axis=1, keepdim=True ) * x_norm + // - Mean ( d_xnorm_std, axis=1, keepdim=True ) + d_x->mutable_data(place); + d_x->Resize(xnorm.dims()); + F.Mul(&d_xnorm_std, &xnorm, d_x); + Tensor dx1 = F.ReduceMeanToNG(d_x, data_layout, N, C, H, W, G); + F.Mul(&dx1, &xnorm, d_x); + + Tensor dx2 = F.ReduceMeanToNG(&d_xnorm_std, data_layout, N, C, H, W, G); + + F.Sub(&d_xnorm_std, d_x, d_x); + F.Sub(d_x, &dx2, d_x); + + d_x->Resize(y->dims()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(group_norm, ops::GroupNormNPUKernel, + ops::GroupNormNPUKernel); +REGISTER_OP_NPU_KERNEL(group_norm_grad, ops::GroupNormGradNPUKernel, + ops::GroupNormGradNPUKernel); diff --git a/paddle/fluid/operators/huber_loss_op_npu.cc b/paddle/fluid/operators/huber_loss_op_npu.cc index a9426155941544..33cbaec4dfc462 100644 --- a/paddle/fluid/operators/huber_loss_op_npu.cc +++ b/paddle/fluid/operators/huber_loss_op_npu.cc @@ -1,13 +1,16 @@ /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and -limitations under the Licnse. */ +limitations under the License. */ #include "paddle/fluid/operators/huber_loss_op.h" #include "paddle/fluid/operators/npu_op_runner.h" diff --git a/paddle/fluid/operators/index_select_op_npu.cc b/paddle/fluid/operators/index_select_op_npu.cc index b624d03cc85559..825229282f3dac 100644 --- a/paddle/fluid/operators/index_select_op_npu.cc +++ b/paddle/fluid/operators/index_select_op_npu.cc @@ -99,10 +99,11 @@ class IndexSelectGradNPUKernel : public framework::OpKernel { transed_out_dims[i] = out_dims[in_trans_perm[i]]; } transed_out_grad.mutable_data(transed_out_dims, ctx.GetPlace()); - framework::NPUAttributeMap in_trans_attr = {{"perm", in_trans_perm}}; - - const auto& in_trans_runner = NpuOpRunner( - "TransposeD", {*out_grad}, {transed_out_grad}, in_trans_attr); + NpuOpRunner in_trans_runner; + in_trans_runner.SetType("Transpose") + .AddInput(*out_grad) + .AddInput(std::move(in_trans_perm)) + .AddOutput(transed_out_grad); in_trans_runner.Run(stream); Tensor sum_out; @@ -133,10 +134,12 @@ class IndexSelectGradNPUKernel : public framework::OpKernel { for (int i = 1 + dim; i < x_dims.size(); ++i) { out_trans_perm.push_back(i); } - framework::NPUAttributeMap out_trans_attr = {{"perm", out_trans_perm}}; x_grad->mutable_data(ctx.GetPlace()); - const auto& out_trans_runner = - NpuOpRunner("TransposeD", {sum_out}, {*x_grad}, out_trans_attr); + NpuOpRunner out_trans_runner; + out_trans_runner.SetType("Transpose") + .AddInput(sum_out) + .AddInput(std::move(out_trans_perm)) + .AddOutput(*x_grad); out_trans_runner.Run(stream); } } diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu index 6f8b89ce64523d..fe9228135606dc 100644 --- a/paddle/fluid/operators/interpolate_v2_op.cu +++ b/paddle/fluid/operators/interpolate_v2_op.cu @@ -1198,7 +1198,12 @@ static void Interpolate2DCUDAFwd(const framework::ExecutionContext& ctx, input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n, out_chw, c, ratio_h, ratio_w, align_corners, align_mode, data_layout); } else if ("bicubic" == interp_method) { - KeBicubicInterpFw<<<<>>( input_data, in_h, in_w, n, in_chw, output_data, out_h, out_w, n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout); @@ -1606,9 +1611,11 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx, const T align_type_value = (align_mode == 0 && !align_corners) ? 0.5f : 0; bool is_nchw = (data_layout == DataLayout::kNCHW) ? true : false; bool optimize_flag = false; +#ifndef __HIPCC__ optimize_flag = (in_h < (out_h >> 6) && in_w < (out_w >> 6)) ? true : ((in_h == 1 && in_w == 1) ? true : false); +#endif if (optimize_flag & is_nchw) { KeBilinearInterpBwShareMemory< @@ -1623,7 +1630,12 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx, ratio_h, ratio_w, align_type_value, is_nchw); } } else if ("bicubic" == interp_method) { - KeBicubicInterpBw<<<<>>( input_grad_data, in_h, in_w, n, in_chw, output_grad_data, out_h, out_w, n, out_chw, c, ratio_h, ratio_w, align_corners, data_layout); diff --git a/paddle/fluid/operators/interpolate_v2_op_npu.cc b/paddle/fluid/operators/interpolate_v2_op_npu.cc index d893fbd0196289..b30c7ac810c011 100644 --- a/paddle/fluid/operators/interpolate_v2_op_npu.cc +++ b/paddle/fluid/operators/interpolate_v2_op_npu.cc @@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and -limitations under the Licnse. */ +limitations under the License. */ #include "paddle/fluid/operators/interpolate_v2_op.h" #include "paddle/fluid/operators/npu_op_runner.h" diff --git a/paddle/fluid/operators/is_empty_op_npu.cc b/paddle/fluid/operators/is_empty_op_npu.cc index 9155afecd021b7..01579abd74d234 100644 --- a/paddle/fluid/operators/is_empty_op_npu.cc +++ b/paddle/fluid/operators/is_empty_op_npu.cc @@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and -limitations under the Licnse. */ +limitations under the License. */ #include "paddle/fluid/operators/is_empty_op.h" diff --git a/paddle/fluid/operators/kernel_primitives/compute_primitives.h b/paddle/fluid/operators/kernel_primitives/compute_primitives.h index a36c76d7881737..73316d66b6cf26 100644 --- a/paddle/fluid/operators/kernel_primitives/compute_primitives.h +++ b/paddle/fluid/operators/kernel_primitives/compute_primitives.h @@ -135,17 +135,16 @@ __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) { } // namespace details /** - * @brief Perform unary calculation according to OpFunc. Size of input and + * @brief Perform unary calculation according to OpFunc. Shape of input and * output are the same. * * @template paraments - * InT: Data type of in. - * OutT: Data type of out. + * InT: The data type of in. + * OutT: The data type of out. * NX: The number of data columns loaded by each thread. * NY: The number of data rows loaded by each thread. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. + * threadIdx.x is used as the thread index. Currently only GPU was supported. * OpFunc: Compute functor which has an operator() as following: * template * struct XxxFunctor { @@ -170,21 +169,20 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in, } /** - * @brief Binary calculation according to OpFunc. Size of The input and output + * @brief Binary calculation according to OpFunc. Shape of The input and output * are the same. * * @template paraments - * InT: Data type of in1 and in2. - * OutT: Data type of out. - * NX: The number of data columns loaded by each thread. - * NY: The number of data rows loaded by each thread. + * InT: The data type of in1 and in2. + * OutT: The data type of out. + * NX: The number of data columns computed by each thread. + * NY: The number of data rows computed by each thread. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. + * threadIdx.x is used as the thread index. Currently only GPU was supported. * OpFunc: Compute functor which has an operator() as following: - * template + * template * struct XxxFunctor { - * HOSTDEVICE OutT operator()(const InT& a, const InT& b) const { + * HOSTDEVICE InT operator()(const InT& a, const InT& b) const { * return ...; * } * }; @@ -193,7 +191,7 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in, * out: The register pointer of out, the size is NX * NY. * in1: The register pointer of fist input, size is NX * NY. * in2: The register pointer of second input, size is NX * NY. - * compute: Compute function which was declared like OpFunc(). + * compute: Compute function which was declared like OpFunc(). */ template @@ -207,21 +205,20 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1, } /** - * @brief Ternary calculation according to OpFunc. Size of input and output + * @brief Ternary calculation according to OpFunc. Shape of input and output * are the same. * * @template paraments - * InT: Data type of in1 and in2. - * OutT: Data type of out. + * InT: The data type of in1 and in2. + * OutT: The data type of out. * NX: The number of data columns loaded by each thread. * NY: The number of data rows loaded by each thread. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. + * threadIdx.x is used as the thread index. Currently only GPU was supported. * OpFunc: Compute functor which has an operator() as following - * template + * template * struct XxxFunctor { - * HOSTDEVICE OutT operator()(const InT& a, const InT& b, const InT& c) + * HOSTDEVICE InT operator()(const InT& a, const InT& b, const InT& c) * const { * return ...; * } @@ -232,7 +229,7 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1, * in1: The register pointer of fist input, size is NX * NY. * in2: The register pointer of second input, size is NX * NY. * in3: The register pointer of third input, size is NX * NY. - * compute: Compute function which was declared like OpFunc(). + * compute: Compute function which was declared like OpFunc(). */ template @@ -247,30 +244,29 @@ __device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1, } /** - * @brief Multivariate calculation according to OpFunc. Size of input and output - * are the same. + * @brief Multivariate calculation according to OpFunc. Shape of inputs and + * output are the same. * * @template paraments - * InT: Data type of in1, in2 and in3. - * OutT: Data type of out. + * InT: The data type of in1, in2 and in3. + * OutT: The data type of out. * NX: The number of data columns loaded by each thread. * NY: The number of data rows loaded by each thread. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. - * Arity: The size of ins + * threadIdx.x is used as the thread index. Currently only GPU was supported. + * Arity: The size of ins. * OpFunc: Compute functor which has an operator() as following: - * template + * template * struct XxxFunctor { - * HOSTDEVICE OutT operator()(const InT* args) const { + * HOSTDEVICE InT operator()(const InT* args) const { * return ...; * } * }; * * @param * out: The register pointer of out, the size is NX * NY. - * ins: An array of pointers consisting of multiple inputs. - * compute: Compute function which was declared like OpFunc(). + * ins: A pointers of array consisting of multiple inputs. + * compute: Compute function which was declared like OpFunc(). */ template @@ -293,13 +289,12 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY], * shape is [NY, NX]. * * @template paraments - * InT: Data type of in1 and in2. - * OutT: Data type of out. + * InT: The data type of in1 and in2. + * OutT: The data type of out. * NX: The number of data columns loaded by each thread. * NY: The number of data rows loaded by each thread. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. + * threadIdx.x is used as the thread index. Currently only GPU was supported. * OpFunc: Compute functor which has an operator() as following * template * struct XxxFunctor { @@ -339,8 +334,7 @@ __device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1, * NX: The number of data continuously loaded by each thread. * NY: The number of data rows loaded by each thread, only NY = 1 was supported. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. + * threadIdx.x is used as the thread index. Currently only GPU was supported. * ReduceFunctor: Compute functor which has an operator() as following * template * struct ReduceFunctor { diff --git a/paddle/fluid/operators/kernel_primitives/datamover_primitives.h b/paddle/fluid/operators/kernel_primitives/datamover_primitives.h index c720bedf0a3afc..860072bd0c52ec 100644 --- a/paddle/fluid/operators/kernel_primitives/datamover_primitives.h +++ b/paddle/fluid/operators/kernel_primitives/datamover_primitives.h @@ -118,8 +118,8 @@ struct BroadcastConfig { } // namespace details /** - * @brief Read 2D data from global memory to registers according to Tx type, and - * store it as Ty type. + * @brief Read 2D data from global memory to register according to Tx type, and + * store it as Ty type into register. * * @template paraments * Tx: The type of data stored in the global memory. @@ -127,8 +127,7 @@ struct BroadcastConfig { * NX: The number of data columns loaded by each thread. * NY: The number of data rows loaded by each thread. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. + * threadIdx.x is used as the thread index. Currently only GPU was supported. * IsBoundary: Indicates whether to perform block access storage out-of-bounds * judgment. When the number of data processed by the block is less than * NX x NY x blockDim, boundary judgment is required to avoid memory access @@ -136,20 +135,20 @@ struct BroadcastConfig { * * @param: * dst: The register pointer of the thread, the size is NX * NY. - * src: Data pointer of the current block. - * size_nx: The current block needs to load size_nx columns of data, this - * parameter will be used when IsBoundary = true. - * size_ny: The current block needs to load size_ny rows of data. This parameter - * will be used when IsBoundary = true. - * stride_nx: The stride of cols. - * stride_ny: The stride of rows. + * src: The data pointer of the current block. + * size_nx: The maximum offset of the current block is size_nx elements in the + * lowest dimension. The parameters are only calculated when isboundary = true. + * size_ny: The maximum offset of the current block is size_ny elements in the + * first dimension. The parameters are only calculated when isboundary = true. + * stride_nx: Each read one element stride stride_nx elements in the last dim. + * stride_ny: Each read one element stride stride_ny elements in the first dim. */ template __device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src, int size_nx, int size_ny, int stride_nx, int stride_ny) { - int thread_offset = threadIdx.x * NX; + int thread_offset = threadIdx.x; int left_size_nx = size_nx - thread_offset; // Each branch is added for better performance @@ -165,7 +164,7 @@ __device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src, #pragma unroll for (int idy = 0; idy < NY; ++idy) { if (IsBoundary) { - if (idy >= size_ny) { + if (idy * stride_ny >= size_ny) { break; } } @@ -175,7 +174,7 @@ __device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src, #pragma unroll for (int idx = 0; idx < NX; ++idx) { if (IsBoundary) { - if (idx >= left_size_nx) { + if (idx * stride_nx >= left_size_nx) { break; } } @@ -185,14 +184,14 @@ __device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src, #pragma unroll for (int idx = 0; idx < NX; ++idx) { if (IsBoundary) { - if (idx >= left_size_nx) { + if (idx * stride_nx >= left_size_nx) { break; } } #pragma unroll for (int idy = 0; idy < NY; ++idy) { if (IsBoundary) { - if (idy >= size_ny) { + if (idy * stride_ny >= size_ny) { break; } } @@ -223,25 +222,24 @@ __device__ __forceinline__ void Init(T* dst, T init_data) { } /** - * @brief Read 2D data from global memory to registers. When IsBoundary = true + * @brief Read 1D data from global memory to register. When IsBoundary = true * and (NX % 4 == 0 or Nx % 2 == 0), vectorized load data will be used to * improve memory access efficiency. * * @template paraments - * T: Data type of src and dst. - * NX: The number of data continuously loaded by each thread. - * NY: The number of data rows loaded by each thread, only NY = 1 was supported. + * T: The type of data. + * NX: Each thread load NX data from global memory continuously. + * NY: Each thread need to load NY rows, only NY = 1 was supported. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. + * threadIdx.x is used as the thread index. Currently only GPU was supported. * IsBoundary: Whether to make an out-of-bounds judgment on access to memory. * When the number of data processed by this block is less than - * NX x NY x blockDim, boundary judgment is required to avoid memory access + * NX x NY x blockDim.x, boundary judgment is required to avoid memory access * crossing the boundary. * * @param: * dst: The register pointer of the thread, the size is NX * NY. - * src: Data pointer of the current block. + * src: The data pointer of the current block. * size: The current block needs to load size data continuously. */ template @@ -276,31 +274,29 @@ __device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src, } /** - * @brief Read 2D data from global memory to registers for broadcast. + * @brief Read 2D data from global memory to registers with broadcast form. * * @template paraments * T: The type of data stored in the global memory. * NX: The number of data columns loaded by each thread. * NY: The number of data rows loaded by each thread. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. + * threadIdx.x is used as the thread index. Currently only GPU was supported. * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2. * IsBoundary: Indicates whether to perform block access storage out-of-bounds * judgment. When the number of data processed by the block is less than - * NX x NY x blockDim, boundary judgment is required to avoid memory access + * NX x NY x blockDim.x, boundary judgment is required to avoid memory access * crossing the boundary. * * @param: * dst: The register pointer of the thread, the size is NX * NY. - * src: Raw input data pointer of kernel. - * block_offset: Data offset of this block, blockDim.x * blockIdx.x * NX; + * src: The original input data pointer of this kernel. + * block_offset: The data offset of this block, blockDim.x * blockIdx.x * NX. * config: Calculation configuration of broadcast. It is used to calculate the - * coordinate mapping relationship between output data and input data. Please - * refer to the sample code for specific usage. + * coordinate mapping relationship between output data and input data. * total_num_output: Total number of original output. - * stride_nx: The stride of cols. - * stride_ny: The stride of rows. + * stride_nx: Each read one element stride stride_nx elements in the last dim. + * stride_ny: Each read one element stride stride_ny elements in the first dim. */ template @@ -308,7 +304,7 @@ __device__ __forceinline__ void ReadDataBc( T* dst, const T* __restrict__ src, uint32_t block_offset, details::BroadcastConfig config, int total_num_output, int stride_nx, int stride_ny) { - uint32_t thread_offset = block_offset + threadIdx.x * NX; + uint32_t thread_offset = block_offset + threadIdx.x; uint32_t index_src = 0; #pragma unroll @@ -334,37 +330,33 @@ __device__ __forceinline__ void ReadDataBc( } /** - * @brief Read 2D data from global memory to registers for reduce. + * @brief Read 2D data from global memory to register with reduce form. * * @template paraments - * T: The type of data stored in the global memory. + * T: The type of data. * NX: The number of data columns loaded by each thread. * NY: The number of data rows loaded by each thread. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. + * threadIdx.x is used as the thread index. Currently only GPU was supported. * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2. * IsBoundary: Indicates whether to perform block access storage out-of-bounds * judgment. When the number of data processed by the block is less than - * NX x NY x blockDim, boundary judgment is required to avoid memory access + * NX x NY x blockDim.x, boundary judgment is required to avoid memory access * crossing the boundary. * * @param: * dst: The register pointer of the thread, the size is NX * NY. - * src: Raw input data pointer of kernel. - * block_offset: Data offset of this block, blockDim.x * blockIdx.x * NX; + * src: The input data pointer of this block. + * block_offset: The data offset of this block, blockDim.x * blockIdx.x * NX. * index_cal: Calculation configuration of Reduce. It is used to calculate the - * coordinate mapping relationship between output data and input data. Please - * refer to the sample code for specific usage. - * block_offset: data offset of this block, blockDim.x * blockIdx.x * NX; - * index_cal: get the global index in src, attention config was declared in - * host; + * coordinate mapping relationship between output data and input data. * size_nx: The current block needs to load size_nx columns of data, this - * parameter will be used when IsBoundary = true. - * size_ny: The current block needs to load size_ny rows of data. This parameter + * parameter will participate in the calculation when isboundary = true. + * size_ny: The current block needs to load size_ny rows of data, this parameter + * will participate in the calculation when isboundary = true. * will be used when IsBoundary = true. - * stride_nx: The stride of cols. - * stride_ny: The stride of rows. + * stride_nx: Each read one element stride stride_nx columns. + * stride_ny: Each read one element stride stride_ny raws. * reduce_last_dim: Used to indicate whether the dimension of reduce contains * the lowest dimension. */ @@ -375,10 +367,13 @@ __device__ __forceinline__ void ReadDataReduce( const IndexCal& index_cal, int size_nx, int size_ny, int stride_nx, int stride_ny, bool reduce_last_dim) { int thread_offset = 0; + int left_idx = 0; if (reduce_last_dim) { - thread_offset = block_offset + threadIdx.x; + thread_offset = threadIdx.x; + left_idx = threadIdx.y; } else { - thread_offset = block_offset + threadIdx.y; + thread_offset = threadIdx.y; + left_idx = threadIdx.x; } if (NX == 1) { @@ -389,30 +384,25 @@ __device__ __forceinline__ void ReadDataReduce( break; } } - uint32_t index_src = index_cal(thread_offset); + uint32_t index_src = index_cal(thread_offset + block_offset); dst[ny] = src[index_src]; thread_offset += stride_ny; } } else { #pragma unroll for (int nx = 0; nx < NX; ++nx) { - if (IsBoundary) { - if (nx * stride_nx >= size_nx) { - break; - } - } #pragma unroll for (int ny = 0; ny < NY; ++ny) { if (IsBoundary) { - if (nx * stride_nx >= size_nx) { + if ((thread_offset >= size_ny) || + (left_idx + nx * stride_nx >= size_nx)) { break; } } - uint32_t index_src = index_cal(thread_offset); + uint32_t index_src = index_cal(thread_offset + block_offset); dst[nx + ny * NX] = src[index_src]; thread_offset += stride_ny; } - thread_offset += stride_nx; } } } @@ -424,20 +414,19 @@ __device__ __forceinline__ void ReadDataReduce( * * @template paraments * T: The type of data. - * NX: The number of data continuously loaded by each thread. + * NX: The number of data continuously writed by each thread. * NY: The number of data rows loaded by each thread, only NY = 1 was supported. * BlockSize: Identifies the current device thread index method. For GPU, - * threadIdx.x is used as the thread index, and for xpu, core_id() is used as - * the index. Currently only GPU was supported. + * threadIdx.x is used as the thread index. Currently only GPU was supported. * IsBoundary: Indicates whether to perform block access storage out-of-bounds * judgment. When the number of data processed by the block is less than - * NX x NY x blockDim, boundary judgment is required to avoid memory access + * NX x NY x blockDim.x, boundary judgment is required to avoid memory access * crossing the boundary. * * @param: - * dst: Data pointer of the current block. - * src: The register pointer of the thread, the size is NX * NY. - * size: The current block needs to load size data continuously. + * dst: The data pointer of the current block. + * src: The register pointer, the size is NX * NY. + * size: The current block needs to load size elements continuously. */ template __device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src, @@ -467,6 +456,165 @@ __device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src, } } +/** + * @brief Write 2D data from register to global memory according to Tx type, and + * store it as Ty type. + * + * @template paraments + * Tx: The type of data that needs to be stored in registers. + * Ty: The type of data that stored in the global memory. + * NX: The number of data columns loaded by each thread. + * NY: The number of data rows loaded by each thread. + * BlockSize: Identifies the current device thread index method. For GPU, + * threadIdx.x is used as the thread index. Currently only GPU was supported. + * IsBoundary: Indicates whether to perform block access storage out-of-bounds + * judgment. When the number of data processed by the block is less than + * NX x NY x blockDim.x, boundary judgment is required to avoid memory access + * crossing the boundary. + * + * @param: + * dst: The data pointer of the current block. + * src: The register pointer of the thread, the size is NX * NY. + * size_nx: The maximum offset of the current block is size_nx elements in the + * lowest dimension. The parameters are only calculated when isboundary = true. + * size_ny: The maximum offset of the current block is size_ny elements in the + * first dimension. The parameters are only calculated when isboundary = true. + * stride_nx: Each read one element stride stride_nx elements in the last dim. + * stride_ny: Each read one element stride stride_ny elements in the first dim. + */ +template +__device__ __forceinline__ void WriteData(Ty* dst, const Tx* __restrict__ src, + int size_nx, int size_ny, + int stride_nx, int stride_ny) { + int thread_offset = threadIdx.x; + int left_size_nx = size_nx - thread_offset; + + // Each branch is added for better performance + if (NX == 1 && NY == 1) { // for NX == 1 and NY == 1 + if (IsBoundary) { + if (left_size_nx > 0) { + dst[thread_offset] = static_cast(src[0]); + } + } else { + dst[thread_offset] = static_cast(src[0]); + } + } else if (NX == 1) { // for NX == 1 and NY != 1 +#pragma unroll + for (int idy = 0; idy < NY; ++idy) { + if (IsBoundary) { + if (idy * stride_ny >= size_ny) { + break; + } + } + dst[thread_offset + idy * stride_ny] = static_cast(src[idy]); + } + } else if (NY == 1) { // for NY == 1 and NX != 1 +#pragma unroll + for (int idx = 0; idx < NX; ++idx) { + if (IsBoundary) { + if (idx * stride_nx >= left_size_nx) { + break; + } + } + dst[thread_offset + idx * stride_nx] = static_cast(src[idx]); + } + } else { // for NX != 1 and NY != 1 +#pragma unroll + for (int idx = 0; idx < NX; ++idx) { + if (IsBoundary) { + if (idx * stride_nx >= left_size_nx) { + break; + } + } +#pragma unroll + for (int idy = 0; idy < NY; ++idy) { + if (IsBoundary) { + if (idy * stride_ny >= size_ny) { + break; + } + } + dst[thread_offset + idx * stride_nx + idy * stride_ny] = + static_cast(src[idy * NX + idx]); + } + } + } +} + +/** + * @brief Initialize register with init_data. + * + * @template paraments + * T: Data type of register. + * NX: Number of data to initialize. + * + * @param: + * dst: The register pointer of the thread, the size is NX. + * init_data: The register pointer of init data, the size is NX. + */ +template +__device__ __forceinline__ void Init(T* dst, T* init_data, int num) { +#pragma unroll + for (int i = 0; i < NX; i++) { + if (IsBoundary) { + if (i >= num) { + break; + } + } + dst[i] = init_data[i]; + } +} + +/** + * @brief Read 1D data from global memory to register with broadcast form. + * + * @template paraments + * T: The type of data stored in the global memory. + * NX: The number of data continuously loaded by each thread. + * NY: The number of data rows loaded by each thread, only NY = 1 was supported. + * BlockSize: Identifies the current device thread index method. For GPU, + * threadIdx.x is used as the thread index. Currently only GPU was supported. + * Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2. + * IsBoundary: Indicates whether to perform block access storage out-of-bounds + * judgment. When the number of data processed by the block is less than + * NX x NY x blockDim.x, boundary judgment is required to avoid memory access + * crossing the boundary. + * + * @param: + * dst: The register pointer of the thread, the size is NX * NY. + * src: The original input data pointer of kernel. + * block_offset: The data offset of this block, blockDim.x * blockIdx.x * NX; + * config: Calculation configuration of broadcast. It is used to calculate the + * coordinate mapping relationship between output data and input data. + * total_num_output: Total number of original output. + */ +template +__device__ __forceinline__ void ReadDataBc( + T* dst, const T* __restrict__ src, uint32_t block_offset, + details::BroadcastConfig config, int total_num_output) { + uint32_t thread_offset = block_offset + threadIdx.x * NX; + uint32_t index_src = 0; + +#pragma unroll + for (uint32_t nx = 0; nx < NX; ++nx) { + uint32_t index_output = thread_offset + nx; + index_src = 0; + if (IsBoundary) { + if (index_output >= total_num_output) { + break; + } + } +#pragma unroll + for (int i = 0; i < Rank; ++i) { + auto fast_divmoder = config.divmoders[i].Divmod(index_output); + index_output = fast_divmoder.val[0]; + index_src += fast_divmoder.val[1] * config.strides[i]; + } + dst[nx] = src[index_src]; + } +} + } // namespace kernel_primitives } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/kernel_primitives/functor_primitives.h b/paddle/fluid/operators/kernel_primitives/functor_primitives.h new file mode 100644 index 00000000000000..fcfcdc28b1f009 --- /dev/null +++ b/paddle/fluid/operators/kernel_primitives/functor_primitives.h @@ -0,0 +1,230 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +namespace paddle { +namespace operators { +namespace kernel_primitives { +namespace details { + +static __device__ __forceinline__ platform::float16 Exp(platform::float16 x) { + return ::Eigen::numext::exp(x); +} + +static __device__ __forceinline__ float Exp(float x) { return expf(x); } + +static __device__ __forceinline__ double Exp(double x) { return exp(x); } + +static __device__ __forceinline__ platform::float16 Log(platform::float16 x) { + return ::Eigen::numext::log(x); +} + +static __device__ __forceinline__ float Log(float x) { return logf(x); } + +static __device__ __forceinline__ double Log(double x) { return log(x); } + +} // namespace details + +/******************************** Unary Functor *******************************/ + +/** + * @brief Default unary exp functor + */ +template +struct ExpFunctor { + HOSTDEVICE inline ExpFunctor() {} + + HOSTDEVICE explicit inline ExpFunctor(int n) {} + + HOSTDEVICE inline Ty operator()(const Tx& x) const { + return static_cast(details::Exp(x)); + } +}; + +/** + * @brief Default unary identity functor + */ +template +struct IdentityFunctor { + HOSTDEVICE inline IdentityFunctor() {} + + HOSTDEVICE explicit inline IdentityFunctor(int n) {} + + HOSTDEVICE inline Ty operator()(const Tx& x) const { + return static_cast(x); + } +}; + +/** + * @brief Default unary div functor. Divide by a constant + */ +template +struct DivideFunctor { + HOSTDEVICE inline DivideFunctor() { n_inv = static_cast(1.0f); } + + HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((Tx)(1.0 / n)) {} + + HOSTDEVICE inline Ty operator()(const Tx& x) const { + return static_cast(x * n_inv); + } + + private: + Tx n_inv; +}; + +/** + * @brief Default unary square functor + */ +template +struct SquareFunctor { + HOSTDEVICE inline SquareFunctor() {} + + HOSTDEVICE explicit inline SquareFunctor(int n) {} + + HOSTDEVICE inline Ty operator()(const Tx& x) const { + return static_cast(x) * static_cast(x); + } +}; + +/****************************** Binary Functor ********************************/ + +/** + * @brief Default binary min functor + */ +template +struct MinFunctor { + inline T initial() { return static_cast(std::numeric_limits::max()); } + + __device__ __forceinline__ T operator()(const T& a, const T& b) const { + return (b < a) ? b : a; + } +}; + +/** + * @brief Default binary max functor + */ +template +struct MaxFunctor { + inline T initial() { + return static_cast(std::numeric_limits::lowest()); + } + + __device__ __forceinline__ T operator()(const T& a, const T& b) const { + return (b > a) ? b : a; + } +}; + +/** + * @brief Default binary add functor + */ +template +struct AddFunctor { + inline T initial() { return static_cast(0.0f); } + + __device__ __forceinline__ T operator()(const T& a, const T& b) const { + return b + a; + } +}; + +/** + * @brief Default binary add functor + */ +template +struct MulFunctor { + inline T initial() { return static_cast(1.0f); } + + __device__ __forceinline__ T operator()(const T& a, const T& b) const { + return b * a; + } +}; + +/** + * @brief Default binary logic or functor + */ +template +struct LogicalOrFunctor { + inline T initial() { return static_cast(false); } + + __device__ __forceinline__ T operator()(const T& a, const T& b) const { + return b || a; + } +}; + +/** + * @brief Default binary logic and functor + */ +template +struct LogicalAndFunctor { + inline T initial() { return static_cast(true); } + + __device__ __forceinline__ T operator()(const T& a, const T& b) const { + return b && a; + } +}; + +/** + * @brief Default binary sub functor + */ +template +struct SubFunctor { + inline T initial() { return static_cast(0.0f); } + + inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a - b; } +}; + +/** + * @brief Default binary div functor + */ +template +struct DivFunctor { + inline T initial() { return static_cast(1.0f); } + + inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a / b; } +}; + +template +struct DivFunctor::value>::type> { + inline T initial() { return static_cast(1.0f); } + + inline HOSTDEVICE T operator()(const T& a, const T& b) const { + // For int32/int64, need to check whether the divison is zero. + PADDLE_ENFORCE_NE(b, 0, + platform::errors::InvalidArgument( + "Integer division by zero encountered " + "in (floor) divide. Please check the input value.")); + return a / b; + } +}; + +/** + * @brief Default binary floor divide functor + */ +template +struct FloorDivFunctor { + inline T initial() { return static_cast(1.0f); } + + inline HOSTDEVICE T operator()(const T& a, const T& b) const { + PADDLE_ENFORCE_NE(b, 0, + platform::errors::InvalidArgument( + "Integer division by zero encountered " + "in (floor) divide. Please check the input value.")); + return static_cast(std::trunc(a / b)); + } +}; + +} // namespace kernel_primitives +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/kernel_primitives/kernel_primitives.h b/paddle/fluid/operators/kernel_primitives/kernel_primitives.h index 45ee4fd738174b..9a4f8bb026b9da 100644 --- a/paddle/fluid/operators/kernel_primitives/kernel_primitives.h +++ b/paddle/fluid/operators/kernel_primitives/kernel_primitives.h @@ -16,6 +16,7 @@ #include "paddle/fluid/operators/kernel_primitives/compute_primitives.h" #include "paddle/fluid/operators/kernel_primitives/datamover_primitives.h" +#include "paddle/fluid/operators/kernel_primitives/functor_primitives.h" #include "paddle/fluid/operators/kernel_primitives/helper_primitives.h" namespace paddle { diff --git a/paddle/fluid/operators/kldiv_loss_op_npu.cc b/paddle/fluid/operators/kldiv_loss_op_npu.cc new file mode 100644 index 00000000000000..7d7cdd4c786712 --- /dev/null +++ b/paddle/fluid/operators/kldiv_loss_op_npu.cc @@ -0,0 +1,163 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the Licnse. */ + +#include "paddle/fluid/operators/kldiv_loss_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class KLDivLossNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* target = ctx.Input("Target"); + auto* loss = ctx.Output("Loss"); + auto reduction = ctx.Attr("reduction"); + loss->mutable_data(ctx.GetPlace()); + + auto& dev_ctx = ctx.template device_context(); + auto stream = dev_ctx.stream(); + + if ("none" == reduction) { + // log(label) + auto ones_tensor = ctx.AllocateTmpTensor( + target->dims(), dev_ctx); + const auto& ones_runner = + NpuOpRunner("OnesLike", {*target}, {ones_tensor}, {}); + ones_runner.Run(stream); + + auto sub_tensor = ctx.AllocateTmpTensor( + target->dims(), dev_ctx); + const auto& sub_runner = + NpuOpRunner("Sub", {*target, ones_tensor}, {sub_tensor}, {}); + sub_runner.Run(stream); + + auto log_target = ctx.AllocateTmpTensor( + target->dims(), dev_ctx); + const auto& log_runner = + NpuOpRunner("Log1p", {sub_tensor}, {log_target}, {}); + log_runner.Run(stream); + + // log(label) - input + const auto& sub_runner2 = + NpuOpRunner("Sub", {log_target, *input}, {*loss}, {}); + sub_runner2.Run(stream); + + // label * (log(label) - input) + auto min_value = + ctx.AllocateTmpTensor({1}, dev_ctx); + auto max_value = + ctx.AllocateTmpTensor({1}, dev_ctx); + FillNpuTensorWithConstant(&min_value, static_cast(0)); + FillNpuTensorWithConstant(&max_value, std::numeric_limits::max()); + + auto cliped_target = ctx.AllocateTmpTensor( + target->dims(), dev_ctx); + const auto& clip_runner = NpuOpRunner( + "ClipByValue", {*target, min_value, max_value}, {cliped_target}, {}); + clip_runner.Run(stream); + + const auto& mul_runner = + NpuOpRunner("Mul", {*loss, cliped_target}, {*loss}, {}); + mul_runner.Run(stream); + } else if ("batchmean" == reduction || "sum" == reduction) { + const auto& runner = NpuOpRunner("KLDiv", {*input, *target}, {*loss}, + {{"reduction", reduction}}); + runner.Run(stream); + } else if ("mean" == reduction) { + const auto& runner = NpuOpRunner("KLDiv", {*input, *target}, {*loss}, + {{"reduction", std::string("sum")}}); + runner.Run(stream); + + const int numel = input->numel(); + const auto& muls_runner = + NpuOpRunner("Muls", {*loss}, {*loss}, + {{"value", static_cast(1.0 / numel)}}); + muls_runner.Run(stream); + } + } +}; + +template +class KLDivLossGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* target = ctx.Input("Target"); + auto* loss_grad = ctx.Input(framework::GradVarName("Loss")); + auto* input_grad = ctx.Output(framework::GradVarName("X")); + auto reduction = ctx.Attr("reduction"); + input_grad->mutable_data(ctx.GetPlace()); + + auto& dev_ctx = ctx.template device_context(); + auto stream = dev_ctx.stream(); + + Tensor loss_grad_transformed; + if ("none" == reduction) { + loss_grad_transformed.ShareDataWith(*loss_grad); + } else { + loss_grad_transformed.mutable_data(input_grad->dims(), ctx.GetPlace()); + + NpuOpRunner broadcast_runner; + broadcast_runner.SetType("BroadcastTo"); + broadcast_runner.AddInput(*loss_grad); + broadcast_runner.AddInput(framework::vectorize(input_grad->dims())); + broadcast_runner.AddOutput(loss_grad_transformed); + broadcast_runner.Run(stream); + } + auto min_value = + ctx.AllocateTmpTensor({1}, dev_ctx); + auto max_value = + ctx.AllocateTmpTensor({1}, dev_ctx); + FillNpuTensorWithConstant(&min_value, static_cast(0)); + FillNpuTensorWithConstant(&max_value, std::numeric_limits::max()); + + auto cliped_target = ctx.AllocateTmpTensor( + target->dims(), dev_ctx); + const auto& clip_runner = NpuOpRunner( + "ClipByValue", {*target, min_value, max_value}, {cliped_target}, {}); + clip_runner.Run(stream); + + const auto& mul_runner = NpuOpRunner( + "Mul", {cliped_target, loss_grad_transformed}, {*input_grad}, {}); + mul_runner.Run(stream); + + float k = -1.0f; + + if ("mean" == reduction) { + k = static_cast(-1.0 / input_grad->numel()); + } else if ("batchmean" == reduction) { + k = static_cast(-1.0 / input_grad->dims()[0]); + } + + const auto& muls_runner = + NpuOpRunner("Muls", {*input_grad}, {*input_grad}, {{"value", k}}); + muls_runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(kldiv_loss, ops::KLDivLossNPUKernel, + ops::KLDivLossNPUKernel); + +REGISTER_OP_NPU_KERNEL(kldiv_loss_grad, ops::KLDivLossGradNPUKernel, + ops::KLDivLossGradNPUKernel); diff --git a/paddle/fluid/operators/lite/lite_engine_op_test.cc b/paddle/fluid/operators/lite/lite_engine_op_test.cc index 8b7f1268081343..053ba322d8f4de 100644 --- a/paddle/fluid/operators/lite/lite_engine_op_test.cc +++ b/paddle/fluid/operators/lite/lite_engine_op_test.cc @@ -105,15 +105,16 @@ TEST(LiteEngineOp, engine_op) { engine_op_desc.SetAttr("use_gpu", true); engine_op_desc.SetAttr("zero_copy", true); engine_op_desc.SetBlockAttr("sub_block", &block_desc); - inference::Singleton::Global().Create( - engine_key, config); - LOG(INFO) << "create engine op"; - auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc); - LOG(INFO) << "engine_op " << engine_op.get(); - // Execute them. - LOG(INFO) << "engine_op run"; - engine_op->Run(scope, place); - LOG(INFO) << "done"; + // TODO(wilber): The ut is out of date, we need to a new lite subgraph test. + // inference::Singleton::Global().Create( + // engine_key, config); + // LOG(INFO) << "create engine op"; + // auto engine_op = framework::OpRegistry::CreateOp(engine_op_desc); + // LOG(INFO) << "engine_op " << engine_op.get(); + // // Execute them. + // LOG(INFO) << "engine_op run"; + // engine_op->Run(scope, place); + // LOG(INFO) << "done"; } #endif diff --git a/paddle/fluid/operators/load_combine_op.h b/paddle/fluid/operators/load_combine_op.h index 589df8821b3e7f..a02b0e61d9278e 100644 --- a/paddle/fluid/operators/load_combine_op.h +++ b/paddle/fluid/operators/load_combine_op.h @@ -21,6 +21,8 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type_transform.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/string_array.h" +#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/platform/device_context.h" namespace paddle { @@ -75,38 +77,57 @@ class LoadCombineOpKernel : public framework::OpKernel { out_vars[i], platform::errors::InvalidArgument( "The variable %s to be loaded cannot be found.", out_var_names[i])); - - auto *tensor = out_vars[i]->GetMutable(); - // Error checking PADDLE_ENFORCE_EQ( static_cast(*buffer), true, platform::errors::Unavailable( "An error occurred while loading model parameters. " "Please check whether the model file is complete or damaged.")); - - // Get data from fin to tensor - DeserializeFromStream(*buffer, tensor, dev_ctx); - - auto in_dtype = tensor->type(); - auto out_dtype = - load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; - - if (in_dtype != out_dtype) { - // convert to float16 tensor - auto in_kernel_type = framework::OpKernelType(in_dtype, place); - auto out_kernel_type = framework::OpKernelType(out_dtype, place); - framework::LoDTensor fp16_tensor; - // copy LoD info to the new tensor - fp16_tensor.set_lod(tensor->lod()); - framework::TransDataType(in_kernel_type, out_kernel_type, *tensor, - &fp16_tensor); - - // reset output tensor - out_vars[i]->Clear(); - tensor = out_vars[i]->GetMutable(); - tensor->set_lod(fp16_tensor.lod()); - tensor->ShareDataWith(fp16_tensor); + if (out_vars[i]->IsType()) { + auto *tensor = out_vars[i]->GetMutable(); + tensor->clear(); + std::unordered_map data; + framework::StringMapFromStream(*buffer, &data); + for (auto it = data.begin(); it != data.end(); ++it) { + std::string tmp; + framework::NFD(it->first, &tmp); + if (tmp.empty()) { + VLOG(0) << "The string " << it->first + << " was converted to unicode failedly! " + << "Then dropped to load it."; + continue; + } + std::wstring token; + bool status = framework::ConvertStrToWstr(tmp, &token); + if (!status) continue; + tensor->emplace(token, it->second); + } + } else { + auto *tensor = out_vars[i]->GetMutable(); + + // Get data from fin to tensor + DeserializeFromStream(*buffer, tensor, dev_ctx); + + auto in_dtype = tensor->type(); + auto out_dtype = + load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; + + if (in_dtype != out_dtype) { + // convert to float16 tensor + auto in_kernel_type = framework::OpKernelType(in_dtype, place); + auto out_kernel_type = framework::OpKernelType(out_dtype, place); + framework::LoDTensor fp16_tensor; + // copy LoD info to the new tensor + fp16_tensor.set_lod(tensor->lod()); + framework::TransDataType(in_kernel_type, out_kernel_type, *tensor, + &fp16_tensor); + + // reset output tensor + out_vars[i]->Clear(); + tensor = out_vars[i]->GetMutable(); + tensor->set_lod(fp16_tensor.lod()); + tensor->ShareDataWith(fp16_tensor); + } } } buffer->peek(); diff --git a/paddle/fluid/operators/log_loss_op_npu.cc b/paddle/fluid/operators/log_loss_op_npu.cc index a8d906d4b5cad8..74b44165dcc4c1 100644 --- a/paddle/fluid/operators/log_loss_op_npu.cc +++ b/paddle/fluid/operators/log_loss_op_npu.cc @@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and -limitations under the Licnse. */ +limitations under the License. */ #include "paddle/fluid/operators/log_loss_op.h" #include diff --git a/paddle/fluid/operators/lookup_table_v2_op_npu.cc b/paddle/fluid/operators/lookup_table_v2_op_npu.cc index 387cd92b69f923..3cb91c712335d6 100644 --- a/paddle/fluid/operators/lookup_table_v2_op_npu.cc +++ b/paddle/fluid/operators/lookup_table_v2_op_npu.cc @@ -21,6 +21,9 @@ limitations under the License. */ namespace paddle { namespace operators { +using Tensor = framework::Tensor; +constexpr int64_t kNoPadding = -1; + template class LookupTableV2NPUKernel : public framework::OpKernel { public: @@ -35,13 +38,52 @@ class LookupTableV2NPUKernel : public framework::OpKernel { platform::errors::InvalidArgument("npu only accept LoDTensor")); output_t->mutable_data(ctx.GetPlace()); - NpuOpRunner runner; - runner.SetType("GatherV2") - .AddInput(*table_t) - .AddInput(*ids_t) - .AddInput(std::vector{0}) - .AddOutput(*output_t); - runner.Run(); + int64_t padding_idx = ctx.Attr("padding_idx"); + if (padding_idx == kNoPadding) { + NpuOpRunner runner; + runner.SetType("GatherV2") + .AddInput(*table_t) + .AddInput(*ids_t) + .AddInput(std::vector{0}) +#if (CANN_VERSION_CODE >= 503003) + .AddAttrs({{"batch_dims", 0}}) +#endif + .AddOutput(*output_t); + runner.Run(); + } else { + Tensor tmp_table_t(table_t->type()); + tmp_table_t.mutable_data(table_t->dims(), ctx.GetPlace()); + + Tensor index; + index.mutable_data({1, 1}, ctx.GetPlace()); + FillNpuTensorWithConstant(&index, + static_cast(padding_idx)); + + auto updata_dim = framework::make_ddim({1, table_t->dims()[1]}); + Tensor update; + update.mutable_data(updata_dim, ctx.GetPlace()); + FillNpuTensorWithConstant(&update, static_cast(0)); + update.Resize(updata_dim); + + NpuOpRunner update_runner; + update_runner.SetType("TensorScatterUpdate") + .AddInput(*table_t) + .AddInput(index) + .AddInput(update) + .AddOutput(tmp_table_t); + update_runner.Run(); + + NpuOpRunner runner; + runner.SetType("GatherV2") + .AddInput(tmp_table_t) + .AddInput(*ids_t) + .AddInput(std::vector{0}) +#if (CANN_VERSION_CODE >= 503003) + .AddAttrs({{"batch_dims", 0}}) +#endif + .AddOutput(*output_t); + runner.Run(); + } } }; diff --git a/paddle/fluid/operators/math/matrix_solve.h b/paddle/fluid/operators/math/matrix_solve.h index 93c37ae425640f..415d0c6dd8e0cf 100644 --- a/paddle/fluid/operators/math/matrix_solve.h +++ b/paddle/fluid/operators/math/matrix_solve.h @@ -70,6 +70,46 @@ void compute_solve_eigen(const DeviceContext& context, } } +// only used for complex input +template +void SolveLinearSystem(T* matrix_data, T* rhs_data, T* out_data, int order, + int rhs_cols, int batch) { + using Treal = typename Eigen::NumTraits::Real; + + // cast paddle::complex into std::complex + std::complex* matrix_data_ = + reinterpret_cast*>(matrix_data); + std::complex* rhs_data_ = + reinterpret_cast*>(rhs_data); + std::complex* out_data_ = + reinterpret_cast*>(out_data); + + using Matrix = Eigen::Matrix, Eigen::Dynamic, + Eigen::Dynamic, Eigen::RowMajor>; + using InputMatrixMap = Eigen::Map; + using OutputMatrixMap = Eigen::Map; + + for (int i = 0; i < batch; ++i) { + auto input_matrix = + InputMatrixMap(matrix_data_ + i * order * order, order, order); + auto input_rhs = + InputMatrixMap(rhs_data_ + i * order * rhs_cols, order, rhs_cols); + auto output = + OutputMatrixMap(out_data_ + i * order * rhs_cols, order, rhs_cols); + + Eigen::PartialPivLU lu_decomposition(order); + lu_decomposition.compute(input_matrix); + + const Treal min_abs_piv = + lu_decomposition.matrixLU().diagonal().cwiseAbs().minCoeff(); + PADDLE_ENFORCE_GT(min_abs_piv, Treal(0), + platform::errors::InvalidArgument( + "Something's wrong with SolveLinearSystem. ")); + + output = lu_decomposition.solve(input_rhs); + } +} + template class MatrixSolveFunctor { public: diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/fluid/operators/math/pooling.cu index 48b0d2ab460571..84a970a9a26067 100644 --- a/paddle/fluid/operators/math/pooling.cu +++ b/paddle/fluid/operators/math/pooling.cu @@ -979,6 +979,49 @@ __global__ void KernelMaxPool3DGrad( } } +template +void Pool3dDirectCUDAFunctor::operator()( + const T* input, const std::vector& input_shape, + const std::vector& output_shape, const std::vector& ksize, + const std::vector& strides, const std::vector& paddings, + bool exclusive, bool adaptive, T* output, gpuStream_t stream, + PoolProcess pool_compute) { + const int batch_size = input_shape[0]; + const int input_channels = input_shape[1]; + const int input_depth = input_shape[2]; + const int input_height = input_shape[3]; + const int input_width = input_shape[4]; + const int output_channels = output_shape[1]; + const int output_depth = output_shape[2]; + const int output_height = output_shape[3]; + const int output_width = output_shape[4]; + const int ksize_depth = ksize[0]; + const int ksize_height = ksize[1]; + const int ksize_width = ksize[2]; + const int stride_depth = strides[0]; + const int stride_height = strides[1]; + const int stride_width = strides[2]; + const int padding_depth = paddings[0]; + const int padding_height = paddings[1]; + const int padding_width = paddings[2]; + + int nthreads = batch_size * output_channels * output_depth * output_height * + output_width; + int thread_num = 1024; +#ifdef WITH_NV_JETSON + thread_num = 512; +#endif + int blocks = (nthreads + thread_num - 1) / thread_num; + dim3 threads(thread_num, 1); + dim3 grid(blocks, 1); + + KernelPool3D<<>>( + nthreads, input, input_channels, input_depth, input_height, input_width, + output_depth, output_height, output_width, ksize_depth, ksize_height, + ksize_width, stride_depth, stride_height, stride_width, padding_depth, + padding_height, padding_width, pool_compute, exclusive, adaptive, output); +} + /* * Tensors are in NCDHW or NDHWC format. * Ksize, strides, paddings are three elements. These three elements represent @@ -1315,6 +1358,11 @@ class MaxPool3dGradFunctor { } }; +template class Pool3dDirectCUDAFunctor, + float>; +template class Pool3dDirectCUDAFunctor, + float>; + template class MaxPool3dGradFunctor; template class MaxPool3dGradFunctor; template class MaxPool3dGradFunctor +class Pool3dDirectCUDAFunctor { + public: + void operator()(const T* input, const std::vector& input_shape, + const std::vector& output_shape, + const std::vector& ksize, + const std::vector& strides, + const std::vector& paddings, bool exclusive, + bool adaptive, T* output, gpuStream_t stream, + PoolProcess pool_compute); +}; +#endif + template class Pool3dFunctor { public: diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc index 4e435660ff6dc4..051f97ad4ec8de 100644 --- a/paddle/fluid/operators/matmul_op.cc +++ b/paddle/fluid/operators/matmul_op.cc @@ -336,6 +336,8 @@ framework::DDim GetDimForInput(const framework::InferShapeContext &ctx, "The Input(%s) has not been initialized properly. The " "shape of Input(%s) = [%s].", dim)); + + // if mkldnn reshape+transpose+matmul fuse activated if (!shape.empty() && !axis.empty()) { PADDLE_ENFORCE_GE( shape.size(), 2, @@ -355,6 +357,43 @@ framework::DDim GetDimForInput(const framework::InferShapeContext &ctx, "Ranks of shape_%s and axis_%s attributes of MatMulOp " "must be equal.", input_name, input_name)); + + int num_negative = std::count(shape.begin(), shape.end(), -1); + PADDLE_ENFORCE_LE(num_negative, 1, + platform::errors::InvalidArgument( + "The max number of -1 in fused_reshape_%s is 1 " + "but received %d.", + input_name, num_negative)); + + auto it_zero = std::find(shape.begin(), shape.end(), 0); + if (it_zero != shape.end()) { + for (uint64_t i = 0; i < shape.size(); i++) { + if (shape[i] == 0) { + PADDLE_ENFORCE_LT(i, dim.size(), + platform::errors::InvalidArgument( + "The index of 0 in fused_reshape_%s ", + "should be less than output dim size, ", + "but the index is %d and output dim size is %d", + input_name, i, dim.size())); + shape[i] = dim.at(i); + } + } + } + + // if "-1" is present then one of reshape dims must be infered + auto it_negative = std::find(shape.begin(), shape.end(), -1); + if (it_negative != shape.end()) { + int64_t dim_product = 1; + for (int i = 0; i < dim.size(); i++) { + dim_product *= dim.at(i); + } + + int64_t shape_product = std::accumulate(shape.begin(), shape.end(), -1, + std::multiplies()); + int index = std::distance(shape.begin(), it_negative); + shape[index] = dim_product / shape_product; + } + dim = dim.reshape(shape).transpose(axis); } return dim; diff --git a/paddle/fluid/operators/matmul_op_npu.cc b/paddle/fluid/operators/matmul_op_npu.cc index d5606177a55926..df811abc1de98b 100644 --- a/paddle/fluid/operators/matmul_op_npu.cc +++ b/paddle/fluid/operators/matmul_op_npu.cc @@ -12,8 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include -#include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/npu_op_runner.h" @@ -21,40 +19,253 @@ limitations under the License. */ namespace paddle { namespace operators { +using Tensor = framework::Tensor; +using NPUDeviceContext = platform::NPUDeviceContext; + +template +static void Mul(const framework::ExecutionContext& ctx, + const aclrtStream& stream, const Tensor& X, const Tensor& Y, + Tensor* Out, const float alpha) { + Out->mutable_data(ctx.GetPlace()); + + if (fabs(alpha - 1.0) < std::numeric_limits::epsilon()) { + const auto& runner_dx = NpuOpRunner("Mul", {X, Y}, {*Out}, {}); + runner_dx.Run(stream); + } else { + Tensor Out_temp(Out->type()); + Out_temp.mutable_data(Out->dims(), ctx.GetPlace()); + const auto& runner_dx = NpuOpRunner("Mul", {X, Y}, {Out_temp}, {}); + runner_dx.Run(stream); + + const auto& runner = + NpuOpRunner("Muls", {Out_temp}, {*Out}, {{"value", alpha}}); + runner.Run(stream); + } +} + +template +static void Dot(const framework::ExecutionContext& ctx, + const aclrtStream& stream, const Tensor& X, const Tensor& Y, + Tensor* Out, const float alpha) { + Out->mutable_data(ctx.GetPlace()); + + if (fabs(alpha - 1.0) < std::numeric_limits::epsilon()) { + const auto& runner = NpuOpRunner("Dot", {X, Y}, {*Out}); + runner.Run(stream); + } else { + Tensor Out_temp(Out->type()); + Out_temp.mutable_data(Out->dims(), ctx.GetPlace()); + const auto& out_temp_runner = NpuOpRunner("Dot", {X, Y}, {Out_temp}); + out_temp_runner.Run(stream); + + const auto& runner = + NpuOpRunner("Muls", {Out_temp}, {*Out}, {{"value", alpha}}); + runner.Run(stream); + } +} + +template +static void MatMul2D(const framework::ExecutionContext& ctx, + const aclrtStream& stream, const Tensor& X, + const Tensor& Y, Tensor* Out, const bool trans_x, + const bool trans_y, const float alpha) { + Out->mutable_data(ctx.GetPlace()); + + if (fabs(alpha - 1.0) < std::numeric_limits::epsilon()) { + const auto& runner = + NpuOpRunner("MatMul", {X, Y}, {*Out}, + {{"transpose_x1", trans_x}, {"transpose_x2", trans_y}}); + runner.Run(stream); + } else { + Tensor Out_temp(Out->type()); + Out_temp.mutable_data(Out->dims(), ctx.GetPlace()); + const auto& out_temp_runner = + NpuOpRunner("MatMul", {X, Y}, {Out_temp}, + {{"transpose_x1", trans_x}, {"transpose_x2", trans_y}}); + out_temp_runner.Run(stream); + + const auto& runner = + NpuOpRunner("Muls", {Out_temp}, {*Out}, {{"value", alpha}}); + runner.Run(stream); + } +} + +template +static void MatMulND(const framework::ExecutionContext& ctx, + const aclrtStream& stream, const Tensor& X, + const Tensor& Y, Tensor* Out, const bool trans_x, + const bool trans_y, const float alpha) { + Out->mutable_data(ctx.GetPlace()); + + if (fabs(alpha - 1.0) < std::numeric_limits::epsilon()) { + const auto& runner = + NpuOpRunner("BatchMatMul", {X, Y}, {*Out}, + {{"adj_x1", trans_x}, {"adj_x2", trans_y}}); + runner.Run(stream); + } else { + Tensor Out_temp(Out->type()); + Out_temp.mutable_data(Out->dims(), ctx.GetPlace()); + const auto& out_temp_runner = + NpuOpRunner("BatchMatMul", {X, Y}, {Out_temp}, + {{"adj_x1", trans_x}, {"adj_x2", trans_y}}); + out_temp_runner.Run(stream); + + const auto& runner = + NpuOpRunner("Muls", {Out_temp}, {*Out}, {{"value", alpha}}); + runner.Run(stream); + } +} + +template +static void ReduceDims(const framework::ExecutionContext& ctx, + const aclrtStream& stream, + const std::vector& dims, + const std::vector& brd_dims, const Tensor& in, + Tensor* out) { + std::vector axes; + int64_t size = brd_dims.size(); + int64_t diff = brd_dims.size() - dims.size(); + for (int64_t i = 0; i < size; ++i) { + if (i < diff) { + axes.push_back(i); + continue; + } + if (brd_dims[i] > dims[i - diff]) { + axes.push_back(i); + } + } + out->mutable_data(ctx.GetPlace()); + const auto& runner = NpuOpRunner("ReduceSumD", {in}, {*out}, + {{"axes", axes}, {"keep_dims", false}}); + runner.Run(stream); +} + template class MatMulNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); + auto* X = ctx.Input("X"); + auto* Y = ctx.Input("Y"); + auto* Out = ctx.Output("Out"); bool transpose_x = ctx.Attr("transpose_X"); bool transpose_y = ctx.Attr("transpose_Y"); + float alpha = static_cast(ctx.Attr("alpha")); + + std::vector x_dims = framework::vectorize(X->dims()); + std::vector y_dims = framework::vectorize(Y->dims()); + std::vector out_dims = framework::vectorize(Out->dims()); + int x_ndim = x_dims.size(); + int y_ndim = y_dims.size(); + int out_ndim = out_dims.size(); - if (x->dims().size() == 2) { - out->mutable_data(ctx.GetPlace()); + auto stream = ctx.template device_context().stream(); - const auto& runner = NpuOpRunner( - "MatMul", {*x, *y}, {*out}, - {{"transpose_x1", transpose_x}, {"transpose_x2", transpose_y}}); + // Case 1: [K] x [K] = [1] + if (x_ndim == 1 && y_ndim == 1) { + PADDLE_ENFORCE_EQ( + X->numel(), Y->numel(), + platform::errors::InvalidArgument( + "X's numbers must be equal to Y's numbers," + "when X/Y's dims =1. But received X has [%d] elements," + "received Y has [%d] elements", + X->numel(), Y->numel())); + Out->Resize({1}); + Dot(ctx, stream, *X, *Y, Out, alpha); + return; + } - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); + // Resize dim 1 to 2 + Tensor x_temp, y_temp; + x_temp.ShareDataWith(*X); + y_temp.ShareDataWith(*Y); + if (x_ndim == 1) { + x_dims.insert(x_dims.begin(), 1); + out_dims.insert(out_dims.end() - 1, 1); + x_temp.Resize(framework::make_ddim(x_dims)); + x_ndim = 2; + out_ndim += 1; + } + if (y_ndim == 1) { + y_dims.push_back(1); + out_dims.push_back(1); + y_temp.Resize(framework::make_ddim(y_dims)); + y_ndim = 2; + out_ndim += 1; + } + + const int K = transpose_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1]; + if (transpose_y) { + PADDLE_ENFORCE_EQ(y_dims[y_ndim - 1], K, + platform::errors::InvalidArgument( + "Input(Y) has error dim." + "Y'dims[%d] must be equal to %d" + "But received Y'dims[%d] is %d", + y_ndim - 1, K, y_ndim - 1, y_dims[y_ndim - 1])); + } else { + PADDLE_ENFORCE_EQ(y_dims[y_ndim - 2], K, + platform::errors::InvalidArgument( + "Input(Y) has error dim." + "Y'dims[%d] must be equal to %d" + "But received Y'dims[%d] is %d", + y_ndim - 2, K, y_ndim - 2, y_dims[y_ndim - 2])); + } + + // Case 2: [M, K] x [K, N] = [M, N] + if (x_ndim == 2 && y_ndim == 2) { + MatMul2D(ctx, stream, x_temp, y_temp, Out, transpose_x, transpose_y, + alpha); + return; + } + + // Case 3: [B, M, K] x [K, N] = [B, M, N], when transpose_x = false + // Equal: [B * M, K] x [K, N] = [B * M, N] => [B, M, N] + if (transpose_x == false && y_ndim == 2) { + std::vector vec_dim = {x_temp.numel() / K, K}; + x_temp.Resize(framework::make_ddim(vec_dim)); + MatMul2D(ctx, stream, x_temp, y_temp, Out, transpose_x, transpose_y, + alpha); + return; + } - } else if (x->dims().size() > 2) { - out->mutable_data(ctx.GetPlace()); + // Case 4: [B, M, K] x [B, K, N] = [B, M, N] + std::vector x_broadcast_dims(out_ndim, 1); + std::vector y_broadcast_dims(out_ndim, 1); + std::copy(out_dims.begin(), out_dims.end() - 2, x_broadcast_dims.begin()); + std::copy(out_dims.begin(), out_dims.end() - 2, y_broadcast_dims.begin()); + std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2); + std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2); - const auto& runner = - NpuOpRunner("BatchMatMul", {*x, *y}, {*out}, - {{"adj_x1", transpose_x}, {"adj_x2", transpose_y}}); + Tensor x_temp_brd(X->type()); + if (x_dims == x_broadcast_dims) { + x_temp_brd.ShareDataWith(*X); + x_temp_brd.Resize(framework::make_ddim(x_broadcast_dims)); + } else { + x_temp_brd.Resize(framework::make_ddim(x_broadcast_dims)); + x_temp_brd.mutable_data(ctx.GetPlace()); + NpuOpRunner runner_brd; + runner_brd.SetType("BroadcastTo") + .AddInput(x_temp) + .AddInput(std::move(x_broadcast_dims)) + .AddOutput(x_temp_brd) + .Run(stream); + } - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); + Tensor y_temp_brd(Y->type()); + if (y_dims == y_broadcast_dims) { + y_temp_brd.ShareDataWith(*Y); + y_temp_brd.Resize(framework::make_ddim(y_broadcast_dims)); + } else { + y_temp_brd.Resize(framework::make_ddim(y_broadcast_dims)); + y_temp_brd.mutable_data(ctx.GetPlace()); + NpuOpRunner runner_brd; + runner_brd.SetType("BroadcastTo") + .AddInput(y_temp) + .AddInput(std::move(y_broadcast_dims)) + .AddOutput(y_temp_brd) + .Run(stream); } + MatMulND(ctx, stream, x_temp_brd, y_temp_brd, Out, transpose_x, + transpose_y, alpha); } }; @@ -62,109 +273,200 @@ template class MatMulGradNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); + auto* X = ctx.Input("X"); + auto* Y = ctx.Input("Y"); + auto* dOut = ctx.Input(framework::GradVarName("Out")); + auto* dX = ctx.Output(framework::GradVarName("X")); + auto* dY = ctx.Output(framework::GradVarName("Y")); + bool transpose_x = ctx.Attr("transpose_X"); bool transpose_y = ctx.Attr("transpose_Y"); - auto stream = - ctx.template device_context() - .stream(); - - if (x->dims().size() == 2) { - if (transpose_y) { - if (dx) { - dx->mutable_data(ctx.GetPlace()); - const auto& runner_dx = - NpuOpRunner("MatMul", {*dout, *y}, {*dx}, - {{"transpose_x1", false}, {"transpose_x2", false}}); - - runner_dx.Run(stream); - } - if (dy) { - dy->mutable_data(ctx.GetPlace()); - const auto& runner_dy = - NpuOpRunner("MatMul", {*dout, *x}, {*dy}, - {{"transpose_x1", true}, {"transpose_x2", false}}); + float alpha = static_cast(ctx.Attr("alpha")); - runner_dy.Run(stream); - } + std::vector x_dims = framework::vectorize(X->dims()); + std::vector y_dims = framework::vectorize(Y->dims()); + std::vector out_dims = framework::vectorize(dOut->dims()); + int x_ndim = x_dims.size(); + int y_ndim = y_dims.size(); + int out_ndim = out_dims.size(); - } else { - if (dx) { - dx->mutable_data(ctx.GetPlace()); - const auto& runner_dx = - NpuOpRunner("MatMul", {*dout, *y}, {*dx}, - {{"transpose_x1", false}, {"transpose_x2", true}}); + auto stream = ctx.template device_context().stream(); - runner_dx.Run(stream); - } - if (dy) { - dy->mutable_data(ctx.GetPlace()); - const auto& runner_dy = - NpuOpRunner("MatMul", {*x, *dout}, {*dy}, - {{"transpose_x1", true}, {"transpose_x2", false}}); + // Case 1: [K] x [K] = [1] + if (x_ndim == 1 && y_ndim == 1) { + Tensor dout_temp(dOut->type()); + dout_temp.Resize(X->dims()); + dout_temp.mutable_data(ctx.GetPlace()); + NpuOpRunner runner; + runner.SetType("BroadcastTo") + .AddInput(*dOut) + .AddInput(std::move(x_dims)) + .AddOutput(dout_temp) + .Run(stream); + + if (dX) { + Mul(ctx, stream, dout_temp, *Y, dX, alpha); + } + if (dY) { + Mul(ctx, stream, dout_temp, *X, dY, alpha); + } + return; + } + + // Resize dim 1 to 2 + Tensor x_temp, y_temp, dout_temp; + x_temp.ShareDataWith(*X); + y_temp.ShareDataWith(*Y); + dout_temp.ShareDataWith(*dOut); + if (x_ndim == 1) { + x_dims.insert(x_dims.begin(), 1); + out_dims.insert(out_dims.end() - 1, 1); + x_temp.Resize(framework::make_ddim(x_dims)); + dout_temp.Resize(framework::make_ddim(out_dims)); + x_ndim = 2; + out_ndim += 1; + } + if (y_ndim == 1) { + y_dims.push_back(1); + out_dims.push_back(1); + y_temp.Resize(framework::make_ddim(y_dims)); + dout_temp.Resize(framework::make_ddim(out_dims)); + y_ndim = 2; + out_ndim += 1; + } - runner_dy.Run(stream); + // Case 2: [M, K] x [K, N] = [M, N] + if (out_ndim == 2) { + if (dX) { + dX->Resize(framework::make_ddim(x_dims)); + if (transpose_x) { + MatMul2D(ctx, stream, y_temp, dout_temp, dX, transpose_y, true, + alpha); + } else { + MatMul2D(ctx, stream, dout_temp, y_temp, dX, false, !transpose_y, + alpha); } + dX->Resize(X->dims()); } - } else if (x->dims().size() > 2) { - if (transpose_y) { - if (dx) { - dx->mutable_data(ctx.GetPlace()); - const auto& runner_dx = - NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx}, - {{"adj_x1", false}, {"adj_x2", false}}); - - runner_dx.Run(stream); + if (dY) { + dY->Resize(framework::make_ddim(y_dims)); + if (transpose_y) { + MatMul2D(ctx, stream, dout_temp, x_temp, dY, true, transpose_x, + alpha); + } else { + MatMul2D(ctx, stream, x_temp, dout_temp, dY, !transpose_x, false, + alpha); } - if (dy) { - dy->mutable_data(ctx.GetPlace()); - const auto& runner_dy = - NpuOpRunner("BatchMatMul", {*dout, *x}, {*dy}, - {{"adj_x1", true}, {"adj_x2", false}}); + dY->Resize(Y->dims()); + } + return; + } + + const int K = transpose_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1]; + const int N = transpose_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1]; - runner_dy.Run(stream); + // Case 3: [B, M, K] x [K, N] = [B, M, N], when transpose_x = false + // Equal: [B * M, K] x [K, N] = [B * M, N] => [B, M, N] + if (transpose_x == false && y_ndim == 2) { + std::vector x_vec_dim = {x_temp.numel() / K, K}; + dout_temp.Resize( + framework::make_ddim(std::vector{dout_temp.numel() / N, N})); + if (dX) { + dX->Resize(framework::make_ddim(x_vec_dim)); + MatMul2D(ctx, stream, dout_temp, y_temp, dX, false, !transpose_y, + alpha); + dX->Resize(X->dims()); + } + if (dY) { + x_temp.Resize(framework::make_ddim(x_vec_dim)); + if (transpose_y) { + MatMul2D(ctx, stream, dout_temp, x_temp, dY, true, false, alpha); + } else { + MatMul2D(ctx, stream, x_temp, dout_temp, dY, true, false, alpha); } - } else { - if (dx) { - dx->mutable_data(ctx.GetPlace()); - const auto& runner_dx = - NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx}, - {{"adj_x1", false}, {"adj_x2", true}}); + } + return; + } - runner_dx.Run(stream); + // Case 4: [B, M, K] x [B, K, N] = [B, M, N] + std::vector x_broadcast_dims(out_ndim, 1); + std::vector y_broadcast_dims(out_ndim, 1); + std::copy(out_dims.begin(), out_dims.end() - 2, x_broadcast_dims.begin()); + std::copy(out_dims.begin(), out_dims.end() - 2, y_broadcast_dims.begin()); + std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2); + std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2); + + Tensor x_temp_brd(X->type()); + if (x_dims == x_broadcast_dims) { + x_temp_brd.ShareDataWith(*X); + x_temp_brd.Resize(framework::make_ddim(x_broadcast_dims)); + } else { + x_temp_brd.Resize(framework::make_ddim(x_broadcast_dims)); + x_temp_brd.mutable_data(ctx.GetPlace()); + NpuOpRunner runner_brd; + runner_brd.SetType("BroadcastTo") + .AddInput(x_temp) + .AddInput(std::move(x_broadcast_dims)) + .AddOutput(x_temp_brd) + .Run(stream); + } + + Tensor y_temp_brd(Y->type()); + if (y_dims == y_broadcast_dims) { + y_temp_brd.ShareDataWith(*Y); + y_temp_brd.Resize(framework::make_ddim(y_broadcast_dims)); + } else { + y_temp_brd.Resize(framework::make_ddim(y_broadcast_dims)); + y_temp_brd.mutable_data(ctx.GetPlace()); + NpuOpRunner runner_brd; + runner_brd.SetType("BroadcastTo") + .AddInput(y_temp) + .AddInput(std::move(y_broadcast_dims)) + .AddOutput(y_temp_brd) + .Run(stream); + } + + if (dX) { + if (x_dims == x_broadcast_dims) { + if (transpose_x) { + MatMulND(ctx, stream, y_temp_brd, dout_temp, dX, transpose_y, true, + alpha); + } else { + MatMulND(ctx, stream, dout_temp, y_temp_brd, dX, false, + !transpose_y, alpha); + } + } else { + Tensor dx_temp(X->type()); + dx_temp.Resize(framework::make_ddim(x_broadcast_dims)); + if (transpose_x) { + MatMulND(ctx, stream, y_temp_brd, dout_temp, &dx_temp, transpose_y, + true, alpha); + } else { + MatMulND(ctx, stream, dout_temp, y_temp_brd, &dx_temp, false, + !transpose_y, alpha); } - if (dy) { - dy->mutable_data(ctx.GetPlace()); - if ((x->dims().size() == 3) && (dout->dims().size() == 3) && - (dy->dims().size() == 2)) { - framework::Tensor dout_tmp; - dout_tmp.ShareDataWith(*dout); - std::vector vec_dim = - framework::vectorize(dout_tmp.dims()); - std::vector vec_dim_v{vec_dim[0] * vec_dim[1], vec_dim[2]}; - dout_tmp.Resize(framework::make_ddim(vec_dim_v)); - - framework::Tensor x_tmp; - x_tmp.ShareDataWith(*x); - std::vector vec_dim_x = - framework::vectorize(x_tmp.dims()); - std::vector vec_dim_x_v{vec_dim_x[0] * vec_dim_x[1], - vec_dim_x[2]}; - x_tmp.Resize(framework::make_ddim(vec_dim_x_v)); - const auto& runner_dy = - NpuOpRunner("MatMul", {x_tmp, dout_tmp}, {*dy}, - {{"transpose_x1", true}, {"transpose_x2", false}}); - runner_dy.Run(stream); - } else { - const auto& runner_dy = - NpuOpRunner("BatchMatMul", {*x, *dout}, {*dy}, - {{"adj_x1", true}, {"adj_x2", false}}); - runner_dy.Run(stream); - } + ReduceDims(ctx, stream, x_dims, x_broadcast_dims, dx_temp, dX); + } + } + if (dY) { + if (y_dims == y_broadcast_dims) { + if (transpose_y) { + MatMulND(ctx, stream, dout_temp, x_temp_brd, dY, true, transpose_x, + alpha); + } else { + MatMulND(ctx, stream, x_temp_brd, dout_temp, dY, !transpose_x, + false, alpha); + } + } else { + Tensor dy_temp(Y->type()); + dy_temp.Resize(framework::make_ddim(y_broadcast_dims)); + if (transpose_y) { + MatMulND(ctx, stream, dout_temp, x_temp_brd, &dy_temp, true, + transpose_x, alpha); + } else { + MatMulND(ctx, stream, x_temp_brd, dout_temp, &dy_temp, + !transpose_x, false, alpha); } + ReduceDims(ctx, stream, y_dims, y_broadcast_dims, dy_temp, dY); } } } diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc index 953c3a555fa4b7..1b609b15d6e569 100644 --- a/paddle/fluid/operators/matmul_v2_op.cc +++ b/paddle/fluid/operators/matmul_v2_op.cc @@ -90,8 +90,62 @@ class MatMulV2Op : public framework::OperatorWithKernel { new_dims.push_back(1); } - auto out_dims = framework::make_ddim(new_dims); - ctx->SetOutputDim("Out", out_dims); + auto ddim_out = framework::make_ddim(new_dims); + +#ifdef PADDLE_WITH_MKLDNN + // if mkldnn matmul_v2+transpose+reshape fuse activated + auto reshape_out = ctx->Attrs().Get>("fused_reshape_Out"); + auto transpose_out = + ctx->Attrs().Get>("fused_transpose_Out"); + + if (!reshape_out.empty() && !transpose_out.empty()) { + auto reshape_out_size = reshape_out.size(); + auto transpose_out_size = transpose_out.size(); + PADDLE_ENFORCE_EQ(transpose_out_size, 4, + platform::errors::InvalidArgument( + "transpose_out supported rank is 4, " + "received %d", + transpose_out_size)); + const std::vector supported_axis{0, 2, 1, 3}; + const bool supported_transpose_axis = std::equal( + transpose_out.begin(), transpose_out.end(), supported_axis.begin()); + PADDLE_ENFORCE_EQ( + supported_transpose_axis, true, + platform::errors::InvalidArgument( + "supported transpose axis for the fuse are {0, 2, 1, 3}")); + PADDLE_ENFORCE_EQ( + reshape_out_size, 3, + platform::errors::InvalidArgument("reshape_out supported rank is 3, " + "received %d", + reshape_out_size)); + + auto it = std::find(reshape_out.begin(), reshape_out.end(), -1); + + // if "-1" is present then one of reshape dims must be infered + if (it != reshape_out.end()) { + int index = std::distance(reshape_out.begin(), it); + + auto ddim_out_vec = framework::vectorize(ddim_out); + + int ddim_out_product = + std::accumulate(ddim_out_vec.begin(), ddim_out_vec.end(), 1, + std::multiplies()); + int reshape_out_product = std::accumulate( + reshape_out.begin(), reshape_out.end(), -1, std::multiplies()); + + reshape_out[index] = ddim_out_product / reshape_out_product; + } + + framework::DDim shape_out = + ddim_out.transpose(transpose_out).reshape(reshape_out); + ctx->SetOutputDim("Out", shape_out); + } else { + ctx->SetOutputDim("Out", ddim_out); + } +#else + ctx->SetOutputDim("Out", ddim_out); +#endif + ctx->ShareLoD("X", /* --> */ "Out"); } @@ -139,6 +193,18 @@ class MatMulV2OpMaker : public framework::OpProtoAndCheckerMaker { "Set true to transpose the last two dimensions of Y before " "doing multiplication") .SetDefault(false); + AddAttr>( + "fused_reshape_Out", + R"DOC(When MKLDNN matmul_v2_transpose_reshape fuse activated, " + "it's a shape atribute of fused reshape for `Out` output.)DOC") + .SetDefault({}) + .AsExtra(); + AddAttr>( + "fused_transpose_Out", + R"DOC(When MKLDNN matmul_v2_transpose_reshape fuse activated, " + "it's a axis atribute of fused transpose for `Out` output.)DOC") + .SetDefault({}) + .AsExtra(); AddAttr("use_mkldnn", "(bool, default false) Only used in mkldnn kernel") .SetDefault(false) diff --git a/paddle/fluid/operators/matmul_v2_op_npu.cc b/paddle/fluid/operators/matmul_v2_op_npu.cc index b23b408e9c59a7..6d7e8f3478c848 100644 --- a/paddle/fluid/operators/matmul_v2_op_npu.cc +++ b/paddle/fluid/operators/matmul_v2_op_npu.cc @@ -21,166 +21,387 @@ limitations under the License. */ namespace paddle { namespace operators { -template +using Tensor = framework::Tensor; +using NPUDeviceContext = platform::NPUDeviceContext; + +template +static void MatMul2D(const framework::ExecutionContext& ctx, + const aclrtStream& stream, const Tensor& X, + const Tensor& Y, Tensor* Out, const bool trans_x, + const bool trans_y) { + Out->mutable_data(ctx.GetPlace()); + const auto& runner = + NpuOpRunner("MatMul", {X, Y}, {*Out}, + {{"transpose_x1", trans_x}, {"transpose_x2", trans_y}}); + runner.Run(stream); +} + +template +static void MatMulND(const framework::ExecutionContext& ctx, + const aclrtStream& stream, const Tensor& X, + const Tensor& Y, Tensor* Out, const bool trans_x, + const bool trans_y) { + Out->mutable_data(ctx.GetPlace()); + const auto& runner = NpuOpRunner("BatchMatMul", {X, Y}, {*Out}, + {{"adj_x1", trans_x}, {"adj_x2", trans_y}}); + runner.Run(stream); +} + +template +static void ReduceDims(const framework::ExecutionContext& ctx, + const aclrtStream& stream, + const std::vector& dims, + const std::vector& brd_dims, const Tensor& in, + Tensor* out) { + std::vector axes; + int64_t size = brd_dims.size(); + int64_t diff = brd_dims.size() - dims.size(); + for (int64_t i = 0; i < size; ++i) { + if (i < diff) { + axes.push_back(i); + continue; + } + if (brd_dims[i] > dims[i - diff]) { + axes.push_back(i); + } + } + out->mutable_data(ctx.GetPlace()); + const auto& runner = NpuOpRunner("ReduceSumD", {in}, {*out}, + {{"axes", axes}, {"keep_dims", false}}); + runner.Run(stream); +} + +template class MatMulV2NPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* out = ctx.Output("Out"); - bool transpose_x = ctx.Attr("trans_x"); - bool transpose_y = ctx.Attr("trans_y"); - - if (x->dims().size() == 2) { - out->mutable_data(ctx.GetPlace()); - - const auto& runner = NpuOpRunner( - "MatMul", {*x, *y}, {*out}, - {{"transpose_x1", transpose_x}, {"transpose_x2", transpose_y}}); - - auto stream = - ctx.template device_context() - .stream(); - runner.Run(stream); + auto* X = ctx.Input("X"); + auto* Y = ctx.Input("Y"); + auto* Out = ctx.Output("Out"); + const bool trans_x = ctx.Attr("trans_x"); + const bool trans_y = ctx.Attr("trans_y"); + + std::vector x_dims = framework::vectorize(X->dims()); + std::vector y_dims = framework::vectorize(Y->dims()); + std::vector out_dims = framework::vectorize(Out->dims()); + int x_ndim = x_dims.size(); + int y_ndim = y_dims.size(); + int out_ndim = out_dims.size(); - } else if (x->dims().size() > 2) { - out->mutable_data(ctx.GetPlace()); + auto stream = ctx.template device_context().stream(); - const auto& runner = - NpuOpRunner("BatchMatMul", {*x, *y}, {*out}, - {{"adj_x1", transpose_x}, {"adj_x2", transpose_y}}); + // Case 1: [K] x [K] = [1] + if (x_ndim == 1 && y_ndim == 1) { + PADDLE_ENFORCE_EQ( + X->numel(), Y->numel(), + platform::errors::InvalidArgument( + "X's numbers must be equal to Y's numbers," + "when X/Y's dims =1. But received X has [%d] elements," + "received Y has [%d] elements", + X->numel(), Y->numel())); + Out->Resize({1}); + Out->mutable_data(ctx.GetPlace()); - auto stream = - ctx.template device_context() - .stream(); + const auto& runner = NpuOpRunner("Dot", {*X, *Y}, {*Out}); runner.Run(stream); + return; + } + + // Resize dim 1 to 2 + Tensor x_temp, y_temp; + x_temp.ShareDataWith(*X); + y_temp.ShareDataWith(*Y); + if (x_ndim == 1) { + x_dims.insert(x_dims.begin(), 1); + out_dims.insert(out_dims.end() - 1, 1); + x_temp.Resize(framework::make_ddim(x_dims)); + x_ndim = 2; + out_ndim += 1; + } + if (y_ndim == 1) { + y_dims.push_back(1); + out_dims.push_back(1); + y_temp.Resize(framework::make_ddim(y_dims)); + y_ndim = 2; + out_ndim += 1; + } + + const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1]; + if (trans_y) { + PADDLE_ENFORCE_EQ(y_dims[y_ndim - 1], K, + platform::errors::InvalidArgument( + "Input(Y) has error dim." + "Y'dims[%d] must be equal to %d" + "But received Y'dims[%d] is %d", + y_ndim - 1, K, y_ndim - 1, y_dims[y_ndim - 1])); + } else { + PADDLE_ENFORCE_EQ(y_dims[y_ndim - 2], K, + platform::errors::InvalidArgument( + "Input(Y) has error dim." + "Y'dims[%d] must be equal to %d" + "But received Y'dims[%d] is %d", + y_ndim - 2, K, y_ndim - 2, y_dims[y_ndim - 2])); } + + // Case 2: [M, K] x [K, N] = [M, N] + if (x_ndim == 2 && y_ndim == 2) { + MatMul2D(ctx, stream, x_temp, y_temp, Out, trans_x, trans_y); + return; + } + + // Case 3: [B, M, K] x [K, N] = [B, M, N], when trans_x = false + // Equal: [B * M, K] x [K, N] = [B * M, N] => [B, M, N] + if (trans_x == false && y_ndim == 2) { + std::vector vec_dim = {x_temp.numel() / K, K}; + x_temp.Resize(framework::make_ddim(vec_dim)); + MatMul2D(ctx, stream, x_temp, y_temp, Out, trans_x, trans_y); + return; + } + + // Case 4: [B, M, K] x [B, K, N] = [B, M, N] + std::vector x_broadcast_dims(out_ndim, 1); + std::vector y_broadcast_dims(out_ndim, 1); + std::copy(out_dims.begin(), out_dims.end() - 2, x_broadcast_dims.begin()); + std::copy(out_dims.begin(), out_dims.end() - 2, y_broadcast_dims.begin()); + std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2); + std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2); + + Tensor x_temp_brd(X->type()); + if (x_dims == x_broadcast_dims) { + x_temp_brd.ShareDataWith(*X); + x_temp_brd.Resize(framework::make_ddim(x_broadcast_dims)); + } else { + x_temp_brd.Resize(framework::make_ddim(x_broadcast_dims)); + x_temp_brd.mutable_data(ctx.GetPlace()); + NpuOpRunner runner_brd; + runner_brd.SetType("BroadcastTo") + .AddInput(x_temp) + .AddInput(std::move(x_broadcast_dims)) + .AddOutput(x_temp_brd) + .Run(stream); + } + + Tensor y_temp_brd(Y->type()); + if (y_dims == y_broadcast_dims) { + y_temp_brd.ShareDataWith(*Y); + y_temp_brd.Resize(framework::make_ddim(y_broadcast_dims)); + } else { + y_temp_brd.Resize(framework::make_ddim(y_broadcast_dims)); + y_temp_brd.mutable_data(ctx.GetPlace()); + NpuOpRunner runner_brd; + runner_brd.SetType("BroadcastTo") + .AddInput(y_temp) + .AddInput(std::move(y_broadcast_dims)) + .AddOutput(y_temp_brd) + .Run(stream); + } + MatMulND(ctx, stream, x_temp_brd, y_temp_brd, Out, trans_x, trans_y); } }; -template +template class MatMulV2GradNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); - auto* y = ctx.Input("Y"); - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - bool transpose_y = ctx.Attr("trans_y"); - auto stream = - ctx.template device_context() - .stream(); - - if (x->dims().size() == 2) { - if (transpose_y) { - if (dx) { - dx->mutable_data(ctx.GetPlace()); - const auto& runner_dx = - NpuOpRunner("MatMul", {*dout, *y}, {*dx}, - {{"transpose_x1", false}, {"transpose_x2", false}}); - - runner_dx.Run(stream); - } - if (dy) { - dy->mutable_data(ctx.GetPlace()); - const auto& runner_dy = - NpuOpRunner("MatMul", {*dout, *x}, {*dy}, - {{"transpose_x1", true}, {"transpose_x2", false}}); + auto* X = ctx.Input("X"); + auto* Y = ctx.Input("Y"); + auto* dOut = ctx.Input(framework::GradVarName("Out")); + auto* dX = ctx.Output(framework::GradVarName("X")); + auto* dY = ctx.Output(framework::GradVarName("Y")); + const bool trans_x = ctx.Attr("trans_x"); + const bool trans_y = ctx.Attr("trans_y"); - runner_dy.Run(stream); - } + std::vector x_dims = framework::vectorize(X->dims()); + std::vector y_dims = framework::vectorize(Y->dims()); + std::vector out_dims = framework::vectorize(dOut->dims()); + int x_ndim = x_dims.size(); + int y_ndim = y_dims.size(); + int out_ndim = out_dims.size(); - } else { - if (dx) { - dx->mutable_data(ctx.GetPlace()); - const auto& runner_dx = - NpuOpRunner("MatMul", {*dout, *y}, {*dx}, - {{"transpose_x1", false}, {"transpose_x2", true}}); + auto stream = ctx.template device_context().stream(); - runner_dx.Run(stream); - } - if (dy) { - dy->mutable_data(ctx.GetPlace()); - const auto& runner_dy = - NpuOpRunner("MatMul", {*x, *dout}, {*dy}, - {{"transpose_x1", true}, {"transpose_x2", false}}); + // Case 1: [K] x [K] = [1] + if (x_ndim == 1 && y_ndim == 1) { + Tensor dout_temp(dOut->type()); + dout_temp.Resize(X->dims()); + dout_temp.mutable_data(ctx.GetPlace()); + NpuOpRunner runner; + runner.SetType("BroadcastTo") + .AddInput(*dOut) + .AddInput(std::move(x_dims)) + .AddOutput(dout_temp) + .Run(stream); - runner_dy.Run(stream); + if (dX) { + dX->mutable_data(ctx.GetPlace()); + const auto& runner_dx = NpuOpRunner("Mul", {dout_temp, *Y}, {*dX}, {}); + runner_dx.Run(stream); + } + if (dY) { + dY->mutable_data(ctx.GetPlace()); + const auto& runner_dy = NpuOpRunner("Mul", {dout_temp, *X}, {*dY}, {}); + runner_dy.Run(stream); + } + return; + } + + // Resize dim 1 to 2 + Tensor x_temp, y_temp, dout_temp; + x_temp.ShareDataWith(*X); + y_temp.ShareDataWith(*Y); + dout_temp.ShareDataWith(*dOut); + if (x_ndim == 1) { + x_dims.insert(x_dims.begin(), 1); + out_dims.insert(out_dims.end() - 1, 1); + x_temp.Resize(framework::make_ddim(x_dims)); + dout_temp.Resize(framework::make_ddim(out_dims)); + x_ndim = 2; + out_ndim += 1; + } + if (y_ndim == 1) { + y_dims.push_back(1); + out_dims.push_back(1); + y_temp.Resize(framework::make_ddim(y_dims)); + dout_temp.Resize(framework::make_ddim(out_dims)); + y_ndim = 2; + out_ndim += 1; + } + + // Case 2: [M, K] x [K, N] = [M, N] + if (out_ndim == 2) { + if (dX) { + dX->Resize(framework::make_ddim(x_dims)); + if (trans_x) { + MatMul2D(ctx, stream, y_temp, dout_temp, dX, trans_y, true); + } else { + MatMul2D(ctx, stream, dout_temp, y_temp, dX, false, !trans_y); } + dX->Resize(X->dims()); } - } else if (x->dims().size() > 2) { - if (transpose_y) { - if (dx) { - dx->mutable_data(ctx.GetPlace()); - const auto& runner_dx = - NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx}, - {{"adj_x1", false}, {"adj_x2", false}}); - - runner_dx.Run(stream); + if (dY) { + dY->Resize(framework::make_ddim(y_dims)); + if (trans_y) { + MatMul2D(ctx, stream, dout_temp, x_temp, dY, true, trans_x); + } else { + MatMul2D(ctx, stream, x_temp, dout_temp, dY, !trans_x, false); } - if (dy) { - dy->mutable_data(ctx.GetPlace()); - const auto& runner_dy = - NpuOpRunner("BatchMatMul", {*dout, *x}, {*dy}, - {{"adj_x1", true}, {"adj_x2", false}}); + dY->Resize(Y->dims()); + } + return; + } + + const int K = trans_x ? x_dims[x_ndim - 2] : x_dims[x_ndim - 1]; + const int N = trans_y ? y_dims[y_ndim - 2] : y_dims[y_ndim - 1]; - runner_dy.Run(stream); + // Case 3: [B, M, K] x [K, N] = [B, M, N], when trans_x = false + // Equal: [B * M, K] x [K, N] = [B * M, N] => [B, M, N] + if (trans_x == false && y_ndim == 2) { + std::vector x_vec_dim = {x_temp.numel() / K, K}; + dout_temp.Resize( + framework::make_ddim(std::vector{dout_temp.numel() / N, N})); + if (dX) { + dX->Resize(framework::make_ddim(x_vec_dim)); + MatMul2D(ctx, stream, dout_temp, y_temp, dX, false, !trans_y); + dX->Resize(X->dims()); + } + if (dY) { + x_temp.Resize(framework::make_ddim(x_vec_dim)); + if (trans_y) { + MatMul2D(ctx, stream, dout_temp, x_temp, dY, true, false); + } else { + MatMul2D(ctx, stream, x_temp, dout_temp, dY, true, false); } - } else { - if (dx) { - dx->mutable_data(ctx.GetPlace()); - const auto& runner_dx = - NpuOpRunner("BatchMatMul", {*dout, *y}, {*dx}, - {{"adj_x1", false}, {"adj_x2", true}}); + } + return; + } + + // Case 4: [B, M, K] x [B, K, N] = [B, M, N] + std::vector x_broadcast_dims(out_ndim, 1); + std::vector y_broadcast_dims(out_ndim, 1); + std::copy(out_dims.begin(), out_dims.end() - 2, x_broadcast_dims.begin()); + std::copy(out_dims.begin(), out_dims.end() - 2, y_broadcast_dims.begin()); + std::copy(x_dims.end() - 2, x_dims.end(), x_broadcast_dims.end() - 2); + std::copy(y_dims.end() - 2, y_dims.end(), y_broadcast_dims.end() - 2); + + Tensor x_temp_brd(X->type()); + if (x_dims == x_broadcast_dims) { + x_temp_brd.ShareDataWith(*X); + x_temp_brd.Resize(framework::make_ddim(x_broadcast_dims)); + } else { + x_temp_brd.Resize(framework::make_ddim(x_broadcast_dims)); + x_temp_brd.mutable_data(ctx.GetPlace()); + NpuOpRunner runner_brd; + runner_brd.SetType("BroadcastTo") + .AddInput(x_temp) + .AddInput(std::move(x_broadcast_dims)) + .AddOutput(x_temp_brd) + .Run(stream); + } - runner_dx.Run(stream); + Tensor y_temp_brd(Y->type()); + if (y_dims == y_broadcast_dims) { + y_temp_brd.ShareDataWith(*Y); + y_temp_brd.Resize(framework::make_ddim(y_broadcast_dims)); + } else { + y_temp_brd.Resize(framework::make_ddim(y_broadcast_dims)); + y_temp_brd.mutable_data(ctx.GetPlace()); + NpuOpRunner runner_brd; + runner_brd.SetType("BroadcastTo") + .AddInput(y_temp) + .AddInput(std::move(y_broadcast_dims)) + .AddOutput(y_temp_brd) + .Run(stream); + } + + if (dX) { + if (x_dims == x_broadcast_dims) { + if (trans_x) { + MatMulND(ctx, stream, y_temp_brd, dout_temp, dX, trans_y, true); + } else { + MatMulND(ctx, stream, dout_temp, y_temp_brd, dX, false, !trans_y); } - if (dy) { - dy->mutable_data(ctx.GetPlace()); - if ((x->dims().size() == 3) && (dout->dims().size() == 3) && - (dy->dims().size() == 2)) { - framework::Tensor dout_tmp; - dout_tmp.ShareDataWith(*dout); - std::vector vec_dim = - framework::vectorize(dout_tmp.dims()); - std::vector vec_dim_v{vec_dim[0] * vec_dim[1], vec_dim[2]}; - dout_tmp.Resize(framework::make_ddim(vec_dim_v)); - - framework::Tensor x_tmp; - x_tmp.ShareDataWith(*x); - std::vector vec_dim_x = - framework::vectorize(x_tmp.dims()); - std::vector vec_dim_x_v{vec_dim_x[0] * vec_dim_x[1], - vec_dim_x[2]}; - x_tmp.Resize(framework::make_ddim(vec_dim_x_v)); - const auto& runner_dy = - NpuOpRunner("MatMul", {x_tmp, dout_tmp}, {*dy}, - {{"transpose_x1", true}, {"transpose_x2", false}}); - runner_dy.Run(stream); - } else { - const auto& runner_dy = - NpuOpRunner("BatchMatMul", {*x, *dout}, {*dy}, - {{"adj_x1", true}, {"adj_x2", false}}); - runner_dy.Run(stream); - } + } else { + Tensor dx_temp(X->type()); + dx_temp.Resize(framework::make_ddim(x_broadcast_dims)); + if (trans_x) { + MatMulND(ctx, stream, y_temp_brd, dout_temp, &dx_temp, trans_y, + true); + } else { + MatMulND(ctx, stream, dout_temp, y_temp_brd, &dx_temp, false, + !trans_y); } + ReduceDims(ctx, stream, x_dims, x_broadcast_dims, dx_temp, dX); + } + } + if (dY) { + if (y_dims == y_broadcast_dims) { + if (trans_y) { + MatMulND(ctx, stream, dout_temp, x_temp_brd, dY, true, trans_x); + } else { + MatMulND(ctx, stream, x_temp_brd, dout_temp, dY, !trans_x, false); + } + } else { + Tensor dy_temp(Y->type()); + dy_temp.Resize(framework::make_ddim(y_broadcast_dims)); + if (trans_y) { + MatMulND(ctx, stream, dout_temp, x_temp_brd, &dy_temp, true, + trans_x); + } else { + MatMulND(ctx, stream, x_temp_brd, dout_temp, &dy_temp, !trans_x, + false); + } + ReduceDims(ctx, stream, y_dims, y_broadcast_dims, dy_temp, dY); } } } }; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL( - matmul_v2, - ops::MatMulV2NPUKernel, - ops::MatMulV2NPUKernel); -REGISTER_OP_NPU_KERNEL( - matmul_v2_grad, - ops::MatMulV2GradNPUKernel, - ops::MatMulV2GradNPUKernel); +REGISTER_OP_NPU_KERNEL(matmul_v2, ops::MatMulV2NPUKernel, + ops::MatMulV2NPUKernel); +REGISTER_OP_NPU_KERNEL(matmul_v2_grad, ops::MatMulV2GradNPUKernel, + ops::MatMulV2GradNPUKernel); diff --git a/paddle/fluid/operators/meshgrid_op_npu.cc b/paddle/fluid/operators/meshgrid_op_npu.cc index 9605fa092f0697..f22e2e178ef851 100644 --- a/paddle/fluid/operators/meshgrid_op_npu.cc +++ b/paddle/fluid/operators/meshgrid_op_npu.cc @@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and -limitations under the Licnse. */ +limitations under the License. */ #include "paddle/fluid/operators/meshgrid_op.h" #include "paddle/fluid/operators/npu_op_runner.h" diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc index d992890adeec3e..29106dc30498e8 100644 --- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc @@ -13,6 +13,7 @@ limitations under the License. */ #include "paddle/fluid/operators/activation_op.h" +#include "paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h" #include "paddle/fluid/platform/mkldnn_reuse.h" namespace paddle { @@ -169,6 +170,13 @@ struct GeluMKLDNNGradFunctor : public BaseActivationFunctor { } }; +template +struct SoftplusMKLDNNFunctor : public BaseActivationFunctor { + void operator()(const framework::ExecutionContext &ctx) const { + custom_softplus_eltwise_forward(ctx); + } +}; + template using ReluMKLDNNFunctor = MKLDNNActivationFunc; @@ -257,7 +265,6 @@ namespace ops = paddle::operators; ops::grad_functor>); #define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro) \ - __macro(relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor); \ __macro(relu6, Relu6MKLDNNFunctor, Relu6MKLDNNGradFunctor); \ __macro(leaky_relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor); \ __macro(swish, SwishMKLDNNFunctor, SwishMKLDNNGradFunctor); \ @@ -267,7 +274,14 @@ namespace ops = paddle::operators; __macro(abs, AbsMKLDNNFunctor, AbsMKLDNNGradFunctor); FOR_EACH_MKLDNN_KERNEL_FUNCTOR(REGISTER_ACTIVATION_MKLDNN_KERNEL); +REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(relu, ReluMKLDNNFunctor, + ReluMKLDNNGradFunctor); REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(gelu, GeluMKLDNNFunctor, GeluMKLDNNGradFunctor); REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(sigmoid, SigmoidMKLDNNFunctor, SigmoidMKLDNNGradFunctor); + +namespace ops = paddle::operators; +REGISTER_OP_KERNEL( + softplus, MKLDNN, paddle::platform::CPUPlace, + ops::MKLDNNActivationKernel>); diff --git a/paddle/fluid/operators/mkldnn/axpy_handler.cc b/paddle/fluid/operators/mkldnn/axpy_handler.cc index ed265edf003e01..db1127b055c31e 100644 --- a/paddle/fluid/operators/mkldnn/axpy_handler.cc +++ b/paddle/fluid/operators/mkldnn/axpy_handler.cc @@ -23,7 +23,6 @@ limitations under the License. */ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/place.h" -#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc index 57a56776736ff9..4cc96a48bd26f4 100644 --- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc @@ -23,6 +23,7 @@ namespace operators { using framework::DataLayout; using framework::Tensor; +using framework::LoDTensor; using mkldnn::memory; using mkldnn::primitive; using mkldnn::concat; @@ -149,6 +150,72 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel { output->set_format(platform::GetMKLDNNFormat(*dst_mem)); } }; + +template +class ConcatGradMKLDNNOpKernel : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const override { + const auto& dev_ctx = + ctx.template device_context(); + const auto& onednn_engine = dev_ctx.GetEngine(); + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + + auto out_var_names = ctx.OutputNames(framework::GradVarName("X")); + + const auto x = ctx.MultiInput("X"); + const auto* dout = ctx.Input(framework::GradVarName("Out")); + auto dx = ctx.MultiOutput(framework::GradVarName("X")); + + for (size_t i = 0; i < dx.size(); ++i) { + if (dx[i] != nullptr) { + dx[i]->set_lod(x[i]->lod()); + } + } + + int axis = ctx.Attr("axis"); + if (ctx.HasInput("AxisTensor")) { + auto* axis_tensor = ctx.Input("AxisTensor"); + axis = GetDataFromTensor(axis_tensor)[0]; + } + + auto dout_vec_dims = framework::vectorize(dout->dims()); + + axis = ComputeAxis(axis, dout_vec_dims.size()); + + std::vector offset(dout_vec_dims.size(), 0); + + mkldnn::memory::data_type dout_type = + framework::ToMKLDNNDataType(dout->type()); + platform::ReorderMKLDNNHandler reorder_handler(dout_vec_dims, dout->type(), + dout_type, onednn_engine); + auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory( + dout->format(), platform::to_void_cast(dout->data())); + + for (size_t i = 0; i < dx.size(); ++i) { + if (out_var_names[i] != framework::kEmptyVarName && + dx[i]->numel() != 0UL) { + auto dx_vec_dims = framework::vectorize(dx[i]->dims()); + auto slice_mem_p = reorder_handler.AcquireSubmemory( + dx_vec_dims, offset, reorder_src_memory_p); + + auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory( + dx[i], dx_vec_dims, dout->format(), ctx.GetPlace()); + auto reorder_p = + reorder_handler.AcquireReorder(reorder_dst_memory_p, slice_mem_p); + + reorder_p->execute(astream, *slice_mem_p, *reorder_dst_memory_p); + + offset[axis] += dx[i]->dims()[axis]; + + dx[i]->set_layout(framework::DataLayout::kMKLDNN); + dx[i]->set_format(platform::GetMKLDNNFormat(*reorder_dst_memory_p)); + } + } + astream.wait(); + } +}; + } // namespace operators } // namespace paddle @@ -159,3 +226,7 @@ REGISTER_OP_KERNEL(concat, MKLDNN, ::paddle::platform::CPUPlace, ops::ConcatMKLDNNOpKernel, ops::ConcatMKLDNNOpKernel, ops::ConcatMKLDNNOpKernel); + +REGISTER_OP_KERNEL(concat_grad, MKLDNN, ::paddle::platform::CPUPlace, + ops::ConcatGradMKLDNNOpKernel, + ops::ConcatGradMKLDNNOpKernel); diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc index 1b69dd7ea00c7c..cce835e6bc0354 100644 --- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc @@ -1,4 +1,4 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -12,27 +12,16 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include "paddle/fluid/framework/data_layout_transform.h" +#include + #include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_reuse.h" -namespace paddle { -namespace platform { -class MKLDNNDeviceContext; -} // namespace platform -} // namespace paddle - namespace paddle { namespace operators { - -using framework::DataLayout; -using mkldnn::memory; -using mkldnn::primitive; -using mkldnn::reorder; -using mkldnn::stream; -using platform::GetMKLDNNFormat; -using platform::to_void_cast; +namespace { inline MKLDNNMemoryFormat GetWeightsFormat(const MKLDNNMemoryFormat format, const int groups, @@ -78,7 +67,7 @@ class ConvMKLDNNHandlerT mkldnn::convolution_backward_data, mkldnn::convolution_backward_weights> { public: - ConvMKLDNNHandlerT(const paddle::framework::ExecutionContext& ctx, + ConvMKLDNNHandlerT(const framework::ExecutionContext& ctx, const platform::MKLDNNDeviceContext& dev_ctx, const mkldnn::engine mkldnn_engine, platform::Place cpu_place, const Tensor* input, @@ -92,19 +81,19 @@ class ConvMKLDNNHandlerT unique_name)) { if (!this->isCached()) { PADDLE_ENFORCE_EQ( - input->layout(), DataLayout::kMKLDNN, + input->layout(), framework::DataLayout::kMKLDNN, platform::errors::InvalidArgument( "The input tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, input->layout())); + framework::DataLayout::kMKLDNN, input->layout())); PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::undef, platform::errors::InvalidArgument( "Wrong format set for Input tensor")); PADDLE_ENFORCE_EQ( - filter->layout(), DataLayout::kMKLDNN, + filter->layout(), framework::DataLayout::kMKLDNN, platform::errors::InvalidArgument( "The Filter tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, filter->layout())); + framework::DataLayout::kMKLDNN, filter->layout())); PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::undef, platform::errors::InvalidArgument( "Wrong format set for Filter tensor")); @@ -137,10 +126,10 @@ class ConvMKLDNNHandlerT if (bias) { PADDLE_ENFORCE_EQ( - bias->layout(), DataLayout::kMKLDNN, + bias->layout(), framework::DataLayout::kMKLDNN, platform::errors::InvalidArgument( "The Bias tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, bias->layout())); + framework::DataLayout::kMKLDNN, bias->layout())); PADDLE_ENFORCE_NE(bias->format(), MKLDNNMemoryFormat::undef, platform::errors::InvalidArgument( "Got wrong format for Bias tensor.")); @@ -188,12 +177,12 @@ class ConvMKLDNNHandlerT std::transform(dilations.begin(), dilations.end(), dilations.begin(), [](int64_t i) { return i - 1; }); - const auto src_tz = paddle::framework::vectorize(input->dims()); + const auto src_tz = framework::vectorize(input->dims()); - auto weights_tz = paddle::framework::vectorize(filter->dims()); + auto weights_tz = framework::vectorize(filter->dims()); platform::GetGroupConvWeightsTz(weights_tz, groups); - const auto dst_tz = paddle::framework::vectorize(output->dims()); + const auto dst_tz = framework::vectorize(output->dims()); const mkldnn::memory::dims stride_dims = strides; const auto mkldnn_paddings = platform::ToMkldnnPadding(paddings); @@ -204,29 +193,49 @@ class ConvMKLDNNHandlerT * the memory format preferred for best performance */ auto chosen_memory_format = MKLDNNMemoryFormat::any; - auto data_type = mkldnn::memory::data_type::f32; if (ctx.Attr("mkldnn_data_type") == "bfloat16" || std::is_same::value) data_type = mkldnn::memory::data_type::bf16; - const auto src_md = - platform::MKLDNNMemDesc(src_tz, data_type, chosen_memory_format); - const auto weights_md = platform::MKLDNNMemDesc(weights_tz, data_type, - MKLDNNMemoryFormat::any); + mkldnn::memory::desc src_md, weights_md; + if (platform::is_int8()) { + src_md = platform::MKLDNNMemDesc( + src_tz, framework::ToMKLDNNDataType(input->type()), + chosen_memory_format); + weights_md = platform::MKLDNNMemDesc( + weights_tz, mkldnn::memory::data_type::s8, chosen_memory_format); + } else { + src_md = + platform::MKLDNNMemDesc(src_tz, data_type, chosen_memory_format); + weights_md = platform::MKLDNNMemDesc(weights_tz, data_type, + MKLDNNMemoryFormat::any); + } + const auto dst_md = platform::MKLDNNMemDesc( dst_tz, platform::MKLDNNGetDataType(), chosen_memory_format); - const auto fwd_prop_kind = is_test ? mkldnn::prop_kind::forward_inference : mkldnn::prop_kind::forward_training; + float sum_scale = 1.0f; + std::vector output_shift_scale; + if (platform::is_int8()) + std::tie(sum_scale, output_shift_scale) = get_int8_scales(ctx); + const mkldnn::primitive_attr conv_attr = CreatePostOps( - fuse_activation, fuse_alpha, fuse_beta, fuse_residual_conn); + fuse_activation, fuse_alpha, fuse_beta, fuse_residual_conn, + output_shift_scale, sum_scale); // for INT8 only! if (bias) { auto bias_tz = framework::vectorize(bias->dims()); - auto bias_md = - platform::MKLDNNMemDesc(bias_tz, data_type, MKLDNNMemoryFormat::x); + mkldnn::memory::desc bias_md; + if (platform::is_int8()) { + bias_md = platform::MKLDNNMemDesc( + bias_tz, mkldnn::memory::data_type::s32, MKLDNNMemoryFormat::x); + } else { + bias_md = platform::MKLDNNMemDesc(bias_tz, data_type, + MKLDNNMemoryFormat::x); + } this->AcquireForwardPrimitiveDescriptor( conv_attr, fwd_prop_kind, dnnl::algorithm::convolution_direct, @@ -255,28 +264,28 @@ class ConvMKLDNNHandlerT unique_name)) { if (!this->isBwdCached()) { PADDLE_ENFORCE_EQ( - in->layout(), DataLayout::kMKLDNN, + in->layout(), framework::DataLayout::kMKLDNN, platform::errors::InvalidArgument( "The input tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, in->layout())); + framework::DataLayout::kMKLDNN, in->layout())); PADDLE_ENFORCE_NE(in->format(), MKLDNNMemoryFormat::undef, platform::errors::InvalidArgument( "Got wrong format for Input tensor.")); PADDLE_ENFORCE_EQ( - filter->layout(), DataLayout::kMKLDNN, + filter->layout(), framework::DataLayout::kMKLDNN, platform::errors::InvalidArgument( "The filter tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, filter->layout())); + framework::DataLayout::kMKLDNN, filter->layout())); PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::undef, platform::errors::InvalidArgument( "Got wrong format for Filter tensor.")); PADDLE_ENFORCE_EQ( - out_grad->layout(), DataLayout::kMKLDNN, + out_grad->layout(), framework::DataLayout::kMKLDNN, platform::errors::InvalidArgument( "The output_grad tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, out_grad->layout())); + framework::DataLayout::kMKLDNN, out_grad->layout())); PADDLE_ENFORCE_NE(out_grad->format(), MKLDNNMemoryFormat::undef, platform::errors::InvalidArgument( "Wrong format set for output_grad tensor")); @@ -296,28 +305,25 @@ class ConvMKLDNNHandlerT std::vector dilations(begin(dilations_temp), end(dilations_temp)); - std::string padding_algorithm = - ctx.Attr("padding_algorithm"); - - int groups = ctx.Attr("groups"); - auto input_dims = in->dims(); auto data_dims = framework::slice_ddim(input_dims, 2, input_dims.size()); auto filter_dims = filter->dims(); auto filter_data_dims = framework::slice_ddim(filter_dims, 2, filter_dims.size()); - auto ksize = framework::vectorize(filter_data_dims); + std::string padding_algorithm = + ctx.Attr("padding_algorithm"); UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, data_dims, strides, ksize); auto src_tz = framework::vectorize(in->dims()); auto weights_tz = framework::vectorize(filter->dims()); + int groups = ctx.Attr("groups"); int g = std::max(groups, 1); platform::GetGroupConvWeightsTz(weights_tz, g); - auto dst_tz = paddle::framework::vectorize(out_grad->dims()); + auto dst_tz = framework::vectorize(out_grad->dims()); /* create memory descriptor for conv backward without specified format * ('any') which lets a primitive (conv backward in this case) choose @@ -349,8 +355,14 @@ class ConvMKLDNNHandlerT mkldnn::primitive_attr conv_attr; if (bias) { auto bias_tz = framework::vectorize(bias->dims()); - auto bias_md = platform::MKLDNNMemDesc( - bias_tz, mkldnn::memory::data_type::f32, MKLDNNMemoryFormat::x); + mkldnn::memory::desc bias_md; + if (platform::is_int8()) { + bias_md = platform::MKLDNNMemDesc( + bias_tz, mkldnn::memory::data_type::s32, MKLDNNMemoryFormat::x); + } else { + bias_md = platform::MKLDNNMemDesc( + bias_tz, mkldnn::memory::data_type::f32, MKLDNNMemoryFormat::x); + } this->AcquireForwardPrimitiveDescriptor( conv_attr, mkldnn::prop_kind::forward_training, @@ -377,6 +389,71 @@ class ConvMKLDNNHandlerT } } + std::tuple> get_int8_scales( + const framework::ExecutionContext& ctx) const { + const auto* filter = ctx.Input("Filter"); + const auto& weights_tz = framework::vectorize(filter->dims()); + + const bool& force_fp32_output = ctx.Attr("force_fp32_output"); + const bool& fuse_residual_conn = ctx.Attr("fuse_residual_connection"); + const int groups = std::max(ctx.Attr("groups"), 1); + + const auto& scale_in_data = ctx.Attr("Scale_in"); + const auto& scale_in_eltwise_data = ctx.Attr("Scale_in_eltwise"); + auto scale_weights_data = ctx.Attr>("Scale_weights"); + bool is_multi_channel = scale_weights_data.size() > 1; + auto scale_out_data = + force_fp32_output ? 1.0f : ctx.Attr("Scale_out"); + float sum_scale = + fuse_residual_conn ? scale_out_data / scale_in_eltwise_data : 1.0f; + int count = + is_multi_channel + ? (groups > 1 ? (weights_tz)[1] * (weights_tz)[0] : (weights_tz)[0]) + : 1; + std::vector output_shift_scale(count); + +#pragma omp parallel for if (count > 50) + for (int i = 0; i < count; i++) { + if (scale_weights_data[i] == 0.0) + // weights data will contain 0 in some models, then weights + // scale couldn't be calculated + output_shift_scale[i] = scale_out_data; + else + output_shift_scale[i] = + static_cast(static_cast(scale_out_data) / + (static_cast(scale_in_data) * + static_cast(scale_weights_data[i]))); + } + + return std::make_tuple(sum_scale, output_shift_scale); + } + + std::tuple> get_int8_bias_scales( + const framework::ExecutionContext& ctx) const { + const auto* filter = ctx.Input("Filter"); + const auto& weights_tz = framework::vectorize(filter->dims()); + const int groups = std::max(ctx.Attr("groups"), 1); + + const auto& scale_weights_data = + ctx.Attr>("Scale_weights"); + const auto& scale_in_data = ctx.Attr("Scale_in"); + + bool is_multi_channel = scale_weights_data.size() > 1; + int mask_reorder = is_multi_channel ? 1 << 0 : 1; + int count = + is_multi_channel + ? (groups > 1 ? (weights_tz)[1] * (weights_tz)[0] : (weights_tz)[0]) + : 1; + std::vector scale_bias_data(count); + +#pragma omp parallel for if (count > 50) + for (int i = 0; i < count; i++) { + scale_bias_data[i] = scale_in_data * scale_weights_data[i]; + } + + return std::make_tuple(mask_reorder, scale_bias_data); + } + mkldnn::primitive_attr CreatePostOps( std::string fuse_activation, float fuse_alpha, float fuse_beta, bool fuse_residual_conn, const std::vector output_shift_scale = {}, @@ -433,7 +510,7 @@ class ConvMKLDNNHandlerT return this->AcquireMemoryWithReorder( user_src_md, this->bwd_pd_->weights_desc(), - to_void_cast(filter_data), "@weights_mem_d_p", false); + platform::to_void_cast(filter_data), "@weights_mem_d_p", false); } std::shared_ptr AcquireSrcMemoryWithReorder( @@ -480,11 +557,11 @@ class ConvMKLDNNHandlerT framework::vectorize(in_mem->dims()), platform::MKLDNNGetDataType(), in_mem->format()); return this->AcquireMemoryWithReorder( - user_mem_md, mem_md, to_void_cast(in_mem_data), key_mem); + user_mem_md, mem_md, platform::to_void_cast(in_mem_data), key_mem); } else { const std::string target_key_suffix{key_mem_target}; const auto target_mem_p = this->AcquireMemory(target_key_suffix); - user_mem_p->set_data_handle(to_void_cast(in_mem_data)); + user_mem_p->set_data_handle(platform::to_void_cast(in_mem_data)); if (user_mem_p != target_mem_p) { this->AcquireReorder(user_mem_p, target_mem_p, key_mem); } @@ -494,7 +571,8 @@ class ConvMKLDNNHandlerT std::shared_ptr AcquireWeightsMemoryWithReorder( const framework::Tensor* filter, const int groups, const bool is_conv3d, - const bool is_test) { + const bool is_test, const std::vector& scale_data = {1.0f}, + int mask = 0) { // This is workaround to make execution faster, delete // if statement after including md inside Tensor auto weights_mem_p = this->AcquireMemory("@weights_mem_p_target"); @@ -511,12 +589,14 @@ class ConvMKLDNNHandlerT return this->AcquireMemoryWithReorder( user_src_md, this->fwd_pd_->weights_desc(), - to_void_cast(filter_data), "@weights_mem_p", is_test); + platform::to_void_cast(filter_data), "@weights_mem_p", is_test, {}, + scale_data, mask); } } std::shared_ptr AcquireBiasMemoryWithReorder( - const framework::Tensor* bias, const bool is_test) { + const framework::Tensor* bias, const bool is_test, + const std::vector& scale_data = {1.0f}, int mask = 0) { auto bias_mem_p = this->AcquireMemory("@bias_mem_p_target"); if (is_test && bias_mem_p) { return bias_mem_p; @@ -527,8 +607,9 @@ class ConvMKLDNNHandlerT MKLDNNMemoryFormat::x); return this->AcquireMemoryWithReorder( - user_bias_md, this->fwd_pd_->bias_desc(), to_void_cast(bias_data), - "@bias_mem_p", is_test); + user_bias_md, this->fwd_pd_->bias_desc(), + platform::to_void_cast(bias_data), "@bias_mem_p", is_test, {}, + scale_data, mask); } } @@ -536,8 +617,8 @@ class ConvMKLDNNHandlerT const framework::Tensor* residual_param) { void* residual_data = residual_param->type() == framework::DataTypeTrait::DataType() - ? to_void_cast(residual_param->data()) - : to_void_cast(residual_param->data()); + ? platform::to_void_cast(residual_param->data()) + : platform::to_void_cast(residual_param->data()); auto residual_mem_p = this->AcquireMemory("@user_residual_data_mem_p"); if (residual_mem_p) { residual_mem_p->set_data_handle(residual_data); @@ -572,12 +653,14 @@ class ConvMKLDNNHandlerT } }; +} // anonymous namespace + template -class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { +class ConvMKLDNNOpKernel : public framework::OpKernel { public: - void Compute(const paddle::framework::ExecutionContext& ctx) const override { + void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, - paddle::platform::errors::PreconditionNotMet( + platform::errors::PreconditionNotMet( "Operator DNNL Conv must use CPUPlace")); bool is_INT8 = std::is_same::value || std::is_same::value; @@ -607,9 +690,9 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { } template - void ComputeFP32(const paddle::framework::ExecutionContext& ctx) const { + void ComputeFP32(const framework::ExecutionContext& ctx) const { auto& dev_ctx = - ctx.template device_context(); + ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); const bool is_test = ctx.Attr("is_test"); @@ -656,407 +739,112 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { conv_p->execute(astream, args); astream.wait(); - output->set_layout(DataLayout::kMKLDNN); - output->set_format(GetMKLDNNFormat(*dst_memory_p)); + output->set_layout(framework::DataLayout::kMKLDNN); + output->set_format(platform::GetMKLDNNFormat(*dst_memory_p)); } template - void ComputeINT8(const paddle::framework::ExecutionContext& ctx) const { - const bool is_test = ctx.Attr("is_test"); - + void ComputeINT8(const framework::ExecutionContext& ctx) const { auto& dev_ctx = - ctx.template device_context(); + ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); - auto* input = ctx.Input("Input"); - auto* output = ctx.Output("Output"); - - PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument( - "The input tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, input->layout())); - PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument( - "Got wrong format for Input tensor.")); - - PADDLE_ENFORCE_GE(input->dims().size(), 4, - platform::errors::InvalidArgument( - "Input must be with 4 or 5 dimensions, i.e. NCHW or " - "NCDHW, but got dimension = %d .", - input->dims().size())); - PADDLE_ENFORCE_LE(input->dims().size(), 5, - platform::errors::InvalidArgument( - "Input must be with 4 or 5 dimensions, i.e. NCHW or " - "NCDHW, but got dimension = %d .", - input->dims().size())); + const std::string& fuse_activation = + ctx.Attr("fuse_activation"); + const bool& fuse_residual_conn = ctx.Attr("fuse_residual_connection"); + const bool& force_fp32_output = ctx.Attr("force_fp32_output"); + const bool is_conv3d = ctx.Attr>("strides").size() == 3U; - std::string fuse_activation = ctx.Attr("fuse_activation"); - bool fuse_residual_conn = ctx.Attr("fuse_residual_connection"); bool unsigned_output = (fuse_activation == "relu" || fuse_activation == "relu6"); - - const T* input_data = input->data(); - - auto src_tz = paddle::framework::vectorize(input->dims()); - - mkldnn::memory::data_type src_dt = - paddle::framework::ToMKLDNNDataType(input->type()); - - std::string key = - platform::CreateKey(dev_ctx, src_tz, src_dt, - ctx.InputName("Input") + ctx.InputName("Filter")); - bool need_s8_to_u8 = false; - std::shared_ptr conv_p; - std::shared_ptr src_memory_p; - std::shared_ptr user_src_memory_p; - std::shared_ptr dst_memory_p; - std::vector pipeline; - std::shared_ptr conv_pd; - std::shared_ptr handler; - - // This is workaround for hacky implementation - // of conv int8 mkl-dnn. Once conv fp32 and conv int8 - // are merged/unified, this will disappear - auto key_tid = platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key); - - const std::string key_conv_pd = key_tid + "@conv_pd"; - auto prim_key = key_tid + "@conv_p"; - auto dst_key = key_tid + "@dst_mem_p"; - auto src_key = key_tid + "@src_mem_p"; - auto weights_key = key_tid + "@weights_mem_p"; - auto bias_key = key_tid + "@bias_mem_p"; - auto user_src_key = key_tid + "@user_src_mem_p"; - auto user_residual_key = key_tid + "@user_residual_data_mem_p"; - auto src_reorder_key = key_tid + "@src_mem_preorder_p"; - auto residual_reorder_key = key_tid + "@residual_data_mem_preorder_p"; - - conv_pd = - std::static_pointer_cast( - dev_ctx.GetBlob(key_conv_pd)); - auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + PADDLE_ENFORCE_NE( + is_conv3d, true, + platform::errors::Unimplemented( + "OneDNN int8 convolution does not support 3D inputs currently")); + PADDLE_ENFORCE_EQ( + fuse_residual_conn && force_fp32_output, false, + platform::errors::Unimplemented( + "residual fusion does not support force output with fp32")); - if (conv_pd == nullptr || !is_test) { - float fuse_alpha = ctx.Attr("fuse_alpha"); - float fuse_beta = ctx.Attr("fuse_beta"); - bool force_fp32_output = ctx.Attr("force_fp32_output"); + auto* input = ctx.Input("Input"); + auto* filter = ctx.Input("Filter"); + auto* bias = ctx.HasInput("Bias") ? ctx.Input("Bias") : nullptr; + auto* output = ctx.Output("Output"); - auto* filter = ctx.Input("Filter"); + ConvMKLDNNHandlerT handler( + ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), input, filter, bias, + output, ctx.InputName("Input") + ctx.InputName("Filter")); - PADDLE_ENFORCE_EQ( - filter->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument( - "The filter tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, filter->layout())); - PADDLE_ENFORCE_NE(filter->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument( - "Got wrong format for Filter tensor.")); + auto src_memory_p = handler.AcquireSrcMemoryWithReorder(input); - PADDLE_ENFORCE_GE(filter->dims().size(), 4, - platform::errors::InvalidArgument( - "Filter must be with 4 or 5 dimensions, i.e. OIHW " - "or OIDHW, but got dimensions = %d .", - filter->dims().size())); - PADDLE_ENFORCE_LE(filter->dims().size(), 5, - platform::errors::InvalidArgument( - "Filter must be with 4 or 5 dimensions, i.e. OIHW " - "or OIDHW, but got dimensions = %d .", - filter->dims().size())); + const auto& scale_weights_data = + ctx.Attr>("Scale_weights"); + const bool is_multi_channel = scale_weights_data.size() > 1; + const int& groups = ctx.Attr("groups"); + const bool& is_test = ctx.Attr("is_test"); + int mask_reorder = + is_multi_channel ? ((groups != 1) ? (1 << 1) + (1 << 0) : 1 << 0) : 0; + auto weights_memory_p = handler.AcquireWeightsMemoryWithReorder( + filter, groups, false, is_test, scale_weights_data, mask_reorder); + std::shared_ptr dst_memory_p; + if (fuse_residual_conn) { + auto* residual_param = ctx.Input("ResidualData"); PADDLE_ENFORCE_EQ( - !fuse_residual_conn || !force_fp32_output, true, - platform::errors::Unimplemented( - "residual fusion does not support force output with fp32")); - - auto* bias = ctx.HasInput("Bias") ? ctx.Input("Bias") : nullptr; - - if (bias) { - PADDLE_ENFORCE_EQ( - bias->layout(), DataLayout::kMKLDNN, - platform::errors::InvalidArgument( - "The bias tensor's layout should be %d, but got %d.", - DataLayout::kMKLDNN, bias->layout())); - PADDLE_ENFORCE_NE(bias->format(), MKLDNNMemoryFormat::undef, - platform::errors::InvalidArgument( - "Got wrong format for Bias tensor.")); - - PADDLE_ENFORCE_EQ(bias->dims().size(), 1, - platform::errors::InvalidArgument( - "Bias must only have 1 dimension, i.e. X, but " - "got dimension = %d .", - bias->dims().size())); - } - - std::vector strides_temp = ctx.Attr>("strides"); - std::vector strides(begin(strides_temp), end(strides_temp)); - - std::vector paddings_temp = ctx.Attr>("paddings"); - std::vector paddings(begin(paddings_temp), end(paddings_temp)); - - std::vector dilations_temp = ctx.Attr>("dilations"); - std::vector dilations(begin(dilations_temp), - end(dilations_temp)); - - std::string padding_algorithm = - ctx.Attr("padding_algorithm"); - - bool is_conv3d = strides.size() == 3U; - - PADDLE_ENFORCE_NE(is_conv3d, true, - platform::errors::Unimplemented( - "int8 does not support conv3d currently")); - - auto input_dims = input->dims(); - auto data_dims = framework::slice_ddim(input_dims, 2, input_dims.size()); - auto filter_dims = filter->dims(); - auto filter_data_dims = - framework::slice_ddim(filter_dims, 2, filter_dims.size()); - - auto ksize = framework::vectorize(filter_data_dims); - - UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm, - data_dims, strides, ksize); - - int groups = ctx.Attr("groups"); - auto weights_tz = paddle::framework::vectorize(filter->dims()); - int g = std::max(groups, 1); - - platform::GetGroupConvWeightsTz(weights_tz, g); - auto dst_tz = paddle::framework::vectorize(output->dims()); - - std::transform(dilations.begin(), dilations.end(), dilations.begin(), - [](int64_t i) { return i - 1; }); - - const K* filter_data = filter->data(); - auto scale_in_data = ctx.Attr("Scale_in"); - auto scale_in_eltwise_data = ctx.Attr("Scale_in_eltwise"); - auto scale_weights_data = ctx.Attr>("Scale_weights"); - auto scale_out_data = - force_fp32_output ? 1.0f : ctx.Attr("Scale_out"); - float sum_scale = - fuse_residual_conn ? scale_out_data / scale_in_eltwise_data : 1.0f; - - bool is_multi_channel = scale_weights_data.size() > 1; - - int count = is_multi_channel ? (g > 1 ? (weights_tz)[1] * (weights_tz)[0] - : (weights_tz)[0]) - : 1; - std::vector output_shift_scale(count); -#pragma omp parallel for if (count > 1) - for (int i = 0; i < count; i++) { - if (scale_weights_data[i] == 0.0) - output_shift_scale[i] = - scale_out_data; // weights data will contain 0 - // in some models, then weights - // scale couldn't be calculated - else - output_shift_scale[i] = - static_cast(static_cast(scale_out_data) / - (static_cast(scale_in_data) * - static_cast(scale_weights_data[i]))); - } - - auto user_src_md = - platform::MKLDNNMemDesc({src_tz}, src_dt, input->format()); - auto user_weights_md = platform::MKLDNNMemDesc( - {weights_tz}, platform::MKLDNNGetDataType(), - ((g) == 1) ? MKLDNNMemoryFormat::oihw : MKLDNNMemoryFormat::goihw); - - /* create memory descriptor for convolution without specified format - * ('any') which lets a primitive (convolution in this case) choose - * the memory format preferred for best performance - */ - auto chosen_memory_format = MKLDNNMemoryFormat::any; - - std::vector bias_tz; - - auto src_md = - platform::MKLDNNMemDesc(src_tz, src_dt, chosen_memory_format); - auto weights_md = platform::MKLDNNMemDesc( - weights_tz, memory::data_type::s8, chosen_memory_format); - auto dst_md = platform::MKLDNNMemDesc( - dst_tz, platform::MKLDNNGetDataType(), chosen_memory_format); - - handler.reset( - new platform::ConvMKLDNNHandler(dev_ctx, mkldnn_engine, key)); - // create a conv primitive descriptor and save it for usage in backward - auto propagation = is_test ? mkldnn::prop_kind::forward_scoring - : mkldnn::prop_kind::forward_training; - - if (bias) { - bias_tz = paddle::framework::vectorize(bias->dims()); - auto bias_md = platform::MKLDNNMemDesc(bias_tz, memory::data_type::s32, - MKLDNNMemoryFormat::x); - conv_pd = handler->AcquireConvolutionPrimitiveDescriptor( - src_md, weights_md, bias_md, dst_md, strides, dilations, paddings, - mkldnn_engine, fuse_activation, fuse_alpha, fuse_beta, - fuse_residual_conn, propagation, output_shift_scale, sum_scale); - } else { - conv_pd = handler->AcquireConvolutionPrimitiveDescriptor( - src_md, weights_md, paddle::none, dst_md, strides, dilations, - paddings, mkldnn_engine, fuse_activation, fuse_alpha, fuse_beta, - fuse_residual_conn, propagation, output_shift_scale, sum_scale); - } - - // create mkldnn memory from input tensors (data/weights) - user_src_memory_p = - handler->AcquireSrcMemory(user_src_md, to_void_cast(input_data)); - auto user_weights_memory_p = handler->AcquireWeightsMemory( - user_weights_md, to_void_cast(filter_data)); - - // create reorder primitive if the input format is not the preferred one - src_memory_p = - handler->AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline); - - std::shared_ptr weights_memory_p; - int mask_reorder = - is_multi_channel ? ((g != 1) ? (1 << 1) + (1 << 0) : 1 << 0) : 0; - weights_memory_p = handler->AcquireWeightsMemoryFromPrimitive( - user_weights_memory_p, pipeline, is_test, true, scale_weights_data, - mask_reorder); - - if (fuse_residual_conn) { - auto residual_param = ctx.Input("ResidualData"); - PADDLE_ENFORCE_EQ( - output->dims(), residual_param->dims(), - platform::errors::InvalidArgument( - "Output and elementwise parameter need to have the " - "same dimension sizes, but got output's dimension = %d" - " and residual param's dimension =%d .", - output->dims().size(), residual_param->dims().size())); - auto residual_dt = - paddle::framework::ToMKLDNNDataType(residual_param->type()); - if (residual_param->format() != handler->GetDstFormat()) { - auto residual_data_tz = - paddle::framework::vectorize(residual_param->dims()); - auto user_residual_md = platform::MKLDNNMemDesc( - residual_data_tz, residual_dt, residual_param->format()); - dst_memory_p = platform::SetDstMemory( - ctx, output, residual_param, user_residual_md, handler, - &pipeline); - } else { - output->ShareDataWith(*residual_param); - dst_memory_p = platform::SetDstMemory(ctx, output, handler); - } - need_s8_to_u8 = - (platform::MKLDNNGetDataType() == memory::data_type::s8) && - unsigned_output; - } else { - dst_memory_p = platform::SetDstMemory(ctx, output, handler); - } - - // create convolution op primitive - conv_p = handler->AcquireConvolution(); - if (bias) { - const K* bias_data = bias->data(); - auto user_bias_md = platform::MKLDNNMemDesc( - {bias_tz}, platform::MKLDNNGetDataType(), MKLDNNMemoryFormat::x); - auto user_bias_memory_p = handler->AcquireBiasMemory( - user_bias_md, to_void_cast(bias_data)); - std::shared_ptr bias_memory_p; - int mask_reorder = is_multi_channel ? 1 << 0 : 1; - int count = - is_multi_channel - ? (g > 1 ? (weights_tz)[1] * (weights_tz)[0] : (weights_tz)[0]) - : 1; - std::vector scale_bias_data(count); -#pragma omp parallel for if (count > 1) - for (int i = 0; i < count; i++) { - scale_bias_data[i] = scale_in_data * scale_weights_data[i]; - } - bias_memory_p = handler->AcquireBiasMemoryFromPrimitive( - user_bias_memory_p, pipeline, is_test, true, scale_bias_data, - mask_reorder); - conv_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory_p}, - {MKLDNN_ARG_WEIGHTS, *weights_memory_p}, - {MKLDNN_ARG_BIAS, *bias_memory_p}, - {MKLDNN_ARG_DST, *dst_memory_p}}); - } else { - conv_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory_p}, - {MKLDNN_ARG_WEIGHTS, *weights_memory_p}, - {MKLDNN_ARG_DST, *dst_memory_p}}); - } - } else { - auto src_memory_reorder_p = std::static_pointer_cast( - dev_ctx.GetBlob(src_reorder_key)); - src_memory_p = - std::static_pointer_cast(dev_ctx.GetBlob(src_key)); - if (src_memory_reorder_p) { - user_src_memory_p = std::static_pointer_cast( - dev_ctx.GetBlob(user_src_key)); - user_src_memory_p->set_data_handle(to_void_cast(input_data)); - { - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); - src_memory_reorder_p->execute(astream, *user_src_memory_p, - *src_memory_p); - astream.wait(); - } - } else if (src_memory_p) { - src_memory_p->set_data_handle(to_void_cast(input_data)); - } - auto weights_memory_p = std::static_pointer_cast( - dev_ctx.GetBlob(weights_key)); + output->dims(), residual_param->dims(), + platform::errors::InvalidArgument( + "Output and elementwise parameter need to have the " + "same dimension sizes, but got output's dimension = %d" + " and residual param's dimension =%d .", + output->dims().size(), residual_param->dims().size())); dst_memory_p = - std::static_pointer_cast(dev_ctx.GetBlob(dst_key)); - conv_p = std::static_pointer_cast( - dev_ctx.GetBlob(prim_key)); - handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx, - mkldnn_engine, key)); - - if (fuse_residual_conn) { - auto residual_param = ctx.Input("ResidualData"); - output->ShareDataWith(*residual_param); - need_s8_to_u8 = - (platform::MKLDNNGetDataType() == memory::data_type::s8) && - unsigned_output; - } - platform::SetDstMemoryHandler(ctx, output, handler, dst_memory_p); + handler.AcquireDstMemoryWithResidual(output, residual_param); + need_s8_to_u8 = (platform::MKLDNNGetDataType() == + mkldnn::memory::data_type::s8) && + unsigned_output; + } else { + dst_memory_p = handler.template AcquireDstMemory(output); + } - auto residual_reorder_p = std::static_pointer_cast( - dev_ctx.GetBlob(residual_reorder_key)); - if (residual_reorder_p) { - auto user_residual_data_p = std::static_pointer_cast( - dev_ctx.GetBlob(user_residual_key)); - { - platform::RecordEvent record_reorder("int_reorder", - platform::EventRole::kUniqueOp); - residual_reorder_p->execute(astream, *user_residual_data_p, - *dst_memory_p); - astream.wait(); - } - } + auto conv_p = handler.AcquireForwardPrimitive(); + + std::unordered_map args = { + {MKLDNN_ARG_SRC, *src_memory_p}, + {MKLDNN_ARG_WEIGHTS, *weights_memory_p}, + {MKLDNN_ARG_DST, *dst_memory_p}}; - auto bias_memory_p = - std::static_pointer_cast(dev_ctx.GetBlob(bias_key)); + if (bias) { + float mask_reorder; + std::vector scale_bias_data; + std::tie(mask_reorder, scale_bias_data) = + handler.get_int8_bias_scales(ctx); - if (bias_memory_p) { - conv_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory_p}, - {MKLDNN_ARG_WEIGHTS, *weights_memory_p}, - {MKLDNN_ARG_BIAS, *bias_memory_p}, - {MKLDNN_ARG_DST, *dst_memory_p}}); - } else { - conv_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory_p}, - {MKLDNN_ARG_WEIGHTS, *weights_memory_p}, - {MKLDNN_ARG_DST, *dst_memory_p}}); - } + auto bias_memory_p = handler.AcquireBiasMemoryWithReorder( + bias, is_test, scale_bias_data, mask_reorder); + args.insert({MKLDNN_ARG_BIAS, *bias_memory_p}); } + + auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); + conv_p->execute(astream, args); astream.wait(); + if (need_s8_to_u8) { output->mutable_data(ctx.GetPlace()); } - output->set_layout(DataLayout::kMKLDNN); - output->set_format(GetMKLDNNFormat(*dst_memory_p)); + + output->set_layout(framework::DataLayout::kMKLDNN); + output->set_format(platform::GetMKLDNNFormat(*dst_memory_p)); } }; template -class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { +class ConvMKLDNNGradOpKernel : public framework::OpKernel { public: - void Compute(const paddle::framework::ExecutionContext& ctx) const override { + void Compute(const framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, - paddle::platform::errors::PreconditionNotMet( + platform::errors::PreconditionNotMet( "Operator DNNL ConvGrad must use CPUPlace")); auto& dev_ctx = ctx.template device_context(); @@ -1105,18 +893,19 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { {MKLDNN_ARG_DIFF_WEIGHTS, *diff_weights_memory_p}}); astream.wait(); - filter_grad->set_layout(DataLayout::kMKLDNN); + filter_grad->set_layout(framework::DataLayout::kMKLDNN); // in OneDNN groups in convolution are treated as separate dimension // which is not the case in paddlepaddle - auto filter_fmt = GetMKLDNNFormat(*diff_weights_memory_p); + auto filter_fmt = platform::GetMKLDNNFormat(*diff_weights_memory_p); // For convolution with groups convert from blocked to NCHW // otherwise there will be problems in next operators working on this data if (g > 1) { - memory::data_type in_type = framework::ToMKLDNNDataType(filter->type()); + mkldnn::memory::data_type in_type = + framework::ToMKLDNNDataType(filter->type()); // for 3d conv with groups (six dimensional data reorder to goidhw) // for 2d conv with groups (five dimensional data reorder to goihw) - // auto weights_tz = paddle::framework::vectorize(filter->dims()); + // auto weights_tz = framework::vectorize(filter->dims()); auto weights_tz = diff_weights_memory_p->get_desc().dims(); mkldnn::memory::format_tag out_format = @@ -1168,8 +957,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { {MKLDNN_ARG_DIFF_SRC, *diff_src_memory_p}}); astream.wait(); - input_grad->set_layout(DataLayout::kMKLDNN); - input_grad->set_format(GetMKLDNNFormat(*diff_src_memory_p)); + input_grad->set_layout(framework::DataLayout::kMKLDNN); + input_grad->set_format(platform::GetMKLDNNFormat(*diff_src_memory_p)); } } }; diff --git a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc index 90f0de60b592de..f567f4660534c7 100644 --- a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc @@ -104,8 +104,10 @@ class InterpolateMKLDNNKernel : public framework::OpKernel { scale.push_back(scale[0]); } else { // v2 std::vector scale_attr = ctx.Attr>("scale"); - scale.resize(3, scale_attr[0]); - std::copy(scale_attr.begin(), scale_attr.end(), scale.begin()); + if (scale_attr.size() > 0) { + scale.resize(3, scale_attr[0]); + std::copy(scale_attr.begin(), scale_attr.end(), scale.begin()); + } } } if (scale[0] > 0.0f && scale[1] > 0.0f && scale[2] > 0.0f) { diff --git a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc index b78acd32e6dc8f..b7eb5a3ab4b57c 100644 --- a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc @@ -245,6 +245,36 @@ class MatMulMKLDNNHandler auto input_dims = ctx.Input(input_name)->dims(); auto new_dims = input_dims; if (!shape.empty() && !axis.empty()) { + auto it_zero = std::find(shape.begin(), shape.end(), 0); + if (it_zero != shape.end()) { + for (uint64_t i = 0; i < shape.size(); i++) { + if (shape[i] == 0) { + PADDLE_ENFORCE_LT( + i, input_dims.size(), + paddle::platform::errors::InvalidArgument( + "The index of 0 in fused_reshape_%s ", + "should be less than output dim size, ", + "but the index is %d and output dim size is %d", input_name, + i, input_dims.size())); + shape[i] = input_dims.at(i); + } + } + } + + // if "-1" is present then one of reshape dims must be infered + auto it_negative = std::find(shape.begin(), shape.end(), -1); + if (it_negative != shape.end()) { + int64_t dim_product = 1; + for (int i = 0; i < input_dims.size(); i++) { + dim_product *= input_dims.at(i); + } + + int64_t shape_product = std::accumulate(shape.begin(), shape.end(), -1, + std::multiplies()); + int index = std::distance(shape.begin(), it_negative); + shape[index] = dim_product / shape_product; + } + new_dims = input_dims.reshape(shape).transpose(axis); } diff --git a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc index 57a3c385593160..aa0a16944bcfab 100644 --- a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc @@ -36,7 +36,8 @@ class MatMulV2MKLDNNHandler MatMulV2MKLDNNHandler(const mkldnn::engine engine, paddle::platform::Place cpu_place, const std::vector& x_org_dims, bool trans_x, - const std::vector& y_org_dims, bool trans_y) + const std::vector& y_org_dims, bool trans_y, + bool is_output_fused) : paddle::platform::MKLDNNHandlerNoCachingT(engine, cpu_place) { // M X K * K X N @@ -86,6 +87,10 @@ class MatMulV2MKLDNNHandler out_strides[i] = out_ddims[i + 1] * out_strides[i + 1]; } + if (is_output_fused) { + out_strides = FakeTransposeStrides(out_ddims); + } + auto x_md = memory::desc(x_dims, MKLDNNGetDataType(), x_strides); auto y_md = memory::desc(y_dims, MKLDNNGetDataType(), y_strides); auto out_md = memory::desc(out_ddims, MKLDNNGetDataType(), out_strides); @@ -93,6 +98,24 @@ class MatMulV2MKLDNNHandler this->AcquireForwardPrimitiveDescriptor(x_md, y_md, out_md); } + std::vector FakeTransposeStrides( + const std::vector& matmul_out_dims) const { + // fuse matmul_v2 + transpose + reshape guarantees that output is 4D and + // transpose axis are: {0, 2, 1, 3} + std::vector transpose_axis = {0, 2, 1, 3}; + std::vector fake_strides(transpose_axis.size()); + int ndims = static_cast(transpose_axis.size()); + + int total_stride = 1; + + for (int i = ndims - 1; i >= 0; --i) { + fake_strides[transpose_axis[i]] = total_stride; + total_stride *= matmul_out_dims[transpose_axis[i]]; + } + + return fake_strides; + } + std::shared_ptr AcquireWeightsMemory(const Tensor* input) { const T* input_data = input->data(); return this->AcquireMemoryFromPrimitive(this->fwd_pd_->weights_desc(), @@ -116,7 +139,8 @@ class MatMulV2MKLDNNKernel bool trans_y, Tensor* out, std::vector& out_dims, int execution_number = 0) const { MatMulV2MKLDNNHandler handler(onednn_engine, ctx.GetPlace(), x_dims, - trans_x, y_dims, trans_y); + trans_x, y_dims, trans_y, + IsOutputFused(ctx)); const auto src_memory_p = handler.AcquireSrcMemory(x); const auto weights_memory_p = handler.AcquireWeightsMemory(y); @@ -133,9 +157,10 @@ class MatMulV2MKLDNNKernel matmul_p->execute(astream, matmul_args); astream.wait(); + auto format = paddle::platform::MKLDNNFormatForSize( + out->dims().size(), dnnl::memory::format_tag::nchw); out->set_layout(paddle::framework::DataLayout::kMKLDNN); - out->set_format( - GetMKLDNNFormat(dst_memory_p->get_desc().reshape(out_dims))); + out->set_format(format); } private: @@ -148,8 +173,8 @@ class MatMulV2MKLDNNKernel if (x_dims.size() == 1) { x_bd_dims[x_bd_dims.size() - 1] = x_dims[0]; } else if (x_dims.size() == 2) { - x_bd_dims[2] = x_dims[1]; - x_bd_dims[1] = x_dims[0]; + x_bd_dims[x_bd_dims.size() - 1] = x_dims[1]; + x_bd_dims[x_bd_dims.size() - 2] = x_dims[0]; } else { for (size_t i = 0; i < x_dims.size(); ++i) { x_bd_dims[i] = x_dims[i]; @@ -158,15 +183,16 @@ class MatMulV2MKLDNNKernel if (y_dims.size() == 1) { y_bd_dims[x_bd_dims.size() - 2] = y_dims[0]; } else if (y_dims.size() == 2) { - y_bd_dims[2] = y_dims[1]; - y_bd_dims[1] = y_dims[0]; + y_bd_dims[y_bd_dims.size() - 1] = y_dims[1]; + y_bd_dims[y_bd_dims.size() - 2] = y_dims[0]; } else { for (size_t i = 0; i < y_dims.size(); ++i) { y_bd_dims[i] = y_dims[i]; } } - if ((y_dims.size() == x_dims.size()) && y_dims.size() > 2) { + if ((y_dims.size() == x_dims.size()) && y_dims.size() > 2 && + !IsOutputFused(ctx)) { for (size_t i = 0; i < x_dims.size() - 2; ++i) { PADDLE_ENFORCE_EQ( x_dims[i] == y_dims[i] || x_dims[i] == 1 || y_dims[i] == 1, true, @@ -181,6 +207,13 @@ class MatMulV2MKLDNNKernel } } + bool IsOutputFused(const ExecutionContext& ctx) const { + auto& fused_reshape_Out = ctx.Attr>("fused_reshape_Out"); + auto& fused_transpose_Out = + ctx.Attr>("fused_transpose_Out"); + return !fused_reshape_Out.empty() && !fused_transpose_Out.empty(); + } + void RunKernel(const ExecutionContext& ctx) const { const auto& dev_ctx = ctx.template device_context(); const auto& onednn_engine = dev_ctx.GetEngine(); diff --git a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc index e6a7f3e74fcc7a..6c3f4ec06201a1 100644 --- a/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/reshape_mkldnn_op.cc @@ -12,9 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/operators/flatten_op.h" #include "paddle/fluid/operators/squeeze_op.h" #include "paddle/fluid/platform/mkldnn_reuse.h" +namespace { +enum class ReshapeKernelOpName { + reshape, + reshape2, + squeeze, + squeeze2, + flatten, + flatten2, +}; +} // anonymous namespace + namespace paddle { namespace operators { @@ -41,7 +53,7 @@ static std::vector extract_shape( return vec_new_shape; } -template +template class ReshapeMKLDNNKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -55,43 +67,13 @@ class ReshapeMKLDNNKernel : public framework::OpKernel { const auto& onednn_engine = dev_ctx.GetEngine(); auto* x = ctx.Input("X"); - auto* xshape = ctx.Output("XShape"); auto* out = ctx.Output("Out"); - framework::DDim x_dims; - // if reshape or squeeze - if (ctx.Type().find("2") == std::string::npos) { - x_dims = x->dims(); - } else { - auto xshape_dims = xshape->dims(); - x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size()); - } + framework::DDim x_dims, out_dims; + InferInOutShape(ctx, x_dims, out_dims); auto x_vec_dims = framework::vectorize(x_dims); - framework::DDim out_dims; - if (ctx.Type() == "squeeze") { - auto& axes = ctx.Attr>("axes"); - out_dims = GetOutputShape(axes, x_dims, true); - } else { - out_dims = out->dims(); - } - - if (ctx.Type().find("reshape") != std::string::npos) { - auto list_new_shape_tensor = ctx.MultiInput("ShapeTensor"); - if (list_new_shape_tensor.size() > 0) { - auto new_shape = extract_shape(list_new_shape_tensor); - out_dims = ValidateShape(new_shape, x_dims); - } else if (ctx.HasInput("Shape")) { - auto* shape_tensor = ctx.Input("Shape"); - auto* shape_data = shape_tensor->data(); - - auto shape = - std::vector(shape_data, shape_data + shape_tensor->numel()); - out_dims = ValidateShape(shape, x_dims); - } - } - mkldnn::memory::data_type x_type = framework::ToMKLDNNDataType(x->type()); platform::ReorderMKLDNNHandler reorder_handler(x_vec_dims, x->type(), x_type, onednn_engine); @@ -116,6 +98,104 @@ class ReshapeMKLDNNKernel : public framework::OpKernel { framework::vectorize(out_dims)))); } + void InferInOutShape(const framework::ExecutionContext& ctx, + framework::DDim& x_dims, + framework::DDim& out_dims) const { + switch (op_name) { + case ReshapeKernelOpName::reshape: + InferShapeReshapeOp(ctx, x_dims, out_dims); + break; + case ReshapeKernelOpName::reshape2: + InferShapeReshape2Op(ctx, x_dims, out_dims); + break; + case ReshapeKernelOpName::squeeze: + InferShapeSqueezeOp(ctx, x_dims, out_dims); + break; + case ReshapeKernelOpName::squeeze2: + InferShapeSqueeze2Op(ctx, x_dims, out_dims); + break; + case ReshapeKernelOpName::flatten: + InferShapeFlattenOp(ctx, x_dims, out_dims); + break; + case ReshapeKernelOpName::flatten2: + InferShapeFlattenOp(ctx, x_dims, out_dims); + break; + default: + PADDLE_THROW(paddle::platform::errors::OutOfRange( + "Reshape kernel doesn not support that operator name")); + } + } + + void InferShapeReshapeOp(const framework::ExecutionContext& ctx, + framework::DDim& x_dims, + framework::DDim& out_dims) const { + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + x_dims = x->dims(); + out_dims = out->dims(); + ChangeReshapeOutDimsIfNeeded(ctx, x_dims, out_dims); + } + + void InferShapeReshape2Op(const framework::ExecutionContext& ctx, + framework::DDim& x_dims, + framework::DDim& out_dims) const { + auto* out = ctx.Output("Out"); + auto* xshape = ctx.Output("XShape"); + auto xshape_dims = xshape->dims(); + x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size()); + out_dims = out->dims(); + ChangeReshapeOutDimsIfNeeded(ctx, x_dims, out_dims); + } + + // in reshape1/2 ops "ShapeTensor" has highest priority and "Shape" has + // second highest priority + void ChangeReshapeOutDimsIfNeeded(const framework::ExecutionContext& ctx, + framework::DDim& x_dims, + framework::DDim& out_dims) const { + auto list_new_shape_tensor = ctx.MultiInput("ShapeTensor"); + if (list_new_shape_tensor.size() > 0) { + auto new_shape = extract_shape(list_new_shape_tensor); + out_dims = ValidateShape(new_shape, x_dims); + } else if (ctx.HasInput("Shape")) { + auto* shape_tensor = ctx.Input("Shape"); + auto* shape_data = shape_tensor->data(); + + auto shape = + std::vector(shape_data, shape_data + shape_tensor->numel()); + out_dims = ValidateShape(shape, x_dims); + } + } + + void InferShapeSqueezeOp(const framework::ExecutionContext& ctx, + framework::DDim& x_dims, + framework::DDim& out_dims) const { + auto* x = ctx.Input("X"); + x_dims = x->dims(); + const auto& axes = ctx.Attr>("axes"); + out_dims = GetOutputShape(axes, x_dims, true); + } + + void InferShapeSqueeze2Op(const framework::ExecutionContext& ctx, + framework::DDim& x_dims, + framework::DDim& out_dims) const { + auto* out = ctx.Output("Out"); + auto* xshape = ctx.Output("XShape"); + auto xshape_dims = xshape->dims(); + x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size()); + out_dims = out->dims(); + } + + void InferShapeFlattenOp(const framework::ExecutionContext& ctx, + framework::DDim& x_dims, + framework::DDim& out_dims) const { + auto x = ctx.Input("X"); + x_dims = x->dims(); + auto axes = ctx.Attr("axis"); + out_dims = framework::make_ddim( + FlattenKernel::GetOutputShape( + axes, x_dims)); + } + protected: static mkldnn::memory::format_tag getPlainFormatTag(const Tensor* tensor) { auto tensor_dims_size = tensor->dims().size(); @@ -223,8 +303,8 @@ class ReshapeMKLDNNKernel : public framework::OpKernel { } }; -template -class ReshapeGradMKLDNNKernel : public ReshapeMKLDNNKernel { +template +class ReshapeGradMKLDNNKernel : public ReshapeMKLDNNKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { RunKernel(ctx); @@ -239,14 +319,9 @@ class ReshapeGradMKLDNNKernel : public ReshapeMKLDNNKernel { auto* dout = ctx.Input(framework::GradVarName("Out")); auto* dx = ctx.Output(framework::GradVarName("X")); - framework::DDim x_dims; - // if reshape or squeeze - if (ctx.Type().find("2") == std::string::npos) { - x_dims = dx->dims(); - } else { - auto xshape_dims = ctx.Input("XShape")->dims(); - x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size()); - } + framework::DDim dx_dims; + InferOutputShapeInGrad(ctx, dx_dims); + auto dout_vec_dims = framework::vectorize(dout->dims()); mkldnn::memory::data_type dout_type = @@ -265,44 +340,128 @@ class ReshapeGradMKLDNNKernel : public ReshapeMKLDNNKernel { reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p); astream.wait(); - dx->Resize(x_dims); + dx->Resize(dx_dims); dx->set_layout(framework::DataLayout::kMKLDNN); dx->set_format(GetMKLDNNFormat(reorder_dst_memory_p->get_desc().reshape( - framework::vectorize(x_dims)))); + framework::vectorize(dx_dims)))); } -}; -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OP_KERNEL(squeeze, MKLDNN, paddle::platform::CPUPlace, - ops::ReshapeMKLDNNKernel, - ops::ReshapeMKLDNNKernel); - -REGISTER_OP_KERNEL(squeeze_grad, MKLDNN, paddle::platform::CPUPlace, - ops::ReshapeGradMKLDNNKernel, - ops::ReshapeGradMKLDNNKernel); -REGISTER_OP_KERNEL(squeeze2, MKLDNN, paddle::platform::CPUPlace, - ops::ReshapeMKLDNNKernel, - ops::ReshapeMKLDNNKernel); - -REGISTER_OP_KERNEL(squeeze2_grad, MKLDNN, paddle::platform::CPUPlace, - ops::ReshapeGradMKLDNNKernel, - ops::ReshapeGradMKLDNNKernel); + void InferOutputShapeInGrad(const framework::ExecutionContext& ctx, + framework::DDim& x_dims) const { + switch (op_name) { + case ReshapeKernelOpName::reshape: + InferShapeReshapeSqueezeGradOp(ctx, x_dims); + break; + case ReshapeKernelOpName::reshape2: + InferShapeReshape2Squeeze2Flatten2GradOp(ctx, x_dims); + break; + case ReshapeKernelOpName::squeeze: + InferShapeReshapeSqueezeGradOp(ctx, x_dims); + break; + case ReshapeKernelOpName::squeeze2: + InferShapeReshape2Squeeze2Flatten2GradOp(ctx, x_dims); + break; + case ReshapeKernelOpName::flatten: + InferShapeFlattenGradOp(ctx, x_dims); + break; + case ReshapeKernelOpName::flatten2: + InferShapeReshape2Squeeze2Flatten2GradOp(ctx, x_dims); + break; + default: + PADDLE_THROW(paddle::platform::errors::OutOfRange( + "Reshape grad kernel doesn not support that operator name")); + } + } -REGISTER_OP_KERNEL(reshape, MKLDNN, paddle::platform::CPUPlace, - ops::ReshapeMKLDNNKernel, - ops::ReshapeMKLDNNKernel); + void InferShapeReshapeSqueezeGradOp(const framework::ExecutionContext& ctx, + framework::DDim& dx_dims) const { + auto* dx = ctx.Output(framework::GradVarName("X")); + dx_dims = dx->dims(); + } -REGISTER_OP_KERNEL(reshape_grad, MKLDNN, paddle::platform::CPUPlace, - ops::ReshapeGradMKLDNNKernel, - ops::ReshapeGradMKLDNNKernel); + void InferShapeReshape2Squeeze2Flatten2GradOp( + const framework::ExecutionContext& ctx, framework::DDim& dx_dims) const { + auto xshape_dims = ctx.Input("XShape")->dims(); + dx_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size()); + } -REGISTER_OP_KERNEL(reshape2, MKLDNN, paddle::platform::CPUPlace, - ops::ReshapeMKLDNNKernel, - ops::ReshapeMKLDNNKernel); + void InferShapeFlattenGradOp(const framework::ExecutionContext& ctx, + framework::DDim& dx_dims) const { + dx_dims = ctx.Input("X")->dims(); + } +}; +} // namespace operators +} // namespace paddle -REGISTER_OP_KERNEL(reshape2_grad, MKLDNN, paddle::platform::CPUPlace, - ops::ReshapeGradMKLDNNKernel, - ops::ReshapeGradMKLDNNKernel); +namespace ops = paddle::operators; +REGISTER_OP_KERNEL( + squeeze, MKLDNN, paddle::platform::CPUPlace, + ops::ReshapeMKLDNNKernel, + ops::ReshapeMKLDNNKernel); + +REGISTER_OP_KERNEL( + squeeze_grad, MKLDNN, paddle::platform::CPUPlace, + ops::ReshapeGradMKLDNNKernel, + ops::ReshapeGradMKLDNNKernel); + +REGISTER_OP_KERNEL( + squeeze2, MKLDNN, paddle::platform::CPUPlace, + ops::ReshapeMKLDNNKernel, + ops::ReshapeMKLDNNKernel); + +REGISTER_OP_KERNEL( + squeeze2_grad, MKLDNN, paddle::platform::CPUPlace, + ops::ReshapeGradMKLDNNKernel, + ops::ReshapeGradMKLDNNKernel); + +REGISTER_OP_KERNEL( + reshape, MKLDNN, paddle::platform::CPUPlace, + ops::ReshapeMKLDNNKernel, + ops::ReshapeMKLDNNKernel); + +REGISTER_OP_KERNEL( + reshape_grad, MKLDNN, paddle::platform::CPUPlace, + ops::ReshapeGradMKLDNNKernel, + ops::ReshapeGradMKLDNNKernel); + +REGISTER_OP_KERNEL( + reshape2, MKLDNN, paddle::platform::CPUPlace, + ops::ReshapeMKLDNNKernel, + ops::ReshapeMKLDNNKernel); + +REGISTER_OP_KERNEL( + reshape2_grad, MKLDNN, paddle::platform::CPUPlace, + ops::ReshapeGradMKLDNNKernel, + ops::ReshapeGradMKLDNNKernel); + +REGISTER_OP_KERNEL( + flatten, MKLDNN, paddle::platform::CPUPlace, + ops::ReshapeMKLDNNKernel, + ops::ReshapeMKLDNNKernel); + +REGISTER_OP_KERNEL( + flatten_grad, MKLDNN, paddle::platform::CPUPlace, + ops::ReshapeGradMKLDNNKernel, + ops::ReshapeGradMKLDNNKernel); + +REGISTER_OP_KERNEL( + flatten2, MKLDNN, paddle::platform::CPUPlace, + ops::ReshapeMKLDNNKernel, + ops::ReshapeMKLDNNKernel); + +REGISTER_OP_KERNEL( + flatten2_grad, MKLDNN, paddle::platform::CPUPlace, + ops::ReshapeGradMKLDNNKernel, + ops::ReshapeGradMKLDNNKernel); diff --git a/paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h b/paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h new file mode 100644 index 00000000000000..fdb2c534e03634 --- /dev/null +++ b/paddle/fluid/operators/mkldnn/softplus_mkldnn_op.h @@ -0,0 +1,94 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/platform/mkldnn_reuse.h" + +namespace paddle { +namespace operators { + +using paddle::framework::Tensor; + +template +class SoftplusMKLDNNHandler + : public platform::MKLDNNHandlerNoCachingT { + public: + SoftplusMKLDNNHandler(const Tensor* x, const float beta, + const mkldnn::engine engine, platform::Place cpu_place) + : platform::MKLDNNHandlerNoCachingT(engine, cpu_place) { + auto x_tz = framework::vectorize(x->dims()); + auto x_md = + dnnl::memory::desc(x_tz, platform::MKLDNNGetDataType(), x->format()); + + auto beta_tz = std::vector(x_tz.size(), 1); + auto beta_md = dnnl::memory::desc(beta_tz, platform::MKLDNNGetDataType(), + x->format()); + + dnnl::post_ops post_ops; + post_ops.append_eltwise(1.0f, dnnl::algorithm::eltwise_soft_relu, 0.0f, + 0.0f); + if (beta != 1.0f) { + post_ops.append_eltwise(1.0f, dnnl::algorithm::eltwise_linear, + 1.0f / beta, 0.0f); + } + + dnnl::primitive_attr attrs; + attrs.set_post_ops(post_ops); + + this->AcquireForwardPrimitiveDescriptor(attrs, dnnl::algorithm::binary_mul, + x_md, beta_md, x_md); + } + + std::shared_ptr AcquireBetaMemory(const float* beta) { + return this->AcquireMemoryFromPrimitive( + this->fwd_pd_->src1_desc(), platform::to_void_cast(beta)); + } +}; + +template +void custom_softplus_eltwise_forward(const framework::ExecutionContext& ctx) { + const auto& dev_ctx = + ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); + + const auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + + bool is_inplaced = x->IsSharedBufferWith(*out); + + const float beta = ctx.Attr("beta"); + + SoftplusMKLDNNHandler handler(x, beta, mkldnn_engine, ctx.GetPlace()); + + auto src_memory_p = handler.AcquireSrcMemory(x); + + auto beta_memory_p = handler.AcquireBetaMemory(&beta); + auto dst_memory_p = + is_inplaced ? src_memory_p : handler.AcquireDstMemory(out); + auto binary_p = handler.AcquireForwardPrimitive(); + + auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream(); + + const std::unordered_map args = { + {DNNL_ARG_SRC_0, *src_memory_p}, + {DNNL_ARG_SRC_1, *beta_memory_p}, + {DNNL_ARG_DST, *dst_memory_p}}; + + binary_p->execute(astream, args); + astream.wait(); + + out->set_layout(framework::DataLayout::kMKLDNN); + out->set_format(platform::GetMKLDNNFormat(*dst_memory_p)); +} +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/npu_op_runner.cc b/paddle/fluid/operators/npu_op_runner.cc index bb6549c111988e..830e18cb8a14c0 100644 --- a/paddle/fluid/operators/npu_op_runner.cc +++ b/paddle/fluid/operators/npu_op_runner.cc @@ -26,6 +26,8 @@ limitations under the License. */ #include "paddle/fluid/framework/framework.pb.h" +DECLARE_string(npu_precision_mode); + namespace paddle { namespace operators { @@ -186,6 +188,21 @@ NpuOpRunner &NpuOpRunner::AddAttr(const std::string &name, return *this; } +NpuOpRunner &NpuOpRunner::AddAttrDataType(const std::string &name, + const NPUAttribute &attr) { + PADDLE_ENFORCE_EQ( + (attr.type() == typeid(int)), true, + platform::errors::InvalidArgument( + "Attr type is NOT equal to framework::proto::VarType::Type.")); + if (!attr_) { + attr_ = aclopCreateAttr(); + } + auto dtype = ConvertToNpuDtype( + static_cast(BOOST_GET_CONST(int, attr))); + PADDLE_ENFORCE_NPU_SUCCESS(aclopSetAttrDataType(attr_, name.c_str(), dtype)); + return *this; +} + NpuOpRunner &NpuOpRunner::AddAttrs(const NPUAttributeMap &attrs) { for (const auto &pair : attrs) { AddAttr(pair.first, pair.second); @@ -404,6 +421,12 @@ void NpuOpRunner::Run(aclrtStream stream) const { VLOG(4) << "attr: " << attr_; VLOG(4) << "stream: " << stream; + if (!FLAGS_npu_precision_mode.empty()) { + PADDLE_ENFORCE_NPU_SUCCESS( + aclSetCompileopt(ACL_PRECISION_MODE, FLAGS_npu_precision_mode.c_str())); + VLOG(4) << "set ACL_PRECISION_MODE: " << FLAGS_npu_precision_mode; + } + aclError ret = aclopCompileAndExecute( op_type_.c_str(), input_descs_.size(), input_descs_.data(), input_buffers_.data(), output_descs_.size(), output_descs_.data(), diff --git a/paddle/fluid/operators/npu_op_runner.h b/paddle/fluid/operators/npu_op_runner.h index 45e973970a956d..6db5f17d671181 100644 --- a/paddle/fluid/operators/npu_op_runner.h +++ b/paddle/fluid/operators/npu_op_runner.h @@ -58,6 +58,12 @@ class NpuOpRunner { NpuOpRunner &AddAttr(const std::string &name, const NPUAttribute &attr); + // NOTE(qili93): need to add indivisual api for aclopSetAttrDataType + // as typeid(aclDataType) and typeid(framework::proto::VarType::Type) + // always go to attr.type() == typeid(int) to call aclopSetAttrInt + NpuOpRunner &AddAttrDataType(const std::string &name, + const NPUAttribute &attr); + NpuOpRunner &AddAttrs(const NPUAttributeMap &attrs); NpuOpRunner &AddInput(const Tensor &tensor); diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cc b/paddle/fluid/operators/optimizers/lars_momentum_op.cc index 8f30dd5b2e68a4..65be35843bdf99 100644 --- a/paddle/fluid/operators/optimizers/lars_momentum_op.cc +++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cc @@ -13,46 +13,158 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/optimizers/lars_momentum_op.h" -#include "paddle/fluid/operators/optimizers/momentum_op.h" namespace paddle { namespace operators { +class LarsMomentumOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInputs("Param"), "Input", "Param", "LarsMomentum"); + OP_INOUT_CHECK(ctx->HasInputs("Grad"), "Input", "Grad", "LarsMomentum"); + OP_INOUT_CHECK(ctx->HasInputs("Velocity"), "Input", "Velocity", + "LarsMomentum"); + OP_INOUT_CHECK(ctx->HasInputs("LearningRate"), "Input", "LearningRate", + "LarsMomentum"); + OP_INOUT_CHECK(ctx->HasOutputs("ParamOut"), "Output", "ParamOut", + "LarsMomentum"); + OP_INOUT_CHECK(ctx->HasOutputs("VelocityOut"), "Output", "VelocityOut", + "LarsMomentum"); + PADDLE_ENFORCE_EQ( + ctx->GetInputsVarType("Param").front(), + framework::proto::VarType::LOD_TENSOR, + platform::errors::InvalidArgument( + "The input var's type should be LoDTensor, but the received is %s", + ctx->GetInputsVarType("Param").front())); + + auto lr_dims = ctx->GetInputsDim("LearningRate"); + auto grad_dim = ctx->GetInputsDim("Grad"); + auto param_dim = ctx->GetInputsDim("Param"); + auto velocity_dim = ctx->GetInputsDim("Velocity"); + auto lars_weight_decays = + ctx->Attrs().Get>("lars_weight_decay"); + auto multi_precision = ctx->Attrs().Get("multi_precision"); + + PADDLE_ENFORCE_EQ( + param_dim.size(), grad_dim.size(), + platform::errors::InvalidArgument( + "Input(Param) and Input(Grad) of LarsMomentumOp should have " + "same quantity. But number of Param is [%d] and Grad is [%d].", + param_dim.size(), grad_dim.size())); + PADDLE_ENFORCE_EQ( + param_dim.size(), velocity_dim.size(), + platform::errors::InvalidArgument( + "Input(Param) and Input(Velocity) of LarsMomentumOp should " + "have same quantity. But number of Param is [%d] and Velocity " + "is [%d].", + param_dim.size(), velocity_dim.size())); + PADDLE_ENFORCE_EQ( + lars_weight_decays.size(), grad_dim.size(), + platform::errors::InvalidArgument( + "Attr(Lars_weight_decay) and " + "Input(Grad) of LarsMomentumOp should have same quantity. " + "But number of Lars_weight_decay is [%d] and Grad is [%d].", + lars_weight_decays.size(), grad_dim.size())); + + if (multi_precision) { + OP_INOUT_CHECK(ctx->HasInputs("MasterParam"), "Input", "MasterParam", + "LarsMomentumMultiPrecision"); + OP_INOUT_CHECK(ctx->HasOutputs("MasterParamOut"), "Output", + "MasterParamOut", "LarsMomentumMultiPrecision"); + } + for (size_t i = 0; i < lr_dims.size(); ++i) { + PADDLE_ENFORCE_EQ(framework::product(lr_dims[i]), 1, + platform::errors::InvalidArgument( + "Learning_rate should be a scalar. But Received " + "LearningRate's dim [%s]", + framework::product(lr_dims[i]))); + } + + for (size_t i = 0; i < param_dim.size(); ++i) { + PADDLE_ENFORCE_EQ(ctx->GetInputsVarType("Grad")[i], + framework::proto::VarType::LOD_TENSOR, + platform::errors::InvalidArgument( + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx->Inputs("Grad")[i].front(), + ctx->GetInputsVarType("Grad")[i])); + PADDLE_ENFORCE_EQ( + param_dim[i], grad_dim[i], + platform::errors::InvalidArgument( + "Input(Param) and Input(Grad) input of LarsMomentumOp shall " + "have same dimension. But Param`s dim is [%s] and Grad's dim " + "is [%s].", + param_dim[i], grad_dim[i])); + PADDLE_ENFORCE_EQ( + param_dim[i], velocity_dim[i], + platform::errors::InvalidArgument( + "Input(Param) and Input(Velocity) of LarsMomentumOp shall have " + "same dimension. But Param dim [%s] differs with Velocity dim " + "[%s].", + param_dim[i], velocity_dim[i])); + } + ctx->SetOutputsDim("ParamOut", param_dim); + ctx->SetOutputsDim("VelocityOut", param_dim); + if (ctx->HasOutputs("MasterParamOut")) { + ctx->SetOutputsDim("MasterParamOut", param_dim); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto input_data_type = + OperatorWithKernel::IndicateVarDataType(ctx, "Param"); + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } +}; + class LarsMomentumOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("Param", "(LoDTensor, default LoDTensor) " - "Input parameter that has to be updated"); + "Input parameter that has to be updated") + .AsDuplicable(); AddInput("Grad", "(LoDTensor, default LoDTensor) " - "Input gradient of the parameter"); + "Input gradient of the parameter") + .AsDuplicable(); AddInput("Velocity", "(LoDTensor, default LoDTensor) " "Input velocity (corresponding to the parameter) " - "that has to be updated"); + "that has to be updated") + .AsDuplicable(); AddInput("LearningRate", "(LoDTensor, default LoDTensor) " - "Input learning rate"); - AddInput("MasterParam", "FP32 master weight for AMP.").AsDispensable(); - + "Input learning rate") + .AsDuplicable(); + AddInput("MasterParam", "FP32 master weight for AMP.") + .AsDuplicable() + .AsDispensable(); AddOutput("ParamOut", "(LoDTensor) This output is updated parameter. " - "It shared memory with Input(Param)."); + "It shared memory with Input(Param).") + .AsDuplicable(); AddOutput("VelocityOut", "(LoDTensor) This output is updated velocity. " - "It shared memory with Input(Velocity)."); + "It shared memory with Input(Velocity).") + .AsDuplicable(); AddOutput("MasterParamOut", "The updated FP32 master weight for AMP. " "It shared memory with Input(MasterParam).") + .AsDuplicable() .AsDispensable(); - AddAttr("mu", "(float) Momentum coefficient"); AddAttr("lars_coeff", "(float, default 0.001) LARS coefficient.") .SetDefault(0.001); - AddAttr("lars_weight_decay", - "(float, default 0.0005) LARS weight decay") - .SetDefault(0.0005); + AddAttr>( + "lars_weight_decay", + "(std::vector, default 0.0005) LARS weight decay params") + .SetDefault({0.0005}); AddAttr("epsilon", "(float, default 0.0) epsilon to avoid Division by Zero.") .SetDefault(0.0); @@ -96,7 +208,7 @@ class LarsMomentumOpVarTypeInference : public framework::VarTypeInference { namespace ops = paddle::operators; REGISTER_OPERATOR( - lars_momentum, ops::MomentumOp, ops::LarsMomentumOpMaker, + lars_momentum, ops::LarsMomentumOp, ops::LarsMomentumOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker, ops::LarsMomentumOpVarTypeInference); diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.cu b/paddle/fluid/operators/optimizers/lars_momentum_op.cu index 42477232e7ca1b..2c27a2135c14b2 100644 --- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu +++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu @@ -14,7 +14,21 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" +#include "paddle/fluid/operators/math/math_cuda_utils.h" #include "paddle/fluid/operators/optimizers/lars_momentum_op.h" +#include "paddle/fluid/platform/fast_divmod.h" + +#if CUDA_VERSION >= 11000 +#include +#endif + +#ifdef __HIPCC__ +#define LARS_BLOCK_SIZE 256 +#else +#define LARS_BLOCK_SIZE 512 +#endif + +#define LARS_MAX_MERGED_OPS 60 namespace paddle { namespace operators { @@ -22,124 +36,472 @@ namespace operators { template using MultiPrecisionType = typename details::MPTypeTrait::Type; -template -__global__ void MomentumLarsKernel( - const T* p, const T* g, const MT* v, - const MultiPrecisionType* learning_rate, const MT mu, const int64_t num, - const MT lars_coeff, const MT lars_weight_decay, - const MultiPrecisionType* p_norm, const MultiPrecisionType* g_norm, - T* p_out, MT* v_out, const MT epsilon, const MT* master_p, MT* master_p_out, - const MultiPrecisionType rescale_grad) { - const MT lr = static_cast(learning_rate[0]); - MT local_lr = lr; - const MT p_n = static_cast(p_norm[0]); - const MT g_n = static_cast(g_norm[0]); +__device__ __forceinline__ float Sqrt(float x) { return sqrtf(x); } +__device__ __forceinline__ double Sqrt(double x) { return sqrt(x); } +__device__ __forceinline__ float Fma(float x, float y, float z) { + return fmaf(x, y, z); +} +__device__ __forceinline__ double Fma(double x, double y, double z) { + return fma(x, y, z); +} + +template +class LarsThreadConfig { + public: + int grid_for_norm; + int grid_for_lars; +#if CUDA_VERSION >= 11000 - if (lars_weight_decay > static_cast(0) && p_n > static_cast(0) && - g_n > static_cast(0)) { - local_lr = - lr * lars_coeff * p_n / (g_n + lars_weight_decay * p_n + epsilon); + private: + int grid_stride; + + public: + explicit LarsThreadConfig(int64_t numel, int sm_num, int num_blocks_per_sm) { + int grid = (numel + LARS_BLOCK_SIZE - 1) / LARS_BLOCK_SIZE; + grid_for_lars = + std::min(std::min(sm_num * num_blocks_per_sm, grid), LARS_BLOCK_SIZE); + grid_stride = LARS_BLOCK_SIZE * grid_for_lars; } - CUDA_KERNEL_LOOP(i, num) { - MT grad = static_cast(g[i]) * static_cast(rescale_grad); - MT param = master_p ? master_p[i] : static_cast(p[i]); - MT v_new = v[i] * mu + local_lr * (grad + lars_weight_decay * param); - MT p_new = param - v_new; + int GetRepeatTimes(int64_t numel) { + return (numel + grid_stride - 1) / grid_stride - 1; + } +#else + int repeat_times; + explicit LarsThreadConfig(const int64_t numel) { + int grid = (numel + LARS_BLOCK_SIZE - 1) / LARS_BLOCK_SIZE; + grid_for_norm = std::min(grid, LARS_BLOCK_SIZE); + const int grid_stride = grid_for_norm * LARS_BLOCK_SIZE; + repeat_times = (numel + grid_stride - 1) / grid_stride - 1; + // Determine to read 4 fp16 or float data once, but 2 double data once. + grid_for_lars = + std::is_same::value + ? (numel + (LARS_BLOCK_SIZE << 1) - 1) / (LARS_BLOCK_SIZE << 1) + : (numel + (LARS_BLOCK_SIZE << 2) - 1) / (LARS_BLOCK_SIZE << 2); + } +#endif +}; + +template +__device__ inline void VectorizeLarsUpdate( + const T* __restrict__ grad, const MT* param, const MT* velocity, + T* param_out, MT* velocity_out, const MT mu, MT local_lr, + const MT lars_weight_decay, const MT rescale_grad, const int tid, + const int grid_stride, const int numel, MT* master_param_out = nullptr) { + using VecType = paddle::platform::AlignedVector; + using VecMType = paddle::platform::AlignedVector; + int main = numel >> (VecSize >> 1); + int tail_offset = main * VecSize; - v_out[i] = v_new; - p_out[i] = static_cast(p_new); - if (master_p_out) master_p_out[i] = p_new; + const VecType* grad_vec = reinterpret_cast(grad); + const VecMType* param_vec = reinterpret_cast(param); + const VecMType* velocity_vec = reinterpret_cast(velocity); + VecType* param_out_vec = reinterpret_cast(param_out); + VecMType* velocity_out_vec = reinterpret_cast(velocity_out); + + VecMType* master_param_out_vec; + if (IsAmp) { + master_param_out_vec = reinterpret_cast(master_param_out); + } + + for (int i = tid; i < main; i += grid_stride) { + VecType param_out_tmp; + VecMType velocity_tmp, param_tmp; + VecType grad_data = grad_vec[i]; + VecMType param_data = param_vec[i]; + VecMType velocity_data = velocity_vec[i]; +#pragma unroll + for (int j = 0; j < VecSize; ++j) { + MT grad_val = static_cast(grad_data[j]) * rescale_grad; + velocity_tmp[j] = + Fma(velocity_data[j], mu, + local_lr * Fma(lars_weight_decay, param_data[j], grad_val)); + param_tmp[j] = param_data[j] - velocity_tmp[j]; + param_out_tmp[j] = static_cast(param_tmp[j]); + } + param_out_vec[i] = param_out_tmp; + velocity_out_vec[i] = velocity_tmp; + if (IsAmp) { + master_param_out_vec[i] = param_tmp; + } + } + + for (int i = tid + tail_offset; i < numel; i += grid_stride) { + MT grad_val = static_cast(grad[i]) * rescale_grad; + MT param_val = param[i]; + MT velocity_tmp = Fma(velocity[i], mu, local_lr * Fma(lars_weight_decay, + param_val, grad_val)); + MT param_tmp = param_val - velocity_tmp; + param_out[i] = static_cast(param_tmp); + velocity_out[i] = velocity_tmp; + if (IsAmp) { + master_param_out[i] = param_tmp; + } } } -template -class LarsMomentumOpCUDAKernel : public framework::OpKernel { - using MPDType = MultiPrecisionType; +#if CUDA_VERSION >= 11000 +/* Once CUDA_VERSION is beyond 11, cooperative_groups can be involved in without + --rdc=true compile flag, then L2_norm kernel can be set with __device__ and + cooperative_groups::grid_group also can be involved. Otherwise, adding this + flag may affect much, L2_norm kernel shall be set with __global__.*/ +// TODO(limingshu): declaration of cooperative_groups wapper is invalid in host. +template +__forceinline__ __device__ void L2NormKernel( + const cooperative_groups::grid_group* cg, +#else +template +__global__ void L2NormKernel( +#endif + const T* p_data, const T* __restrict__ g_data, MT* __restrict__ p_buffer, + MT* __restrict__ g_buffer, const int64_t numel, const int repeat_times, + const MT rescale_grad, const int thresh = 0, MT* __restrict__ p_n = nullptr, + MT* __restrict__ g_n = nullptr) { + __shared__ MT s_buffer[2]; + int tid = threadIdx.x + blockDim.x * blockIdx.x; + int grid_stride = LARS_BLOCK_SIZE * gridDim.x; - public: - void Compute(const framework::ExecutionContext& ctx) const override { - const bool multi_precision = ctx.Attr("multi_precision"); - if (multi_precision) { - InnerCompute(ctx, multi_precision); + MT p_tmp = static_cast(0); + MT g_tmp = static_cast(0); + while (tid < numel) { + MT tmp0 = static_cast(p_data[tid]); + MT tmp1 = static_cast(g_data[tid]); + p_tmp += (tmp0 * tmp0); + g_tmp += (tmp1 * tmp1); + tid += grid_stride; + } + p_tmp = math::blockReduceSum(p_tmp, FINAL_MASK); + g_tmp = math::blockReduceSum(g_tmp, FINAL_MASK); + + if (threadIdx.x == 0) { + p_buffer[blockIdx.x] = p_tmp; + g_buffer[blockIdx.x] = g_tmp; + } +#if CUDA_VERSION >= 11000 + cg->sync(); // Grid sync for writring partial result to gloabl memory + MT p_part_sum = threadIdx.x < gridDim.x ? p_buffer[threadIdx.x] : 0; + MT g_part_sum = threadIdx.x < gridDim.x ? g_buffer[threadIdx.x] : 0; + MT tmp0 = math::blockReduceSum(p_part_sum, FINAL_MASK); + MT tmp1 = math::blockReduceSum(g_part_sum, FINAL_MASK); + if (threadIdx.x == 0) { + s_buffer[0] = tmp0; + s_buffer[1] = tmp1; + } + __syncthreads(); + *p_n = Sqrt(s_buffer[0]); + *g_n = rescale_grad * Sqrt(s_buffer[1]); +#endif +} + +template +__forceinline__ __device__ void MomentumUpdate( + const T* param, const T* __restrict__ grad, const MT* velocity, + T* param_out, MT* velocity_out, const MT* master_param, + MT* master_param_out, const MT* __restrict__ learning_rate, const MT mu, + const MT lars_weight_decay, const MT lars_coeff, const MT epsilon, + const MT rescale_grad, const MT param_norm, const MT grad_norm, + const int tid, const int grid_stride, const int64_t numel, + const bool is_amp) { + const MT lr = learning_rate[0]; + MT local_lr = lr; + if (lars_weight_decay > static_cast(0)) { + local_lr = lr * lars_coeff * param_norm / + (fma(lars_weight_decay, param_norm, grad_norm) + epsilon); + } + if (is_amp) { + VectorizeLarsUpdate( + grad, master_param, velocity, param_out, velocity_out, mu, local_lr, + lars_weight_decay, rescale_grad, tid, grid_stride, numel, + master_param_out); + } else { + if (std::is_same::value || + std::is_same::value) { + /* TODO(limingshu): pointer cast may damage memory accessing for fp16 */ + VectorizeLarsUpdate( + grad, reinterpret_cast(param), velocity, param_out, + velocity_out, mu, local_lr, lars_weight_decay, rescale_grad, tid, + grid_stride, numel); } else { - InnerCompute(ctx, multi_precision); + VectorizeLarsUpdate( + grad, reinterpret_cast(param), velocity, param_out, + velocity_out, mu, local_lr, lars_weight_decay, rescale_grad, tid, + grid_stride, numel); } } +} - private: - template - void InnerCompute(const framework::ExecutionContext& ctx, - const bool multi_precision) const { - auto param_out = ctx.Output("ParamOut"); - auto velocity_out = ctx.Output("VelocityOut"); - auto param = ctx.Input("Param"); - auto velocity = ctx.Input("Velocity"); - auto grad = ctx.Input("Grad"); - auto learning_rate = ctx.Input("LearningRate"); - - const framework::Tensor* master_param = nullptr; - framework::Tensor* master_param_out = nullptr; - if (multi_precision) { - bool has_master = - ctx.HasInput("MasterParam") && ctx.HasOutput("MasterParamOut"); - PADDLE_ENFORCE_EQ(has_master, true, - platform::errors::InvalidArgument( - "The Input(MasterParam) and Output(MasterParamOut) " - "should not be null when " - "the attr `multi_precision` is true")); - master_param = ctx.Input("MasterParam"); - master_param_out = ctx.Output("MasterParamOut"); - } +#if CUDA_VERSION >= 11000 +template +struct LarsParamWarpper { + int64_t numel_arr[LARS_MAX_MERGED_OPS]; + int repeat_arr[LARS_MAX_MERGED_OPS]; + const T* __restrict__ g_arr[LARS_MAX_MERGED_OPS]; + const MT* __restrict__ lr_arr[LARS_MAX_MERGED_OPS]; + T* __restrict__ p_out_arr[LARS_MAX_MERGED_OPS]; + MT* __restrict__ v_out_arr[LARS_MAX_MERGED_OPS]; + MT* __restrict__ master_p_out_arr[LARS_MAX_MERGED_OPS]; + MT weight_decay_arr[LARS_MAX_MERGED_OPS]; +}; - const MT* master_p = multi_precision ? master_param->data() : nullptr; - MT* master_p_out = multi_precision - ? master_param_out->mutable_data(ctx.GetPlace()) - : nullptr; +template +__global__ void MergedMomentumLarsKernel(LarsParamWarpper lars_warpper, + MT* __restrict__ p_buffer, + MT* __restrict__ g_buffer, + const int op_num, const MT mu, + const MT lars_coeff, const MT epsilon, + const MT rescale_grad, + const bool is_amp) { + int grid_stride = gridDim.x * LARS_BLOCK_SIZE; + int tid = threadIdx.x + blockIdx.x * blockDim.x; + const cooperative_groups::grid_group cg = cooperative_groups::this_grid(); + for (int i = 0; i < op_num; ++i) { + int numel = lars_warpper.numel_arr[i]; + MT param_norm = static_cast(0); + MT grad_norm = static_cast(0); + L2NormKernel(&cg, lars_warpper.p_out_arr[i], lars_warpper.g_arr[i], + p_buffer, g_buffer, numel, lars_warpper.repeat_arr[i], + rescale_grad, 0, ¶m_norm, &grad_norm); + MomentumUpdate( + lars_warpper.p_out_arr[i], lars_warpper.g_arr[i], + lars_warpper.v_out_arr[i], lars_warpper.p_out_arr[i], + lars_warpper.v_out_arr[i], lars_warpper.master_p_out_arr[i], + lars_warpper.master_p_out_arr[i], lars_warpper.lr_arr[i], mu, + lars_warpper.weight_decay_arr[i], lars_coeff, epsilon, rescale_grad, + param_norm, grad_norm, tid, grid_stride, numel, is_amp); + } +} +#endif - T* p_out = param_out->mutable_data(ctx.GetPlace()); - MT* v_out = velocity_out->mutable_data(ctx.GetPlace()); +template +__global__ void MomentumLarsKernel( + const T* param, const T* __restrict__ grad, const MT* velocity, + T* param_out, MT* velocity_out, const MT* master_param, + MT* master_param_out, const MT* __restrict__ learning_rate, + MT* __restrict__ p_buffer, MT* __restrict__ g_buffer, const MT mu, + const MT lars_coeff, const MT lars_weight_decay, const MT epsilon, + const MT rescale_grad, const int repeat_times, const int thresh, + const int64_t numel, const bool is_amp) { + int tid = threadIdx.x + blockIdx.x * blockDim.x; + int grid_stride = gridDim.x * LARS_BLOCK_SIZE; +#if CUDA_VERSION >= 11000 + const cooperative_groups::grid_group cg = cooperative_groups::this_grid(); + MT param_norm = static_cast(0); + MT grad_norm = static_cast(0); + L2NormKernel(&cg, param, grad, p_buffer, g_buffer, numel, repeat_times, + rescale_grad, gridDim.x, ¶m_norm, &grad_norm); +#else + const MT rescale_grad_pow = rescale_grad * rescale_grad; + MT param_part_norm = threadIdx.x < thresh ? p_buffer[threadIdx.x] : 0; + MT grad_part_norm = threadIdx.x < thresh ? g_buffer[threadIdx.x] : 0; + __syncthreads(); + MT param_norm = Sqrt(math::blockReduceSum(param_part_norm, FINAL_MASK)); + MT grad_norm = Sqrt(rescale_grad_pow * + math::blockReduceSum(grad_part_norm, FINAL_MASK)); +#endif + MomentumUpdate(param, grad, velocity, param_out, velocity_out, + master_param, master_param_out, learning_rate, mu, + lars_weight_decay, lars_coeff, epsilon, rescale_grad, + param_norm, grad_norm, tid, grid_stride, numel, is_amp); +} + +template +inline void SeparatedLarsMomentumOpCUDAKernel( + const platform::CUDADeviceContext& cuda_ctx, const T* param_data, + T* param_out_data, const MT* velocity_data, MT* velocity_out_data, + const T* grad_data, const MT* lr, MT* p_buffer, MT* g_buffer, const MT mu, + const MT lars_coeff, const MT weight_decay, const MT epsilon, + const MT rescale_grad, const int64_t numel, const MT* master_param_data, + MT* master_out_data, const bool is_amp) { + LarsThreadConfig lars_thread_config(numel); + L2NormKernel<<>>( + param_data, grad_data, p_buffer, g_buffer, numel, + lars_thread_config.repeat_times, rescale_grad); + + MomentumLarsKernel<<>>( + param_data, grad_data, velocity_data, param_out_data, velocity_out_data, + master_param_data, master_out_data, lr, p_buffer, g_buffer, mu, + lars_coeff, weight_decay, epsilon, rescale_grad, 0, + lars_thread_config.grid_for_norm, numel, is_amp); +} + +template +class LarsMomentumOpCUDAKernel : public framework::OpKernel { + using MT = MultiPrecisionType; + + public: + void Compute(const framework::ExecutionContext& ctx) const override { + int num_blocks_per_sm = 0; + bool multi_precision = ctx.Attr("multi_precision"); + auto& cuda_ctx = ctx.template device_context(); + int sm_num = cuda_ctx.GetSMCount(); + framework::Tensor tmp_buffer_t = + ctx.AllocateTmpTensor( + {LARS_BLOCK_SIZE << 1}, cuda_ctx); + auto* p_buffer = tmp_buffer_t.mutable_data(ctx.GetPlace()); + auto* g_buffer = p_buffer + LARS_BLOCK_SIZE; MT mu = static_cast(ctx.Attr("mu")); MT lars_coeff = static_cast(ctx.Attr("lars_coeff")); - MT lars_weight_decay = - static_cast(ctx.Attr("lars_weight_decay")); MT epsilon = static_cast(ctx.Attr("epsilon")); - MPDType rescale_grad = - static_cast(ctx.Attr("rescale_grad")); - - auto* p = param->data(); - auto* g = grad->data(); - auto* v = velocity->data(); - auto* lr = learning_rate->data(); - - int block = 512; - int grid = (param->numel() + block - 1) / block; - - auto eigen_p = framework::EigenVector::Flatten(*param); - auto eigen_g = framework::EigenVector::Flatten(*grad); - // calculate norms using eigein and launch the kernel. - framework::Tensor p_norm_t, g_norm_t; - p_norm_t.Resize({1}); - g_norm_t.Resize({1}); - auto* p_norm_data = p_norm_t.mutable_data(ctx.GetPlace()); - auto* g_norm_data = g_norm_t.mutable_data(ctx.GetPlace()); - auto ep_norm = framework::EigenScalar::From(p_norm_t); - auto eg_norm = framework::EigenScalar::From(g_norm_t); - - auto* place = ctx.template device_context().eigen_device(); - - // eigen unsupport fp16 l2-norm - ep_norm.device(*place) = - eigen_p.template cast().square().sum().sqrt(); - eg_norm.device(*place) = - (eigen_g.template cast() * rescale_grad).square().sum().sqrt(); - - MomentumLarsKernel< - T, MT><<>>( - p, g, v, lr, mu, param->numel(), lars_coeff, lars_weight_decay, - p_norm_data, g_norm_data, p_out, v_out, epsilon, master_p, master_p_out, - rescale_grad); + MT rescale_grad = static_cast(ctx.Attr("rescale_grad")); + + auto weight_decay_arr = ctx.Attr>("lars_weight_decay"); + auto grad = ctx.MultiInput("Grad"); + auto param = ctx.MultiInput("Param"); + auto velocity = ctx.MultiInput("Velocity"); + auto param_out = ctx.MultiOutput("ParamOut"); + auto velocity_out = ctx.MultiOutput("VelocityOut"); + auto learning_rate = ctx.MultiInput("LearningRate"); + auto master_param = ctx.MultiInput("MasterParam"); + auto master_param_out = + ctx.MultiOutput("MasterParamOut"); + + int op_num = grad.size(); +#if CUDA_VERSION >= 11000 + if (op_num > 1) { + LarsParamWarpper lars_warpper; + PADDLE_ENFORCE_LT( + op_num, LARS_MAX_MERGED_OPS, + platform::errors::InvalidArgument( + "The maximum number of merged-ops supported is (%d), but" + "lars op required for trainning this model is (%d)\n", + LARS_MAX_MERGED_OPS, op_num)); + + /* Implementation of lars optimizer consists of following two steps: + 1. Figure out the L2 norm statistic result of grad data and param data. + 2. Update param and velocity with usage of L2 norm statistic result. + Step1 and step2 can be merged with api provided by nvida + cudaLaunchCooperativeKernel: + - The thread quantity shall less than pyhsical SM limited threads + - Launche as thread-block can synchronizlly execute. */ + cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &num_blocks_per_sm, MergedMomentumLarsKernel, LARS_BLOCK_SIZE, + sizeof(MT) << 1); + + size_t total_numel = 0; + for (int i = 0; i < op_num; ++i) { + size_t temp_numel = param[i]->numel(); + total_numel += temp_numel; + lars_warpper.numel_arr[i] = temp_numel; + lars_warpper.g_arr[i] = grad[i]->data(); + lars_warpper.lr_arr[i] = learning_rate[i]->data(); + lars_warpper.p_out_arr[i] = + param_out[i]->mutable_data(ctx.GetPlace()); + lars_warpper.v_out_arr[i] = + velocity_out[i]->mutable_data(ctx.GetPlace()); + lars_warpper.weight_decay_arr[i] = static_cast(weight_decay_arr[i]); + PADDLE_ENFORCE_EQ( + param[i]->data(), lars_warpper.p_out_arr[i], + platform::errors::InvalidArgument( + "Input(Param) and Output(ParamOut) must be the same Tensors.")); + PADDLE_ENFORCE_EQ(velocity[i]->data(), lars_warpper.v_out_arr[i], + platform::errors::InvalidArgument( + "Input(Velocity) and Output(VelocityOut) must be " + "the same Tensors.")); + } + int64_t avg_numel = total_numel / op_num; + LarsThreadConfig lars_thread_config(avg_numel, sm_num, + num_blocks_per_sm); + for (int i = 0; i < op_num; ++i) { + lars_warpper.repeat_arr[i] = + lars_thread_config.GetRepeatTimes(lars_warpper.numel_arr[i]); + } + if (multi_precision) { + for (int i = 0; i < op_num; ++i) { + lars_warpper.master_p_out_arr[i] = + master_param_out[i]->mutable_data(ctx.GetPlace()); + PADDLE_ENFORCE_EQ(master_param[i]->data(), + lars_warpper.master_p_out_arr[i], + platform::errors::InvalidArgument( + "Input(MasterParam) and Output(MasterParamOut) " + "must be the same Tensors.")); + } + } + void* cuda_param[] = {reinterpret_cast(&lars_warpper), + reinterpret_cast(&p_buffer), + reinterpret_cast(&g_buffer), + reinterpret_cast(&op_num), + reinterpret_cast(&mu), + reinterpret_cast(&lars_coeff), + reinterpret_cast(&epsilon), + reinterpret_cast(&rescale_grad), + reinterpret_cast(&multi_precision)}; + // Lanuch all sm theads, and thead of each block synchronizedly cooperate. + cudaLaunchCooperativeKernel( + reinterpret_cast(MergedMomentumLarsKernel), + lars_thread_config.grid_for_lars, LARS_BLOCK_SIZE, cuda_param, 0, + cuda_ctx.stream()); + } else { + auto* param_data = param[0]->data(); + auto* grad_data = grad[0]->data(); + auto* velocity_data = velocity[0]->data(); + auto* lr = learning_rate[0]->data(); + auto* param_out_data = param_out[0]->mutable_data(ctx.GetPlace()); + auto* velocity_out_data = + velocity_out[0]->mutable_data(ctx.GetPlace()); + const MT* master_param_data = + multi_precision ? master_param[0]->data() : nullptr; + MT* master_param_out_data = + multi_precision + ? master_param_out[0]->mutable_data(ctx.GetPlace()) + : nullptr; + int64_t numel = param[0]->numel(); + MT lars_weight_decay = weight_decay_arr[0]; + + // Figure out how many blocks can be active in each sm. + cudaOccupancyMaxActiveBlocksPerMultiprocessor( + &num_blocks_per_sm, MomentumLarsKernel, LARS_BLOCK_SIZE, + sizeof(MT) << 1); + LarsThreadConfig lars_thread_config(numel, sm_num, + num_blocks_per_sm); + int repeat_times = lars_thread_config.GetRepeatTimes(numel); + int thresh = 0; + void* cuda_param[] = { + reinterpret_cast(¶m_data), + reinterpret_cast(&grad_data), + reinterpret_cast(&velocity_data), + reinterpret_cast(¶m_out_data), + reinterpret_cast(&velocity_out_data), + reinterpret_cast(&master_param_data), + reinterpret_cast(&master_param_out_data), + reinterpret_cast(&lr), + reinterpret_cast(&p_buffer), + reinterpret_cast(&g_buffer), + reinterpret_cast(&mu), + reinterpret_cast(&lars_coeff), + reinterpret_cast(&lars_weight_decay), + reinterpret_cast(&epsilon), + reinterpret_cast(&rescale_grad), + reinterpret_cast(&repeat_times), + reinterpret_cast(&thresh), // Just a placeholder + reinterpret_cast(&numel), + reinterpret_cast(&multi_precision)}; + // Lanuch all sm theads. + cudaLaunchCooperativeKernel( + reinterpret_cast(MomentumLarsKernel), + lars_thread_config.grid_for_lars, LARS_BLOCK_SIZE, cuda_param, 0, + cuda_ctx.stream()); + } +#else + for (int i = 0; i < op_num; ++i) { + const MT* master_param_data = + multi_precision ? master_param[i]->data() : nullptr; + MT* master_param_out_data = + multi_precision + ? master_param_out[i]->mutable_data(ctx.GetPlace()) + : nullptr; + SeparatedLarsMomentumOpCUDAKernel( + cuda_ctx, param[i]->data(), + param_out[i]->mutable_data(ctx.GetPlace()), + velocity[i]->data(), + velocity_out[i]->mutable_data(ctx.GetPlace()), grad[i]->data(), + learning_rate[i]->data(), p_buffer, g_buffer, mu, lars_coeff, + weight_decay_arr[i], epsilon, rescale_grad, param[i]->numel(), + master_param_data, master_param_out_data, multi_precision); + } +#endif } }; diff --git a/paddle/fluid/operators/optimizers/lars_momentum_op.h b/paddle/fluid/operators/optimizers/lars_momentum_op.h old mode 100755 new mode 100644 index 55775bc08fb5eb..df4d7b9a0438bc --- a/paddle/fluid/operators/optimizers/lars_momentum_op.h +++ b/paddle/fluid/operators/optimizers/lars_momentum_op.h @@ -23,54 +23,48 @@ template class LarsMomentumOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto param_out = ctx.Output("ParamOut"); - auto velocity_out = ctx.Output("VelocityOut"); - auto param = ctx.Input("Param"); - auto velocity = ctx.Input("Velocity"); - auto learning_rate = ctx.Input("LearningRate"); - auto* grad_var = ctx.InputVar("Grad"); - // only support dense for now. - PADDLE_ENFORCE_EQ(grad_var->IsType(), true, - platform::errors::InvalidArgument( - "The Var(%s)'s type should be LoDTensor, " - "but the received is %s", - ctx.InputNames("Grad").front(), - framework::ToTypeName(grad_var->Type()))); - auto grad = ctx.Input("Grad"); - - param_out->mutable_data(ctx.GetPlace()); - velocity_out->mutable_data(ctx.GetPlace()); - + auto param_out = ctx.MultiOutput("ParamOut"); + auto velocity_out = ctx.MultiOutput("VelocityOut"); + auto param = ctx.MultiInput("Param"); + auto velocity = ctx.MultiInput("Velocity"); + auto learning_rate = ctx.MultiInput("LearningRate"); + auto grad = ctx.MultiInput("Grad"); + auto weight_decay_arr = ctx.Attr>("lars_weight_decay"); T mu = static_cast(ctx.Attr("mu")); T lars_coeff = ctx.Attr("lars_coeff"); - T lars_weight_decay = ctx.Attr("lars_weight_decay"); T epsilon = ctx.Attr("epsilon"); - auto p_out = framework::EigenVector::Flatten(*param_out); - auto v_out = framework::EigenVector::Flatten(*velocity_out); + int op_num = param.size(); + for (int i = 0; i < op_num; ++i) { + auto* lr = learning_rate[i]->data(); + T lars_weight_decay = weight_decay_arr[i]; + param_out[i]->mutable_data(ctx.GetPlace()); + velocity_out[i]->mutable_data(ctx.GetPlace()); - auto p = framework::EigenVector::Flatten(*param); - auto v = framework::EigenVector::Flatten(*velocity); - auto g = framework::EigenVector::Flatten(*grad); - auto* lr = learning_rate->data(); + auto p_out = framework::EigenVector::Flatten(*(param_out[i])); + auto v_out = framework::EigenVector::Flatten(*(velocity_out[i])); + auto p = framework::EigenVector::Flatten(*(param[i])); + auto v = framework::EigenVector::Flatten(*(velocity[i])); + auto g = framework::EigenVector::Flatten(*(grad[i])); - framework::Tensor p_norm_t, g_norm_t; - p_norm_t.Resize({1}); - g_norm_t.Resize({1}); - p_norm_t.mutable_data(ctx.GetPlace()); - g_norm_t.mutable_data(ctx.GetPlace()); - auto ep_norm = framework::EigenScalar::From(p_norm_t); - auto eg_norm = framework::EigenScalar::From(g_norm_t); + framework::Tensor p_norm_t, g_norm_t; + p_norm_t.Resize({1}); + g_norm_t.Resize({1}); + p_norm_t.mutable_data(ctx.GetPlace()); + g_norm_t.mutable_data(ctx.GetPlace()); + auto ep_norm = framework::EigenScalar::From(p_norm_t); + auto eg_norm = framework::EigenScalar::From(g_norm_t); + ep_norm = p.square().sum().sqrt(); + eg_norm = g.square().sum().sqrt(); - ep_norm = p.square().sum().sqrt(); - eg_norm = g.square().sum().sqrt(); - T local_lr = lr[0]; - if (lars_weight_decay > 0 && ep_norm(0) > 0 && eg_norm(0) > 0) { - local_lr = lr[0] * lars_coeff * ep_norm(0) / - (eg_norm(0) + lars_weight_decay * ep_norm(0) + epsilon); + T local_lr = lr[0]; + if (lars_weight_decay > 0 && ep_norm(0) > 0 && eg_norm(0) > 0) { + local_lr = lr[0] * lars_coeff * ep_norm(0) / + (eg_norm(0) + lars_weight_decay * ep_norm(0) + epsilon); + } + v_out = v * mu + local_lr * (g + lars_weight_decay * p); + p_out = p - v_out; } - v_out = v * mu + local_lr * (g + lars_weight_decay * p); - p_out = p - v_out; } }; diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op.cc b/paddle/fluid/operators/optimizers/merged_momentum_op.cc new file mode 100644 index 00000000000000..6c63376b5eb425 --- /dev/null +++ b/paddle/fluid/operators/optimizers/merged_momentum_op.cc @@ -0,0 +1,95 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/optimizers/merged_momentum_op.h" + +namespace paddle { +namespace operators { + +class MergedMomentumOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override {} + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + auto param_dtype = + framework::OperatorWithKernel::IndicateVarDataType(ctx, "Param"); + return framework::OpKernelType(param_dtype, ctx.GetPlace()); + } +}; + +class MergedMomentumOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Param", + "(Tensor, default Tensor) " + "Input parameter that has to be updated") + .AsDuplicable(); + AddInput("Grad", + "(Tensor, default Tensor) " + "Input gradient of the parameter") + .AsDuplicable(); + AddInput("Velocity", + "(Tensor, default Tensor) " + "Input velocity (corresponding to the parameter) " + "that has to be updated") + .AsDuplicable(); + AddInput("LearningRate", + "(Tensor, default Tensor) " + "Input learning rate"); + AddInput("MasterParam", "FP32 master weight for AMP.") + .AsDispensable() + .AsDuplicable(); + AddOutput("ParamOut", + "(Tensor) This output is updated parameter. " + "It shared memory with Input(Param).") + .AsDuplicable(); + AddOutput("VelocityOut", + "(Tensor) This output is updated velocity. " + "It shared memory with Input(Velocity).") + .AsDuplicable(); + AddOutput("MasterParamOut", + "The updated FP32 master weight for AMP. " + "It shared memory with Input(MasterParam).") + .AsDispensable() + .AsDuplicable(); + AddAttr("mu", "(float) Momentum coefficient"); + AddAttr("multi_precision", + "(bool, default false) " + "Whether to use multi-precision during weight updating.") + .SetDefault(false); + AddAttr( + "rescale_grad", + "(float, default 1.0) Multiply the gradient with `rescale_grad`" + "before updating. Often choose to be `1.0/batch_size`.") + .SetDefault(1.0f); + AddComment(R"DOC(Merged Momentum Optimizer.)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_WITHOUT_GRADIENT(merged_momentum, ops::MergedMomentumOp, + ops::MergedMomentumOpMaker); + +REGISTER_OP_CPU_KERNEL( + merged_momentum, ops::MergedMomentumOpKernel, + ops::MergedMomentumOpKernel); diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op.cu b/paddle/fluid/operators/optimizers/merged_momentum_op.cu new file mode 100644 index 00000000000000..7e4bbd9807938c --- /dev/null +++ b/paddle/fluid/operators/optimizers/merged_momentum_op.cu @@ -0,0 +1,24 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/optimizers/merged_momentum_op.h" + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL( + merged_momentum, + ops::MergedMomentumOpKernel, + ops::MergedMomentumOpKernel, + ops::MergedMomentumOpKernel); diff --git a/paddle/fluid/operators/optimizers/merged_momentum_op.h b/paddle/fluid/operators/optimizers/merged_momentum_op.h new file mode 100644 index 00000000000000..4dfaa4de3ad447 --- /dev/null +++ b/paddle/fluid/operators/optimizers/merged_momentum_op.h @@ -0,0 +1,197 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/operators/amp/fp16_type_traits.h" +#include "paddle/fluid/platform/for_range.h" +#include "paddle/fluid/platform/macros.h" + +namespace paddle { +namespace operators { + +template +struct MergedMomentumMasterParams { + MT *PADDLE_RESTRICT master_params[kParamNum]; + + HOSTDEVICE MT *MasterParam(size_t idx) const { return master_params[idx]; } + HOSTDEVICE void SetMasterParam(size_t idx, MT *p) { master_params[idx] = p; } +}; + +template +struct MergedMomentumMasterParams { + HOSTDEVICE constexpr MT *MasterParam(size_t) const { return nullptr; } + HOSTDEVICE constexpr void SetMasterParam(size_t, MT *) {} +}; + +template +struct MergedMomentumKernelParam + : public MergedMomentumMasterParams { + static constexpr auto N = kParamNum; + size_t sizes[N]; + T *PADDLE_RESTRICT params[N]; + const T *PADDLE_RESTRICT grads[N]; + MT *PADDLE_RESTRICT velocitys[N]; + const MT *PADDLE_RESTRICT lr; + MT mu; + MT rescale_grad; + uint32_t param_num; + + HOSTDEVICE void operator()(size_t i) const { + const auto lr_val = *lr; + for (uint32_t idx = 0; idx < param_num; ++idx) { + auto size = sizes[idx]; + if (i >= size) continue; + + auto param_p = params[idx]; + auto grad_p = grads[idx]; + auto velocity_p = velocitys[idx]; + auto master_param_p = this->MasterParam(idx); + + const MT param = + master_param_p ? master_param_p[i] : static_cast(param_p[i]); + const MT grad = static_cast(grad_p[i]) * rescale_grad; + const MT velocity = velocity_p[i]; + const MT velocity_out = velocity * mu + grad; + const MT param_out = param - lr_val * velocity_out; + velocity_p[i] = velocity_out; + param_p[i] = static_cast(param_out); + if (master_param_p) { + master_param_p[i] = param_out; + } + } + } +}; + +template +class MergedMomentumOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto params = ctx.MultiInput("Param"); + auto params_out = ctx.MultiOutput("ParamOut"); + size_t n = params.size(); + PADDLE_ENFORCE_EQ( + n, params_out.size(), + platform::errors::InvalidArgument( + "Output(ParamOut) number must be equal to Input(Param) number.")); + for (size_t i = 0; i < n; ++i) { + PADDLE_ENFORCE_EQ( + params[i], params_out[i], + platform::errors::InvalidArgument( + "Input(Param) and Output(ParamOut) must be the same Tensors.")); + } + + auto grads = ctx.MultiInput("Grad"); + PADDLE_ENFORCE_EQ( + n, grads.size(), + platform::errors::InvalidArgument( + "Input(Grad) number must be equal to Input(Param) number.")); + + auto velocitys = ctx.MultiInput("Velocity"); + PADDLE_ENFORCE_EQ(n, velocitys.size(), + platform::errors::InvalidArgument( + "Input(Velocity) number and Input(Param) number.")); + + auto velocitys_out = ctx.MultiOutput("VelocityOut"); + PADDLE_ENFORCE_EQ( + n, velocitys_out.size(), + platform::errors::InvalidArgument("Output(VelocityOut) number must be " + "equal to Input(Param) number.")); + for (size_t i = 0; i < n; ++i) { + PADDLE_ENFORCE_EQ(velocitys[i], velocitys_out[i], + platform::errors::InvalidArgument( + "Input(Velocity) and Output(VelocityOut) must be " + "the same Tensors.")); + } + + auto master_params = ctx.MultiInput("MasterParam"); + auto master_params_out = + ctx.MultiOutput("MasterParamOut"); + auto multi_precision = ctx.Attr("multi_precision"); + if (multi_precision) { + PADDLE_ENFORCE_EQ( + n, master_params.size(), + platform::errors::InvalidArgument("Input(MasterParam) number must be " + "equal to Input(Param) number.")); + PADDLE_ENFORCE_EQ(n, master_params_out.size(), + platform::errors::InvalidArgument( + "Output(MasterParamOut) number must be equal to " + "Input(MasterParam) number.")); + for (size_t i = 0; i < n; ++i) { + PADDLE_ENFORCE_EQ(master_params[i], master_params_out[i], + platform::errors::InvalidArgument( + "Input(MasterParam) and Output(MasterParamOut) " + "must be the same Tensors.")); + PADDLE_ENFORCE_NOT_NULL(master_params[i], + platform::errors::InvalidArgument( + "Input(MasterParam) must be provided when " + "multi_precision=True.")); + } + } else { + master_params.clear(); + master_params_out.clear(); + } + + auto lr = ctx.Input("LearningRate"); + auto mu = ctx.Attr("mu"); + auto rescale_grad = ctx.Attr("rescale_grad"); + using MPType = typename operators::details::MPTypeTrait::Type; + + auto &dev_ctx = ctx.template device_context(); + +#define PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL(kMultiPrecision) \ + MergedMomentumKernelParam kernel_params; \ + constexpr auto kMaxMergedNum = decltype(kernel_params)::N; \ + size_t kernel_num = (n + kMaxMergedNum - 1) / kMaxMergedNum; \ + kernel_params.mu = static_cast(mu); \ + kernel_params.rescale_grad = static_cast(rescale_grad); \ + kernel_params.lr = lr->data(); \ + for (size_t i = 0; i < kernel_num; ++i) { \ + size_t start = i * kMaxMergedNum; \ + size_t end = std::min((i + 1) * kMaxMergedNum, n); \ + kernel_params.param_num = static_cast(end - start); \ + size_t max_size = 0; \ + for (size_t j = 0; j < kernel_params.param_num; ++j) { \ + auto size = static_cast(params_out[j + start]->numel()); \ + max_size = std::max(max_size, size); \ + kernel_params.sizes[j] = size; \ + kernel_params.params[j] = params_out[j + start]->data(); \ + kernel_params.grads[j] = grads[j + start]->data(); \ + kernel_params.velocitys[j] = velocitys_out[j + start]->data(); \ + kernel_params.SetMasterParam( \ + j, kMultiPrecision ? master_params_out[j + start]->data() \ + : nullptr); \ + } \ + platform::ForRange for_range(dev_ctx, max_size); \ + for_range(kernel_params); \ + VLOG(10) << "Launch MergedMomentum kernel " << i << " " \ + << kernel_params.param_num; \ + } + + if (multi_precision) { + PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL(true); + } else { + PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL(false); + } + +#undef PADDLE_LAUNCH_MERGED_MOMENTUM_KERNEL + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h index f461dec66c0e75..2d713308fd9389 100644 --- a/paddle/fluid/operators/optimizers/momentum_op.h +++ b/paddle/fluid/operators/optimizers/momentum_op.h @@ -173,14 +173,15 @@ class CPUDenseMomentumFunctor { } }; -template +template class DenseMomentumFunctor; // NOTE(dzh) for performance. // avoid if/else in inside kernel, implement GPU UseNesterov/NoNesterov as two // functor. -template -class DenseMomentumFunctor { +template +class DenseMomentumFunctor { private: const T* param_; const T* grad_; @@ -193,7 +194,6 @@ class DenseMomentumFunctor { T* param_out_; MT* velocity_out_; MT* master_param_out_; - const RegularizationType regularization_flag_; const MT regularization_coeff_; public: @@ -201,7 +201,6 @@ class DenseMomentumFunctor { const MultiPrecisionType* learning_rate, const MT* master_param, const MT mu, const MT rescale_grad, const int64_t num, - const RegularizationType regularization_flag, const MT regularization_coeff, T* param_out, MT* velocity_out, MT* master_param_out) : param_(param), @@ -215,7 +214,6 @@ class DenseMomentumFunctor { param_out_(param_out), velocity_out_(velocity_out), master_param_out_(master_param_out), - regularization_flag_(regularization_flag), regularization_coeff_(regularization_coeff) {} inline HOSTDEVICE void operator()(size_t i) const { // put memory access in register @@ -225,9 +223,9 @@ class DenseMomentumFunctor { const MT lr = static_cast(lr_[0]); const MT velocity = velocity_[i]; - grad = regularization_flag_ == RegularizationType::kL2DECAY - ? grad + regularization_coeff_ * param - : grad; + if (kRegType == RegularizationType::kL2DECAY) { + grad += regularization_coeff_ * param; + } MT velocity_out = velocity * mu_ + grad; MT param_out = param - (grad + velocity_out * mu_) * lr; @@ -240,8 +238,8 @@ class DenseMomentumFunctor { } }; -template -class DenseMomentumFunctor { +template +class DenseMomentumFunctor { private: const T* param_; const T* grad_; @@ -254,7 +252,6 @@ class DenseMomentumFunctor { T* param_out_; MT* velocity_out_; MT* master_param_out_; - const RegularizationType regularization_flag_; const MT regularization_coeff_; public: @@ -262,7 +259,6 @@ class DenseMomentumFunctor { const MultiPrecisionType* learning_rate, const MT* master_param, const MT mu, const MT rescale_grad, const int64_t num, - const RegularizationType regularization_flag, const MT regularization_coeff, T* param_out, MT* velocity_out, MT* master_param_out) : param_(param), @@ -276,7 +272,6 @@ class DenseMomentumFunctor { param_out_(param_out), velocity_out_(velocity_out), master_param_out_(master_param_out), - regularization_flag_(regularization_flag), regularization_coeff_(regularization_coeff) {} inline HOSTDEVICE void operator()(size_t i) const { // put memory access in register @@ -286,9 +281,9 @@ class DenseMomentumFunctor { const MT lr = static_cast(lr_[0]); const MT velocity = velocity_[i]; - grad = regularization_flag_ == RegularizationType::kL2DECAY - ? grad + regularization_coeff_ * param - : grad; + if (kRegType == RegularizationType::kL2DECAY) { + grad += regularization_coeff_ * param; + } MT velocity_out = velocity * mu_ + grad; MT param_out = param - lr * velocity_out; @@ -522,23 +517,31 @@ class MomentumOpKernel : public framework::OpKernel { platform::ForRange for_range( static_cast(ctx.device_context()), param->numel()); - if (use_nesterov) { - DenseMomentumFunctor functor( - param->data(), grad->data(), velocity->data(), - learning_rate->data(), master_in_data, mu, rescale_grad, - param->numel(), regularization_flag, regularization_coeff, - param_out->mutable_data(ctx.GetPlace()), - velocity_out->mutable_data(ctx.GetPlace()), master_out_data); - for_range(functor); +#define PADDLE_LAUNCH_DENSE_MOMENTUM_KERNEL(__nesterov, __reg_type) \ + DenseMomentumFunctor functor( \ + param->data(), grad->data(), velocity->data(), \ + learning_rate->data(), master_in_data, mu, rescale_grad, \ + param->numel(), regularization_coeff, \ + param_out->mutable_data(ctx.GetPlace()), \ + velocity_out->mutable_data(ctx.GetPlace()), master_out_data); \ + for_range(functor); + if (use_nesterov) { + if (regularization_flag == RegularizationType::kL2DECAY) { + PADDLE_LAUNCH_DENSE_MOMENTUM_KERNEL(UseNesterov, + RegularizationType::kL2DECAY); + } else { + PADDLE_LAUNCH_DENSE_MOMENTUM_KERNEL(UseNesterov, + RegularizationType::kNONE); + } } else { - DenseMomentumFunctor functor( - param->data(), grad->data(), velocity->data(), - learning_rate->data(), master_in_data, mu, rescale_grad, - param->numel(), regularization_flag, regularization_coeff, - param_out->mutable_data(ctx.GetPlace()), - velocity_out->mutable_data(ctx.GetPlace()), master_out_data); - for_range(functor); + if (regularization_flag == RegularizationType::kL2DECAY) { + PADDLE_LAUNCH_DENSE_MOMENTUM_KERNEL(NoNesterov, + RegularizationType::kL2DECAY); + } else { + PADDLE_LAUNCH_DENSE_MOMENTUM_KERNEL(NoNesterov, + RegularizationType::kNONE); + } } } diff --git a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc new file mode 100644 index 00000000000000..4d919c94f616b1 --- /dev/null +++ b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cc @@ -0,0 +1,88 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/float16.h" + +namespace paddle { +namespace operators { + +class Pow2DecayWithLinearWarmupOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext *ctx) const override { + auto dim = framework::make_ddim({1}); + ctx->SetOutputDim("LearningRateOut", dim); + ctx->SetOutputDim("StepOut", dim); + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + auto data_type = + OperatorWithKernel::IndicateVarDataType(ctx, "LearningRate"); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +class Pow2DecayWithLinearWarmupOpMaker + : public framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("LearningRate", "(Tensor) The input learning rate Tensor."); + AddInput("Step", "(Tensor) The input global step Tensor."); + AddOutput("LearningRateOut", + "(Tensor) The output learning rate Tensor. Same with " + "Input(LearningRate)."); + AddOutput( + "StepOut", + "(Tensor) The output learning rate Tensor. Same with Input(Step)."); + AddAttr("warmup_steps", "(int64_t) The warmup steps."); + AddAttr( + "total_steps", + "(int64_t) The total steps for changing the learning rate."); + AddAttr("base_lr", + "(float) The final learning rate value after warmup."); + AddAttr("end_lr", + "(float) The final learning rate value after total_steps."); + AddComment(R"DOC( +The Pow2DecayWithLinearWarmup learning rate scheduler. + +When step_num < warmup_steps, lr = base_lr * step_num / warmup_steps + +When warmup_steps <= step_num <= total_steps, + factor = 1 - (step_num - warmup_steps) / (total_steps - warmup_steps) + lr = (base_lr - end_lr) * factor * factor + end_lr + +When step_num > total_steps, lr = end_lr + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_WITHOUT_GRADIENT(pow2_decay_with_linear_warmup, + ops::Pow2DecayWithLinearWarmupOp, + ops::Pow2DecayWithLinearWarmupOpMaker); +REGISTER_OP_CPU_KERNEL( + pow2_decay_with_linear_warmup, + ops::Pow2DecayWithLinearWarmupOpKernel, + ops::Pow2DecayWithLinearWarmupOpKernel); diff --git a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cu b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cu new file mode 100644 index 00000000000000..6695778dbac063 --- /dev/null +++ b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.cu @@ -0,0 +1,24 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h" + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL( + pow2_decay_with_linear_warmup, + ops::Pow2DecayWithLinearWarmupOpKernel, + ops::Pow2DecayWithLinearWarmupOpKernel); diff --git a/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h new file mode 100644 index 00000000000000..74cf7627450773 --- /dev/null +++ b/paddle/fluid/operators/optimizers/pow2_decay_with_linear_warmup_op.h @@ -0,0 +1,115 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/platform/for_range.h" +#include "paddle/fluid/platform/macros.h" + +namespace paddle { +namespace operators { + +template +struct Pow2DecayWithLinearWarmupFunctor { + template + using RestrictPtr = U *PADDLE_RESTRICT; + + public: + HOSTDEVICE Pow2DecayWithLinearWarmupFunctor(RestrictPtr lr, + RestrictPtr step, + size_t warmup_steps, + size_t total_steps, AttrT base_lr, + AttrT end_lr) + : lr_(lr), + step_(step), + warmup_steps_(warmup_steps), + total_steps_(total_steps), + base_lr_(base_lr), + end_lr_(end_lr) {} + + HOSTDEVICE void operator()(size_t) const { + size_t step = static_cast(*step_) + 1; + *step_ = static_cast(step); + if (step <= warmup_steps_) { + auto new_lr = static_cast(step) / warmup_steps_ * base_lr_; + *lr_ = static_cast(new_lr); + } else if (step < total_steps_) { + auto factor = 1 - + static_cast(step - warmup_steps_) / + (total_steps_ - warmup_steps_); + auto new_lr = + static_cast(base_lr_ - end_lr_) * (factor * factor) + end_lr_; + *lr_ = static_cast(new_lr); + } else { + *lr_ = static_cast(end_lr_); + } + } + + private: + RestrictPtr lr_; + RestrictPtr step_; + size_t warmup_steps_; + size_t total_steps_; + AttrT base_lr_; + AttrT end_lr_; +}; + +template +class Pow2DecayWithLinearWarmupOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const { + const auto *lr = ctx.Input("LearningRate"); + const auto *step = ctx.Input("Step"); + auto *lr_out = ctx.Output("LearningRateOut"); + auto *step_out = ctx.Output("StepOut"); + PADDLE_ENFORCE_EQ( + lr, lr_out, platform::errors::InvalidArgument("Input(LearningRate) and " + "Output(LearningRateOut) " + "must be the same.")); + PADDLE_ENFORCE_NOT_NULL(lr, + platform::errors::InvalidArgument( + "Input(LearingRate) should not be nullptr.")); + PADDLE_ENFORCE_EQ(step, step_out, + platform::errors::InvalidArgument( + "Input(Step) and Output(StepOut) must be the same.")); + PADDLE_ENFORCE_NOT_NULL(step, platform::errors::InvalidArgument( + "Input(Step) should not be nullptr.")); + PADDLE_ENFORCE_EQ( + step->IsInitialized(), true, + platform::errors::InvalidArgument("Input(Step) must be initialized.")); + + auto warmup_steps = static_cast(ctx.Attr("warmup_steps")); + auto total_steps = static_cast(ctx.Attr("total_steps")); + PADDLE_ENFORCE_LE(warmup_steps, total_steps, + platform::errors::InvalidArgument( + "warmup_steps must not be larger than total_steps.")); + auto base_lr = ctx.Attr("base_lr"); + auto end_lr = ctx.Attr("end_lr"); + + auto *lr_data = lr_out->data(); + auto *step_data = step_out->data(); + auto &dev_ctx = ctx.template device_context(); + platform::ForRange for_range(dev_ctx, 1); + using AttrT = double; + Pow2DecayWithLinearWarmupFunctor functor( + lr_data, step_data, warmup_steps, total_steps, + static_cast(base_lr), static_cast(end_lr)); + for_range(functor); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/p_norm_op_npu.cc b/paddle/fluid/operators/p_norm_op_npu.cc index 3c5d1a36e9c273..ef2346204b9c0f 100644 --- a/paddle/fluid/operators/p_norm_op_npu.cc +++ b/paddle/fluid/operators/p_norm_op_npu.cc @@ -81,6 +81,122 @@ class PnormNPUKernel : public framework::OpKernel { } }; +template +class PnormGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + using Tensor = framework::Tensor; + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Out"); + auto* dy = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + + auto place = ctx.GetPlace(); + dx->mutable_data(place); + + auto xdim = x->dims(); + float porder = ctx.Attr("porder"); + bool keepdim = ctx.Attr("keepdim"); + + int axis = ctx.Attr("axis"); + axis = axis < 0 ? xdim.size() + axis : axis; + + auto stream = + ctx.template device_context() + .stream(); + + Tensor y_share(y->type()); + Tensor dy_share(dy->type()); + y_share.ShareDataWith(*y); + dy_share.ShareDataWith(*dy); + auto ydim = xdim; + if (!keepdim) { + ydim[axis] = 1; + } else { + ydim = y->dims(); + } + y_share.Resize(ydim); + dy_share.Resize(ydim); + + if (porder == 0) { + FillNpuTensorWithConstant(dx, static_cast(0)); + dx->Resize(xdim); + } else if (porder == INFINITY || porder == -INFINITY) { + Tensor x_abs; + x_abs.mutable_data(xdim, place); + const auto& r_abs = NpuOpRunner("Abs", {*x}, {x_abs}, {}); + r_abs.Run(stream); + + Tensor t_cond; + t_cond.mutable_data(xdim, place); + const auto& r_equal = + NpuOpRunner("Equal", {x_abs, y_share}, {t_cond}, {}); + r_equal.Run(stream); + + Tensor t_zero; + t_zero.mutable_data({1}, place); + FillNpuTensorWithConstant(&t_zero, static_cast(0)); + + Tensor x_sign; + x_sign.mutable_data(xdim, place); + const auto& r_sign = NpuOpRunner("Sign", {*x}, {x_sign}, {}); + r_sign.Run(stream); + + const auto& r_mul = NpuOpRunner("Mul", {x_sign, dy_share}, {*dx}, {}); + r_mul.Run(stream); + + const auto& r_sel = + NpuOpRunner("SelectV2", {t_cond, *dx, t_zero}, {*dx}, {}); + r_sel.Run(stream); + } else { + Tensor x_abs; + x_abs.mutable_data(xdim, place); + const auto& r_abs = NpuOpRunner("Abs", {*x}, {x_abs}, {}); + r_abs.Run(stream); + + Tensor x_sign; + x_sign.mutable_data(xdim, place); + const auto& r_sign = NpuOpRunner("Sign", {*x}, {x_sign}, {}); + r_sign.Run(stream); + + Tensor y_pow; + y_pow.mutable_data(ydim, place); + if (porder >= 1) { + const auto& r_pow1 = NpuOpRunner( + "Power", {x_abs}, {x_abs}, + {{"power", (porder - 1)}, {"scale", 1.0f}, {"shift", 0.0f}}); + r_pow1.Run(stream); + + const auto& r_pow2 = NpuOpRunner( + "Power", {y_share}, {y_pow}, + {{"power", (porder - 1)}, {"scale", 1.0f}, {"shift", 0.0f}}); + r_pow2.Run(stream); + + const auto& r_div = NpuOpRunner("DivNoNan", {x_abs, y_pow}, {*dx}, {}); + r_div.Run(stream); + } else { + const auto& r_pow1 = NpuOpRunner( + "Power", {x_abs}, {x_abs}, + {{"power", (1 - porder)}, {"scale", 1.0f}, {"shift", 0.0f}}); + r_pow1.Run(stream); + + const auto& r_pow2 = NpuOpRunner( + "Power", {y_share}, {y_pow}, + {{"power", (1 - porder)}, {"scale", 1.0f}, {"shift", 0.0f}}); + r_pow2.Run(stream); + + const auto& r_div = NpuOpRunner("DivNoNan", {y_pow, x_abs}, {*dx}, {}); + r_div.Run(stream); + } + + const auto& r_mul1 = NpuOpRunner("Mul", {*dx, x_sign}, {*dx}, {}); + r_mul1.Run(stream); + + const auto& r_mul2 = NpuOpRunner("Mul", {*dx, dy_share}, {*dx}, {}); + r_mul2.Run(stream); + } + } +}; } // namespace operators } // namespace paddle @@ -90,3 +206,7 @@ namespace plat = paddle::platform; REGISTER_OP_NPU_KERNEL( p_norm, ops::PnormNPUKernel, ops::PnormNPUKernel); + +REGISTER_OP_NPU_KERNEL( + p_norm_grad, ops::PnormGradNPUKernel, + ops::PnormGradNPUKernel); diff --git a/paddle/fluid/operators/pad3d_op.cc b/paddle/fluid/operators/pad3d_op.cc index c2be9ac97ff89b..e84b5a9d9baaeb 100644 --- a/paddle/fluid/operators/pad3d_op.cc +++ b/paddle/fluid/operators/pad3d_op.cc @@ -565,13 +565,11 @@ class Pad3dCPUKernel : public framework::OpKernel { " in reflect mode" ", but received depth(%d) and pad_right(%d).", in_width, pads[1])); - } - - if (mode == "circular") { - PADDLE_ENFORCE_NE( - in_depth * in_height * in_width, 0, - platform::errors::InvalidArgument( - "The input tensor size can not be 0 for circular padding mode.")); + } else if (mode == "circular" || mode == "replicate") { + PADDLE_ENFORCE_NE(in_depth * in_height * in_width, 0, + platform::errors::InvalidArgument( + "The input tensor size can not be 0 for circular " + "or replicate padding mode.")); } const int pad_left = pads[0]; diff --git a/paddle/fluid/operators/pad3d_op.cu b/paddle/fluid/operators/pad3d_op.cu index ed936c10755f07..f243a78e5578bb 100644 --- a/paddle/fluid/operators/pad3d_op.cu +++ b/paddle/fluid/operators/pad3d_op.cu @@ -618,13 +618,11 @@ class Pad3dCUDAKernel : public framework::OpKernel { " in reflect mode" ", but received depth(%d) and pad_right(%d).", in_width, pads[1])); - } - - if (mode == "circular") { - PADDLE_ENFORCE_NE( - in_depth * in_height * in_width, 0, - platform::errors::InvalidArgument( - "The input tensor size can not be 0 for circular padding mode.")); + } else if (mode == "circular" || mode == "replicate") { + PADDLE_ENFORCE_NE(in_depth * in_height * in_width, 0, + platform::errors::InvalidArgument( + "The input tensor size can not be 0 for circular " + "or replicate padding mode.")); } const int pad_left = pads[0]; diff --git a/paddle/fluid/operators/pad3d_op_npu.cc b/paddle/fluid/operators/pad3d_op_npu.cc index 3a1fba94550032..483c895e0e65a8 100644 --- a/paddle/fluid/operators/pad3d_op_npu.cc +++ b/paddle/fluid/operators/pad3d_op_npu.cc @@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and -limitations under the Licnse. */ +limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" diff --git a/paddle/fluid/operators/psroi_pool_op.cc b/paddle/fluid/operators/psroi_pool_op.cc index d3faa2c8460f21..da637dfeb237dd 100644 --- a/paddle/fluid/operators/psroi_pool_op.cc +++ b/paddle/fluid/operators/psroi_pool_op.cc @@ -25,22 +25,26 @@ class PSROIPoolOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", - "Tensor, " + "(Tensor), " "the input of PSROIPoolOp. " "The format of input tensor is NCHW. Where N is the batch size, " "C is the number of input channels, " "H is the height of the input feature map, and " "W is the width. The data type can be float32 or float64"); AddInput("ROIs", - "LoDTensor, " + "(LoDTensor), " "ROIs (Regions of Interest) to pool over. " "should be a 2-D LoDTensor of shape (num_rois, 4) " "given as [(x1, y1, x2, y2), ...]. " "where (x1, y1) is the top left coordinates, and " "(x2, y2) is the bottom right coordinates. " "The roi batch index can be calculated from LoD."); + AddInput("RoisNum", + "(Tensor), " + "The number of RoIs in each image.") + .AsDispensable(); AddOutput("Out", - "Tensor, " + "(Tensor), " "the output of PSROIPoolOp is a 4-D Tensor with shape " "(num_rois, output_channels, pooled_h, pooled_w). " "The data type is the same as `x` "); @@ -65,8 +69,6 @@ class PSROIPoolOpMaker : public framework::OpProtoAndCheckerMaker { "the pooled output width.") .SetDefault(1); AddComment(R"Doc( -**PSROIPool Operator,** `rois` **of this op should be a LoDTensor** - Position sensitive region of interest pooling (also known as PSROIPooling) is to perform position-sensitive average pooling on regions of interest specified by input, takes as input N position-sensitive score maps and a list of num_rois regions of interest. @@ -106,7 +108,14 @@ class PSROIPoolOp : public framework::OperatorWithKernel { platform::errors::InvalidArgument( "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) " "given as [(x1, y1, x2, y2), ...]")); - + if (ctx->HasInput("RoisNum")) { + auto rois_num_dims = ctx->GetInputDim("RoisNum"); + PADDLE_ENFORCE_EQ(rois_num_dims.size(), 1, + platform::errors::InvalidArgument( + "The second dimension of RoisNum should " + "be 1, but received dimension is %d", + rois_num_dims.size())); + } int pooled_height = ctx->Attrs().Get("pooled_height"); int pooled_width = ctx->Attrs().Get("pooled_width"); int output_channels = ctx->Attrs().Get("output_channels"); @@ -184,6 +193,7 @@ class PSROIPoolGradMaker : public framework::SingleGradOpMaker { op->SetType("psroi_pool_grad"); op->SetInput("X", this->Input("X")); op->SetInput("ROIs", this->Input("ROIs")); + op->SetInput("RoisNum", this->Input("RoisNum")); op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); op->SetAttrMap(this->Attrs()); diff --git a/paddle/fluid/operators/psroi_pool_op.cu b/paddle/fluid/operators/psroi_pool_op.cu index 748b6036008f13..f69edfc1fcfec9 100644 --- a/paddle/fluid/operators/psroi_pool_op.cu +++ b/paddle/fluid/operators/psroi_pool_op.cu @@ -185,34 +185,67 @@ class GPUPSROIPoolOpKernel : public framework::OpKernel { int rois_num = rois->dims()[0]; if (rois_num == 0) return; - - auto rois_lod = rois->lod().back(); - int rois_batch_size = rois_lod.size() - 1; - PADDLE_ENFORCE_EQ(rois_batch_size, batch_size, - platform::errors::InvalidArgument( - "The batch size of input(ROIs) and input(X) must be " - "the same but received batch size of input(ROIs) and " - "input(X) is %d and %d respectively.", - rois_batch_size, batch_size)); - int rois_num_with_lod = rois_lod[rois_batch_size]; - PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod, - platform::errors::InvalidArgument( - "The number of rois from input(ROIs) and its LOD " - "must be the same. Received rois %d of input(ROIs) " - "but the number of rois %d from its LOD is %d", - rois_num, rois_num_with_lod)); - - // set rois batch id + int rois_batch_size; framework::Tensor rois_batch_id_list; rois_batch_id_list.Resize({rois_num}); int* rois_batch_id_data = rois_batch_id_list.mutable_data(platform::CPUPlace()); - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - rois_batch_id_data[i] = n; + + if (ctx.HasInput("RoisNum")) { + auto* rois_num_t = ctx.Input("RoisNum"); + rois_batch_size = rois_num_t->numel(); + auto* rois_num_data = rois_num_t->data(); + PADDLE_ENFORCE_EQ( + rois_batch_size, batch_size, + platform::errors::InvalidArgument( + "The batch size of input(ROIs) and input(X) must be " + "the same but received batch size of input(ROIs) and " + "input(X) is %d and %d respectively.", + rois_batch_size, batch_size)); + std::vector rois_num_list(rois_batch_size); + memory::Copy(platform::CPUPlace(), rois_num_list.data(), + BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()), + rois_num_data, sizeof(int) * rois_batch_size, 0); + int rois_num_count = 0; + for (int i = 0; i < rois_batch_size; ++i) { + rois_num_count += rois_num_list[i]; + } + PADDLE_ENFORCE_EQ( + rois_num_count, rois_num, + platform::errors::InvalidArgument( + "the rois_num from input and RoisNum must be the same")); + int start = 0; + for (int n = 0; n < rois_batch_size; ++n) { + for (int i = start; i < start + rois_num_list[n]; ++i) { + rois_batch_id_data[i] = n; + } + start += rois_num_list[n]; + } + } else { + auto rois_lod = rois->lod().back(); + rois_batch_size = rois_lod.size() - 1; + PADDLE_ENFORCE_EQ( + rois_batch_size, batch_size, + platform::errors::InvalidArgument( + "The batch size of input(ROIs) and input(X) must be " + "the same but received batch size of input(ROIs) and " + "input(X) is %d and %d respectively.", + rois_batch_size, batch_size)); + int rois_num_with_lod = rois_lod[rois_batch_size]; + PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod, + platform::errors::InvalidArgument( + "The number of rois from input(ROIs) and its LOD " + "must be the same. Received rois %d of input(ROIs) " + "but the number of rois %d from its LOD is %d", + rois_num, rois_num_with_lod)); + + // set rois batch id + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + rois_batch_id_data[i] = n; + } } } - framework::Tensor rois_batch_id_list_gpu; framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(), ctx.device_context(), &rois_batch_id_list_gpu); @@ -257,14 +290,30 @@ class GPUPSROIPoolGradOpKernel : public framework::OpKernel { rois_batch_id_list.Resize({rois_num}); int* rois_batch_id_data = rois_batch_id_list.mutable_data(platform::CPUPlace()); - auto rois_lod = rois->lod().back(); - int rois_batch_size = rois_lod.size() - 1; - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - rois_batch_id_data[i] = n; + int rois_batch_size; + if (ctx.HasInput("RoisNum")) { + auto* rois_num_t = ctx.Input("RoisNum"); + rois_batch_size = rois_num_t->numel(); + std::vector rois_num_list(rois_batch_size); + memory::Copy(platform::CPUPlace(), rois_num_list.data(), + BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()), + rois_num_t->data(), sizeof(int) * rois_batch_size, 0); + int start = 0; + for (int n = 0; n < rois_batch_size; ++n) { + for (int i = start; i < start + rois_num_list[n]; ++i) { + rois_batch_id_data[i] = n; + } + start += rois_num_list[n]; + } + } else { + auto rois_lod = rois->lod().back(); + rois_batch_size = rois_lod.size() - 1; + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + rois_batch_id_data[i] = n; + } } } - framework::Tensor rois_batch_id_list_gpu; framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(), ctx.device_context(), &rois_batch_id_list_gpu); diff --git a/paddle/fluid/operators/psroi_pool_op.h b/paddle/fluid/operators/psroi_pool_op.h index 4f4cb24844b8c2..4d7e9ce295fc86 100644 --- a/paddle/fluid/operators/psroi_pool_op.h +++ b/paddle/fluid/operators/psroi_pool_op.h @@ -40,6 +40,13 @@ class CPUPSROIPoolOpKernel : public framework::OpKernel { int width = in_dims[3]; int rois_num = rois->dims()[0]; + PADDLE_ENFORCE_EQ(input_channels, + output_channels * pooled_height * pooled_width, + platform::errors::InvalidArgument( + "the channels of input " + "X should equal the product of " + "output_channels x pooled_height x pooled_width")); + auto in_stride = framework::stride(in_dims); auto out_stride = framework::stride(out->dims()); @@ -49,32 +56,52 @@ class CPUPSROIPoolOpKernel : public framework::OpKernel { rois_batch_id_list.Resize({rois_num}); int* rois_batch_id_data = rois_batch_id_list.mutable_data(ctx.GetPlace()); - - auto rois_lod = rois->lod().back(); - int rois_batch_size = rois_lod.size() - 1; - PADDLE_ENFORCE_EQ( - rois_batch_size, batch_size, - platform::errors::InvalidArgument("the rois_batch_size and input(X) " - "batch_size should be the same.")); - int rois_num_with_lod = rois_lod[rois_batch_size]; - PADDLE_ENFORCE_EQ(rois_num_with_lod, rois_num, - platform::errors::InvalidArgument( - "the rois_num from input and lod must be the same")); - - PADDLE_ENFORCE_EQ(input_channels, - output_channels * pooled_height * pooled_width, - platform::errors::InvalidArgument( - "the channels of input " - "X should equal the product of " - "output_channels x pooled_height x pooled_width")); - - // calculate batch id index for each roi according to LoD - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - rois_batch_id_data[i] = n; + int rois_batch_size; + if (ctx.HasInput("RoisNum")) { + auto* rois_num_t = ctx.Input("RoisNum"); + rois_batch_size = rois_num_t->numel(); + auto* rois_num_data = rois_num_t->data(); + PADDLE_ENFORCE_EQ( + rois_batch_size, batch_size, + platform::errors::InvalidArgument( + "The batch size of rois and the batch size of images " + " must be the same. But received the batch size of rois is %d, " + "and the batch size of images is %d", + rois_batch_size, batch_size)); + int rois_num_count = 0; + for (int i = 0; i < rois_batch_size; ++i) { + rois_num_count += rois_num_data[i]; + } + PADDLE_ENFORCE_EQ( + rois_num_count, rois_num, + platform::errors::InvalidArgument( + "the rois_num from input and RoisNum must be the same")); + int start = 0; + for (int n = 0; n < rois_batch_size; ++n) { + for (int i = start; i < start + rois_num_data[n]; ++i) { + rois_batch_id_data[i] = n; + } + start += rois_num_data[n]; + } + } else { + auto rois_lod = rois->lod().back(); + rois_batch_size = rois_lod.size() - 1; + PADDLE_ENFORCE_EQ( + rois_batch_size, batch_size, + platform::errors::InvalidArgument("the rois_batch_size and input(X) " + "batch_size should be the same.")); + int rois_num_with_lod = rois_lod[rois_batch_size]; + PADDLE_ENFORCE_EQ( + rois_num_with_lod, rois_num, + platform::errors::InvalidArgument( + "the rois_num from input and lod must be the same")); + // calculate batch id index for each roi according to LoD + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + rois_batch_id_data[i] = n; + } } } - T* output_data = out->mutable_data(ctx.GetPlace()); const T* input_rois = rois->data(); @@ -93,7 +120,6 @@ class CPUPSROIPoolOpKernel : public framework::OpKernel { static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; T roi_end_h = static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; - // Force too small rois to be 1 x 1 T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1); // avoid 0 T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1); @@ -172,15 +198,28 @@ class CPUPSROIPoolGradOpKernel : public framework::OpKernel { rois_batch_id_list.Resize({rois_num}); int* rois_batch_id_data = rois_batch_id_list.mutable_data(ctx.GetPlace()); - auto rois_lod = rois->lod().back(); - int rois_batch_size = rois_lod.size() - 1; - // calculate batch id index for each roi according to LoD - for (int n = 0; n < rois_batch_size; ++n) { - for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { - rois_batch_id_data[i] = n; + int rois_batch_size; + if (ctx.HasInput("RoisNum")) { + auto* rois_num_t = ctx.Input("RoisNum"); + rois_batch_size = rois_num_t->numel(); + auto* rois_num_data = rois_num_t->data(); + int start = 0; + for (int n = 0; n < rois_batch_size; ++n) { + for (int i = start; i < start + rois_num_data[n]; ++i) { + rois_batch_id_data[i] = n; + } + start += rois_num_data[n]; + } + } else { + auto rois_lod = rois->lod().back(); + rois_batch_size = rois_lod.size() - 1; + // calculate batch id index for each roi according to LoD + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + rois_batch_id_data[i] = n; + } } } - const T* input_rois = rois->data(); const T* output_grad_data = output_grad->data(); T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/qr_op.cc b/paddle/fluid/operators/qr_op.cc new file mode 100644 index 00000000000000..f612bb9e31f930 --- /dev/null +++ b/paddle/fluid/operators/qr_op.cc @@ -0,0 +1,152 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/qr_op.h" +#include +#include +#include +#include +#include "paddle/fluid/framework/ddim.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif + +namespace paddle { +namespace operators { +using DDim = framework::DDim; + +class QrOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "qr"); + OP_INOUT_CHECK(ctx->HasOutput("Q"), "Output", "Q", "qr"); + OP_INOUT_CHECK(ctx->HasOutput("R"), "Output", "R", "qr"); + + auto x_dims = ctx->GetInputDim("X"); + int x_rank = x_dims.size(); + PADDLE_ENFORCE_GE(x_dims.size(), 2, + platform::errors::InvalidArgument( + "the rank of input must greater than 2")); + bool compute_q; + bool reduced_mode; + int m = x_dims[x_rank - 2]; + int n = x_dims[x_rank - 1]; + int min_mn = std::min(m, n); + std::string mode = ctx->Attrs().Get("mode"); + std::tie(compute_q, reduced_mode) = _parse_qr_mode(mode); + + if (compute_q) { + int k = reduced_mode ? min_mn : m; + auto q_dims_vec = framework::vectorize(x_dims); + q_dims_vec[q_dims_vec.size() - 1] = k; + ctx->SetOutputDim("Q", framework::make_ddim(q_dims_vec)); + } else { + ctx->SetOutputDim("Q", framework::make_ddim({0})); + } + + int k = reduced_mode ? min_mn : m; + auto r_dims_vec = framework::vectorize(x_dims); + r_dims_vec[r_dims_vec.size() - 2] = k; + r_dims_vec[r_dims_vec.size() - 1] = n; + ctx->SetOutputDim("R", framework::make_ddim(r_dims_vec)); + + ctx->ShareLoD("X", /*->*/ "Q"); + ctx->ShareLoD("X", /*->*/ "R"); + } +}; + +class QrOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor), The input tensor of qr op."); + AddOutput("Q", "(Tensor), The output Q tensor of qr op."); + AddOutput("R", "(Tensor), The output R tensor of qr op."); + AddAttr( + "mode", + "(string, default \"reduced\"). " + "If mode is \"reduced\", Qr op will return reduced Q and R matrices. " + "If mode is \"complete\", Qr op will return complete Q and R matrices. " + "If mode is \"r\", Qr op will only return reduced R matrix.") + .SetDefault("reduced"); + AddComment(R"DOC( +Qr Operator. + +This operator is used to perform QR operation for batched matrics $X$. +$$Q, R = qr(X)$$ + +)DOC"); + } +}; + +class QrGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Q")), "Input", + "Q@Grad", "QrGrad"); + OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("R")), "Input", + "R@Grad", "QrGrad"); + OP_INOUT_CHECK(ctx->HasInput("Q"), "Input", "Q", "QrGrad"); + OP_INOUT_CHECK(ctx->HasInput("R"), "Input", "R", "QrGrad"); + OP_INOUT_CHECK(ctx->HasOutput(framework::GradVarName("X")), "Output", + "X@Grad", "QrGrad"); + + auto x_dims = ctx->GetInputDim(("X")); + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto dtype = OperatorWithKernel::IndicateVarDataType(ctx, "X"); + return framework::OpKernelType(dtype, ctx.GetPlace()); + } +}; + +template +class QrGradMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + void Apply(GradOpPtr retv) const override { + retv->SetType("qr_grad"); + retv->SetInput(framework::GradVarName("Q"), this->OutputGrad("Q")); + retv->SetInput(framework::GradVarName("R"), this->OutputGrad("R")); + retv->SetInput("Q", this->Output("Q")); + retv->SetInput("R", this->Output("R")); + retv->SetInput("X", this->Input("X")); + retv->SetAttrMap(this->Attrs()); + retv->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OPERATOR(qr, ops::QrOp, ops::QrOpMaker, + ops::QrGradMaker, + ops::QrGradMaker); + +REGISTER_OPERATOR(qr_grad, ops::QrGradOp); + +REGISTER_OP_CPU_KERNEL(qr, ops::QrCPUKernel, ops::QrCPUKernel); + +REGISTER_OP_CPU_KERNEL( + qr_grad, ops::QrGradKernel, + ops::QrGradKernel); diff --git a/paddle/fluid/operators/qr_op.cu b/paddle/fluid/operators/qr_op.cu new file mode 100644 index 00000000000000..992df172ace0c7 --- /dev/null +++ b/paddle/fluid/operators/qr_op.cu @@ -0,0 +1,309 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef PADDLE_WITH_HIP +// HIP not support cusolver + +#include +#include +#include +#include "paddle/fluid/memory/memory.h" +#include "paddle/fluid/operators/qr_op.h" +#include "paddle/fluid/platform/dynload/cusolver.h" + +// Reuse some helper functions from svd +#include "paddle/fluid/operators/svd_helper.h" + +namespace paddle { +namespace operators { + +template +class QrGPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + bool compute_q; + bool reduced_mode; + auto& dev_ctx = + context.template device_context(); + const Tensor& x = *context.Input("X"); + Tensor& q = *context.Output("Q"); + Tensor& r = *context.Output("R"); + const std::string mode = context.Attr("mode"); + std::tie(compute_q, reduced_mode) = _parse_qr_mode(mode); + + auto numel = x.numel(); + PADDLE_ENFORCE_GT(numel, 0, platform::errors::PreconditionNotMet( + "The input of QR is empty.")); + auto x_dims = x.dims(); + int x_rank = x_dims.size(); + int m = x_dims[x_rank - 2]; + int n = x_dims[x_rank - 1]; + int min_mn = std::min(m, n); + int k = reduced_mode ? min_mn : m; + int batch_size = numel / (m * n); + int qr_stride = m * n; + int tau_stride = min_mn; + + if (compute_q) { + q.mutable_data>( + context.GetPlace(), + size_t(batch_size * m * k * sizeof(math::Real))); + } + r.mutable_data>( + context.GetPlace(), size_t(batch_size * k * n * sizeof(math::Real))); + + auto dito = + math::DeviceIndependenceTensorOperations(context); + + // Note: allocate temporary tensors because of lacking in-place operatios. + // Prepare qr + Tensor qr; + qr.mutable_data>( + context.GetPlace(), size_t(batch_size * m * n * sizeof(math::Real))); + // BatchedGeqrf performs computation in-place and 'qr' must be a copy of + // input + TensorCopy(x, context.GetPlace(), &qr); + + // Prepare tau + auto tau_dims_vec = framework::vectorize(x_dims); + tau_dims_vec.pop_back(); + tau_dims_vec[tau_dims_vec.size() - 1] = min_mn; + Tensor tau = dito.Fill(tau_dims_vec, 0); + + // Transpose 'qr' to conform the column-major order + auto tmp_qr = dito.Transpose(qr); + framework::TensorCopy(tmp_qr, qr.place(), &qr); + auto qr_data = qr.mutable_data(context.GetPlace()); + auto tau_data = tau.mutable_data(context.GetPlace()); + + BatchedGeqrf(dev_ctx, batch_size, m, n, qr_data, m, tau_data, qr_stride, + tau_stride); + + if (reduced_mode) { + auto trans_qr = dito.Transpose(qr); + auto sliced_qr = dito.Slice(trans_qr, {-2}, {0}, {min_mn}); + auto tmp_r = dito.TrilTriu(sliced_qr, 0, false); + // Transpose 'tmp_r' to retore the original row-major order + framework::TensorCopy(tmp_r, r.place(), &r); + } else { + auto trans_qr = dito.Transpose(qr); + auto tmp_r = dito.TrilTriu(trans_qr, 0, false); + // Transpose 'tmp_r' to retore the original row-major order + framework::TensorCopy(tmp_r, r.place(), &r); + } + + if (compute_q) { + // Perform QRGQR for Q using the result from GEQRF + // Transpose 'q' to retore the original row-major order + if (reduced_mode) { + BatchedOrgqr(dev_ctx, batch_size, m, min_mn, min_mn, qr_data, m, + tau_data, qr_stride, tau_stride); + auto trans_q = dito.Transpose(qr); + auto sliced_q = dito.Slice(trans_q, {-1}, {0}, {min_mn}); + framework::TensorCopy(sliced_q, q.place(), &q); + } else { + if (m > n) { + auto new_qr_dims_vec = framework::vectorize(x_dims); + new_qr_dims_vec[new_qr_dims_vec.size() - 1] = m; + Tensor new_qr = dito.Fill(new_qr_dims_vec, 0); + auto new_qr_data = new_qr.mutable_data(context.GetPlace()); + auto new_qr_stride = m * m; + for (int i = 0; i < batch_size; ++i) { + memory::Copy( + BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), + (new_qr_data + i * new_qr_stride), + BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), + (qr_data + i * qr_stride), qr_stride * sizeof(math::Real), + dev_ctx.stream()); + } + BatchedOrgqr(dev_ctx, batch_size, m, m, min_mn, new_qr_data, m, + tau_data, new_qr_stride, tau_stride); + auto trans_q = dito.Transpose(new_qr); + framework::TensorCopy(trans_q, q.place(), &q); + } else { + BatchedOrgqr(dev_ctx, batch_size, m, m, min_mn, qr_data, m, tau_data, + qr_stride, tau_stride); + auto trans_q = dito.Transpose(qr); + auto sliced_q = dito.Slice(trans_q, {-1}, {0}, {m}); + framework::TensorCopy(sliced_q, q.place(), &q); + } + } + } + } + + void BatchedGeqrf(const platform::CUDADeviceContext& dev_ctx, int batch_size, + int m, int n, float* a, int lda, float* tau, int a_stride, + int tau_stride) const; + + void BatchedGeqrf(const platform::CUDADeviceContext& dev_ctx, int batch_size, + int m, int n, double* a, int lda, double* tau, int a_stride, + int tau_stride) const; + + void BatchedOrgqr(const platform::CUDADeviceContext& dev_ctx, int batch_size, + int m, int n, int k, float* a, int lda, float* tau, + int a_stride, int tau_stride) const; + + void BatchedOrgqr(const platform::CUDADeviceContext& dev_ctx, int batch_size, + int m, int n, int k, double* a, int lda, double* tau, + int a_stride, int tau_stride) const; +}; + +template <> +void QrGPUKernel::BatchedGeqrf( + const platform::CUDADeviceContext& dev_ctx, int batch_size, int m, int n, + float* a, int lda, float* tau, int a_stride, int tau_stride) const { + int lwork = 0; + + auto handle = dev_ctx.cusolver_dn_handle(); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSgeqrf_bufferSize( + handle, m, n, a, lda, &lwork)); + auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float)); + float* workspace_ptr = reinterpret_cast(workspace->ptr()); + auto info = memory::Alloc(dev_ctx, sizeof(int)); + int* info_d = reinterpret_cast(info->ptr()); + + for (int i = 0; i < batch_size; ++i) { + float* a_working_ptr = &a[i * a_stride]; + float* tau_working_ptr = &tau[i * tau_stride]; + // compute geqrf + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSgeqrf( + handle, m, n, a_working_ptr, lda, tau_working_ptr, workspace_ptr, lwork, + info_d)); + // Do we need synchronized here? + // check the error info + int info_h; + memory::Copy(platform::CPUPlace(), &info_h, + BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), + info_d, sizeof(int), dev_ctx.stream()); + PADDLE_ENFORCE_EQ( + info_h, 0, + platform::errors::PreconditionNotMet( + "For batch [%d]: CUSolver geqrf is not zero. [%d]", i, info_h)); + } +} + +template <> +void QrGPUKernel::BatchedGeqrf( + const platform::CUDADeviceContext& dev_ctx, int batch_size, int m, int n, + double* a, int lda, double* tau, int a_stride, int tau_stride) const { + int lwork = 0; + + auto handle = dev_ctx.cusolver_dn_handle(); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDgeqrf_bufferSize( + handle, m, n, a, lda, &lwork)); + auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double)); + double* workspace_ptr = reinterpret_cast(workspace->ptr()); + auto info = memory::Alloc(dev_ctx, sizeof(int)); + int* info_d = reinterpret_cast(info->ptr()); + + for (int i = 0; i < batch_size; ++i) { + double* a_working_ptr = &a[i * a_stride]; + double* tau_working_ptr = &tau[i * tau_stride]; + // compute geqrf + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDgeqrf( + handle, m, n, a_working_ptr, lda, tau_working_ptr, workspace_ptr, lwork, + info_d)); + // Do we need synchronized here? + // check the error info + int info_h; + memory::Copy(platform::CPUPlace(), &info_h, + BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), + info_d, sizeof(int), dev_ctx.stream()); + PADDLE_ENFORCE_EQ( + info_h, 0, + platform::errors::PreconditionNotMet( + "For batch [%d]: CUSolver geqrf is not zero. [%d]", i, info_h)); + } +} + +template <> +void QrGPUKernel::BatchedOrgqr( + const platform::CUDADeviceContext& dev_ctx, int batch_size, int m, int n, + int k, float* a, int lda, float* tau, int a_stride, int tau_stride) const { + int lwork = 0; + + auto handle = dev_ctx.cusolver_dn_handle(); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSorgqr_bufferSize( + handle, m, n, k, a, lda, tau, &lwork)); + auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float)); + float* workspace_ptr = reinterpret_cast(workspace->ptr()); + auto info = memory::Alloc(dev_ctx, sizeof(int)); + int* info_d = reinterpret_cast(info->ptr()); + + for (int i = 0; i < batch_size; ++i) { + float* a_working_ptr = &a[i * a_stride]; + float* tau_working_ptr = &tau[i * tau_stride]; + // compute orggr + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnSorgqr( + handle, m, n, k, a_working_ptr, lda, tau_working_ptr, workspace_ptr, + lwork, info_d)); + // Do we need synchronized here? + // check the error info + int info_h; + memory::Copy(platform::CPUPlace(), &info_h, + BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), + info_d, sizeof(int), dev_ctx.stream()); + PADDLE_ENFORCE_EQ( + info_h, 0, + platform::errors::PreconditionNotMet( + "For batch [%d]: CUSolver QR is not zero. [%d]", i, info_h)); + } +} + +template <> +void QrGPUKernel::BatchedOrgqr( + const platform::CUDADeviceContext& dev_ctx, int batch_size, int m, int n, + int k, double* a, int lda, double* tau, int a_stride, + int tau_stride) const { + int lwork = 0; + + auto handle = dev_ctx.cusolver_dn_handle(); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDorgqr_bufferSize( + handle, m, n, k, a, lda, tau, &lwork)); + auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double)); + double* workspace_ptr = reinterpret_cast(workspace->ptr()); + auto info = memory::Alloc(dev_ctx, sizeof(int)); + int* info_d = reinterpret_cast(info->ptr()); + + for (int i = 0; i < batch_size; ++i) { + double* a_working_ptr = &a[i * a_stride]; + double* tau_working_ptr = &tau[i * tau_stride]; + // compute orggr + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cusolverDnDorgqr( + handle, m, n, k, a_working_ptr, lda, tau_working_ptr, workspace_ptr, + lwork, info_d)); + // Do we need synchronized here? + // check the error info + int info_h; + memory::Copy(platform::CPUPlace(), &info_h, + BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()), + info_d, sizeof(int), dev_ctx.stream()); + PADDLE_ENFORCE_EQ( + info_h, 0, + platform::errors::PreconditionNotMet( + "For batch [%d]: CUSolver QR is not zero. [%d]", i, info_h)); + } +} + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(qr, ops::QrGPUKernel, ops::QrGPUKernel); +REGISTER_OP_CUDA_KERNEL( + qr_grad, ops::QrGradKernel, + ops::QrGradKernel); + +#endif // not PADDLE_WITH_HIP diff --git a/paddle/fluid/operators/qr_op.h b/paddle/fluid/operators/qr_op.h new file mode 100644 index 00000000000000..73ba52f590c0d7 --- /dev/null +++ b/paddle/fluid/operators/qr_op.h @@ -0,0 +1,135 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/math/complex_functors.h" +#include "paddle/fluid/platform/for_range.h" + +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; +using DDim = framework::DDim; + +static inline std::tuple _parse_qr_mode(std::string mode) { + bool compute_q; + bool reduced; + if (mode == "reduced") { + compute_q = true; + reduced = true; + } else if (mode == "complete") { + compute_q = true; + reduced = false; + } else if (mode == "r") { + compute_q = false; + reduced = true; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "QR received unrecognized mode '%s'" + " but expected one of 'reduced' (default), 'r', or 'complete'", + mode)); + } + return std::make_tuple(compute_q, reduced); +} + +template +class QrCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + bool compute_q; + bool reduced_mode; + const Tensor& x = *context.Input("X"); + Tensor& q = *context.Output("Q"); + Tensor& r = *context.Output("R"); + std::string mode = context.Attr("mode"); + std::tie(compute_q, reduced_mode) = _parse_qr_mode(mode); + + auto numel = x.numel(); + PADDLE_ENFORCE_GT(numel, 0, platform::errors::PreconditionNotMet( + "The input of QR is empty.")); + auto x_dims = x.dims(); + int x_rank = x_dims.size(); + int m = x_dims[x_rank - 2]; + int n = x_dims[x_rank - 1]; + int min_mn = std::min(m, n); + int k = reduced_mode ? min_mn : m; + int batch_size = numel / (m * n); + int x_stride = m * n; + int q_stride = m * k; + int r_stride = k * n; + + auto* x_data = x.data>(); + T* q_data = nullptr; + if (compute_q) { + q_data = q.mutable_data>( + context.GetPlace(), + size_t(batch_size * m * k * sizeof(math::Real))); + } + auto* r_data = r.mutable_data>( + context.GetPlace(), size_t(batch_size * k * n * sizeof(math::Real))); + + // Implement QR by calling Eigen + for (int i = 0; i < batch_size; ++i) { + const T* x_matrix_ptr = x_data + i * x_stride; + T* r_matrix_ptr = r_data + i * r_stride; + using EigenDynamicMatrix = + Eigen::Matrix; + auto x_matrix = Eigen::Map(x_matrix_ptr, m, n); + Eigen::HouseholderQR qr(x_matrix); + if (reduced_mode) { + auto qr_top_matrix = qr.matrixQR().block(0, 0, min_mn, n); + auto r_matrix_view = + qr_top_matrix.template triangularView(); + auto r_matrix = EigenDynamicMatrix(r_matrix_view); + memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T)); + } else { + auto r_matrix_view = + qr.matrixQR().template triangularView(); + auto r_matrix = EigenDynamicMatrix(r_matrix_view); + memcpy(r_matrix_ptr, r_matrix.data(), r_matrix.size() * sizeof(T)); + } + + if (compute_q) { + T* q_matrix_ptr = q_data + i * q_stride; + if (reduced_mode) { + auto q_matrix = + qr.householderQ() * EigenDynamicMatrix::Identity(m, min_mn); + q_matrix.transposeInPlace(); + memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T)); + } else { + auto q_matrix = + qr.householderQ() * EigenDynamicMatrix::Identity(m, m); + q_matrix.transposeInPlace(); + memcpy(q_matrix_ptr, q_matrix.data(), q_matrix.size() * sizeof(T)); + } + } + } + } +}; + +template +class QrGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + PADDLE_THROW(platform::errors::InvalidArgument( + "QR doesn't have the backward kernel now and will be supported soon.")); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc index b343fc88d7b8d3..5efc7e9b869b7d 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_npu.cc @@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and -limitations under the Licnse. */ +limitations under the License. */ #include "paddle/fluid/operators/npu_op_runner.h" #include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h" diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op_xpu.cc b/paddle/fluid/operators/reduce_ops/reduce_mean_op_xpu.cc index b82ecbbe2fcdcc..d6c1dc5f02d422 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op_xpu.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op_xpu.cc @@ -23,30 +23,103 @@ namespace paddle { namespace operators { template class ReduceMeanXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& context) const override { PADDLE_ENFORCE_EQ( platform::is_xpu_place(context.GetPlace()), true, platform::errors::Unavailable("This kernel only runs on XPU.")); - // bool reduce_all = context.Attr("reduce_all"); + bool reduce_all = context.Attr("reduce_all"); auto* input = context.Input("X"); auto* output = context.Output("Out"); output->mutable_data(context.GetPlace()); auto& dev_ctx = context.template device_context(); - int ndim = input->dims().size(); - std::vector idims; + + std::vector xdims; for (int i = 0; i < input->dims().size(); i++) { - idims.push_back(input->dims()[i]); + xdims.push_back(input->dims()[i]); } - auto dims = context.Attr>("dim"); - int rdim = dims.size(); - int r = - xpu::reduce(dev_ctx.x_context(), input->data(), output->data(), - idims.data(), ndim, dims.data(), rdim, xpu::REDUCE_MEAN); - PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true, - platform::errors::External("XPU kernel error!")); + auto rdims = context.Attr>("dim"); + if (reduce_all) { + rdims.clear(); + for (size_t i = 0; i < xdims.size(); i++) { + rdims.push_back(static_cast(i)); + } + } + int r = xpu::reduce_mean( + dev_ctx.x_context(), reinterpret_cast(input->data()), + reinterpret_cast(output->data()), xdims, rdims); + + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External( + "XPU reduce_mean kernel return wrong value[%d %s]", r, + XPUAPIErrorMsg[r])); } }; + +template +class ReduceMeanGradXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* output_grad = ctx.Input(framework::GradVarName("Out")); + auto* input_grad = ctx.Output(framework::GradVarName("X")); + + XPUType* x_data = + reinterpret_cast(input_grad->mutable_data(ctx.GetPlace())); + const XPUType* dy_data = + reinterpret_cast(output_grad->data()); + + bool reduce_all = ctx.Attr("reduce_all"); + auto reduce_dims = ctx.Attr>("dim"); + + std::vector xdims; + for (int i = 0; i < input->dims().size(); i++) { + xdims.push_back(input->dims()[i]); + } + std::vector ydims; + for (int i = 0; i < output_grad->dims().size(); i++) { + ydims.push_back(output_grad->dims()[i]); + } + + int reduce_numel = 1; + if (reduce_all) { + reduce_dims.clear(); + for (size_t d = 0; d < xdims.size(); ++d) { + reduce_dims.push_back(static_cast(d)); + } + } + for (auto& d : reduce_dims) { + if (d < 0) { + d = d + xdims.size(); + } + reduce_numel *= xdims[d]; + } + + float val = 1.0f / static_cast(reduce_numel); + + auto& dev_ctx = ctx.template device_context(); + + int r = xpu::constant(dev_ctx.x_context(), x_data, input->numel(), + static_cast(val)); + + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External( + "XPU constant kernel return wrong value[%d %s]", r, + XPUAPIErrorMsg[r])); + r = xpu::broadcast_mul(dev_ctx.x_context(), x_data, dy_data, x_data, xdims, + ydims); + + PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, + platform::errors::External( + "XPU broadcast_mul kernel return wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); + } +}; + } // namespace operators } // namespace paddle @@ -54,4 +127,8 @@ REGISTER_OP_XPU_KERNEL( reduce_mean, ops::ReduceMeanXPUKernel); +REGISTER_OP_XPU_KERNEL( + reduce_mean_grad, + ops::ReduceMeanGradXPUKernel); + #endif diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h index 4760270caa3c6d..bf451272a47b0a 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h @@ -34,6 +34,7 @@ namespace cub = hipcub; #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" +#include "paddle/fluid/operators/cast_op.h" #include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h" #include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/fast_divmod.h" @@ -528,6 +529,31 @@ __device__ void HigherDimDealSegment(const Tx* x, Ty* y, ReduceOp reducer, kps::WriteData(y + store_offset, &temp_data, size); } +template +__device__ void ReduceAnyKernelImpl(const Tx* input, MPType* reduce_var, + ReduceOp reducer, TransformOp transformer, + MPType init, int reduce_num, int input_idx, + bool reduce_last_dim, + const Calculator& reduce_index_calculator, + int stride, int num) { + Tx input_reg[REDUCE_VEC_SIZE]; + MPType input_compute[REDUCE_VEC_SIZE]; + MPType input_transform[REDUCE_VEC_SIZE]; + + kps::Init(&input_compute[0], init); + kps::ReadDataReduce( + &input_reg[0], input, input_idx, reduce_index_calculator, 1, reduce_num, + 1, stride, reduce_last_dim); + kps::ElementwiseUnary( + &input_transform[0], &input_reg[0], transformer); + kps::Init(input_compute, input_transform, + num); + kps::Reduce( + reduce_var, &input_compute[0], reducer, reduce_last_dim); +} + // when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, or // when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this // function will be used @@ -569,37 +595,17 @@ __global__ void ReduceAnyKernel(const Tx* x, Ty* y, ReduceOp reducer, // 1. reduce for each thread if (left_idx < left_num) { // load REDUCE_VEC_SIZE data once, and then compute - Tx input_reg[REDUCE_VEC_SIZE]; - MPType input_compute[REDUCE_VEC_SIZE]; int bound = reduce_num - (REDUCE_VEC_SIZE - 1) * stride; for (; input_idx + block_size < bound; input_idx += REDUCE_VEC_SIZE * stride) { - kps::ReadDataReduce( - &input_reg[0], input, input_idx, reduce_index_calculator, 1, - reduce_num, 1, stride, reduce_last_dim); - kps::ElementwiseUnary( - &input_compute[0], &input_reg[0], transformer); - kps::Reduce( - &reduce_var, &input_compute[0], reducer, reduce_last_dim); - } - - kps::Init(&input_compute[0], init); - kps::ReadDataReduce( - &input_reg[0], input, input_idx, reduce_index_calculator, 1, reduce_num, - 1, stride, reduce_last_dim); - input_idx += tid; -#pragma unroll - for (int i = 0; i < REDUCE_VEC_SIZE; ++i) { - if (input_idx >= reduce_num) { - break; - } - input_compute[i] = static_cast(transformer(input_reg[i])); - input_idx += stride; + ReduceAnyKernelImpl( + input, &reduce_var, reducer, transformer, init, reduce_num, input_idx, + reduce_last_dim, reduce_index_calculator, stride, reduce_num); } - kps::Reduce( - &reduce_var, &input_compute[0], reducer, reduce_last_dim); + int num = (reduce_num - input_idx - tid + stride - 1) / stride; + ReduceAnyKernelImpl( + input, &reduce_var, reducer, transformer, init, reduce_num - input_idx, + input_idx, reduce_last_dim, reduce_index_calculator, stride, num); } kps::Reduce( @@ -705,8 +711,16 @@ void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y, if (config.reduce_num == 1) { auto out_dims = y->dims(); - framework::TensorCopy(x, y->place(), y); - y->Resize(out_dims); + if (x.type() == y->type()) { + framework::TensorCopy(x, y->place(), y); + y->Resize(out_dims); + } else { + auto* dev_ctx = static_cast( + paddle::platform::DeviceContextPool::Instance().Get(x.place())); + framework::VisitDataType( + static_cast(y->type()), + CastOpFunctor(&x, y, *dev_ctx)); + } return; } diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc index 834b63f199e37d..b5f571c7fea2ca 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc +++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc @@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and -limitations under the Licnse. */ +limitations under the License. */ #include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h" #include "paddle/fluid/operators/npu_op_runner.h" diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index 51ff8f189b1513..6f244b1a4cb8fe 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -229,7 +229,7 @@ class ReshapeOp : public framework::OperatorWithKernel { // by now we require that if the input tensor is zero shape, the target // shape of output must be zero if (in_size == 0) { - PADDLE_ENFORCE_EQ( + PADDLE_ENFORCE_LE( capacity, in_size, platform::errors::InvalidArgument( "The 'shape' in ReshapeOp is invalid. " @@ -248,13 +248,13 @@ class ReshapeOp : public framework::OperatorWithKernel { auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); -#ifdef PADDLE_WITH_MKLDNN -// if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { -// return framework::OpKernelType(input_data_type, ctx.GetPlace(), -// framework::DataLayout::kMKLDNN, -// framework::LibraryType::kMKLDNN); -// } -#endif + //#ifdef PADDLE_WITH_MKLDNN + // if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + // return framework::OpKernelType(input_data_type, ctx.GetPlace(), + // framework::DataLayout::kMKLDNN, + // framework::LibraryType::kMKLDNN); + // } + //#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } @@ -366,13 +366,13 @@ class ReshapeGradOp : public framework::OperatorWithKernel { auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); -#ifdef PADDLE_WITH_MKLDNN -// if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { -// return framework::OpKernelType(input_data_type, ctx.GetPlace(), -// framework::DataLayout::kMKLDNN, -// framework::LibraryType::kMKLDNN); -// } -#endif + //#ifdef PADDLE_WITH_MKLDNN + // if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + // return framework::OpKernelType(input_data_type, ctx.GetPlace(), + // framework::DataLayout::kMKLDNN, + // framework::LibraryType::kMKLDNN); + // } + //#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; @@ -557,13 +557,13 @@ class Reshape2GradOp : public framework::OperatorWithKernel { auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType( ctx, framework::GradVarName("Out")); -#ifdef PADDLE_WITH_MKLDNN -// if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { -// return framework::OpKernelType(input_data_type, ctx.GetPlace(), -// framework::DataLayout::kMKLDNN, -// framework::LibraryType::kMKLDNN); -// } -#endif + //#ifdef PADDLE_WITH_MKLDNN + // if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + // return framework::OpKernelType(input_data_type, ctx.GetPlace(), + // framework::DataLayout::kMKLDNN, + // framework::LibraryType::kMKLDNN); + // } + //#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } diff --git a/paddle/fluid/operators/roi_align_op_npu.cc b/paddle/fluid/operators/roi_align_op_npu.cc index c1ba046ca6af1a..c26db2500fd661 100644 --- a/paddle/fluid/operators/roi_align_op_npu.cc +++ b/paddle/fluid/operators/roi_align_op_npu.cc @@ -90,6 +90,94 @@ class ROIAlignNPUKernel : public framework::OpKernel { } }; +template +class ROIAlignNPUGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* rois = ctx.Input("ROIs"); + auto* out_grad = + ctx.Input(framework::GradVarName("Out")); + auto* in_grad = ctx.Output(framework::GradVarName("X")); + + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + auto spatial_scale = ctx.Attr("spatial_scale"); + auto sample_num = ctx.Attr("sampling_ratio"); + auto in_dims = in->dims(); + auto aligned = ctx.Attr("aligned"); + + int rois_num = rois->dims()[0]; + + auto place = ctx.GetPlace(); + auto stream = + ctx.template device_context() + .stream(); + + if (!in_grad) { + return; + } + in_grad->mutable_data(place); + + PADDLE_ENFORCE_EQ( + aligned, false, + platform::errors::InvalidArgument( + "ROIAlignGradNPU only support Aligned attribute equaled to False")); + PADDLE_ENFORCE_EQ( + ctx.HasInput("RoisNum"), true, + platform::errors::NotFound("Input(RoisNum) of ROIAlignGradOp " + "is not found while using NPU.")); + PADDLE_ENFORCE_EQ( + rois->type(), framework::proto::VarType::FP32, + platform::errors::InvalidArgument( + "ROIAlignGradNPU only support ROIs type equaled to FP32.")); + + // Cast RoisNum to fp32 tensor + auto* RoisNum = ctx.Input("RoisNum"); + Tensor ROIs_N5; + ROIs_N5.mutable_data({rois_num, 5}, place); + Tensor ROIsNum_fp; + ROIsNum_fp.mutable_data(RoisNum->dims(), place); // shape = [rois_num] + int nputype_fp32 = + static_cast(ConvertToNpuDtype(framework::proto::VarType::FP32)); + const auto& runner_cast = NpuOpRunner("Cast", {*RoisNum}, {ROIsNum_fp}, + {{"dst_type", nputype_fp32}}); + runner_cast.Run(stream); + ROIsNum_fp.Resize({rois_num, 1}); + + // Combine *ROIsNum with ROIs to get new ROIs + std::vector x_list; + x_list.push_back(ROIsNum_fp); + x_list.push_back(*rois); + const auto& runner_concat = NpuOpRunner("ConcatD", {x_list}, {ROIs_N5}, + {{"N", 2}, {"concat_dim", 1}}); + runner_concat.Run(stream); + + // By analysis, in order to match cpu grad version, + // rois[:,3:5] should substrate 1 before call ascend grad function + std::vector vec_dlt = {0, 0, 0, -1.0f, -1.0f}; + Tensor tsr_dlt; + tsr_dlt.mutable_data({5}, place); + framework::TensorFromVector(vec_dlt, ctx.device_context(), &tsr_dlt); + ctx.template device_context().Wait(); + const auto& runner_add = + NpuOpRunner("AddV2", {ROIs_N5, tsr_dlt}, {ROIs_N5}, {}); + runner_add.Run(stream); + + // Call ascend RoiAlignGrad function + int roi_end_mode = 0; + const auto& runner_roi_align_grad = + NpuOpRunner("ROIAlignGrad", {*out_grad, ROIs_N5}, {*in_grad}, + {{"xdiff_shape", framework::vectorize(in_dims)}, + {"pooled_width", pooled_width}, + {"pooled_height", pooled_height}, + {"spatial_scale", spatial_scale}, + {"sample_num", sample_num}, + {"roi_end_mode", roi_end_mode}}); + runner_roi_align_grad.Run(stream); + } +}; + } // namespace operators } // namespace paddle @@ -99,3 +187,7 @@ REGISTER_OP_NPU_KERNEL( ops::ROIAlignNPUKernel, ops::ROIAlignNPUKernel, ops::ROIAlignNPUKernel); + +REGISTER_OP_NPU_KERNEL(roi_align_grad, ops::ROIAlignNPUGradKernel, + ops::ROIAlignNPUGradKernel, + ops::ROIAlignNPUGradKernel); diff --git a/paddle/fluid/operators/roll_op.cc b/paddle/fluid/operators/roll_op.cc index b6a8111592fb78..f82510556fde87 100644 --- a/paddle/fluid/operators/roll_op.cc +++ b/paddle/fluid/operators/roll_op.cc @@ -40,21 +40,23 @@ class RollOp : public framework::OperatorWithKernel { auto dims = ctx->Attrs().Get>("axis"); auto shifts = ctx->Attrs().Get>("shifts"); - if (dims.size() != 0) { - PADDLE_ENFORCE_EQ(dims.size(), shifts.size(), - platform::errors::InvalidArgument( - "When dims.size() != 0, dims.size() " - "should be equal to " - "shifts.size(). But received " - "dims.size() = %d, shifts.size() = %d", - dims.size(), shifts.size())); - } else { - PADDLE_ENFORCE_EQ(shifts.size(), 1, - platform::errors::InvalidArgument( - "When dims.size() == 0, shifts.size() " - "should be equal to 1, But received " - "shifts.size() = %d", - shifts.size())); + if (!ctx->HasInput("ShiftsTensor")) { + if (dims.size() != 0) { + PADDLE_ENFORCE_EQ(dims.size(), shifts.size(), + platform::errors::InvalidArgument( + "When dims.size() != 0, dims.size() " + "should be equal to " + "shifts.size(). But received " + "dims.size() = %d, shifts.size() = %d", + dims.size(), shifts.size())); + } else { + PADDLE_ENFORCE_EQ(shifts.size(), 1, + platform::errors::InvalidArgument( + "When dims.size() == 0, shifts.size() " + "should be equal to 1, But received " + "shifts.size() = %d", + shifts.size())); + } } ctx->SetOutputDim("Out", ctx->GetInputDim("X")); @@ -105,6 +107,10 @@ class RollOpMaker : public framework::OpProtoAndCheckerMaker { "The number of places by which the elements " "of the tensor are shifted.") .SetDefault({}); + AddInput("ShiftsTensor", + "The number of places by which the elements of the tensor " + "are shifted.") + .AsDispensable(); AddAttr>( "axis", "Axis along which to roll. It must have the same size " @@ -129,6 +135,9 @@ class RollGradMaker : public framework::SingleGradOpMaker { void Apply(GradOpPtr op) const override { op->SetType("roll_grad"); op->SetInput("X", this->Input("X")); + if (this->HasInput("ShiftsTensor")) { + op->SetInput("ShiftsTensor", this->Input("ShiftsTensor")); + } op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); op->SetAttrMap(this->Attrs()); @@ -174,7 +183,12 @@ REGISTER_OP_VERSION(roll) "(std::vector) Axis along which to roll. " "It must have the same size with shifts, or size = 0.", std::vector()) - .DeleteAttr( - "dims", - "(std::vector) Dims along which to roll. " - "It must have the same size with shifts, or size = 0.")); + .DeleteAttr("dims", + "(std::vector) Dims along which to roll. " + "It must have the same size with shifts, or size = 0.")) + .AddCheckpoint( + R"ROC(Upgrade roll add a dispensable input "ShiftsTensor".)ROC", + paddle::framework::compatible::OpVersionDesc().NewInput( + "ShiftsTensor", + "The number of places by which the elements of" + "the tensor are shifted.")); diff --git a/paddle/fluid/operators/roll_op.cu b/paddle/fluid/operators/roll_op.cu index a170ce2fb111de..d70bd58887f846 100644 --- a/paddle/fluid/operators/roll_op.cu +++ b/paddle/fluid/operators/roll_op.cu @@ -59,6 +59,16 @@ class RollKernel auto* in = context.Input("X"); auto* out = context.Output("Out"); std::vector shifts = context.Attr>("shifts"); + if (context.HasInput("ShiftsTensor")) { + const auto* shifts_tensor = + context.Input("ShiftsTensor"); + PADDLE_ENFORCE_EQ( + shifts_tensor->dims().size(), 1, + platform::errors::InvalidArgument( + "The rank of ShiftsTensor is expected to be 1, got %s", + shifts_tensor->dims().size())); + shifts = GetDataFromTensor(shifts_tensor); + } std::vector dims = context.Attr>("axis"); auto* in_data = in->data(); @@ -134,6 +144,16 @@ class RollGradKernel auto* in = context.Input(framework::GradVarName("Out")); auto* out = context.Output(framework::GradVarName("X")); std::vector shifts = context.Attr>("shifts"); + if (context.HasInput("ShiftsTensor")) { + const auto* shifts_tensor = + context.Input("ShiftsTensor"); + PADDLE_ENFORCE_EQ( + shifts_tensor->dims().size(), 1, + platform::errors::InvalidArgument( + "The rank of ShiftsTensor is expected to be 1, got %s", + shifts_tensor->dims().size())); + shifts = GetDataFromTensor(shifts_tensor); + } std::vector dims = context.Attr>("axis"); auto* in_data = in->data(); diff --git a/paddle/fluid/operators/roll_op.h b/paddle/fluid/operators/roll_op.h index e58ff521d8df77..affb5f226ed555 100644 --- a/paddle/fluid/operators/roll_op.h +++ b/paddle/fluid/operators/roll_op.h @@ -16,6 +16,8 @@ #include #include #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/utils.h" +#include "paddle/fluid/platform/enforce.h" namespace paddle { namespace operators { @@ -85,6 +87,16 @@ class RollKernel : public framework::OpKernel { auto& input = input_var->Get(); auto* output = output_var->GetMutable(); std::vector shifts = context.Attr>("shifts"); + if (context.HasInput("ShiftsTensor")) { + const auto* shifts_tensor = + context.Input("ShiftsTensor"); + PADDLE_ENFORCE_EQ( + shifts_tensor->dims().size(), 1, + platform::errors::InvalidArgument( + "The rank of ShiftsTensor is expected to be 1, got %s", + shifts_tensor->dims().size())); + shifts = GetDataFromTensor(shifts_tensor); + } std::vector dims = context.Attr>("axis"); std::vector out_vec; @@ -123,6 +135,11 @@ class RollGradKernel : public framework::OpKernel { auto& input = input_var->Get(); auto* output = output_var->GetMutable(); std::vector shifts = context.Attr>("shifts"); + if (context.HasInput("ShiftsTensor")) { + const auto* shifts_tensor = + context.Input("ShiftsTensor"); + shifts = GetDataFromTensor(shifts_tensor); + } std::vector dims = context.Attr>("axis"); std::vector out_vec; diff --git a/paddle/fluid/operators/run_program_op.h b/paddle/fluid/operators/run_program_op.h index ac352876e7871d..04e4dc62b039b1 100644 --- a/paddle/fluid/operators/run_program_op.h +++ b/paddle/fluid/operators/run_program_op.h @@ -142,10 +142,15 @@ static void ShareVarsIntoScope(const std::vector &vars, static void ShareVarsFromScope(const std::vector &vars, const std::vector &var_names, + const BlockDesc &global_block, framework::Scope *scope) { for (size_t i = 0; i < vars.size(); ++i) { + // NOTE: In case of setting out_tmp.stop_gradient = True in model code, all + // parameters before generating out_tmp have no @GRAD, it will raise error + // because we can't findthem in scope. So we skip sharing these vars or + // var@GRAD if they don't appear in global block. if (var_names[i] == framework::kEmptyVarName || - var_names[i] == "Fake_var") { + var_names[i] == "Fake_var" || !global_block.HasVar(var_names[i])) { VLOG(2) << "find variable name is " << var_names[i] << ", skip it!"; continue; } @@ -214,8 +219,10 @@ class RunProgramOpKernel : public framework::OpKernel { details::ShareVarsIntoScope(input_vars, input_var_names, &scope); details::ShareVarsIntoScope(param_vars, param_names, &scope); + auto *global_block = ctx.Attr("global_block"); + if (end_op_index > start_op_index) { - auto *program = ctx.Attr("global_block")->Program(); + auto *program = global_block->Program(); auto cache_info = framework::GetExecutorInfoFromCache( *program, ctx.GetPlace(), start_op_index, end_op_index, /*is_grad=*/false, program_id, &scope); @@ -240,8 +247,10 @@ class RunProgramOpKernel : public framework::OpKernel { parallel_executor->RunWithoutFetch(skip_eager_delete_vars); } // Step 4. Get Output - details::ShareVarsFromScope(output_vars, output_var_names, &scope); - details::ShareVarsFromScope(dout_vars, dout_var_names, &scope); + details::ShareVarsFromScope(output_vars, output_var_names, *global_block, + &scope); + details::ShareVarsFromScope(dout_vars, dout_var_names, *global_block, + &scope); // Debug info: scope info when run end VLOG(3) << framework::GenScopeTreeDebugInfo(out_scope_vec->front()); @@ -307,10 +316,11 @@ class RunProgramGradOpKernel : public framework::OpKernel { "least one sub scope.")); auto &scope = *(global_inner_scope->kids().front()); + auto *global_block = ctx.Attr("global_block"); if (end_op_index > start_op_index) { // Step 2. prepare executor and scope - auto *program = ctx.Attr("global_block")->Program(); + auto *program = global_block->Program(); auto cache_info = framework::GetExecutorInfoFromCache( *program, ctx.GetPlace(), start_op_index, end_op_index, /*is_grad*/ true, program_id, &scope); @@ -341,8 +351,10 @@ class RunProgramGradOpKernel : public framework::OpKernel { } // Step 4. get outputs - details::ShareVarsFromScope(input_grad_vars, input_grad_var_names, &scope); - details::ShareVarsFromScope(param_grad_vars, param_grad_names, &scope); + details::ShareVarsFromScope(input_grad_vars, input_grad_var_names, + *global_block, &scope); + details::ShareVarsFromScope(param_grad_vars, param_grad_names, + *global_block, &scope); // Step5. drop current scope global_inner_scope->DeleteScope(&scope); diff --git a/paddle/fluid/operators/save_combine_op.h b/paddle/fluid/operators/save_combine_op.h index 939768693a2431..6e6c826a22892d 100644 --- a/paddle/fluid/operators/save_combine_op.h +++ b/paddle/fluid/operators/save_combine_op.h @@ -19,11 +19,13 @@ limitations under the License. */ #include #include #include +#include #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type_transform.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/string_array.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/port.h" @@ -66,34 +68,48 @@ class SaveCombineOpKernel : public framework::OpKernel { inp_vars[i], platform::errors::InvalidArgument("Cannot find variable %s to save.", inp_var_names[i])); - PADDLE_ENFORCE_EQ(inp_vars[i]->IsType(), true, + PADDLE_ENFORCE_EQ(inp_vars[i]->IsType() || + inp_vars[i]->IsType(), + true, platform::errors::InvalidArgument( "SaveCombine operator only supports saving " - "LoDTensor variable, %s has wrong type.", + "LoDTensor or Vocab variable, %s has wrong type.", inp_var_names[i])); - auto &tensor = inp_vars[i]->Get(); - PADDLE_ENFORCE_EQ( - tensor.IsInitialized(), true, - platform::errors::InvalidArgument( - "The Tensor of Variable(%s) to be saved is not initialized.", - inp_var_names[i])); - // Serialize tensors one by one - // Check types to see if a fp16 transformation is required - auto in_dtype = tensor.type(); - auto out_dtype = - save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; + if (inp_vars[i]->IsType()) { + auto &tensor = inp_vars[i]->Get(); + PADDLE_ENFORCE_EQ( + tensor.IsInitialized(), true, + platform::errors::InvalidArgument( + "The Tensor of Variable(%s) to be saved is not initialized.", + inp_var_names[i])); + // Serialize tensors one by one + // Check types to see if a fp16 transformation is required + auto in_dtype = tensor.type(); + auto out_dtype = + save_as_fp16 ? framework::proto::VarType::FP16 : in_dtype; - if (in_dtype != out_dtype) { - auto in_kernel_type = framework::OpKernelType(in_dtype, place); - auto out_kernel_type = framework::OpKernelType(out_dtype, place); - framework::LoDTensor out; - // copy LoD info to the new tensor - out.set_lod(tensor.lod()); - framework::TransDataType(in_kernel_type, out_kernel_type, tensor, &out); - framework::SerializeToStream(ss, out, dev_ctx); + if (in_dtype != out_dtype) { + auto in_kernel_type = framework::OpKernelType(in_dtype, place); + auto out_kernel_type = framework::OpKernelType(out_dtype, place); + framework::LoDTensor out; + // copy LoD info to the new tensor + out.set_lod(tensor.lod()); + framework::TransDataType(in_kernel_type, out_kernel_type, tensor, + &out); + framework::SerializeToStream(ss, out, dev_ctx); + } else { + framework::SerializeToStream(ss, tensor, dev_ctx); + } } else { - framework::SerializeToStream(ss, tensor, dev_ctx); + auto &tensor = inp_vars[i]->Get(); + std::unordered_map data; + for (auto it = tensor.begin(); it != tensor.end(); ++it) { + std::string t; + framework::ConvertWstrToStr(it->first, &t); + data.emplace(t, it->second); + } + framework::StringMapToStream(ss, data); } } if (save_to_memory) { diff --git a/paddle/fluid/operators/scale_op_npu.cc b/paddle/fluid/operators/scale_op_npu.cc index 23817190208693..744a9b137f622e 100644 --- a/paddle/fluid/operators/scale_op_npu.cc +++ b/paddle/fluid/operators/scale_op_npu.cc @@ -12,11 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include -#include - -#include "paddle/fluid/operators/npu_op_runner.h" #include "paddle/fluid/operators/scale_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/scale_op_xpu.cc b/paddle/fluid/operators/scale_op_xpu.cc index e0dfad91570ad6..d3943e09b6d0b1 100644 --- a/paddle/fluid/operators/scale_op_xpu.cc +++ b/paddle/fluid/operators/scale_op_xpu.cc @@ -22,12 +22,14 @@ namespace paddle { namespace operators { template class ScaleXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + public: virtual void Compute(const framework::ExecutionContext& ctx) const { auto* in_var = ctx.InputVar("X"); auto* in = framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_var); - auto scale = static_cast(ctx.Attr("scale")); - auto bias = static_cast(ctx.Attr("bias")); + auto scale = static_cast(ctx.Attr("scale")); + auto bias = static_cast(ctx.Attr("bias")); auto bias_after_scale = ctx.Attr("bias_after_scale"); auto* out_var = ctx.OutputVar("Out"); if (in_var->IsType() && in_var != out_var) { @@ -46,9 +48,10 @@ class ScaleXPUKernel : public framework::OpKernel { in->dims().to_str().c_str(), out->dims().to_str().c_str())); auto& dev_ctx = ctx.template device_context(); - int r = - xpu::scale(dev_ctx.x_context(), in->data(), out->data(), - in->numel(), bias_after_scale, scale, bias); + int r = xpu::scale(dev_ctx.x_context(), + reinterpret_cast(in->data()), + reinterpret_cast(out->data()), in->numel(), + bias_after_scale, scale, bias); PADDLE_ENFORCE_EQ( r, XPU_SUCCESS, platform::errors::External("XPU scale kernel return wrong value[%d %s]", @@ -60,7 +63,11 @@ class ScaleXPUKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; + REGISTER_OP_XPU_KERNEL( - scale, ops::ScaleXPUKernel); + scale, ops::ScaleXPUKernel, + ops::ScaleXPUKernel, + ops::ScaleXPUKernel); #endif diff --git a/paddle/fluid/operators/seed_op.cc b/paddle/fluid/operators/seed_op.cc index 2f3e4c9ba88c39..837ccae0284f5e 100644 --- a/paddle/fluid/operators/seed_op.cc +++ b/paddle/fluid/operators/seed_op.cc @@ -39,6 +39,23 @@ class SeedOpMaker : public framework::OpProtoAndCheckerMaker { void Make() override { AddOutput("Out", "The output of seed op."); AddAttr("seed", "Dropout random seed.").SetDefault(0); + AddAttr("deterministic", + "(bool, default false) Whether to use deterministic " + "RandomSeedGenerator which " + "generate by `set_random_seed_generator`") + .SetDefault(false) + .AsExtra(); + AddAttr( + "rng_name", + "use deterministic RandomSeedGenerator which name is `rng_name`") + .SetDefault("") + .AsExtra(); + AddAttr("force_cpu", + "(bool, default false) Force fill output variable to cpu " + "memory. Otherwise, fill output variable to the running " + "device") + .SetDefault(false) + .AsExtra(); AddComment(R"DOC( Seed Operator. )DOC"); @@ -55,3 +72,15 @@ REGISTER_OPERATOR( paddle::framework::EmptyGradOpMaker); REGISTER_OP_CPU_KERNEL( seed, ops::CPUSeedKernel); + +/* ========================== register checkpoint ===========================*/ +REGISTER_OP_VERSION(seed) + .AddCheckpoint( + R"ROC( + Upgrade seed add a new attribute [force_cpu])ROC", + paddle::framework::compatible::OpVersionDesc().NewAttr( + "force_cpu", + "If true, Force fill output variable to cpu." + "memory. Otherwise, fill output variable to the running " + "device", + false)); diff --git a/paddle/fluid/operators/seed_op.cu b/paddle/fluid/operators/seed_op.cu index c84407ba52dfd6..4ca75bcf76e513 100644 --- a/paddle/fluid/operators/seed_op.cu +++ b/paddle/fluid/operators/seed_op.cu @@ -12,6 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/seed_op.h" namespace paddle { @@ -20,22 +21,28 @@ namespace operators { template class GPUSeedKernel : public framework::OpKernel { public: - void Compute(const framework::ExecutionContext& context) const override { - auto* out = context.Output("Out"); - auto* out_data = out->mutable_data(context.GetPlace()); - int user_seed = context.Attr("seed"); - std::random_device rnd; - int seed; - if (user_seed != 0) { - seed = user_seed; + void Compute(const framework::ExecutionContext &context) const override { + auto *out = context.Output("Out"); + int seed = get_seed(context); + + auto force_cpu = context.Attr("force_cpu"); + bool cpu_place = force_cpu || context.GetPlace() == platform::CPUPlace(); + if (cpu_place) { + platform::DeviceContextPool &pool = + platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(context.GetPlace()); + out->mutable_data(platform::CPUPlace()); + math::SetConstant functor; + functor(reinterpret_cast(dev_ctx), + out, static_cast(seed)); } else { - seed = rnd(); + auto *out_data = out->mutable_data(context.GetPlace()); + auto target_gpu_place = + BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()); + auto stream = context.cuda_device_context().stream(); + memory::Copy(target_gpu_place, out_data, platform::CPUPlace(), &seed, + sizeof(int), stream); } - auto target_gpu_place = - BOOST_GET_CONST(platform::CUDAPlace, context.GetPlace()); - auto stream = context.cuda_device_context().stream(); - memory::Copy(target_gpu_place, out_data, platform::CPUPlace(), &seed, - sizeof(int), stream); } }; diff --git a/paddle/fluid/operators/seed_op.h b/paddle/fluid/operators/seed_op.h index f8b513fca4824c..202f25e0b4cd12 100644 --- a/paddle/fluid/operators/seed_op.h +++ b/paddle/fluid/operators/seed_op.h @@ -13,30 +13,45 @@ // limitations under the License. #pragma once +#include "paddle/fluid/framework/generator.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/op_version_registry.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; -template -class CPUSeedKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - auto* out = context.Output("Out"); - auto* out_data = out->mutable_data(context.GetPlace()); - int user_seed = context.Attr("seed"); +static int get_seed(const framework::ExecutionContext& context) { + int user_seed = context.Attr("seed"); + bool deterministic = context.Attr("deterministic"); + int seed = 0; + if (!deterministic) { // NOTE: fixed seed should only be used in unittest or for debug. // Guarantee to use random seed in training. - std::random_device rnd; - int seed; if (user_seed != 0) { seed = user_seed; } else { + std::random_device rnd; seed = rnd(); } - out_data[0] = seed; + } else { + std::string name = context.Attr("rng_name"); + auto rng = framework::GetRandomSeedGenerator(name); + do { // NOTE(wangxi): cpu dropout will use random seed if seed == 0 + seed = static_cast(rng->Random64()); + } while (seed == 0); + } + return seed; +} + +template +class CPUSeedKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* out = context.Output("Out"); + auto* out_data = out->mutable_data(context.GetPlace()); + out_data[0] = get_seed(context); } }; diff --git a/paddle/fluid/operators/set_value_op_npu.cc b/paddle/fluid/operators/set_value_op_npu.cc index 3a8d81920f262c..e7b124d5bddd64 100644 --- a/paddle/fluid/operators/set_value_op_npu.cc +++ b/paddle/fluid/operators/set_value_op_npu.cc @@ -1,8 +1,11 @@ /* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -10,291 +13,25 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/set_value_op.h" -#include "paddle/fluid/operators/assign_value_op.h" #include "paddle/fluid/operators/npu_op_runner.h" -#include "paddle/fluid/operators/slice_utils.h" -#include "paddle/fluid/operators/utils.h" -#include "paddle/fluid/platform/enforce.h" namespace paddle { namespace operators { -template -class SetValueNPUKernel : public framework::OpKernel { - private: - using Vector_Int64 = std::vector; - void GetNPUStartEndSteps(const Vector_Int64& start, const Vector_Int64& end, - const Vector_Int64& steps, const Vector_Int64& axes, - const framework::DDim& in_dim, - std::vector>& output) const { - int rank = in_dim.size(); - for (int i = 0; i < rank; ++i) { - int axis_size = in_dim[i]; - auto iter = find(axes.begin(), axes.end(), i); - if (iter != axes.end()) { - int idx = iter - axes.begin(); - output[0].push_back(start[idx]); // set as the same as raw input - output[1].push_back(end[idx]); - output[2].push_back(steps[idx]); - } else { - output[0].push_back(0); // begin 0 - output[1].push_back(axis_size); // end = last one - output[2].push_back(1); // step = 1 - } - } - } - - inline std::vector MininumPadNumberMakeSureLastDimGT8( - const std::vector>& npu_slice) const { - int rank = npu_slice[0].size(); - int last_dim_start = npu_slice[0][rank - 1]; - int last_dim_end = npu_slice[1][rank - 1]; - int last_dim_step = npu_slice[2][rank - 1]; - int min_end = last_dim_start + last_dim_step * min_last_dim_value_; - int raw_last_dim_len = (last_dim_end - last_dim_start) / last_dim_step; - return std::vector({std::max(0, min_end - last_dim_end), - min_last_dim_value_ - raw_last_dim_len}); - } - - inline void TileTensor(const framework::ExecutionContext* ctx, - const Tensor* input, Tensor* output) const { - VLOG(4) << "start to tile tensor function, which calls the npu operator " - "TileWithAxis"; - // UNSQUEEZE last dim + TILE last dim * min_last_dim_value_ - Tensor reshape_tensor; - auto reshape_dims = framework::vectorize(input->dims()); - reshape_dims.push_back(1); - reshape_tensor.ShareDataWith(*input); - reshape_tensor.Resize(framework::make_ddim(reshape_dims)); - - auto output_dims = framework::vectorize(input->dims()); - output_dims.push_back(min_last_dim_value_); - output->mutable_data(framework::make_ddim(output_dims), ctx->GetPlace()); - - framework::NPUAttributeMap attr; - attr["axis"] = static_cast(reshape_dims.size() - 1); - attr["tiles"] = min_last_dim_value_; - auto stream = - ctx->template device_context() - .stream(); - NpuOpRunner("TileWithAxis", {reshape_tensor}, {*output}, attr).Run(stream); - } - - inline void BroadcastToD(const framework::ExecutionContext* ctx, - const Tensor* input, - const std::vector* shape, - Tensor* output) const { - VLOG(4) << "Start BroadCast To"; - auto new_shape = std::vector(shape->begin(), shape->end()); - output->mutable_data(framework::make_ddim(new_shape), ctx->GetPlace()); - framework::NPUAttributeMap attr; - attr["shape"] = new_shape; - auto stream = - ctx->template device_context() - .stream(); - NpuOpRunner("BroadcastToD", {*input}, {*output}, attr).Run(stream); - } - - inline void CropTensor(const framework::ExecutionContext* ctx, - const Tensor* input, Tensor* output) const { - auto out_dims = output->dims(); - auto in_dims = input->dims(); - int rank = in_dims.size(); - in_dims[rank - 1] = 1; - output->Resize(in_dims); // unsqueeze output -> [..., 1] - framework::NPUAttributeMap attr; - attr["axis"] = 0; - attr["offsets"] = std::vector(rank, 0); - auto stream = - ctx->template device_context() - .stream(); - NpuOpRunner("Crop", {*input, *output}, {*output}, attr).Run(stream); - output->Resize(out_dims); // restore it - } - - void SliceAssignNPU(const framework::ExecutionContext* ctx, - const Tensor* value_tensor, Vector_Int64& start, - Vector_Int64& end, Vector_Int64& steps, - Vector_Int64& axes, Tensor* assigned_tensor) const { - // must ensure assigned_tensor and value_tensor have the same shape - // not support steps < 0 - // output is also the assigned_tensor. - VLOG(4) << "start function SliceAssignND"; - auto stream = - ctx->template device_context() - .stream(); - for (size_t i = 0; i < steps.size(); ++i) { - PADDLE_ENFORCE_GT(steps[i], 0, - platform::errors::InvalidArgument( - "Currently NPU set_value operator doesn't support " - "negative steps, but got %d as step", - steps[i])); - } - std::vector> npu_slice(3); - GetNPUStartEndSteps(start, end, steps, axes, assigned_tensor->dims(), - npu_slice); - auto tile_numbers = MininumPadNumberMakeSureLastDimGT8(npu_slice); - int assigned_tensor_tile_number = tile_numbers[0]; - int value_tensor_tile_number = tile_numbers[1]; - VLOG(4) << "tile number is : " << assigned_tensor_tile_number << " " - << value_tensor_tile_number; - - Tensor tiled_assigned_tns, tiled_value_tns; - if (assigned_tensor_tile_number > 0) { - TileTensor(ctx, assigned_tensor, &tiled_assigned_tns); - TileTensor(ctx, value_tensor, &tiled_value_tns); - // output have different shape, so use a tmp variable before_crop_output; - // add last dim = min_last_dim_value_ in slice - npu_slice[0].push_back(0); - npu_slice[1].push_back(min_last_dim_value_); - npu_slice[2].push_back(1); - } - - framework::NPUAttributeMap attr_input; - attr_input["begin"] = - std::vector(npu_slice[0].begin(), npu_slice[0].end()); - attr_input["end"] = - std::vector(npu_slice[1].begin(), npu_slice[1].end()); - attr_input["strides"] = - std::vector(npu_slice[2].begin(), npu_slice[2].end()); - attr_input["begin_mask"] = 0; - attr_input["end_mask"] = 0; - attr_input["ellipsis_mask"] = 0; - attr_input["new_axis_mask"] = 0; - attr_input["shrink_axis_mask"] = 0; - if (assigned_tensor_tile_number > 0) { - NpuOpRunner("StridedSliceAssignD", {tiled_assigned_tns, tiled_value_tns}, - {tiled_assigned_tns}, attr_input) - .Run(stream); // Remember, set output = input, and this op will - // change the input value. - } else { - NpuOpRunner("StridedSliceAssignD", {*assigned_tensor, *value_tensor}, - {*assigned_tensor}, attr_input) - .Run(stream); - } - if (assigned_tensor_tile_number > 0) { - CropTensor(ctx, &tiled_assigned_tns /*initialzied*/, - assigned_tensor /*initalized*/); - } - } - - void ModifyAxesAccordingNoneAxes(const Vector_Int64& none_axes, - Vector_Int64& axes_to_modify) const { - if (none_axes.empty()) return; - auto none_axes_copy = none_axes; - sort(none_axes_copy.begin(), none_axes_copy.end()); - for (size_t i = 0; i < axes_to_modify.size(); ++i) { - int axis = axes_to_modify[i]; - auto upper = - upper_bound(none_axes_copy.begin(), none_axes_copy.end(), axis); - // Example: none_axes = [1,3,4,5,7] - // axis = 4 - // find the element number less or equal than 4, which is - // 3(1,3,4) - // axis becomes 4 + 3 = 7 ; - axes_to_modify[i] = axis + (upper - none_axes_copy.begin()); - } - } - - void UnsqueezeAccordingNoneAxes(const Vector_Int64& none_axes, - Vector_Int64& slice_dims) const { - // note : axes will change, because new axes inserted. - // sum array to modify the axes. because more simply - if (none_axes.empty()) return; - Vector_Int64 slice_dims_with_none; - size_t none_axes_cur = 0; - for (size_t i = 0; i < slice_dims.size(); ++i) { - while (none_axes_cur < none_axes.size() && - none_axes[none_axes_cur] <= static_cast(i)) { - slice_dims_with_none.push_back(1); - none_axes_cur++; - } - slice_dims_with_none.push_back(slice_dims[i]); - } - // if the none_axes.size() > slice_dims.size(), append 1 after last dim - while (none_axes_cur < none_axes.size()) { - slice_dims_with_none.push_back(1); - none_axes_cur++; - } - slice_dims = slice_dims_with_none; - } +using NPUDeviceContext = platform::NPUDeviceContext; - void ModiftyDimsAccordingNoneAndDecrease(Vector_Int64& slice_dim, - Vector_Int64& value_dim, - Vector_Int64& axes, - Vector_Int64& none_axes, - Vector_Int64& dec_axes) const { - // change the value of slice_dim, value_dim, start, end, steps, axes by none - // and decrease axes - // after change, this values can be passed to SliceAssignNPU() directly. - - // Modity Slice Dim - UnsqueezeAccordingNoneAxes(none_axes, slice_dim); - ModifyAxesAccordingNoneAxes(none_axes, dec_axes); - ModifyAxesAccordingNoneAxes(none_axes, axes); - // Modity Value Dim by new slice dim - auto slice_dim_reverse = slice_dim; - auto value_dim_reverse = value_dim; - std::reverse(slice_dim_reverse.begin(), slice_dim_reverse.end()); - std::reverse(value_dim_reverse.begin(), value_dim_reverse.end()); - - Vector_Int64 new_value_dim; - PADDLE_ENFORCE_GE( - slice_dim.size(), value_dim.size(), - platform::errors::InvalidArgument("The size of expanded slice_dim(%d) " - "must greater than the value_dim(%d)", - slice_dim.size(), value_dim.size())); +template +class SetValueNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const { + auto* in = ctx.Input("Input"); + auto* value_tensor = ctx.Input("ValueTensor"); + auto* out = ctx.Output("Out"); - size_t value_cur = 0; - size_t rank = slice_dim.size(); - for (size_t i = 0; i < rank; ++i) { - auto& xsize = slice_dim_reverse[i]; - if (value_cur >= value_dim_reverse.size()) { - new_value_dim.push_back(1); - continue; - } - auto& vsize = value_dim_reverse[value_cur]; - auto it = find(dec_axes.begin(), dec_axes.end(), rank - 1 - i); - if (it != dec_axes.end()) { - // found, insert one dim ; - PADDLE_ENFORCE_EQ(xsize, 1, platform::errors::InvalidArgument( - "The dims refered by decrease axes is " - "not equal to 1, some wrongs happen")); - new_value_dim.push_back(1); - continue; - } - if (xsize == vsize || vsize == 1) { - new_value_dim.push_back(vsize); - ++value_cur; - continue; - } - PADDLE_THROW(platform::errors::InvalidArgument( - "The shape of value_tensor can't be broadcast to value tensor, " - "please check input")); - } - for (; value_cur < value_dim_reverse.size(); ++value_cur) { - if (value_dim_reverse[value_cur] != 1) { - PADDLE_THROW(platform::errors::InvalidArgument( - "The shape of value_tensor can't be broadcast to value tensor, " - "please check input")); - } - } - std::reverse(new_value_dim.begin(), new_value_dim.end()); - value_dim = new_value_dim; - return; - } + auto starts_tensor_list = ctx.MultiInput("StartsTensorList"); + auto ends_tensor_list = ctx.MultiInput("EndsTensorList"); + auto steps_tensor_list = ctx.MultiInput("StepsTensorList"); - public: - void Compute(const framework::ExecutionContext& ctx) const override { - VLOG(2) << "Start Set Value Npu Kernel"; - auto* in = ctx.Input("Input"); - auto* out = ctx.Output("Out"); - auto* value_tensor = ctx.Input("ValueTensor"); - auto starts_tensor_list = - ctx.MultiInput("StartsTensorList"); - auto ends_tensor_list = ctx.MultiInput("EndsTensorList"); - auto steps_tensor_list = - ctx.MultiInput("StepsTensorList"); auto axes = ctx.Attr>("axes"); auto starts = ctx.Attr>("starts"); auto ends = ctx.Attr>("ends"); @@ -302,17 +39,6 @@ class SetValueNPUKernel : public framework::OpKernel { auto shape = ctx.Attr>("shape"); auto decrease_axes = ctx.Attr>("decrease_axes"); auto none_axes = ctx.Attr>("none_axes"); - auto dtype = in->type(); - - if (dtype == framework::proto::VarType::FP64 || - dtype == framework::proto::VarType::INT64 || - dtype == framework::proto::VarType::BOOL) { - auto value_type_name = GetValueName(dtype); - PADDLE_THROW(platform::errors::InvalidArgument( - "The NPU setvalue kernel currently only support FLOAT32 and INT32, " - "but got type: %s", - value_type_name.data())); - } if (!starts_tensor_list.empty()) { starts = GetDataFromTensorList(starts_tensor_list); @@ -327,65 +53,137 @@ class SetValueNPUKernel : public framework::OpKernel { auto in_dims = in->dims(); CheckAndUpdateSliceAttrs(in_dims, axes, &starts, &ends, &steps); auto slice_dims = GetSliceDims(in_dims, axes, starts, ends, &steps); - auto place = ctx.GetPlace(); + auto decrease_slice_dims = GetDecreasedDims(slice_dims, decrease_axes); + + auto slice_dims_for_assign = decrease_slice_dims; + if (!none_axes.empty()) { + std::vector slice_dims_with_none; + + size_t none_axes_cur = 0, decrease_axes_cur = 0; + for (int i = 0; i < slice_dims.size(); ++i) { + while (none_axes_cur < none_axes.size() && + none_axes[none_axes_cur] <= i) { + slice_dims_with_none.push_back(1); + none_axes_cur++; + } + if (decrease_axes_cur < decrease_axes.size() && + decrease_axes[decrease_axes_cur] == i) { + decrease_axes_cur++; + } else { + slice_dims_with_none.push_back(slice_dims[i]); + } + } + while (none_axes_cur < none_axes.size()) { + slice_dims_with_none.push_back(1); + none_axes_cur++; + } - // aforementioned code is copyed directly from CPU kernel. - // (@xiongkun03) the following is redesigned by xiongkun. because NPU can do - // step slice assignment. so we deal with all none_axes and decrease_axes - // here. - // 1. we insert 1 into assigned_tensor_shape according to none_axes; - // 2. we insert 1 into value_tensor_shape(value tensor) according to - // decrease_axes; - // 3. we reshape back the assigned_tensor. and return it. - // note : we use a tmp_value_tensor as value_tns. it shares data with - // value_tensor; - // I believe the logic is more simple than cpu logic. + slice_dims_for_assign = framework::make_ddim(slice_dims_with_none); + } + + TensorCopy(*in, ctx.GetPlace(), out); + + auto starts_indices = std::vector(in_dims.size(), 0); + auto ends_indices = std::vector(in_dims.size(), 0); + auto strides_indices = std::vector(in_dims.size(), 0); + + for (int i = 0; i < in_dims.size(); ++i) { + starts_indices[i] = 0; + ends_indices[i] = slice_dims[i]; + strides_indices[i] = 1; + } + for (size_t i = 0; i < axes.size(); i++) { + int axis_index = axes[i]; + starts_indices[axis_index] = starts[i]; + ends_indices[axis_index] = ends[i]; + strides_indices[axis_index] = steps[i]; + } + + int64_t stride_step = framework::product(in_dims); + std::vector index_indices(1, 0); + for (size_t i = 0; i < strides_indices.size(); ++i) { + auto index_size = index_indices.size(); + stride_step /= in_dims[i]; + for (size_t j = 0; j < index_size; ++j) { + auto start_index = *index_indices.begin(); + if (strides_indices[i] > 0) { + for (int64_t k = starts_indices[i]; k < ends_indices[i]; + k += strides_indices[i]) { + index_indices.push_back(start_index + k * stride_step); + } + } else { + for (int64_t k = starts_indices[i]; k > ends_indices[i]; + k += strides_indices[i]) { + index_indices.push_back(start_index + k * stride_step); + } + } + index_indices.erase(index_indices.begin()); + } + } - TensorCopy(*in, place, out); - Tensor value_t(dtype); + PADDLE_ENFORCE_EQ( + static_cast(index_indices.size()), + framework::product(slice_dims_for_assign), + platform::errors::InvalidArgument( + "OP(set_value) error index indices and value update not match ")); - if (value_tensor == nullptr) { + Tensor value_t(in->type()); + if (value_tensor != nullptr) { + value_t.ShareDataWith(*value_tensor); + } else { auto value_dims = framework::make_ddim(shape); - value_t.mutable_data(value_dims, place); - auto value_name = GetValueName(dtype); + CheckIsDimsMatch(slice_dims_for_assign, value_dims); + + value_t.mutable_data(value_dims, ctx.GetPlace()); + auto value_name = GetValueName(in->type()); CopyVecotorToTensor(value_name.c_str(), &value_t, ctx); value_t.Resize(value_dims); } - const Tensor* value_tensor_ptr = - (value_tensor == nullptr) ? &value_t : value_tensor; - auto value_dims_vec = framework::vectorize(value_tensor_ptr->dims()); - auto slice_dims_vec = framework::vectorize(slice_dims); - auto in_dims_vec = framework::vectorize(in_dims); - - UnsqueezeAccordingNoneAxes(none_axes, in_dims_vec); - ModiftyDimsAccordingNoneAndDecrease(slice_dims_vec, value_dims_vec, axes, - none_axes, - decrease_axes); // Modify and Check + auto stream = ctx.template device_context().stream(); - Tensor reshaped_value_tensor, broadcast_value_tensor; - reshaped_value_tensor.ShareDataWith(*value_tensor_ptr); - reshaped_value_tensor.Resize(framework::make_ddim(value_dims_vec)); - - BroadcastToD(&ctx, &reshaped_value_tensor, &slice_dims_vec, - &broadcast_value_tensor /*inner function initialized*/); + Tensor value_temp(in->type()); + if (slice_dims_for_assign == value_t.dims()) { + value_temp.ShareDataWith(value_t); + } else { + value_temp.Resize(slice_dims_for_assign); + value_temp.mutable_data(ctx.GetPlace()); + NpuOpRunner runner_brd; + runner_brd.SetType("BroadcastTo") + .AddInput(value_t) + .AddInput(framework::vectorize(slice_dims_for_assign)) + .AddOutput(value_temp) + .Run(stream); + } - out->Resize(framework::make_ddim(in_dims_vec)); - SliceAssignNPU(&ctx, &broadcast_value_tensor, starts, ends, steps, axes, - out); - out->Resize(in_dims); // Reshape Back + int64_t input_numel = framework::product(in_dims); + int64_t index_numel = index_indices.size(); + + Tensor in_temp, out_temp, val_temp; + in_temp.ShareDataWith(*in); + out_temp.ShareDataWith(*out); + val_temp.ShareDataWith(value_temp); + in_temp.Resize(framework::make_ddim({input_numel})); + out_temp.Resize(framework::make_ddim({input_numel})); + val_temp.Resize(framework::make_ddim({index_numel})); + + NpuOpRunner runner; + runner.SetType("ScatterUpdate") + .AddInput(in_temp) + .AddInput(std::move(index_indices)) + .AddInput(val_temp) + .AddOutput(out_temp) + .Run(stream); } - - private: - const int min_last_dim_value_ = - 32 / sizeof(T); // 16 for float16 , 8 for float32 }; } // namespace operators } // namespace paddle namespace ops = paddle::operators; -namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL( - set_value, ops::SetValueNPUKernel, - ops::SetValueNPUKernel) + +REGISTER_OP_NPU_KERNEL(set_value, ops::SetValueNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::SetValueNPUKernel, +#endif + ops::SetValueNPUKernel) diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc index 6f3b40dbbf3942..400a09330a3483 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op_npu.cc @@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and -limitations under the Licnse. */ +limitations under the License. */ #include "paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h" #include "paddle/fluid/operators/npu_op_runner.h" diff --git a/paddle/fluid/operators/slice_op_npu.cc b/paddle/fluid/operators/slice_op_npu.cc index 1084eadc55c5bc..a9092d7e2abbce 100644 --- a/paddle/fluid/operators/slice_op_npu.cc +++ b/paddle/fluid/operators/slice_op_npu.cc @@ -10,20 +10,16 @@ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and -limitations under the Licnse. */ +limitations under the License. */ -#include -#include - -#include "paddle/fluid/framework/ddim.h" -#include "paddle/fluid/framework/tensor_util.h" -#include "paddle/fluid/operators/npu_op_runner.h" #include "paddle/fluid/operators/slice_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" namespace paddle { namespace operators { using Tensor = framework::Tensor; +using NPUDeviceContext = platform::NPUDeviceContext; void UpdateAttr(const framework::DDim& in_dims, const std::vector axes, const std::vector starts, const std::vector ends, @@ -54,7 +50,7 @@ void UpdateAttr(const framework::DDim& in_dims, const std::vector axes, } } -template +template class SliceNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -128,17 +124,14 @@ class SliceNPUKernel : public framework::OpKernel { UpdateAttr(in_dims, axes, starts, ends, &offsets, &size); + auto stream = ctx.template device_context().stream(); const auto& runner = NpuOpRunner("SliceD", {*input}, {*out}, {{"offsets", offsets}, {"size", size}}); - - auto stream = - ctx.template device_context() - .stream(); runner.Run(stream); } }; -template +template class SliceGradNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -181,12 +174,37 @@ class SliceGradNPUKernel : public framework::OpKernel { paddings[i][1] = static_cast(in_dims[i] - size[i] - offsets[i]); } + Tensor tmp_dout; + tmp_dout.ShareDataWith(*dout); + auto out_dims = dout->dims(); + auto decrease_axis = ctx.Attr>("decrease_axis"); + auto decrease_size = decrease_axis.size(); + if (decrease_size > 0) { + if (decrease_size == static_cast(in_dims.size())) { + out_dims = framework::make_ddim(std::vector(decrease_size, 1)); + } else { + std::vector origin_out_shape(out_dims.size() + decrease_size, -1); + for (size_t i = 0; i < decrease_size; ++i) { + origin_out_shape[decrease_axis[i]] = 1; + } + int index = 0; + for (size_t i = 0; i < origin_out_shape.size(); ++i) { + if (origin_out_shape[i] == -1) { + origin_out_shape[i] = out_dims[index]; + ++index; + } + } + out_dims = framework::make_ddim(origin_out_shape); + } + tmp_dout.Resize(out_dims); + } + dinput->mutable_data(ctx.GetPlace()); auto stream = ctx.template device_context() .stream(); const auto& runner = - NpuOpRunner("PadD", {*dout}, {*dinput}, {{"paddings", paddings}}); + NpuOpRunner("PadD", {tmp_dout}, {*dinput}, {{"paddings", paddings}}); runner.Run(stream); } }; @@ -196,15 +214,13 @@ class SliceGradNPUKernel : public framework::OpKernel { namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL( - slice, ops::SliceNPUKernel, - ops::SliceNPUKernel, - ops::SliceNPUKernel); - -REGISTER_OP_NPU_KERNEL( - slice_grad, - ops::SliceGradNPUKernel, - ops::SliceGradNPUKernel, - ops::SliceGradNPUKernel); +REGISTER_OP_NPU_KERNEL(slice, ops::SliceNPUKernel, + ops::SliceNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::SliceNPUKernel, +#endif + ops::SliceNPUKernel); + +REGISTER_OP_NPU_KERNEL(slice_grad, ops::SliceGradNPUKernel, + ops::SliceGradNPUKernel, + ops::SliceGradNPUKernel); diff --git a/paddle/fluid/operators/slice_op_xpu.cc b/paddle/fluid/operators/slice_op_xpu.cc index 5f98efe8e91466..6ac1027b0ce195 100644 --- a/paddle/fluid/operators/slice_op_xpu.cc +++ b/paddle/fluid/operators/slice_op_xpu.cc @@ -27,6 +27,8 @@ using Tensor = framework::Tensor; template class SliceXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& ctx) const override { auto in = ctx.Input("Input"); @@ -83,114 +85,93 @@ class SliceXPUKernel : public framework::OpKernel { } auto& dev_ctx = ctx.template device_context(); - auto* in_data = in->data(); - auto* out_data = out->mutable_data(ctx.GetPlace()); - int r = xpu::slice(dev_ctx.x_context(), in_data, out_data, shape, - starts_extension, ends_extension); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, - platform::errors::External("XPU slice kernel error!")); + const XPUType* in_data = reinterpret_cast(in->data()); + XPUType* out_data = + reinterpret_cast(out->mutable_data(ctx.GetPlace())); + int r = xpu::slice(dev_ctx.x_context(), in_data, out_data, shape, + starts_extension, ends_extension); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External("XPU slice kernel return wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); } }; template class SliceGradXPUKernel : public framework::OpKernel { + using XPUType = typename XPUTypeTrait::Type; + public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* d_out = ctx.Input(framework::GradVarName("Out")); - auto* d_in = ctx.Output(framework::GradVarName("Input")); - d_in->mutable_data(ctx.GetPlace()); - - auto in_dims = d_in->dims(); - auto axes = ctx.Attr>("axes"); - auto starts = ctx.Attr>("starts"); - auto ends = ctx.Attr>("ends"); + auto* input = ctx.Input("Input"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dinput = ctx.Output(framework::GradVarName("Input")); + + auto axes_int = ctx.Attr>("axes"); + auto starts_int = ctx.Attr>("starts"); + auto ends_int = ctx.Attr>("ends"); + std::vector axes(axes_int.begin(), axes_int.end()); + std::vector starts(starts_int.begin(), starts_int.end()); + std::vector ends(ends_int.begin(), ends_int.end()); + + // Get the accurate attribute value of starts and ends + auto starts_tensor_list = ctx.MultiInput("StartsTensorList"); + if (ctx.HasInput("StartsTensor")) { + starts = GetDataFromTensor(ctx.Input("StartsTensor")); + } else if (starts_tensor_list.size() > 0) { + starts = GetDataFromTensorList(starts_tensor_list); + } - // prepare starts, ends on XPU - int dim_value = 0, start = 0, end = 0; - // If a negative value is passed for any of the start or end indices, - // it represents number of elements before the end of that dimension. - // If the value passed to start or end is larger than the n - // (the number of elements in this dimension), it represents n. - for (size_t i = 0; i < axes.size(); ++i) { - dim_value = in_dims[axes[i]]; - start = starts[i]; - end = ends[i]; - start = start < 0 ? (start + dim_value) : start; - end = end < 0 ? (end + dim_value) : end; - start = std::max(start, 0); - end = std::max(end, 0); - end = std::min(end, dim_value); - PADDLE_ENFORCE_GT(end, start, platform::errors::InvalidArgument( - "end should greater than start")); - starts[i] = start; - ends[i] = end; + auto ends_tensor_list = ctx.MultiInput("EndsTensorList"); + if (ctx.HasInput("EndsTensor")) { + ends = GetDataFromTensor(ctx.Input("EndsTensor")); + } else if (ends_tensor_list.size() > 0) { + ends = GetDataFromTensorList(ends_tensor_list); } - size_t shape_size = in_dims.size(); - // the slice XPU kernel require that the length of `start`, `end` must be - // equal - // to the dims size of input tensor, therefore, if shape_size > axes.size(), - // the `starts_extension` and `ends_extension` is necessary. - std::vector starts_extension(shape_size, 0); - std::vector ends_extension(shape_size, 0); - if (shape_size > axes.size()) { - for (size_t i = 0; i < shape_size; ++i) { - ends_extension[i] = in_dims[i]; - } - for (size_t i = 0; i < axes.size(); ++i) { - starts_extension[axes[i]] = starts[i]; - ends_extension[axes[i]] = ends[i]; + + const auto& in_dims = input->dims(); + int rank = in_dims.size(); + + std::vector pad_left(rank); + std::vector out_dims(rank); + std::vector pad_right(rank); + int cnt = 0; + for (int i = 0; i < in_dims.size(); ++i) { + int start = 0; + int end = in_dims[i]; + int axis = cnt < static_cast(axes.size()) ? axes[cnt] : -1; + if (axis == i) { + start = starts[cnt]; + if (start < 0) { + start = (start + in_dims[i]); + } + start = std::max(start, static_cast(0)); + end = ends[cnt]; + if (end < 0) { + end = (end + in_dims[i]); + } + end = std::min(end, static_cast(in_dims[i])); + cnt++; } - } - int* starts_device = nullptr; - int* ends_device = nullptr; - int* starts_host = - shape_size > axes.size() ? starts_extension.data() : starts.data(); - int* ends_host = - shape_size > axes.size() ? ends_extension.data() : ends.data(); - PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast(&starts_device), - shape_size * sizeof(int)), - XPU_SUCCESS, - platform::errors::External("XPU has no enough memory")); - PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast(&ends_device), - shape_size * sizeof(int)), - XPU_SUCCESS, - platform::errors::External("XPU has no enough memory")); - memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()), - starts_device, platform::CPUPlace(), starts_host, - shape_size * sizeof(int)); - memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()), - ends_device, platform::CPUPlace(), ends_host, - shape_size * sizeof(int)); - // prepare shape on XPU - std::vector shape(shape_size, 0); - for (size_t i = 0; i < shape_size; ++i) { - shape[i] = in_dims[i]; + pad_left[i] = start; + out_dims[i] = end - start; + pad_right[i] = in_dims[i] - out_dims[i] - pad_left[i]; } - int* shape_device = nullptr; - PADDLE_ENFORCE_EQ(xpu_malloc(reinterpret_cast(&shape_device), - shape_size * sizeof(int)), - XPU_SUCCESS, - platform::errors::External("XPU has no enough memory")); - memory::Copy(BOOST_GET_CONST(platform::XPUPlace, ctx.GetPlace()), - shape_device, platform::CPUPlace(), shape.data(), - shape_size * sizeof(int)); auto& dev_ctx = ctx.template device_context(); - int r = - xpu::slice_backward(dev_ctx.x_context(), shape_device, starts_device, - ends_device, shape_size, d_out->data(), - d_in->data(), d_in->numel(), d_out->numel()); - PADDLE_ENFORCE_EQ(r, XPU_SUCCESS, - platform::errors::External("xpu slice kernel error")); - dev_ctx.Wait(); - // free device data - xpu_free(shape_device); - xpu_free(starts_device); - xpu_free(ends_device); + const XPUType* dout_data = + reinterpret_cast(dout->data()); + XPUType* din_data = + reinterpret_cast(dinput->mutable_data(ctx.GetPlace())); + int r = xpu::pad(dev_ctx.x_context(), dout_data, din_data, + out_dims, pad_left, pad_right, XPUType(0)); + PADDLE_ENFORCE_EQ( + r, XPU_SUCCESS, + platform::errors::External("XPU pad kernel return wrong value[%d %s]", + r, XPUAPIErrorMsg[r])); } }; - } // namespace operators } // namespace paddle @@ -198,8 +179,13 @@ namespace ops = paddle::operators; REGISTER_OP_XPU_KERNEL( slice, ops::SliceXPUKernel, - ops::SliceXPUKernel); + ops::SliceXPUKernel, + ops::SliceXPUKernel); REGISTER_OP_XPU_KERNEL( slice_grad, - ops::SliceGradXPUKernel); + ops::SliceGradXPUKernel, + ops::SliceGradXPUKernel, + ops::SliceGradXPUKernel); #endif diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc index 0c2d39e7519ef4..78e813edda930c 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc @@ -13,10 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/softmax_with_cross_entropy_op.h" -#include -#include -#include -#include #include "paddle/fluid/framework/op_version_registry.h" namespace paddle { @@ -54,8 +50,7 @@ class SoftmaxWithCrossEntropyOpMaker "exp(logits -max_logits) / sum(exp(logits - max_logits)) - labels, " "where labels is ont-hot." "Currently, the tensor is generated and used in npu kernel only. ") - .AsIntermediate() - .AsDispensable(); + .AsIntermediate(); #endif AddOutput("Loss", "(Tensor, default: Tensor), A tensor in same shape with " @@ -136,6 +131,11 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(ctx->HasOutput("Softmax"), true, platform::errors::InvalidArgument( "Output(Softmax) should be not null.")); +#ifdef PADDLE_WITH_ASCEND_CL + PADDLE_ENFORCE_EQ(ctx->HasOutput("Backprop"), true, + platform::errors::InvalidArgument( + "Output(Backprop) should be not null.")); +#endif PADDLE_ENFORCE_EQ( ctx->HasOutput("Loss"), true, platform::errors::InvalidArgument("Output(Loss) should be not null.")); @@ -225,6 +225,11 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(ctx->HasInput("Softmax"), true, platform::errors::InvalidArgument( "Input(Softmax) should be not null.")); +#ifdef PADDLE_WITH_ASCEND_CL + PADDLE_ENFORCE_EQ(ctx->HasInput("Backprop"), true, + platform::errors::InvalidArgument( + "Input(Backprop) should be not null.")); +#endif PADDLE_ENFORCE_EQ( ctx->HasInput("Label"), true, platform::errors::InvalidArgument("Input(Label) should be not null.")); diff --git a/paddle/fluid/operators/sparse_attention_op.cc b/paddle/fluid/operators/sparse_attention_op.cc new file mode 100644 index 00000000000000..9b6bc1b6290451 --- /dev/null +++ b/paddle/fluid/operators/sparse_attention_op.cc @@ -0,0 +1,193 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class SparseAttentionOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput( + "Q", + "(Tensor), The input tensor of query in attention, " + "whose dimension : `[batch_size, num_heads, target_len, head_dim]`."); + AddInput( + "K", + "(Tensor), The input tensor of key in attention, " + "whose dimension : `[batch_size, num_heads, target_len, head_dim]`."); + AddInput( + "V", + "(Tensor), The input tensor of value in attention, " + "whose dimension : `[batch_size, num_heads, target_len, head_dim]`."); + AddInput("Offset", + "(Tensor, default: Tensor), The input tensor of offset in " + "CSR sparse format, " + "whose dimension : `[batch_size, num_heads, target_len + 1]`."); + AddInput("Columns", + "(Tensor, default: Tensor), The input tensor of columns in " + "CSR sparse format, " + "whose dimension : `[batch_size, num_heads, sparse_nnz_num]`."); + AddOutput( + "Out", + "(Tensor), The output tensor of result in attention, " + "whose dimension : `[batch_size, num_heads, target_len, head_dim]`."); + AddOutput("SparseDotSdd", + "(Tensor), The output tensor of result in SparseDotSdd step, " + "whose dimension : `[batch_size, num_heads, sparse_nnz_dim]`.") + .AsIntermediate(); + AddOutput("Softmax", + "(Tensor), The output tensor of result in Softmax step, " + "whose dimension : `[batch_size, num_heads, sparse_nnz_dim]`.") + .AsIntermediate(); + AddComment(R"DOC( + Compute the value of the sparse attention module. Its input value includes five tensors. + Q, K, and V represent query, key, and value in the Attention module, respectively. + The CSR format is used to represent the sparsity feature in the Attention module. + The CSR format contains two tensors, offset and columns. + )DOC"); + } +}; + +class SparseAttentionOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("Q"), "Input", "Q", "sparse_attention"); + OP_INOUT_CHECK(ctx->HasInput("K"), "Input", "K", "sparse_attention"); + OP_INOUT_CHECK(ctx->HasInput("V"), "Input", "V", "sparse_attention"); + OP_INOUT_CHECK(ctx->HasInput("Offset"), "Input", "Offset", + "sparse_attention"); + OP_INOUT_CHECK(ctx->HasInput("Columns"), "Input", "Columns", + "sparse_attention"); + OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "sparse_attention"); + OP_INOUT_CHECK(ctx->HasOutput("SparseDotSdd"), "Output", "SparseDotSdd", + "sparse_attention"); + OP_INOUT_CHECK(ctx->HasOutput("Softmax"), "Output", "Softmax", + "sparse_attention"); + + auto dims_q = ctx->GetInputDim("Q"); + auto dims_k = ctx->GetInputDim("K"); + auto dims_v = ctx->GetInputDim("V"); + auto dims_columns = ctx->GetInputDim("Columns"); + + PADDLE_ENFORCE_EQ(dims_q.size(), static_cast(4), + platform::errors::InvalidArgument( + "Dimension in query' shapes should be 4.")); + PADDLE_ENFORCE_EQ(dims_k.size(), static_cast(4), + platform::errors::InvalidArgument( + "Dimension in key' shapes should be 4.")); + PADDLE_ENFORCE_EQ(dims_v.size(), static_cast(4), + platform::errors::InvalidArgument( + "Dimension in value' shapes should be 4.")); + + auto batch_size = dims_q[0]; + auto num_heads = dims_q[1]; + auto M = dims_q[2]; + auto N = dims_q[3]; + auto sparse_nnz = dims_columns[2]; + ctx->SetOutputDim("Out", {batch_size, num_heads, M, N}); + ctx->SetOutputDim("SparseDotSdd", {batch_size, num_heads, sparse_nnz}); + ctx->SetOutputDim("Softmax", {batch_size, num_heads, sparse_nnz}); + ctx->ShareLoD("Q", "Out"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto input_data_type = + OperatorWithKernel::IndicateOrPromoteVarDataTypes(ctx, "Q", "K"); + return framework::OpKernelType(input_data_type, ctx.GetPlace()); + } +}; + +class SparseAttentionOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("Q"), "Input", "Q", "sparse_attention_grad"); + OP_INOUT_CHECK(ctx->HasInput("K"), "Input", "K", "sparse_attention_grad"); + OP_INOUT_CHECK(ctx->HasInput("V"), "Input", "V", "sparse_attention_grad"); + OP_INOUT_CHECK(ctx->HasInput("Offset"), "Input", "Offset", + "sparse_attention_grad"); + OP_INOUT_CHECK(ctx->HasInput("Columns"), "Input", "Columns", + "sparse_attention_grad"); + OP_INOUT_CHECK(ctx->HasInput("SparseDotSdd"), "Input", "SparseDotSdd", + "sparse_attention_grad"); + OP_INOUT_CHECK(ctx->HasInput("Softmax"), "Input", "Softmax", + "sparse_attention_grad"); + OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")), "Input", + "Out@GRAD", "sparse_attention_grad"); + + auto x_grad_name = framework::GradVarName("Q"); + auto y_grad_name = framework::GradVarName("K"); + auto z_grad_name = framework::GradVarName("V"); + + if (ctx->HasOutput(x_grad_name)) { + ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("Q")); + } + if (ctx->HasOutput(y_grad_name)) { + ctx->SetOutputDim(y_grad_name, ctx->GetInputDim("K")); + } + if (ctx->HasOutput(z_grad_name)) { + ctx->SetOutputDim(z_grad_name, ctx->GetInputDim("V")); + } + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")), + ctx.GetPlace()); + } +}; + +template +class SparseAttentionGradOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("sparse_attention_grad"); + op->SetInput("Q", this->Input("Q")); + op->SetInput("K", this->Input("K")); + op->SetInput("V", this->Input("V")); + op->SetInput("Offset", this->Input("Offset")); + op->SetInput("Columns", this->Input("Columns")); + op->SetInput("SparseDotSdd", this->Output("SparseDotSdd")); + op->SetInput("Softmax", this->Output("Softmax")); + op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); + op->SetOutput(framework::GradVarName("Q"), this->InputGrad("Q")); + op->SetOutput(framework::GradVarName("K"), this->InputGrad("K")); + op->SetOutput(framework::GradVarName("V"), this->InputGrad("V")); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(sparse_attention, ops::SparseAttentionOp, + ops::SparseAttentionOpMaker, + ops::SparseAttentionGradOpMaker, + ops::SparseAttentionGradOpMaker); + +REGISTER_OPERATOR(sparse_attention_grad, ops::SparseAttentionOpGrad); diff --git a/paddle/fluid/operators/sparse_attention_op.cu b/paddle/fluid/operators/sparse_attention_op.cu new file mode 100644 index 00000000000000..88ee8999c5f4af --- /dev/null +++ b/paddle/fluid/operators/sparse_attention_op.cu @@ -0,0 +1,537 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/op_registry.h" +#if defined(PADDLE_WITH_CUDA) +#include "paddle/fluid/platform/dynload/cusparse.h" +#endif + +namespace ops = paddle::operators; +namespace plf = paddle::platform; + +namespace paddle { +namespace operators { + +template +__forceinline__ __device__ T CudaShuffleXorSync(unsigned mask, T val, + int width = warpSize) { + return __shfl_xor_sync(mask, val, width); +} + +template +__device__ __forceinline__ void WarpReduceSum(T* sum) { +#pragma unroll + for (int offset = warp_size / 2; offset > 0; offset /= 2) { +#pragma unroll + for (int i = 0; i < batch_size; ++i) { + T sum_val = CudaShuffleXorSync(0xFFFFFFFF, sum[i], offset); + sum[i] = sum[i] + sum_val; + } + } +} + +template +__device__ __forceinline__ void WarpReduceMax(T* sum) { +#pragma unroll + for (int offset = warp_size / 2; offset > 0; offset /= 2) { +#pragma unroll + for (int i = 0; i < batch_size; ++i) { + T max_val = CudaShuffleXorSync(0xFFFFFFFF, sum[i], offset); + sum[i] = max(sum[i], max_val); + } + } +} + +template +__global__ void BlockSparseSoftmaxForward(T* softmax, const T* src, T scale, + const T* kp_mask, const T* attn_mask, + const int* layout_rowptr, + const int* layout_colindex, + int num_rows) { + // current thread related info + const int WarpSize = 32; + const int cur_row = blockIdx.x * blockDim.y + threadIdx.y; + if (cur_row < num_rows) { + const int cur_block_row = cur_row / BlockSize; + const int cur_block_nnz = + layout_rowptr[cur_block_row + 1] - layout_rowptr[cur_block_row]; + + T srcdata[(BlockSize * BlockNnzMax + WarpSize - 1) / WarpSize]; + T attndata[(BlockSize * BlockNnzMax + WarpSize - 1) / WarpSize]; + + // read kp mask + T cur_kp_mask = (kp_mask == nullptr) ? 0 : kp_mask[cur_row]; + + // read tensor data, attn mask + const int iter = (cur_block_nnz + WarpSize - 1) / WarpSize; + const T* srcptr = src + layout_rowptr[cur_block_row]; + T* attnptr = nullptr; + if (attn_mask != nullptr) { + const T* attnptr = attn_mask + cur_block_row * num_rows; + } + const int* colindex = layout_colindex + layout_rowptr[cur_block_row]; + for (int j = 0; j < iter; j++) { + int cur_block_col = j * WarpSize + threadIdx.x; + int cur_reg_index = j; + if (cur_block_col < cur_block_nnz) { + if ((attnptr != nullptr) && + std::abs(attnptr[colindex[cur_block_col]]) < + std::numeric_limits::epsilon()) { + srcdata[cur_reg_index] = + -std::numeric_limits::infinity() * scale + cur_kp_mask; + } else { + srcdata[cur_reg_index] = scale * srcptr[cur_block_col] + cur_kp_mask; + } + } else { + srcdata[cur_reg_index] = -std::numeric_limits::infinity(); + } + } + + // max value + T max_value = srcdata[0]; + const int kIteration = + (cur_block_nnz * BlockSize + WarpSize - 1) / WarpSize; +#pragma unroll + for (int it = 1; it < kIteration; ++it) { + max_value = (max_value > srcdata[it]) ? max_value : srcdata[it]; + } + WarpReduceMax(&max_value); + + // exp sum + T sum = 0; +#pragma unroll + for (int it = 0; it < kIteration; ++it) { + srcdata[it] = std::exp(srcdata[it] - max_value); + sum += srcdata[it]; + } + WarpReduceSum(&sum); + + // compute softmax and write out + T* softmaxptr = softmax + layout_rowptr[cur_block_row]; + for (int j = 0; j < iter; j++) { + int cur_block_col = j * WarpSize + threadIdx.x; + int cur_reg_index = j; + if (cur_block_col < cur_block_nnz) { + softmaxptr[cur_block_col] = srcdata[cur_reg_index] / sum; + } + } + } +} + +template +__global__ void BlockSparseSoftmaxBackward(T* dst, const T* grad, const T* src, + T scale, const int* layout_rowptr, + const int* layout_colindex, + int num_rows) { + // current thread related info + const int WarpSize = 32; + const int cur_row = blockIdx.x * blockDim.y + threadIdx.y; + if (cur_row < num_rows) { + const int cur_block_row = cur_row / BlockSize; + const int cur_block_nnz = + layout_rowptr[cur_block_row + 1] - layout_rowptr[cur_block_row]; + + T srcdata[(BlockSize * BlockNnzMax + WarpSize - 1) / WarpSize]; + T graddata[(BlockSize * BlockNnzMax + WarpSize - 1) / WarpSize]; + + // read tensor data, attn mask + const int iter = (cur_block_nnz + WarpSize - 1) / WarpSize; + const T* srcptr = src + layout_rowptr[cur_block_row]; + const T* gradptr = grad + layout_rowptr[cur_block_row]; + for (int j = 0; j < iter; j++) { + int cur_block_col = j * WarpSize + threadIdx.x; + int cur_reg_index = j; + if (cur_block_col < cur_block_nnz) { + srcdata[cur_reg_index] = srcptr[cur_block_col]; + graddata[cur_reg_index] = gradptr[cur_block_col]; + } else { + srcdata[cur_reg_index] = 0; + graddata[cur_reg_index] = 0; + } + } + + T sum = 0; + const int kIteration = + (cur_block_nnz * BlockSize + WarpSize - 1) / WarpSize; +#pragma unroll + for (int it = 0; it < kIteration; ++it) { + sum += srcdata[it] * graddata[it]; + } + WarpReduceSum(&sum); + + // compute softmax and write out + T* dstptr = dst + layout_rowptr[cur_block_row]; + for (int j = 0; j < iter; j++) { + int cur_block_col = j * WarpSize + threadIdx.x; + int cur_reg_index = j; + if (cur_block_col < cur_block_nnz) { + dstptr[cur_block_col] = + scale * srcdata[cur_reg_index] * (graddata[cur_reg_index] - sum); + } + } + } +} + +using Tensor = framework::Tensor; +/* +input: sparse C in CSR format (num_rows,num_rows) +output: sparse C after softmax operation +*/ +template +void SparseSoftmaxForward(const platform::CUDADeviceContext& ctx, + const Tensor* offset, const Tensor* columns, + Tensor* input, Tensor* output, const int blocksize, + const int num_rows, const int num_cols) { + const int* offset_data = offset->data(); + const int* columns_data = columns->data(); + T* input_data = input->data(); + T* output_data = output->data(); + + const int block_size = 1; + dim3 blocks(32, 4, 1); + int grid = (num_rows * block_size + 3) / 4; + T scaling = static_cast(1.0) / sqrt(static_cast(num_cols)); + + const int block_nnz_max = 256; + BlockSparseSoftmaxForward<<>>( + output_data, input_data, scaling, nullptr, nullptr, offset_data, + columns_data, num_rows); +} + +template +void SparseSoftmaxBackward(const platform::CUDADeviceContext& ctx, + const Tensor* offset, const Tensor* columns, + Tensor* dx, const Tensor* dout, const Tensor* out, + const int blocksize, const int num_rows, + const int num_cols) { + const int* offset_data = offset->data(); + const int* columns_data = columns->data(); + T* dx_data = dx->data(); + const T* dout_data = dout->data(); + const T* out_data = out->data(); + + const int block_size = 1; + dim3 blocks(32, 4, 1); + int grid = (num_rows * block_size + 3) / 4; + T scaling = static_cast(1.0) / sqrt(static_cast(num_cols)); + + const int block_nnz_max = 256; + BlockSparseSoftmaxBackward<<>>( + dx_data, dout_data, out_data, scaling, offset_data, columns_data, + num_rows); +} + +using VarType = framework::proto::VarType; +inline cudaDataType_t GetGpuType(const VarType::Type data_type) { + if (data_type == VarType::FP32) { + return CUDA_R_32F; + } else if (data_type == VarType::FP64) { + return CUDA_R_64F; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Not support tensor type in sparse_attention OP: %s", + framework::DataTypeToString(data_type))); + } +} + +inline cusparseOperation_t GetTransposeOperation(const bool transpose) { + if (transpose) { + return CUSPARSE_OPERATION_TRANSPOSE; + } else { + return CUSPARSE_OPERATION_NON_TRANSPOSE; + } +} + +void CusparseDestroy(cusparseDnMatDescr_t* dn_mat_first, + cusparseDnMatDescr_t* dn_mat_second, + cusparseSpMatDescr_t* sp_mat) { + platform::dynload::cusparseDestroyDnMat(*dn_mat_first); + platform::dynload::cusparseDestroyDnMat(*dn_mat_second); + platform::dynload::cusparseDestroySpMat(*sp_mat); +} + +/* +input: dense A (num_rows,num_cols), dense B (num_rows,num_cols) +output: sparse C in CSR format (num_rows,num_rows) +*/ +template +void DotSdd(const platform::CUDADeviceContext& ctx, const Tensor* a, + const Tensor* b, const Tensor* c_offset, const Tensor* c_columns, + Tensor* c_value, const int num_rows, const int num_cols, + const bool a_transpose, const bool b_transpose) { + const T* a_data = a->data(); + const T* b_data = b->data(); + const int* c_offset_data = c_offset->data(); + const int* c_columns_data = c_columns->data(); + T* c_value_data = c_value->data(); + + cudaDataType_t gpu_type = GetGpuType(c_value->type()); + cusparseHandle_t handle = nullptr; + cusparseDnMatDescr_t mat_a, mat_b; + cusparseSpMatDescr_t mat_c; + platform::dynload::cusparseCreate(&handle); + + // Create dense matrix A + platform::dynload::cusparseCreateDnMat(&mat_a, num_rows, num_cols, num_cols, + const_cast(a_data), gpu_type, + CUSPARSE_ORDER_ROW); + // Create dense matrix B + platform::dynload::cusparseCreateDnMat(&mat_b, num_rows, num_cols, num_cols, + const_cast(b_data), gpu_type, + CUSPARSE_ORDER_ROW); + // Create sparse matrix C in CSR format + int c_nnz = c_columns->dims()[1]; + platform::dynload::cusparseCreateCsr( + &mat_c, num_rows, num_rows, c_nnz, const_cast(c_offset_data), + const_cast(c_columns_data), c_value_data, CUSPARSE_INDEX_32I, + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, gpu_type); + + T alpha = 1; + T beta = 0; + + size_t buffer_size = 0; + platform::dynload::cusparseSDDMM_bufferSize( + handle, GetTransposeOperation(a_transpose), + GetTransposeOperation(b_transpose), &alpha, mat_a, mat_b, &beta, mat_c, + gpu_type, CUSPARSE_SDDMM_ALG_DEFAULT, &buffer_size); + auto d_buffer_ptr = paddle::memory::Alloc(ctx, buffer_size); + void* d_buffer = static_cast(d_buffer_ptr->ptr()); + + platform::dynload::cusparseSDDMM(handle, GetTransposeOperation(a_transpose), + GetTransposeOperation(b_transpose), &alpha, + mat_a, mat_b, &beta, mat_c, gpu_type, + CUSPARSE_SDDMM_ALG_DEFAULT, d_buffer); + + CusparseDestroy(&mat_a, &mat_b, &mat_c); + platform::dynload::cusparseDestroy(handle); +} + +/* +input: sparse A in CSR format (num_rows,num_rows), dense B (num_rows,num_cols) +output: dense C (num_rows,num_cols) +*/ +template +void DotDsd(const platform::CUDADeviceContext& ctx, const Tensor* a_offset, + const Tensor* a_columns, const Tensor* a_value, const Tensor* b, + Tensor* c, const int num_rows, const int num_cols, + const bool a_transpose, const bool b_transpose) { + const int* a_offset_data = a_offset->data(); + const int* a_columns_data = a_columns->data(); + const T* a_value_data = a_value->data(); + const T* b_data = b->data(); + T* c_data = c->data(); + + cudaDataType_t gpu_type = GetGpuType(c->type()); + cusparseHandle_t handle = nullptr; + cusparseSpMatDescr_t mat_a; + cusparseDnMatDescr_t mat_b, mat_c; + platform::dynload::cusparseCreate(&handle); + + // Create sparse matrix A in CSR format + int a_nnz = a_columns->dims()[1]; + platform::dynload::cusparseCreateCsr( + &mat_a, num_rows, num_rows, a_nnz, const_cast(a_offset_data), + const_cast(a_columns_data), const_cast(a_value_data), + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, + gpu_type); + + // Create dense matrix B + platform::dynload::cusparseCreateDnMat(&mat_b, num_rows, num_cols, num_cols, + const_cast(b_data), gpu_type, + CUSPARSE_ORDER_ROW); + // Create dense matrix C + platform::dynload::cusparseCreateDnMat(&mat_c, num_rows, num_cols, num_cols, + c_data, gpu_type, CUSPARSE_ORDER_ROW); + + T alpha = 1; + T beta = 0; + + size_t buffer_size = 0; + // allocate an external buffer if needed + platform::dynload::cusparseSpMM_bufferSize( + handle, GetTransposeOperation(a_transpose), + GetTransposeOperation(b_transpose), &alpha, mat_a, mat_b, &beta, mat_c, + gpu_type, CUSPARSE_SPMM_ALG_DEFAULT, &buffer_size); + auto d_buffer_ptr = paddle::memory::Alloc(ctx, buffer_size); + void* d_buffer = static_cast(d_buffer_ptr->ptr()); + + platform::dynload::cusparseSpMM(handle, GetTransposeOperation(a_transpose), + GetTransposeOperation(b_transpose), &alpha, + mat_a, mat_b, &beta, mat_c, gpu_type, + CUSPARSE_SPMM_ALG_DEFAULT, d_buffer); + + CusparseDestroy(&mat_b, &mat_c, &mat_a); + platform::dynload::cusparseDestroy(handle); +} + +std::vector GetSplitTensor(Tensor* input) { + auto dims = input->dims(); + int batch_size = dims[0]; + int num_heads = dims[1]; + std::vector new_dims(dims.size() - 1); + new_dims[0] = batch_size * num_heads; + for (int i = 1; i < new_dims.size(); i++) { + new_dims[i] = dims[i + 1]; + } + input->Resize(framework::make_ddim(new_dims)); + return input->Split(1, 0); +} + +template +class SparseAttentionCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto query = *ctx.Input("Q"); + auto key = *ctx.Input("K"); + auto value = *ctx.Input("V"); + auto offset = *ctx.Input("Offset"); + auto columns = *ctx.Input("Columns"); + auto output_ptr = ctx.Output("Out"); + output_ptr->mutable_data(ctx.GetPlace()); + auto sparse_dot_sdd_ptr = ctx.Output("SparseDotSdd"); + sparse_dot_sdd_ptr->mutable_data(ctx.GetPlace()); + auto softmax_ptr = ctx.Output("Softmax"); + softmax_ptr->mutable_data(ctx.GetPlace()); + + auto output = *output_ptr; + auto result_sdd = *sparse_dot_sdd_ptr; + auto result_softmax = *softmax_ptr; + + auto query_dims = query.dims(); + int batch_size = query_dims[0]; + int num_heads = query_dims[1]; + int M = query_dims[2]; + int N = query_dims[3]; + + std::vector query_lists = GetSplitTensor(&query); + std::vector key_lists = GetSplitTensor(&key); + std::vector value_lists = GetSplitTensor(&value); + std::vector offset_lists = GetSplitTensor(&offset); + std::vector columns_lists = GetSplitTensor(&columns); + std::vector result_sdd_lists = GetSplitTensor(&result_sdd); + std::vector result_softmax_lists = GetSplitTensor(&result_softmax); + std::vector output_lists = GetSplitTensor(&output); + + const auto& dev_ctx = ctx.cuda_device_context(); + const int iter_num = batch_size * num_heads; + for (int i = 0; i < iter_num; i++) { + DotSdd(dev_ctx, &query_lists[i], &key_lists[i], + &offset_lists[i], &columns_lists[i], + &result_sdd_lists[i], M, N, false, true); + + SparseSoftmaxForward( + dev_ctx, &offset_lists[i], &columns_lists[i], &result_sdd_lists[i], + &result_softmax_lists[i], 1, M, N); + + DotDsd(dev_ctx, &offset_lists[i], &columns_lists[i], + &result_softmax_lists[i], &value_lists[i], + &output_lists[i], M, N, false, false); + } + } +}; + +template +class SparseAttentionGradCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto query = *ctx.Input("Q"); + auto key = *ctx.Input("K"); + auto value = *ctx.Input("V"); + auto offset = *ctx.Input("Offset"); + auto columns = *ctx.Input("Columns"); + auto sparse_dot_sdd = *ctx.Input("SparseDotSdd"); + auto softmax = *ctx.Input("Softmax"); + auto dout = *ctx.Input(framework::GradVarName("Out")); + auto* dquery_ptr = ctx.Output(framework::GradVarName("Q")); + auto* dkey_ptr = ctx.Output(framework::GradVarName("K")); + auto* dvalue_ptr = ctx.Output(framework::GradVarName("V")); + dquery_ptr->mutable_data(ctx.GetPlace()); + dkey_ptr->mutable_data(ctx.GetPlace()); + dvalue_ptr->mutable_data(ctx.GetPlace()); + auto dquery = *dquery_ptr; + auto dkey = *dkey_ptr; + auto dvalue = *dvalue_ptr; + + auto query_dims = query.dims(); + int batch_size = query_dims[0]; + int num_heads = query_dims[1]; + int M = query_dims[2]; + int N = query_dims[3]; + + std::vector query_lists = GetSplitTensor(&query); + std::vector key_lists = GetSplitTensor(&key); + std::vector value_lists = GetSplitTensor(&value); + std::vector offset_lists = GetSplitTensor(&offset); + std::vector columns_lists = GetSplitTensor(&columns); + std::vector sparse_dot_sdd_lists = GetSplitTensor(&sparse_dot_sdd); + std::vector softmax_lists = GetSplitTensor(&softmax); + std::vector dout_lists = GetSplitTensor(&dout); + std::vector dquery_lists = GetSplitTensor(&dquery); + std::vector dkey_lists = GetSplitTensor(&dkey); + std::vector dvalue_lists = GetSplitTensor(&dvalue); + + const int iter_num = batch_size * num_heads; + const auto& dev_ctx = ctx.cuda_device_context(); + for (int i = 0; i < iter_num; i++) { + // dValue = transpose(result_softmax) * dOut + DotDsd(dev_ctx, &offset_lists[i], &columns_lists[i], + &softmax_lists[i], &dout_lists[i], + &dvalue_lists[i], M, N, true, false); + + // dSoftmax = dOut * transpose(Value) + int nnz_num = columns.dims()[0]; + Tensor dsoftmax; + dsoftmax.Resize({nnz_num}); + dsoftmax.mutable_data(ctx.GetPlace()); + DotSdd(dev_ctx, &dout_lists[i], &value_lists[i], + &offset_lists[i], &columns_lists[i], &dsoftmax, + M, N, false, true); + + // dSparseDotSdd = dSoftmax * softmax'(SparseDotSdd) + Tensor dsparse_dot_sdd; + dsparse_dot_sdd.Resize({nnz_num}); + dsparse_dot_sdd.mutable_data(ctx.GetPlace()); + SparseSoftmaxBackward( + dev_ctx, &offset_lists[i], &columns_lists[i], &dsparse_dot_sdd, + &dsoftmax, &softmax_lists[i], 1, M, N); + + // dQuery = dSparseDotSdd * Key + DotDsd(dev_ctx, &offset_lists[i], &columns_lists[i], + &dsparse_dot_sdd, &key_lists[i], + &dquery_lists[i], M, N, false, false); + + // dKey = transpose(dSparseDotSdd) * Query + DotDsd(dev_ctx, &offset_lists[i], &columns_lists[i], + &dsparse_dot_sdd, &query_lists[i], + &dkey_lists[i], M, N, true, false); + } + } +}; + +} // namespace operators +} // namespace paddle +REGISTER_OP_CUDA_KERNEL( + sparse_attention, + ops::SparseAttentionCUDAKernel, + ops::SparseAttentionCUDAKernel); + +REGISTER_OP_CUDA_KERNEL( + sparse_attention_grad, + ops::SparseAttentionGradCUDAKernel, + ops::SparseAttentionGradCUDAKernel); diff --git a/paddle/fluid/operators/spectral_helper.h b/paddle/fluid/operators/spectral_helper.h new file mode 100644 index 00000000000000..9c34d500eac92a --- /dev/null +++ b/paddle/fluid/operators/spectral_helper.h @@ -0,0 +1,261 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/operators/spectral_op.h" + +#ifdef PADDLE_WITH_HIP +#include "paddle/fluid/platform/dynload/hipfft.h" +#endif + +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/dynload/cufft.h" +#endif + +namespace paddle { +namespace operators { +using ScalarType = framework::proto::VarType::Type; +const int64_t kMaxCUFFTNdim = 3; +const int64_t kMaxDataNdim = kMaxCUFFTNdim + 1; +// This struct is used to easily compute hashes of the +// parameters. It will be the **key** to the plan cache. +struct PlanKey { + // between 1 and kMaxCUFFTNdim, i.e., 1 <= signal_ndim <= 3 + int64_t signal_ndim_; + // These include additional batch dimension as well. + int64_t sizes_[kMaxDataNdim]; + int64_t input_shape_[kMaxDataNdim]; + int64_t output_shape_[kMaxDataNdim]; + FFTTransformType fft_type_; + ScalarType value_type_; + + PlanKey() = default; + + PlanKey(const std::vector& in_shape, + const std::vector& out_shape, + const std::vector& signal_size, FFTTransformType fft_type, + ScalarType value_type) { + // Padding bits must be zeroed for hashing + memset(this, 0, sizeof(*this)); + signal_ndim_ = signal_size.size() - 1; + fft_type_ = fft_type; + value_type_ = value_type; + + std::copy(signal_size.cbegin(), signal_size.cend(), sizes_); + std::copy(in_shape.cbegin(), in_shape.cend(), input_shape_); + std::copy(out_shape.cbegin(), out_shape.cend(), output_shape_); + } +}; + +#if defined(PADDLE_WITH_CUDA) +// An RAII encapsulation of cuFFTHandle +class CuFFTHandle { + ::cufftHandle handle_; + + public: + CuFFTHandle() { + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftCreate(&handle_)); + } + + ::cufftHandle& get() { return handle_; } + const ::cufftHandle& get() const { return handle_; } + + ~CuFFTHandle() { + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftDestroy(handle_)); + } +}; + +using plan_size_type = long long int; // NOLINT +// This class contains all the information needed to execute a cuFFT plan: +// 1. the plan +// 2. the workspace size needed +class CuFFTConfig { + public: + // Only move semantics is enought for this class. Although we already use + // unique_ptr for the plan, still remove copy constructor and assignment op so + // we don't accidentally copy and take perf hit. + explicit CuFFTConfig(const PlanKey& plan_key) + : CuFFTConfig( + std::vector(plan_key.sizes_, + plan_key.sizes_ + plan_key.signal_ndim_ + 1), + plan_key.signal_ndim_, plan_key.fft_type_, plan_key.value_type_) {} + + // sizes are full signal, including batch size and always two-sided + CuFFTConfig(const std::vector& sizes, const int64_t signal_ndim, + FFTTransformType fft_type, ScalarType dtype) + : fft_type_(fft_type), value_type_(dtype) { + // signal sizes (excluding batch dim) + std::vector signal_sizes(sizes.begin() + 1, sizes.end()); + + // input batch size + const auto batch = static_cast(sizes[0]); + // const int64_t signal_ndim = sizes.size() - 1; + PADDLE_ENFORCE_EQ(signal_ndim, sizes.size() - 1, + platform::errors::InvalidArgument( + "The signal_ndim must be equal to sizes.size() - 1," + "But signal_ndim is: [%d], sizes.size() - 1 is: [%d]", + signal_ndim, sizes.size() - 1)); + + cudaDataType itype, otype, exec_type; + const auto complex_input = has_complex_input(fft_type); + const auto complex_output = has_complex_output(fft_type); + if (dtype == framework::proto::VarType::FP32) { + itype = complex_input ? CUDA_C_32F : CUDA_R_32F; + otype = complex_output ? CUDA_C_32F : CUDA_R_32F; + exec_type = CUDA_C_32F; + } else if (dtype == framework::proto::VarType::FP64) { + itype = complex_input ? CUDA_C_64F : CUDA_R_64F; + otype = complex_output ? CUDA_C_64F : CUDA_R_64F; + exec_type = CUDA_C_64F; + } else if (dtype == framework::proto::VarType::FP16) { + itype = complex_input ? CUDA_C_16F : CUDA_R_16F; + otype = complex_output ? CUDA_C_16F : CUDA_R_16F; + exec_type = CUDA_C_16F; + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "cuFFT only support transforms of type float16, float32 and " + "float64")); + } + + // disable auto allocation of workspace to use allocator from the framework + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftSetAutoAllocation( + plan(), /* autoAllocate */ 0)); + + size_t ws_size_t; + + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftXtMakePlanMany( + plan(), signal_ndim, signal_sizes.data(), + /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype, + /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, otype, + batch, &ws_size_t, exec_type)); + + ws_size = ws_size_t; + } + + const cufftHandle& plan() const { return plan_ptr.get(); } + + FFTTransformType transform_type() const { return fft_type_; } + ScalarType data_type() const { return value_type_; } + size_t workspace_size() const { return ws_size; } + + private: + CuFFTHandle plan_ptr; + size_t ws_size; + FFTTransformType fft_type_; + ScalarType value_type_; +}; + +#elif defined(PADDLE_WITH_HIP) +// An RAII encapsulation of cuFFTHandle +class HIPFFTHandle { + ::hipfftHandle handle_; + + public: + HIPFFTHandle() { + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftCreate(&handle_)); + } + + ::hipfftHandle& get() { return handle_; } + const ::hipfftHandle& get() const { return handle_; } + + ~HIPFFTHandle() { + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftDestroy(handle_)); + } +}; +using plan_size_type = int; +// This class contains all the information needed to execute a cuFFT plan: +// 1. the plan +// 2. the workspace size needed +class HIPFFTConfig { + public: + // Only move semantics is enought for this class. Although we already use + // unique_ptr for the plan, still remove copy constructor and assignment op so + // we don't accidentally copy and take perf hit. + explicit HIPFFTConfig(const PlanKey& plan_key) + : HIPFFTConfig( + std::vector(plan_key.sizes_, + plan_key.sizes_ + plan_key.signal_ndim_ + 1), + plan_key.signal_ndim_, plan_key.fft_type_, plan_key.value_type_) {} + + // sizes are full signal, including batch size and always two-sided + HIPFFTConfig(const std::vector& sizes, const int64_t signal_ndim, + FFTTransformType fft_type, ScalarType dtype) + : fft_type_(fft_type), value_type_(dtype) { + // signal sizes (excluding batch dim) + std::vector signal_sizes(sizes.begin() + 1, sizes.end()); + + // input batch size + const auto batch = static_cast(sizes[0]); + // const int64_t signal_ndim = sizes.size() - 1; + PADDLE_ENFORCE_EQ(signal_ndim, sizes.size() - 1, + platform::errors::InvalidArgument( + "The signal_ndim must be equal to sizes.size() - 1," + "But signal_ndim is: [%d], sizes.size() - 1 is: [%d]", + signal_ndim, sizes.size() - 1)); + + hipfftType exec_type = [&] { + if (dtype == framework::proto::VarType::FP32) { + switch (fft_type) { + case FFTTransformType::C2C: + return HIPFFT_C2C; + case FFTTransformType::R2C: + return HIPFFT_R2C; + case FFTTransformType::C2R: + return HIPFFT_C2R; + } + } else if (dtype == framework::proto::VarType::FP64) { + switch (fft_type) { + case FFTTransformType::C2C: + return HIPFFT_Z2Z; + case FFTTransformType::R2C: + return HIPFFT_D2Z; + case FFTTransformType::C2R: + return HIPFFT_Z2D; + } + } + PADDLE_THROW(platform::errors::InvalidArgument( + "hipFFT only support transforms of type float32 and float64")); + }(); + + // disable auto allocation of workspace to use allocator from the framework + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftSetAutoAllocation( + plan(), /* autoAllocate */ 0)); + + size_t ws_size_t; + + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftMakePlanMany( + plan(), signal_ndim, signal_sizes.data(), + /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, + /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, exec_type, + batch, &ws_size_t)); + + ws_size = ws_size_t; + } + + const hipfftHandle& plan() const { return plan_ptr.get(); } + + FFTTransformType transform_type() const { return fft_type_; } + ScalarType data_type() const { return value_type_; } + size_t workspace_size() const { return ws_size; } + + private: + HIPFFTHandle plan_ptr; + size_t ws_size; + FFTTransformType fft_type_; + ScalarType value_type_; +}; +#endif +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/spectral_op.cc b/paddle/fluid/operators/spectral_op.cc index fb50702233b3ba..b5edc1dda533b0 100644 --- a/paddle/fluid/operators/spectral_op.cc +++ b/paddle/fluid/operators/spectral_op.cc @@ -27,7 +27,7 @@ #include "paddle/fluid/platform/complex.h" #if defined(PADDLE_WITH_ONEMKL) -#include +#include "paddle/fluid/platform/dynload/mklrt.h" #elif defined(PADDLE_WITH_POCKETFFT) #include "extern_pocketfft/pocketfft_hdronly.h" #endif @@ -357,46 +357,45 @@ FFTNormMode get_norm_from_string(const std::string& norm, bool forward) { // FFT Functors #if defined(PADDLE_WITH_ONEMKL) +#define MKL_DFTI_CHECK(expr) \ + do { \ + MKL_LONG status = (expr); \ + if (!platform::dynload::DftiErrorClass(status, DFTI_NO_ERROR)) \ + PADDLE_THROW(platform::errors::External( \ + platform::dynload::DftiErrorMessage(status))); \ + } while (0); + namespace { -static inline void MKL_DFTI_CHECK(MKL_INT status) { - if (status && !DftiErrorClass(status, DFTI_NO_ERROR)) { - PADDLE_THROW(platform::errors::External(DftiErrorMessage(status))); - } -} struct DftiDescriptorDeleter { void operator()(DFTI_DESCRIPTOR_HANDLE handle) { if (handle != nullptr) { - MKL_DFTI_CHECK(DftiFreeDescriptor(&handle)); + MKL_DFTI_CHECK(platform::dynload::DftiFreeDescriptor(&handle)); } } }; +// A RAII wrapper for MKL_DESCRIPTOR* class DftiDescriptor { public: void init(DFTI_CONFIG_VALUE precision, DFTI_CONFIG_VALUE signal_type, MKL_LONG signal_ndim, MKL_LONG* sizes) { - if (desc_ != nullptr) { - PADDLE_THROW(platform::errors::AlreadyExists( - "DFT DESCRIPTOR can only be initialized once.")); - } + PADDLE_ENFORCE_EQ(desc_.get(), nullptr, + platform::errors::AlreadyExists( + "DftiDescriptor has already been initialized.")); + DFTI_DESCRIPTOR* raw_desc; - if (signal_ndim == 1) { - MKL_DFTI_CHECK( - DftiCreateDescriptor(&raw_desc, precision, signal_type, 1, sizes[0])); - } else { - MKL_DFTI_CHECK(DftiCreateDescriptor(&raw_desc, precision, signal_type, - signal_ndim, sizes)); - } + MKL_DFTI_CHECK(platform::dynload::DftiCreateDescriptorX( + &raw_desc, precision, signal_type, signal_ndim, sizes)); desc_.reset(raw_desc); } DFTI_DESCRIPTOR* get() const { - if (desc_ == nullptr) { - PADDLE_THROW(platform::errors::PreconditionNotMet( - "DFTI DESCRIPTOR has not been initialized.")); - } - return desc_.get(); + DFTI_DESCRIPTOR* raw_desc = desc_.get(); + PADDLE_ENFORCE_NOT_NULL(raw_desc, + platform::errors::PreconditionNotMet( + "DFTI DESCRIPTOR has not been initialized.")); + return raw_desc; } private: @@ -421,7 +420,9 @@ DftiDescriptor _plan_mkl_fft(const framework::proto::VarType::Type& in_dtype, return DFTI_DOUBLE; default: PADDLE_THROW(platform::errors::InvalidArgument( - "Input data type should be FP32, FP64, COMPLEX64 or COMPLEX128.")); + "Invalid input datatype (%s), input data type should be FP32, " + "FP64, COMPLEX64 or COMPLEX128.", + framework::DataTypeToString(in_dtype))); } }(); @@ -430,35 +431,27 @@ DftiDescriptor _plan_mkl_fft(const framework::proto::VarType::Type& in_dtype, const DFTI_CONFIG_VALUE domain = (fft_type == FFTTransformType::C2C) ? DFTI_COMPLEX : DFTI_REAL; - // const bool complex_input = framework::IsComplexType(in_dtype); - // const bool complex_output = framework::IsComplexType(out_dtype); - // const DFTI_CONFIG_VALUE domain = [&] { - // if (forward) { - // return complex_input ? DFTI_COMPLEX : DFTI_REAL; - // } else { - // return complex_output ? DFTI_COMPLEX : DFTI_REAL; - // } - // }(); - DftiDescriptor descriptor; std::vector fft_sizes(signal_sizes.cbegin(), signal_sizes.cend()); const MKL_LONG signal_ndim = fft_sizes.size() - 1; descriptor.init(precision, domain, signal_ndim, fft_sizes.data() + 1); // placement inplace or not inplace - MKL_DFTI_CHECK( - DftiSetValue(descriptor.get(), DFTI_PLACEMENT, DFTI_NOT_INPLACE)); + MKL_DFTI_CHECK(platform::dynload::DftiSetValue( + descriptor.get(), DFTI_PLACEMENT, DFTI_NOT_INPLACE)); // number of transformations const MKL_LONG batch_size = fft_sizes[0]; - MKL_DFTI_CHECK( - DftiSetValue(descriptor.get(), DFTI_NUMBER_OF_TRANSFORMS, batch_size)); + MKL_DFTI_CHECK(platform::dynload::DftiSetValue( + descriptor.get(), DFTI_NUMBER_OF_TRANSFORMS, batch_size)); // input & output distance const MKL_LONG idist = in_strides[0]; const MKL_LONG odist = out_strides[0]; - MKL_DFTI_CHECK(DftiSetValue(descriptor.get(), DFTI_INPUT_DISTANCE, idist)); - MKL_DFTI_CHECK(DftiSetValue(descriptor.get(), DFTI_OUTPUT_DISTANCE, odist)); + MKL_DFTI_CHECK(platform::dynload::DftiSetValue(descriptor.get(), + DFTI_INPUT_DISTANCE, idist)); + MKL_DFTI_CHECK(platform::dynload::DftiSetValue(descriptor.get(), + DFTI_OUTPUT_DISTANCE, odist)); // input & output stride std::vector mkl_in_stride(1 + signal_ndim, 0); @@ -467,15 +460,15 @@ DftiDescriptor _plan_mkl_fft(const framework::proto::VarType::Type& in_dtype, mkl_in_stride[i] = in_strides[i]; mkl_out_stride[i] = out_strides[i]; } - MKL_DFTI_CHECK( - DftiSetValue(descriptor.get(), DFTI_INPUT_STRIDES, mkl_in_stride.data())); - MKL_DFTI_CHECK(DftiSetValue(descriptor.get(), DFTI_OUTPUT_STRIDES, - mkl_out_stride.data())); + MKL_DFTI_CHECK(platform::dynload::DftiSetValue( + descriptor.get(), DFTI_INPUT_STRIDES, mkl_in_stride.data())); + MKL_DFTI_CHECK(platform::dynload::DftiSetValue( + descriptor.get(), DFTI_OUTPUT_STRIDES, mkl_out_stride.data())); // conjugate even storage if (!(fft_type == FFTTransformType::C2C)) { - MKL_DFTI_CHECK(DftiSetValue(descriptor.get(), DFTI_CONJUGATE_EVEN_STORAGE, - DFTI_COMPLEX_COMPLEX)); + MKL_DFTI_CHECK(platform::dynload::DftiSetValue( + descriptor.get(), DFTI_CONJUGATE_EVEN_STORAGE, DFTI_COMPLEX_COMPLEX)); } MKL_LONG signal_numel = @@ -496,11 +489,12 @@ DftiDescriptor _plan_mkl_fft(const framework::proto::VarType::Type& in_dtype, return DFTI_BACKWARD_SCALE; } }(); - MKL_DFTI_CHECK(DftiSetValue(descriptor.get(), scale_direction, scale)); + MKL_DFTI_CHECK(platform::dynload::DftiSetValue(descriptor.get(), + scale_direction, scale)); } // commit the descriptor - MKL_DFTI_CHECK(DftiCommitDescriptor(descriptor.get())); + MKL_DFTI_CHECK(platform::dynload::DftiCommitDescriptor(descriptor.get())); return descriptor; } @@ -592,15 +586,16 @@ void exec_fft(const DeviceContext& ctx, const Tensor* x, Tensor* out, collapsed_input.numel(), collapsed_input_conj.data()); for_range(functor); - MKL_DFTI_CHECK(DftiComputeBackward(desc.get(), - collapsed_input_conj.data(), - collapsed_output.data())); + MKL_DFTI_CHECK(platform::dynload::DftiComputeBackward( + desc.get(), collapsed_input_conj.data(), + collapsed_output.data())); } else if (fft_type == FFTTransformType::R2C && !forward) { framework::Tensor collapsed_output_conj(collapsed_output.type()); collapsed_output_conj.mutable_data(collapsed_output.dims(), ctx.GetPlace()); - MKL_DFTI_CHECK(DftiComputeForward(desc.get(), collapsed_input.data(), - collapsed_output_conj.data())); + MKL_DFTI_CHECK(platform::dynload::DftiComputeForward( + desc.get(), collapsed_input.data(), + collapsed_output_conj.data())); // conjugate the output platform::ForRange for_range(ctx, collapsed_output.numel()); math::ConjFunctor functor(collapsed_output_conj.data(), @@ -609,13 +604,13 @@ void exec_fft(const DeviceContext& ctx, const Tensor* x, Tensor* out, for_range(functor); } else { if (forward) { - MKL_DFTI_CHECK(DftiComputeForward(desc.get(), - collapsed_input.data(), - collapsed_output.data())); + MKL_DFTI_CHECK(platform::dynload::DftiComputeForward( + desc.get(), collapsed_input.data(), + collapsed_output.data())); } else { - MKL_DFTI_CHECK(DftiComputeBackward(desc.get(), - collapsed_input.data(), - collapsed_output.data())); + MKL_DFTI_CHECK(platform::dynload::DftiComputeBackward( + desc.get(), collapsed_input.data(), + collapsed_output.data())); } } diff --git a/paddle/fluid/operators/spectral_op.cu b/paddle/fluid/operators/spectral_op.cu index 9aa5ca39d737e0..e8a4fac2915d7c 100644 --- a/paddle/fluid/operators/spectral_op.cu +++ b/paddle/fluid/operators/spectral_op.cu @@ -8,10 +8,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ - -#include -#include - #include #include #include @@ -24,313 +20,246 @@ #include #include "paddle/fluid/operators/conj_op.h" +#include "paddle/fluid/operators/spectral_helper.h" #include "paddle/fluid/operators/spectral_op.h" #include "paddle/fluid/operators/transpose_op.h" -#include "paddle/fluid/platform/dynload/cufft.h" +#include "paddle/fluid/platform/enforce.h" namespace paddle { namespace operators { namespace { -using ScalarType = framework::proto::VarType::Type; -const int64_t kMaxCUFFTNdim = 3; -const int64_t kMaxDataNdim = kMaxCUFFTNdim + 1; - -static inline std::string get_cufft_error_info(cufftResult error) { - switch (error) { - case CUFFT_SUCCESS: - return "CUFFT_SUCCESS"; - case CUFFT_INVALID_PLAN: - return "CUFFT_INVALID_PLAN"; - case CUFFT_ALLOC_FAILED: - return "CUFFT_ALLOC_FAILED"; - case CUFFT_INVALID_TYPE: - return "CUFFT_INVALID_TYPE"; - case CUFFT_INVALID_VALUE: - return "CUFFT_INVALID_VALUE"; - case CUFFT_INTERNAL_ERROR: - return "CUFFT_INTERNAL_ERROR"; - case CUFFT_EXEC_FAILED: - return "CUFFT_EXEC_FAILED"; - case CUFFT_SETUP_FAILED: - return "CUFFT_SETUP_FAILED"; - case CUFFT_INVALID_SIZE: - return "CUFFT_INVALID_SIZE"; - case CUFFT_UNALIGNED_DATA: - return "CUFFT_UNALIGNED_DATA"; - case CUFFT_INCOMPLETE_PARAMETER_LIST: - return "CUFFT_INCOMPLETE_PARAMETER_LIST"; - case CUFFT_INVALID_DEVICE: - return "CUFFT_INVALID_DEVICE"; - case CUFFT_PARSE_ERROR: - return "CUFFT_PARSE_ERROR"; - case CUFFT_NO_WORKSPACE: - return "CUFFT_NO_WORKSPACE"; - case CUFFT_NOT_IMPLEMENTED: - return "CUFFT_NOT_IMPLEMENTED"; -#ifndef __HIPCC__ - case CUFFT_LICENSE_ERROR: - return "CUFFT_LICENSE_ERROR"; -#endif - case CUFFT_NOT_SUPPORTED: - return "CUFFT_NOT_SUPPORTED"; - default: - std::ostringstream ss; - ss << "unknown error " << error; - return ss.str(); +// Calculates the normalization constant +double fft_normalization_scale(FFTNormMode normalization, + const std::vector& sizes, + const std::vector& dims) { + // auto norm = static_cast(normalization); + if (normalization == FFTNormMode::none) { + return static_cast(1.0); } -} -static inline void CUFFT_CHECK(cufftResult error) { - if (error != CUFFT_SUCCESS) { - PADDLE_THROW(platform::errors::External(get_cufft_error_info(error))); + int64_t signal_numel = 1; + for (auto dim : dims) { + signal_numel *= sizes[dim]; } + const double scale_denom = (normalization == FFTNormMode::by_sqrt_n) + ? std::sqrt(signal_numel) + : static_cast(signal_numel); + return static_cast(1.0 / scale_denom); } -// This struct is used to easily compute hashes of the -// parameters. It will be the **key** to the plan cache. -struct PlanKey { - // between 1 and kMaxCUFFTNdim, i.e., 1 <= signal_ndim <= 3 - int64_t signal_ndim_; - // These include additional batch dimension as well. - int64_t sizes_[kMaxDataNdim]; - int64_t input_shape_[kMaxDataNdim]; - int64_t output_shape_[kMaxDataNdim]; - FFTTransformType fft_type_; - ScalarType value_type_; - - PlanKey() = default; - - PlanKey(const std::vector& in_shape, - const std::vector& out_shape, - const std::vector& signal_size, FFTTransformType fft_type, - ScalarType value_type) { - // Padding bits must be zeroed for hashing - memset(this, 0, sizeof(*this)); - signal_ndim_ = signal_size.size() - 1; - fft_type_ = fft_type; - value_type_ = value_type; - - std::copy(signal_size.cbegin(), signal_size.cend(), sizes_); - std::copy(in_shape.cbegin(), in_shape.cend(), input_shape_); - std::copy(out_shape.cbegin(), out_shape.cend(), output_shape_); +template +void exec_normalization(const DeviceContext& ctx, const Tensor* in, Tensor* out, + FFTNormMode normalization, + const std::vector& sizes, + const std::vector& axes) { + double scale = fft_normalization_scale(normalization, sizes, axes); + if (scale != 1.0) { + auto eigen_out = framework::EigenVector::Flatten(*out); + auto eigen_in = framework::EigenVector::Flatten(*in); + auto dev = ctx.eigen_device(); + EigenScale::Eval(*dev, eigen_out, eigen_in, + static_cast(scale), + static_cast(0), false); + } else { + framework::TensorCopy(*in, ctx.GetPlace(), out); } -}; - -// An RAII encapsulation of cuFFTHandle -class CuFFTHandle { - ::cufftHandle handle_; - - public: - CuFFTHandle() { CUFFT_CHECK(platform::dynload::cufftCreate(&handle_)); } +} - ::cufftHandle& get() { return handle_; } - const ::cufftHandle& get() const { return handle_; } +#if defined(PADDLE_WITH_CUDA) +CuFFTConfig create_cufft_config(const framework::Tensor& input, + const framework::Tensor& output, + int signal_ndim) { + // Create the transform plan (either from cache or locally) + const auto value_type = framework::IsComplexType(input.type()) + ? framework::ToRealType(input.type()) + : input.type(); + auto fft_type = GetFFTTransformType(input.type(), output.type()); + // signal sizes + std::vector signal_size(signal_ndim + 1); - ~CuFFTHandle() { -// Not using fftDestroy() for rocFFT to work around double freeing of handles -#ifndef __HIPCC__ - CUFFT_CHECK(platform::dynload::cufftDestroy(handle_)); -#endif + signal_size[0] = input.dims()[0]; + for (int64_t i = 1; i <= signal_ndim; ++i) { + auto in_size = input.dims()[i]; + auto out_size = output.dims()[i]; + signal_size[i] = std::max(in_size, out_size); } -}; + PlanKey key(framework::vectorize(input.dims()), + framework::vectorize(output.dims()), signal_size, fft_type, + value_type); -#ifdef __HIPCC__ -using plan_size_type = int; -#else -using plan_size_type = long long int; // NOLINT -#endif + return CuFFTConfig(key); +} -// This class contains all the information needed to execute a cuFFT plan: -// 1. the plan -// 2. the workspace size needed -class CuFFTConfig { - public: - // Only move semantics is enought for this class. Although we already use - // unique_ptr for the plan, still remove copy constructor and assignment op so - // we don't accidentally copy and take perf hit. - CuFFTConfig(const CuFFTConfig&) = delete; - CuFFTConfig& operator=(CuFFTConfig const&) = delete; - - explicit CuFFTConfig(const PlanKey& plan_key) - : CuFFTConfig( - std::vector(plan_key.sizes_, - plan_key.sizes_ + plan_key.signal_ndim_ + 1), - plan_key.signal_ndim_, plan_key.fft_type_, plan_key.value_type_) {} - - // sizes are full signal, including batch size and always two-sided - CuFFTConfig(const std::vector& sizes, const int64_t signal_ndim, - FFTTransformType fft_type, ScalarType dtype) - : fft_type_(fft_type), value_type_(dtype) { - // signal sizes (excluding batch dim) - std::vector signal_sizes(sizes.begin() + 1, sizes.end()); - - // input batch size - const auto batch = static_cast(sizes[0]); - // const int64_t signal_ndim = sizes.size() - 1; - PADDLE_ENFORCE_EQ(signal_ndim, sizes.size() - 1, - platform::errors::InvalidArgument( - "The signal_ndim must be equal to sizes.size() - 1," - "But signal_ndim is: [%d], sizes.size() - 1 is: [%d]", - signal_ndim, sizes.size() - 1)); - -#ifdef __HIPCC__ - hipfftType exec_type = [&] { - if (dtype == framework::proto::VarType::FP32) { - switch (fft_type) { - case FFTTransformType::C2C: - return HIPFFT_C2C; - case FFTTransformType::R2C: - return HIPFFT_R2C; - case FFTTransformType::C2R: - return HIPFFT_C2R; - } - } else if (dtype == framework::proto::VarType::FP64) { - switch (fft_type) { - case FFTTransformType::C2C: - return HIPFFT_Z2Z; - case FFTTransformType::R2C: - return HIPFFT_D2Z; - case FFTTransformType::C2R: - return HIPFFT_Z2D; - } - } - PADDLE_THROW(platform::errors::InvalidArgument( - "hipFFT only support transforms of type float32 and float64")); - }(); -#else - cudaDataType itype, otype, exec_type; - const auto complex_input = has_complex_input(fft_type); - const auto complex_output = has_complex_output(fft_type); - if (dtype == framework::proto::VarType::FP32) { - itype = complex_input ? CUDA_C_32F : CUDA_R_32F; - otype = complex_output ? CUDA_C_32F : CUDA_R_32F; - exec_type = CUDA_C_32F; - } else if (dtype == framework::proto::VarType::FP64) { - itype = complex_input ? CUDA_C_64F : CUDA_R_64F; - otype = complex_output ? CUDA_C_64F : CUDA_R_64F; - exec_type = CUDA_C_64F; - } else if (dtype == framework::proto::VarType::FP16) { - itype = complex_input ? CUDA_C_16F : CUDA_R_16F; - otype = complex_output ? CUDA_C_16F : CUDA_R_16F; - exec_type = CUDA_C_16F; - } else { - PADDLE_THROW(platform::errors::InvalidArgument( - "cuFFT only support transforms of type float16, float32 and " - "float64")); - } -#endif +// Execute a pre-planned transform +static void exec_cufft_plan_raw(const CuFFTConfig& config, void* in_data, + void* out_data, bool forward) { + auto& plan = config.plan(); - // disable auto allocation of workspace to use allocator from the framework - CUFFT_CHECK(platform::dynload::cufftSetAutoAllocation( - plan(), /* autoAllocate */ 0)); - - size_t ws_size_t; - -// make plan -#ifdef __HIPCC__ - CUFFT_CHECK(hipfftMakePlanMany( - plan(), signal_ndim, signal_sizes.data(), - /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, - /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, exec_type, - batch, &ws_size_t)); -#else - - CUFFT_CHECK(platform::dynload::cufftXtMakePlanMany( - plan(), signal_ndim, signal_sizes.data(), - /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype, - /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, otype, - batch, &ws_size_t, exec_type)); -#endif + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftXtExec( + plan, in_data, out_data, forward ? CUFFT_FORWARD : CUFFT_INVERSE)); +} - ws_size = ws_size_t; +template +void exec_cufft_plan(const DeviceContext& ctx, const CuFFTConfig& config, + framework::Tensor* input, framework::Tensor* output, + bool forward) { + // execute transform plan + auto fft_type = config.transform_type(); + if (fft_type == FFTTransformType::C2R && forward) { + forward = false; + framework::Tensor input_conj(input->type()); + input_conj.mutable_data(input->dims(), ctx.GetPlace()); + platform::ForRange for_range(ctx, input->numel()); + math::ConjFunctor functor(input->data(), input->numel(), + input_conj.data()); + for_range(functor); + exec_cufft_plan_raw(config, input_conj.data(), output->data(), + forward); + } else if (fft_type == FFTTransformType::R2C && !forward) { + forward = true; + framework::Tensor out_conj(output->type()); + out_conj.mutable_data(output->dims(), ctx.GetPlace()); + exec_cufft_plan_raw(config, input->data(), out_conj.data(), + forward); + + platform::ForRange for_range(ctx, output->numel()); + math::ConjFunctor functor(out_conj.data(), output->numel(), + output->data()); + for_range(functor); + } else { + exec_cufft_plan_raw(config, input->data(), output->data(), + forward); } +} - const cufftHandle& plan() const { return plan_ptr.get(); } +#elif defined(PADDLE_WITH_HIP) - FFTTransformType transform_type() const { return fft_type_; } - ScalarType data_type() const { return value_type_; } - size_t workspace_size() const { return ws_size; } +HIPFFTConfig create_hipfft_config(const framework::Tensor& input, + const framework::Tensor& output, + int signal_ndim) { + // Create the transform plan (either from cache or locally) + const auto value_type = framework::IsComplexType(input.type()) + ? framework::ToRealType(input.type()) + : input.type(); + auto fft_type = GetFFTTransformType(input.type(), output.type()); + // signal sizes + std::vector signal_size(signal_ndim + 1); - private: - CuFFTHandle plan_ptr; - size_t ws_size; - FFTTransformType fft_type_; - ScalarType value_type_; -}; + signal_size[0] = input.dims()[0]; + for (int64_t i = 1; i <= signal_ndim; ++i) { + auto in_size = input.dims()[i]; + auto out_size = output.dims()[i]; + signal_size[i] = std::max(in_size, out_size); + } + PlanKey key(framework::vectorize(input.dims()), + framework::vectorize(output.dims()), signal_size, fft_type, + value_type); + + return HIPFFTConfig(key); +} // Execute a pre-planned transform -static void exec_cufft_plan(const CuFFTConfig& config, void* in_data, - void* out_data, bool forward) { +static void exec_hipfft_plan_raw(const HIPFFTConfig& config, void* in_data, + void* out_data, bool forward) { auto& plan = config.plan(); -#ifdef __HIPCC__ + auto value_type = config.data_type(); if (value_type == framework::proto::VarType::FP32) { switch (config.transform_type()) { case FFTTransformType::C2C: { - CUFFT_CHECK(hipfftExecC2C(plan, static_cast(in_data), - static_cast(out_data), - forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD)); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecC2C( + plan, static_cast(in_data), + static_cast(out_data), + forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD)); return; } case FFTTransformType::R2C: { - CUFFT_CHECK(hipfftExecR2C(plan, static_cast(in_data), - static_cast(out_data))); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecR2C( + plan, static_cast(in_data), + static_cast(out_data))); return; } case FFTTransformType::C2R: { - CUFFT_CHECK(hipfftExecC2R(plan, static_cast(in_data), - static_cast(out_data))); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecC2R( + plan, static_cast(in_data), + static_cast(out_data))); return; } } } else if (value_type == framework::proto::VarType::FP64) { switch (config.transform_type()) { case FFTTransformType::C2C: { - CUFFT_CHECK(hipfftExecZ2Z(plan, - static_cast(in_data), - static_cast(out_data), - forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD)); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecZ2Z( + plan, static_cast(in_data), + static_cast(out_data), + forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD)); return; } case FFTTransformType::R2C: { - CUFFT_CHECK(hipfftExecD2Z(plan, static_cast(in_data), - static_cast(out_data))); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecD2Z( + plan, static_cast(in_data), + static_cast(out_data))); return; } case FFTTransformType::C2R: { - CUFFT_CHECK(hipfftExecZ2D(plan, - static_cast(in_data), - static_cast(out_data))); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftExecZ2D( + plan, static_cast(in_data), + static_cast(out_data))); return; } } } PADDLE_THROW(platform::errors::InvalidArgument( "hipFFT only support transforms of type float32 and float64")); -#else - CUFFT_CHECK(platform::dynload::cufftXtExec( - plan, in_data, out_data, forward ? CUFFT_FORWARD : CUFFT_INVERSE)); -#endif } +template +void exec_hipfft_plan(const DeviceContext& ctx, const HIPFFTConfig& config, + framework::Tensor* input, framework::Tensor* output, + bool forward) { + auto fft_type = config.transform_type(); + if (fft_type == FFTTransformType::C2R && forward) { + forward = false; + framework::Tensor input_conj(input->type()); + input_conj.mutable_data(input->dims(), ctx.GetPlace()); + platform::ForRange for_range(ctx, input->numel()); + math::ConjFunctor functor(input->data(), input->numel(), + input_conj.data()); + for_range(functor); + exec_hipfft_plan_raw(config, input_conj.data(), output->data(), + forward); + } else if (fft_type == FFTTransformType::R2C && !forward) { + forward = true; + framework::Tensor out_conj(output->type()); + out_conj.mutable_data(output->dims(), ctx.GetPlace()); + exec_hipfft_plan_raw(config, input->data(), out_conj.data(), + forward); + + platform::ForRange for_range(ctx, output->numel()); + math::ConjFunctor functor(out_conj.data(), output->numel(), + output->data()); + for_range(functor); + } else { + exec_hipfft_plan_raw(config, input->data(), output->data(), + forward); + } +} + +#endif + // Execute a general unnormalized fft operation (can be c2c, onesided r2c or // onesided c2r) template void exec_fft(const DeviceContext& ctx, const Tensor* X, Tensor* out, const std::vector& dim, bool forward) { const auto x_dims = framework::vectorize(X->dims()); - const auto out_dims = framework::vectorize(out->dims()); const int64_t ndim = static_cast(X->dims().size()); - const int64_t signal_ndim = static_cast(dim.size()); - const int64_t batch_dims = ndim - signal_ndim; auto tensor_place = ctx.GetPlace(); - // Transpose batch dimensions first, then with transforming dims + // make a dim permutation std::vector dim_permute(ndim); - std::vector reverse_dim_permute(ndim); - std::vector trans_dims(ndim); std::iota(dim_permute.begin(), dim_permute.end(), int{0}); std::vector is_transformed_dim(ndim); for (const auto& d : dim) { @@ -342,159 +271,89 @@ void exec_fft(const DeviceContext& ctx, const Tensor* X, Tensor* out, std::sort(dim_permute.begin(), batch_end); std::copy(dim.cbegin(), dim.cend(), batch_end); - for (size_t i = 0; i < ndim; i++) { - trans_dims[i] = x_dims[dim_permute[i]]; // shape of input transpose - reverse_dim_permute[dim_permute[i]] = - static_cast(i); // reverse of dim permute - } - framework::Tensor input; - input.Resize(framework::make_ddim(trans_dims)); - input.mutable_data(tensor_place); - /* - auto in_ret = TransposeSimple::run(ctx, *X, dim_permute, input); - if (!in_ret) { - TransCompute(ndim, ctx, *X, input, dim_permute); - } - */ - TransCompute(ndim, ctx, *X, &input, dim_permute); + // transpose input according to dim permutation + auto transposed_input_shape = X->dims().transpose(dim_permute); + framework::Tensor transposed_input; + transposed_input.Resize(transposed_input_shape); + transposed_input.mutable_data(tensor_place); + TransCompute(ndim, ctx, *X, &transposed_input, + dim_permute); // Reshape batch dimensions into a single dimension - std::vector batched_sizes(signal_ndim + 1); + const int64_t signal_ndim = static_cast(dim.size()); + std::vector collapsed_input_shape(signal_ndim + 1); + + auto transposed_input_shape_ = framework::vectorize(transposed_input_shape); + const int64_t batch_dims = ndim - signal_ndim; auto batch_size = - std::accumulate(trans_dims.begin(), trans_dims.begin() + batch_dims, + std::accumulate(transposed_input_shape_.begin(), + transposed_input_shape_.begin() + batch_dims, static_cast(1), std::multiplies()); - batched_sizes[0] = batch_size; - std::copy(trans_dims.begin() + batch_dims, trans_dims.end(), - batched_sizes.begin() + 1); - input.Resize(framework::make_ddim(batched_sizes)); + collapsed_input_shape[0] = batch_size; - // Check the shape of transforming dims with input and output - std::vector signal_size(signal_ndim + 1); - signal_size[0] = batch_size; - for (int64_t i = 0; i < signal_ndim; ++i) { - auto in_size = input.dims()[i + 1]; - auto out_size = out_dims[dim[i]]; - signal_size[i + 1] = std::max(in_size, out_size); - PADDLE_ENFORCE_EQ( - (in_size == signal_size[i + 1] || - in_size == (signal_size[i + 1] / 2) + 1), - true, - platform::errors::InvalidArgument( - "The dimension[%d] of Input size: [%d] must be equal or half to " - "The dimension[%d] of Output size: [%d]", - dim[i], in_size, dim[i], out_size)); - PADDLE_ENFORCE_EQ( - (out_size == signal_size[i + 1] || - out_size == (signal_size[i + 1] / 2) + 1), - true, - platform::errors::InvalidArgument( - "The dimension[%d] of Output size: [%d] must be equal or half to " - "The dimension[%d] of Input size: [%d]", - dim[i], out_size, dim[i], in_size)); - } + std::copy(transposed_input_shape_.begin() + batch_dims, + transposed_input_shape_.end(), collapsed_input_shape.begin() + 1); - std::vector reshape_out_sizes(ndim); - for (size_t i = 0; i < ndim; ++i) { - reshape_out_sizes[i] = out_dims[dim_permute[i]]; - } - std::vector batched_out_sizes(batched_sizes.begin(), - batched_sizes.end()); + framework::Tensor& collapsed_input = transposed_input; + collapsed_input.Resize(framework::make_ddim(collapsed_input_shape)); + + // make a collpased output + const auto out_dims = framework::vectorize(out->dims()); + std::vector collapsed_output_shape(1 + signal_ndim); + collapsed_output_shape[0] = batch_size; for (size_t i = 0; i < dim.size(); ++i) { - batched_out_sizes[i + 1] = out_dims[dim[i]]; + collapsed_output_shape[i + 1] = out_dims[dim[i]]; } - - // output - framework::Tensor output; - output.Resize(framework::make_ddim(batched_out_sizes)); - output.mutable_data(tensor_place); - - // Create the transform plan (either from cache or locally) - const auto value_type = framework::IsComplexType(input.type()) - ? framework::ToRealType(input.type()) - : input.type(); - auto fft_type = GetFFTTransformType(input.type(), output.type()); - PlanKey Key(framework::vectorize(input.dims()), - framework::vectorize(output.dims()), signal_size, fft_type, - value_type); - CuFFTConfig uncached_plan(Key); - CuFFTConfig* config = &uncached_plan; - auto& plan = config->plan(); - + framework::Tensor collapsed_output; + collapsed_output.Resize(framework::make_ddim(collapsed_output_shape)); + collapsed_output.mutable_data(tensor_place); + +#if defined(PADDLE_WITH_CUDA) + // create plan + CuFFTConfig config = + create_cufft_config(collapsed_input, collapsed_output, signal_ndim); // prepare cufft for execution - CUFFT_CHECK(platform::dynload::cufftSetStream(plan, ctx.stream())); + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::cufftSetStream(config.plan(), ctx.stream())); framework::Tensor workspace_tensor; - workspace_tensor.mutable_data(tensor_place, config->workspace_size()); - CUFFT_CHECK( - platform::dynload::cufftSetWorkArea(plan, workspace_tensor.data())); + workspace_tensor.mutable_data(tensor_place, config.workspace_size()); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cufftSetWorkArea( + config.plan(), workspace_tensor.data())); + // execute transform plan + exec_cufft_plan(ctx, config, &collapsed_input, + &collapsed_output, forward); +#elif defined(PADDLE_WITH_HIP) + // create plan + HIPFFTConfig config = + create_hipfft_config(collapsed_input, collapsed_output, signal_ndim); + // prepare cufft for execution + PADDLE_ENFORCE_CUDA_SUCCESS( + platform::dynload::hipfftSetStream(config.plan(), ctx.stream())); + framework::Tensor workspace_tensor; + workspace_tensor.mutable_data(tensor_place, config.workspace_size()); + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::hipfftSetWorkArea( + config.plan(), workspace_tensor.data())); // execute transform plan - if (fft_type == FFTTransformType::C2R && forward) { - forward = false; - framework::Tensor input_conj(input.type()); - input_conj.mutable_data(input.dims(), ctx.GetPlace()); - platform::ForRange for_range(ctx, input.numel()); - math::ConjFunctor functor(input.data(), input.numel(), - input_conj.data()); - for_range(functor); - exec_cufft_plan(*config, input_conj.data(), output.data(), - forward); - } else if (fft_type == FFTTransformType::R2C && !forward) { - forward = true; - framework::Tensor out_conj(output.type()); - out_conj.mutable_data(output.dims(), ctx.GetPlace()); - exec_cufft_plan(*config, input.data(), out_conj.data(), - forward); - - platform::ForRange for_range(ctx, output.numel()); - math::ConjFunctor functor(out_conj.data(), output.numel(), - output.data()); - for_range(functor); - } else { - exec_cufft_plan(*config, input.data(), output.data(), forward); - } + exec_hipfft_plan(ctx, config, &collapsed_input, + &collapsed_output, forward); +#endif // Inverting output by reshape and transpose to original batch and dimension - output.Resize(framework::make_ddim(reshape_out_sizes)); - out->Resize(framework::make_ddim(out_dims)); - TransCompute(ndim, ctx, output, out, reverse_dim_permute); -} + auto transposed_out_shape = out->dims().transpose(dim_permute); -// Calculates the normalization constant -double fft_normalization_scale(FFTNormMode normalization, - const std::vector& sizes, - const std::vector& dims) { - // auto norm = static_cast(normalization); - if (normalization == FFTNormMode::none) { - return static_cast(1.0); - } + collapsed_output.Resize(transposed_out_shape); + auto& transposed_output = collapsed_output; - int64_t signal_numel = 1; - for (auto dim : dims) { - signal_numel *= sizes[dim]; + std::vector reverse_dim_permute(ndim); + for (size_t i = 0; i < ndim; i++) { + reverse_dim_permute[dim_permute[i]] = i; } - const double scale_denom = (normalization == FFTNormMode::by_sqrt_n) - ? std::sqrt(signal_numel) - : static_cast(signal_numel); - return static_cast(1.0 / scale_denom); -} -template -void exec_normalization(const DeviceContext& ctx, const Tensor* in, Tensor* out, - FFTNormMode normalization, - const std::vector& sizes, - const std::vector& axes) { - double scale = fft_normalization_scale(normalization, sizes, axes); - if (scale != 1.0) { - auto eigen_out = framework::EigenVector::Flatten(*out); - auto eigen_in = framework::EigenVector::Flatten(*in); - auto dev = ctx.eigen_device(); - EigenScale::Eval(*dev, eigen_out, eigen_in, - static_cast(scale), - static_cast(0), false); - } else { - framework::TensorCopy(*in, ctx.GetPlace(), out); - } + TransCompute(ndim, ctx, transposed_output, out, + reverse_dim_permute); } + } // anonymous namespace // Use the optimized path to perform single R2C or C2R if transformation dim is diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc index 8894ca650de034..de30eab25f3cf2 100644 --- a/paddle/fluid/operators/squeeze_op.cc +++ b/paddle/fluid/operators/squeeze_op.cc @@ -113,13 +113,13 @@ class SqueezeOp : public framework::OperatorWithKernel { auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); -#ifdef PADDLE_WITH_MKLDNN -// if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { -// return framework::OpKernelType(input_data_type, ctx.GetPlace(), -// framework::DataLayout::kMKLDNN, -// framework::LibraryType::kMKLDNN); -// } -#endif + //#ifdef PADDLE_WITH_MKLDNN + // if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + // return framework::OpKernelType(input_data_type, ctx.GetPlace(), + // framework::DataLayout::kMKLDNN, + // framework::LibraryType::kMKLDNN); + // } + //#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; @@ -140,13 +140,13 @@ class SqueezeGradOp : public framework::OperatorWithKernel { auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType( ctx, framework::GradVarName("Out")); -#ifdef PADDLE_WITH_MKLDNN -// if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { -// return framework::OpKernelType(input_data_type, ctx.GetPlace(), -// framework::DataLayout::kMKLDNN, -// framework::LibraryType::kMKLDNN); -// } -#endif + //#ifdef PADDLE_WITH_MKLDNN + // if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + // return framework::OpKernelType(input_data_type, ctx.GetPlace(), + // framework::DataLayout::kMKLDNN, + // framework::LibraryType::kMKLDNN); + // } + //#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; @@ -241,13 +241,13 @@ class Squeeze2Op : public framework::OperatorWithKernel { auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(ctx, "X"); -#ifdef PADDLE_WITH_MKLDNN -// if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { -// return framework::OpKernelType(input_data_type, ctx.GetPlace(), -// framework::DataLayout::kMKLDNN, -// framework::LibraryType::kMKLDNN); -// } -#endif + //#ifdef PADDLE_WITH_MKLDNN + // if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + // return framework::OpKernelType(input_data_type, ctx.GetPlace(), + // framework::DataLayout::kMKLDNN, + // framework::LibraryType::kMKLDNN); + // } + //#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; @@ -287,13 +287,13 @@ class Squeeze2GradOp : public framework::OperatorWithKernel { auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType( ctx, framework::GradVarName("Out")); -#ifdef PADDLE_WITH_MKLDNN -// if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { -// return framework::OpKernelType(input_data_type, ctx.GetPlace(), -// framework::DataLayout::kMKLDNN, -// framework::LibraryType::kMKLDNN); -// } -#endif + //#ifdef PADDLE_WITH_MKLDNN + // if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { + // return framework::OpKernelType(input_data_type, ctx.GetPlace(), + // framework::DataLayout::kMKLDNN, + // framework::LibraryType::kMKLDNN); + // } + //#endif return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/stack_op_xpu.cc b/paddle/fluid/operators/stack_op_xpu.cc index 9929df6e309d98..01ec4a2b16b4a4 100644 --- a/paddle/fluid/operators/stack_op_xpu.cc +++ b/paddle/fluid/operators/stack_op_xpu.cc @@ -66,5 +66,7 @@ namespace plat = paddle::platform; namespace ops = paddle::operators; REGISTER_OP_XPU_KERNEL(stack, + ops::StackXPUKernel, + ops::StackXPUKernel, ops::StackXPUKernel); #endif diff --git a/paddle/fluid/operators/string/CMakeLists.txt b/paddle/fluid/operators/string/CMakeLists.txt new file mode 100644 index 00000000000000..1da2e8e455da0c --- /dev/null +++ b/paddle/fluid/operators/string/CMakeLists.txt @@ -0,0 +1,6 @@ +include(operators) +if(WITH_UNITY_BUILD) + # Load Unity Build rules for operators in paddle/fluid/operators/sequence_ops. + include(unity_build_rule.cmake) +endif() +register_operators(DEPS op_version_registry utf8proc string_array) diff --git a/paddle/fluid/operators/string/faster_tokenizer_op.cc b/paddle/fluid/operators/string/faster_tokenizer_op.cc new file mode 100644 index 00000000000000..42047021b408a8 --- /dev/null +++ b/paddle/fluid/operators/string/faster_tokenizer_op.cc @@ -0,0 +1,528 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "paddle/fluid/framework/string_array.h" +#include "paddle/fluid/operators/string/faster_tokenizer_op.h" + +namespace paddle { +namespace operators { + +using std::bad_cast; +using std::codecvt_utf8; +using std::endl; +using std::exception; +using std::ifstream; +using std::int64_t; +using std::min; +using std::runtime_error; +using std::unordered_map; +using std::unordered_set; +using std::shared_ptr; +using std::size_t; +using std::int64_t; +using std::string; +using std::vector; +using std::wstring; + +const wstring kStripChars = L" \t\n\r\v\f"; + +inline bool IsControl(const wchar_t& ch) { + if (ch == L'\t' || ch == L'\n' || ch == L'\r') return false; + auto cat = utf8proc_category(ch); + if (cat == UTF8PROC_CATEGORY_CC || cat == UTF8PROC_CATEGORY_CF) return true; + return false; +} + +inline bool IsChineseChar(const wchar_t& ch) { + if ((ch >= 0x4E00 && ch <= 0x9FFF) || (ch >= 0x3400 && ch <= 0x4DBF) || + (ch >= 0x20000 && ch <= 0x2A6DF) || (ch >= 0x2A700 && ch <= 0x2B73F) || + (ch >= 0x2B740 && ch <= 0x2B81F) || (ch >= 0x2B820 && ch <= 0x2CEAF) || + (ch >= 0xF900 && ch <= 0xFAFF) || (ch >= 0x2F800 && ch <= 0x2FA1F)) + return true; + return false; +} + +inline bool IsWhiteSpace(const wchar_t& ch) { + if (ch == L' ' || ch == L'\t' || ch == L'\n' || ch == L'\r') return true; + auto cat = utf8proc_category(ch); + if (cat == UTF8PROC_CATEGORY_ZS) return true; + return false; +} + +inline bool IsPunctuation(const wchar_t& ch) { + if ((ch >= 33 && ch <= 47) || (ch >= 58 && ch <= 64) || + (ch >= 91 && ch <= 96) || (ch >= 123 && ch <= 126)) + return true; + auto cat = utf8proc_category(ch); + if (cat == UTF8PROC_CATEGORY_PD || cat == UTF8PROC_CATEGORY_PS || + cat == UTF8PROC_CATEGORY_PE || cat == UTF8PROC_CATEGORY_PC || + cat == UTF8PROC_CATEGORY_PO // sometimes ¶ belong SO + || cat == UTF8PROC_CATEGORY_PI || cat == UTF8PROC_CATEGORY_PF) + return true; + return false; +} + +BasicTokenizer::BasicTokenizer(bool do_lower_case /* = true */) + : do_lower_case_(do_lower_case) {} + +wchar_t BasicTokenizer::do_lower_case(wchar_t ch) const { + wchar_t new_ch = utf8proc_tolower(ch); + return new_ch; +} + +void BasicTokenizer::Tokenize(const string& text, vector* res) const { + std::wstring unicode_text; + bool status = framework::ConvertStrToWstr(text, &unicode_text); + if (!status) { + // String is converted into wstring failedly. + return; + } + std::wstring cache_text = L""; + auto PushCacheText = [&]() { + if (cache_text != L"") { + res->emplace_back(cache_text); + cache_text = L""; + } + }; + for (auto& ch : unicode_text) { + if (ch == 0 || ch == 0xfffd || IsControl(ch)) { + continue; + } + if (do_lower_case_) { + ch = do_lower_case(ch); + } + if (IsChineseChar(ch) || IsPunctuation(ch)) { + PushCacheText(); + res->emplace_back(std::wstring{ch}); + } else if (IsWhiteSpace(ch)) { + PushCacheText(); + } else { + cache_text += ch; + } + } + PushCacheText(); +} + +WordPieceTokenizer::WordPieceTokenizer( + const framework::Vocab* vocab, const wstring& unk_token /* = L"[UNK]"*/, + const size_t max_input_chars_per_word /* = 100 */) + : vocab_(vocab), + unk_token_(unk_token), + max_input_chars_per_word_(max_input_chars_per_word) { + unk_token_id_ = vocab_->at(unk_token_); +} + +void WordPieceTokenizer::Tokenize(const wstring& text, + vector* token_ids) const { + size_t len = text.size(); + if (len > max_input_chars_per_word_) { + token_ids->emplace_back(std::move(unk_token_id_)); + return; + } + + auto it = vocab_->find(text); + if (it != vocab_->end()) { + token_ids->emplace_back(std::move(it->second)); + return; + } + + size_t start = 0; + vector wordpiece_ids; + while (start < len) { + size_t end = len; + std::wstring cur_substr; + int64_t cur_substr_id; + while (start < end) { + std::wstring sub = text.substr(start, end - start); + if (start > 0) { + sub = L"##" + sub; + } + auto it = vocab_->find(sub); + if (it != vocab_->end()) { + cur_substr = sub; + cur_substr_id = it->second; + break; + } + end -= 1; + } + + if (cur_substr.empty()) { + token_ids->emplace_back(std::move(unk_token_id_)); + return; + } else { + start = end; + wordpiece_ids.emplace_back(std::move(cur_substr_id)); + } + } + for (auto& token_id : wordpiece_ids) { + token_ids->emplace_back(std::move(token_id)); + } +} + +BertTokenizer::BertTokenizer(const framework::Vocab* vocab, + bool do_lower_case /* = false */, + const wstring& unk_token /* = L"[UNK]" */, + const wstring& pad_token /* = L"[PAD]" */, + const wstring& cls_token /* = L"[CLS]" */, + const wstring& mask_token /* = L"[MASK]" */, + const wstring& sep_token /* = L"[SEP]" */, + const string& padding_site /* = "right" */) + : do_lower_case_(do_lower_case), + unk_token_(unk_token), + pad_token_(pad_token), + cls_token_(cls_token), + mask_token_(mask_token), + sep_token_(sep_token), + padding_site_(padding_site), + vocab_(vocab), + basic_tokenizer_(do_lower_case_), + word_piece_tokenizer_(vocab_, unk_token) { + unk_token_id_ = vocab_->at(unk_token_); + pad_token_id_ = vocab_->at(pad_token_); + cls_token_id_ = vocab_->at(cls_token_); + mask_token_id_ = vocab_->at(mask_token_); + sep_token_id_ = vocab_->at(sep_token_); + + all_special_tokens_ = vector( + {unk_token_, pad_token_, cls_token_, mask_token_, sep_token_}); + all_special_token_ids_ = + unordered_set({unk_token_id_, pad_token_id_, cls_token_id_, + mask_token_id_, sep_token_id_}); +} + +void BertTokenizer::Tokenize(const string& text, + vector* split_token_ids) const { + std::vector tmp_tokens; + basic_tokenizer_.Tokenize(text, &tmp_tokens); + if (tmp_tokens.empty()) return; + split_token_ids->reserve(tmp_tokens.size()); + for (auto& w_token : tmp_tokens) { + const auto& vec_size = w_token.size(); + if (vec_size == 1) { + if (IsChineseChar(w_token[0])) { + auto vocab_it = vocab_->find(w_token); + if (vocab_it != vocab_->end()) { + split_token_ids->emplace_back(std::move(vocab_it->second)); + } else { + split_token_ids->emplace_back(std::move(unk_token_id_)); + } + } else { + word_piece_tokenizer_.Tokenize(w_token, split_token_ids); + } + } else if (vec_size > 1) { + word_piece_tokenizer_.Tokenize(w_token, split_token_ids); + } else { + continue; + } + } +} + +void BertTokenizer::BuildInputsWithSpecialTokens( + vector* inputs, const vector& token_ids_0, + const vector& token_ids_1 /* = vector() */) const { + if (token_ids_1.size() == 0) { + inputs->clear(); + inputs->resize(token_ids_0.size() + 2); + inputs->at(0) = std::move(cls_token_id_); + size_t i = 1; + for (auto& token_id : token_ids_0) { + inputs->at(i) = std::move(token_id); + ++i; + } + inputs->at(i) = std::move(sep_token_id_); + } else { + inputs->clear(); + inputs->resize(token_ids_0.size() + token_ids_1.size() + 3); + inputs->at(0) = std::move(cls_token_id_); + size_t i = 1; + for (auto& token_id : token_ids_0) { + inputs->at(i) = std::move(token_id); + ++i; + } + inputs->at(i) = std::move(sep_token_id_); + ++i; + for (auto& token_id : token_ids_1) { + inputs->at(i) = std::move(token_id); + ++i; + } + inputs->at(i) = std::move(sep_token_id_); + } +} + +int64_t BertTokenizer::GetNumSpecialTokensToAdd(const bool pair) const { + if (pair) { + return 3; + } else { + return 2; + } +} + +void BertTokenizer::CreateTokenTypeIdsFromSequences( + vector* token_type_ids, const vector& token_ids_0, + const vector& token_ids_1 /* = vector() */) const { + if (token_ids_1.size() == 0) { + vector tmp(token_ids_0.size() + 2, 0); + token_type_ids->swap(tmp); + } else { + vector tmp(token_ids_0.size() + token_ids_1.size() + 3, 0); + for (size_t i = token_ids_0.size() + 2; i < tmp.size(); i++) { + tmp[i] = 1; + } + token_type_ids->swap(tmp); + } +} + +void BertTokenizer::TruncateSequence( + vector* ids, vector* pair_ids, + const size_t num_tokens_to_remove /* = 0 */, + const size_t stride /* = 0 */) const { + for (size_t i = 0; i < num_tokens_to_remove; i++) { + if ((pair_ids->size() == 0) || (ids->size() > pair_ids->size())) { + ids->pop_back(); + } else { + pair_ids->pop_back(); + } + } +} + +int64_t BertTokenizer::GetPadTokenID() const { return pad_token_id_; } + +int BertTokenizer::Encode( + unordered_map>* encoded_inputs, const string& text, + const string& text_pair /* = "" */, bool is_split_into_words /* = false */, + const size_t max_seq_len /* = 0 */, + bool pad_to_max_seq_len /* = false */) const { + vector ids; + vector pair_ids; + if (!is_split_into_words) { + Tokenize(text, &ids); + if (ids.empty()) return 0; + if (text_pair != "") { + Tokenize(text_pair, &pair_ids); + if (pair_ids.empty()) return 0; + } + } else { + std::wstring unicode_text; + bool status_a = framework::ConvertStrToWstr(text, &unicode_text); + if (!status_a) { + return 0; + } + for (size_t i = 0; i < unicode_text.size(); i++) { + wstring token = unicode_text.substr(i, 1); + auto it = vocab_->find(token); + if (it != vocab_->end()) { + ids.emplace_back(std::move(it->second)); + } else { + ids.emplace_back(std::move(unk_token_id_)); + } + } + } + + bool pair = false; + if (pair_ids.size() != 0) { + pair = true; + } + + size_t len_ids = ids.size(); + size_t len_pair_ids = pair_ids.size(); + + // Truncation: Handle max sequence length + // If max_seq_len == 0, then do nothing and keep the real length. + // If max_seq_len > 0 and + // all the input sequence len is over the max_seq_len, + // then we truncate it. + size_t total_len = len_ids + len_pair_ids + GetNumSpecialTokensToAdd(pair); + if (max_seq_len > 0 && total_len > max_seq_len) { + TruncateSequence(&ids, &pair_ids, total_len - max_seq_len); + } + + // Add special tokens + vector sequence; + BuildInputsWithSpecialTokens(&sequence, ids, pair_ids); + size_t seq_len = sequence.size(); + vector token_type_ids; + CreateTokenTypeIdsFromSequences(&token_type_ids, ids, pair_ids); + + // Build output dictionnary + encoded_inputs->emplace("input_ids", sequence); + encoded_inputs->emplace("token_type_ids", token_type_ids); + // Check lengths + if (max_seq_len > 0 && seq_len > max_seq_len) { + VLOG(3) << "There is something wrong with the input sequence length." + " Please check it."; + // Failed. + return 0; + } + + // Padding + bool needs_to_be_padded = false; + if (pad_to_max_seq_len && max_seq_len > 0 && (seq_len < max_seq_len)) { + needs_to_be_padded = true; + } + + if (needs_to_be_padded) { + int64_t difference = max_seq_len - seq_len; + size_t pad_start = max_seq_len - 1 - difference; + encoded_inputs->at("token_type_ids").resize(max_seq_len); + for (size_t i = max_seq_len - 1; i > pad_start; i--) { + encoded_inputs->at("token_type_ids")[i] = pad_token_id_; + } + + encoded_inputs->at("input_ids").resize(max_seq_len); + for (size_t i = max_seq_len - 1; i > pad_start; i--) { + encoded_inputs->at("input_ids")[i] = pad_token_id_; + } + } + return 1; +} + +void BertTokenizer::BatchEncode( + vector>>* batch_encode_inputs, + const vector& batch_text, + const vector& batch_text_pair /* = vector() */, + bool is_split_into_words /* = false */, const size_t max_seq_len /* = 0 */, + bool pad_to_max_seq_len /* = false */) const { + bool has_text_pair = false; + if (batch_text_pair.size() != 0) { + has_text_pair = true; + } + + size_t batch_size = batch_text.size(); +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (size_t i = 0; i < batch_size; i++) { + unordered_map> res; + if (has_text_pair) { + auto status = + Encode(&res, batch_text[i], batch_text_pair[i], is_split_into_words, + max_seq_len, pad_to_max_seq_len); + if (!status) { + res["input_ids"] = + std::vector{cls_token_id_, sep_token_id_, cls_token_id_}; + res["token_type_ids"] = std::vector{0, 0, 1}; + } + } else { + auto status = Encode(&res, batch_text[i], {}, is_split_into_words, + max_seq_len, pad_to_max_seq_len); + + if (!status) { + res["input_ids"] = std::vector{cls_token_id_, sep_token_id_}; + res["token_type_ids"] = std::vector{0, 0}; + } + } + batch_encode_inputs->at(i) = std::move(res); + } +} + +class FasterTokenizerOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("Text"), "Input", "Text", "Tokenizer"); + OP_INOUT_CHECK(ctx->HasInput("Vocab"), "Input", "Vocab", "Tokenizer"); + OP_INOUT_CHECK(ctx->HasOutput("InputIds"), "Output", "InputIds", + "Tokenizer"); + OP_INOUT_CHECK(ctx->HasOutput("SegmentIds"), "Output", "SegmentIds", + "Tokenizer"); + + ctx->SetOutputDim("InputIds", {-1, -1}); + ctx->SetOutputDim("SegmentIds", {-1, -1}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(framework::proto::VarType::INT64, + paddle::platform::CPUPlace()); + } + + framework::OpKernelType GetKernelTypeForVar( + const std::string& var_name, const framework::Tensor& tensor, + const framework::OpKernelType& expected_kernel_type) const override { + return framework::OpKernelType(expected_kernel_type.data_type_, + expected_kernel_type.place_, + tensor.layout()); + } +}; + +class FasterTokenizerOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Vocab", + "(std::map), The vocab to map " + "token string to token id."); + AddInput("Text", + "(std::vector), The sequence to be processed. " + "One sequence is a string, a list of strings, " + "or a list of integers depending on whether it " + "has been pretokenized and converted to ids. "); + AddInput("TextPair", + "(std::vector), Same as `text` argument, " + "while it represents for the latter sequence of the " + "sequence pair.") + .AsDispensable(); + AddOutput("InputIds", "(Tensor), The token ids of the input text."); + AddOutput("SegmentIds", "(Tensor), The segments ids of the input text."); + AddAttr( + "do_lower_case", + "(bool), Whether or not to lowercase the input when tokenizing.") + .SetDefault(false); + AddAttr( + "is_split_into_words", + "(bool), Whether or not the input is already pre-tokenized " + "(e.g., split into words). If set to True, the tokenizer " + "assumes the input is already split into words (for instance, " + "by splitting it on whitespace) which it will tokenize. This " + "is useful for NER or token classification.") + .SetDefault(false); + AddAttr("max_seq_len", + "(int), If set to a positive number, will limit the " + "total sequence returned so that it has a maximum length." + " If there are overflowing tokens, those overflowing " + "tokens will be added to the returned dictionary when " + "`return_overflowing_tokens` is `True`.") + .SetDefault(0); + AddAttr("pad_to_max_seq_len", + "(bool), If set to `True`, the returned sequences would be" + " padded up to `max_seq_len` specified length according to" + " padding side and padding token id.") + .SetDefault(false); + AddComment(R"DOC(Performs tokenization and uses the tokenized tokens to " + "prepare model inputs. It supports sequence or sequence pair as input, " + "and batch input is not allowed.)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(faster_tokenizer, ops::FasterTokenizerOp, + ops::FasterTokenizerOpMaker); + +REGISTER_OP_CPU_KERNEL(faster_tokenizer, ops::FasterTokenizerKernel); diff --git a/paddle/fluid/operators/string/faster_tokenizer_op.h b/paddle/fluid/operators/string/faster_tokenizer_op.h new file mode 100644 index 00000000000000..5218b7c2eaa51d --- /dev/null +++ b/paddle/fluid/operators/string/faster_tokenizer_op.h @@ -0,0 +1,195 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include + +#include +#include +#include +#include + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/string_array.h" + +namespace paddle { +namespace operators { + +using std::endl; +using std::int64_t; +using std::size_t; +using std::string; +using std::shared_ptr; +using std::vector; +using std::unordered_map; +using std::unordered_set; +using std::vector; +using std::wstring; +using std::wcout; + +inline bool IsControl(const wchar_t& ch); +inline bool IsChineseChar(const wchar_t& ch); +inline bool IsWhiteSpace(const wchar_t& ch); + +using Vocab = unordered_map; +using InvVocab = unordered_map; + +class BasicTokenizer { + public: + explicit BasicTokenizer(bool do_lower_case = true); + void Tokenize(const string& text, vector* res) const; + + private: + wchar_t do_lower_case(wchar_t ch) const; + + bool do_lower_case_; +}; + +class WordPieceTokenizer { + public: + explicit WordPieceTokenizer(const framework::Vocab* vocab, + const wstring& unk_token = L"[UNK]", + const size_t max_input_chars_per_word = 100); + void Tokenize(const wstring& text, vector* output) const; + + private: + const framework::Vocab* vocab_; + wstring unk_token_{L"[UNK]"}; + int64_t unk_token_id_; + size_t max_input_chars_per_word_; +}; + +class BertTokenizer { + public: + explicit BertTokenizer(const framework::Vocab* vocab, + bool do_lower_case = false, + const wstring& unk_token = L"[UNK]", + const wstring& pad_token = L"[PAD]", + const wstring& cls_token = L"[CLS]", + const wstring& mask_token = L"[MASK]", + const wstring& sep_token = L"[SEP]", + const string& padding_site = "right"); + + void Tokenize(const string& text, vector* split_tokens) const; + void BuildInputsWithSpecialTokens( + vector* res, const vector& token_ids_0, + const vector& token_ids_1 = vector()) const; + void CreateTokenTypeIdsFromSequences( + vector* token_type_ids, const vector& token_ids_0, + const vector& token_ids_1 = vector()) const; + void TruncateSequence(vector* ids, vector* pair_ids, + const size_t num_tokens_to_remove = 0, + const size_t stride = 0) const; + int64_t GetNumSpecialTokensToAdd(const bool pair = false) const; + int Encode(unordered_map>* encoded_inputs, + const string& text, const string& text_pair = "", + bool is_split_into_words = false, const size_t max_seq_len = 0, + bool pad_to_max_seq_len = false) const; + void BatchEncode( + vector>>* batch_encode_inputs, + const vector& batch_text, + const vector& batch_text_pair = vector(), + bool is_split_into_words = false, const size_t max_seq_len = 0, + bool pad_to_max_seq_len = false) const; + + int64_t GetPadTokenID() const; + + private: + bool do_lower_case_; + wstring unk_token_, pad_token_, cls_token_, mask_token_, sep_token_; + string padding_site_; + const framework::Vocab* vocab_; + BasicTokenizer basic_tokenizer_; + WordPieceTokenizer word_piece_tokenizer_; + int64_t unk_token_id_, cls_token_id_, mask_token_id_, pad_token_id_, + sep_token_id_; + vector all_special_tokens_; + unordered_set all_special_token_ids_; + InvVocab inv_vocab_; +}; + +template +class FasterTokenizerKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* text = ctx.Input("Text"); + auto* vocab = ctx.Input("Vocab"); + + auto* input_ids = ctx.Output("InputIds"); + auto* seg_ids = ctx.Output("SegmentIds"); + + auto do_lower_case = static_cast(ctx.Attr("do_lower_case")); + auto is_split_into_words = + static_cast(ctx.Attr("is_split_into_words")); + auto max_seq_len = static_cast(ctx.Attr("max_seq_len")); + auto pad_to_max_seq_len = + static_cast(ctx.Attr("pad_to_max_seq_len")); + + auto* text_pair = ctx.Input("TextPair"); + if (text_pair && text->size() != text_pair->size()) { + VLOG(3) << "The input text(list[str]) and text pair (list[str]) must" + << "be the same number of text sequence. Please check the input!"; + return; + } + + BertTokenizer tokenizer(vocab, do_lower_case); + size_t batch_max_seq_len = 0; + size_t batch_size = text->size(); + + vector>> batch_encode_inputs( + batch_size); + if (text_pair) { + tokenizer.BatchEncode(&batch_encode_inputs, *text, *text_pair, + is_split_into_words, max_seq_len, + pad_to_max_seq_len); + } else { + tokenizer.BatchEncode(&batch_encode_inputs, *text, vector(), + is_split_into_words, max_seq_len, + pad_to_max_seq_len); + } + + for (size_t i = 0; i < batch_size; ++i) { + size_t seq_len = batch_encode_inputs[i]["input_ids"].size(); + if (seq_len > batch_max_seq_len) { + batch_max_seq_len = seq_len; + } + } + + input_ids->Resize( + framework::make_ddim({static_cast(batch_size), + static_cast(batch_max_seq_len)})); + auto* input_ids_data = input_ids->mutable_data(ctx.GetPlace()); + seg_ids->Resize( + framework::make_ddim({static_cast(batch_size), + static_cast(batch_max_seq_len)})); + auto* seg_ids_data = seg_ids->mutable_data(ctx.GetPlace()); + + auto pad_token_id = tokenizer.GetPadTokenID(); + for (size_t i = 0; i < batch_size; i++) { + auto& encoder_input_ids = batch_encode_inputs[i]["input_ids"]; + auto& encoder_seg_ids = batch_encode_inputs[i]["token_type_ids"]; + const size_t& seq_len = encoder_input_ids.size(); + // Copy the memory + std::memcpy(input_ids_data + i * batch_max_seq_len, + encoder_input_ids.data(), seq_len * sizeof(T)); + std::memcpy(seg_ids_data + i * batch_max_seq_len, encoder_seg_ids.data(), + seq_len * sizeof(T)); + std::memset(input_ids_data + i * batch_max_seq_len + seq_len, + pad_token_id, (batch_max_seq_len - seq_len) * sizeof(T)); + std::memset(seg_ids_data + i * batch_max_seq_len + seq_len, pad_token_id, + (batch_max_seq_len - seq_len) * sizeof(T)); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/string/unity_build_rule.cmake b/paddle/fluid/operators/string/unity_build_rule.cmake new file mode 100644 index 00000000000000..a4b209d2df13e6 --- /dev/null +++ b/paddle/fluid/operators/string/unity_build_rule.cmake @@ -0,0 +1,8 @@ +# This file records the Unity Build compilation rules. +# The source files in a `register_unity_group` called are compiled in a unity +# file. +# Generally, the combination rules in this file do not need to be modified. +# If there are some redefined error in compiling with the source file which +# in combination rule, you can remove the source file from the following rules. +register_unity_group(cc + faster_tokenizer_op.cc) \ No newline at end of file diff --git a/paddle/fluid/operators/svd_helper.h b/paddle/fluid/operators/svd_helper.h index d592c62d499b35..6b2584682277e5 100644 --- a/paddle/fluid/operators/svd_helper.h +++ b/paddle/fluid/operators/svd_helper.h @@ -96,6 +96,20 @@ struct PowFunctor { float exp_; }; +template +struct RealMulComplexFunctor { + // x: complex number (a+bj) + // y: complex number (c+0j) pretend to be a real number + // out: complex number (ac+bcj) + inline HOSTDEVICE T operator()(T x, T y) { + PADDLE_ENFORCE_LT(y.imag, 1e-6, platform::errors::InvalidArgument( + "The image part of y must to be 0" + "but got [%d]", + y.imag)); + return platform::complex>(x.real * y.real, x.imag * y.real); + } +}; + static std::vector GetBroadcastShape(InTensors ins) { PADDLE_ENFORCE_EQ(ins.size(), 2, platform::errors::InvalidArgument( "GetBroadcastShape Receive 2 tensors" @@ -286,6 +300,45 @@ struct DeviceIndependenceTensorOperations { for_range(DiagFunctor(x.data(), x.numel(), output)); return ret; } + + // batch_diag for CPU only + Tensor BatchDiag(const Tensor& x, int batch) { + Tensor out; + auto* x_data = x.data>(); + auto numel = x.numel(); + auto* out_data = out.mutable_data>( + x.dims(), context.GetPlace(), + static_cast(numel * sizeof(math::Real))); + + auto x_dims = x.dims(); + int num_dims = x_dims.size(); + std::vector out_shape; + + for (int i = 0; i < num_dims - 1; ++i) { + out_shape.push_back(x.dims()[i]); + } + out.Resize(framework::make_ddim(out_shape)); + int order = x.dims()[num_dims - 1]; + int stride_out = order * order; + int stride_in = order + 1; + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < order; ++j) { + out_data[i * order + j] = x_data[stride_out * i + stride_in * j]; + } + } + return out; + } + + // a complex number x times a real number y, which is represented as (a+0j) + Tensor RealMulComplex(const Tensor& x, const Tensor& y) { + framework::Tensor ret; + std::vector out_shape = GetBroadcastShape({&x, &y}); + ret.Resize(framework::make_ddim(out_shape)); + ElementwiseComputeEx, DeviceContext, T>( + context, &x, &y, -1, RealMulComplexFunctor(), &ret); + return ret; + } + framework::Tensor Div(const framework::Tensor& x, const framework::Tensor& y) { framework::Tensor ret; @@ -449,6 +502,19 @@ struct DeviceIndependenceTensorOperations { return ret; } + framework::Tensor TrilTriu(const framework::Tensor& x, int diagonal, + bool lower) { + framework::AttributeMap attrs; + attrs["diagonal"] = diagonal; + attrs["lower"] = lower; + NameInTensorMap inputs({{"X", {&x}}}); + int x_rank = x.dims().size(); + PADDLE_ENFORCE_GE(x_rank, 2, platform::errors::InvalidArgument( + "Rank must be at least 2.")); + std::vector out_shape = framework::vectorize(x.dims()); + return CreateOpRunAndReturnTensor("tril_triu", inputs, attrs, out_shape); + } + Tensor Conj(const Tensor& x) { Tensor out; auto* out_data = out.mutable_data(x.dims(), context.GetPlace()); @@ -459,6 +525,19 @@ struct DeviceIndependenceTensorOperations { return out; } + Tensor Real(const Tensor& x) { + Tensor out; + auto numel = x.numel(); + auto* out_data = out.mutable_data>( + x.dims(), context.GetPlace(), + static_cast(numel * sizeof(math::Real))); + auto* x_data = x.data(); + auto for_range = GetForRange(numel); + math::RealFunctor functor(x_data, out_data, numel); + for_range(functor); + return out; + } + Tensor DiagFill(const int m, const int n, const int num_lower_diags, const int num_upper_diags, const Tensor& scale, const Tensor& input) { diff --git a/paddle/fluid/operators/sync_batch_norm_op_npu.cc b/paddle/fluid/operators/sync_batch_norm_op_npu.cc new file mode 100644 index 00000000000000..31289b1c2396b8 --- /dev/null +++ b/paddle/fluid/operators/sync_batch_norm_op_npu.cc @@ -0,0 +1,995 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the Licnse. */ + +#include "paddle/fluid/operators/batch_norm_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +void training_or_inference( + const framework::ExecutionContext &ctx, const aclrtStream &stream, + const platform::Place &place, const DataLayout &layout, + const bool &test_mode, const int &N, const int &C, const int &H, + const int &W, const float epsilon, const float &momentum, + const Tensor *common_mean, const Tensor *common_var, const Tensor *x, + const Tensor *scale, const Tensor *bias, const Tensor *mean, + const Tensor *variance, Tensor *mean_out, Tensor *variance_out, + Tensor *saved_mean, Tensor *saved_variance, Tensor *y) { + std::vector axes; + if (layout == framework::DataLayout::kNCHW) { + axes = {0, 2, 3}; + } else if (layout == framework::DataLayout::kNHWC) { + axes = {0, 1, 2}; + } + + std::vector multiples; + if (layout == framework::DataLayout::kNCHW) + multiples = {N, 1, H, W}; + else if (layout == framework::DataLayout::kNHWC) + multiples = {N, H, W, 1}; + + Tensor common_mean_tile_1; + { + common_mean_tile_1.Resize({C}); + common_mean_tile_1.mutable_data(place); + TensorCopySync(*common_mean, place, &common_mean_tile_1); + if (layout == framework::DataLayout::kNCHW) + common_mean_tile_1.Resize({1, C, 1, 1}); + else if (layout == framework::DataLayout::kNHWC) + common_mean_tile_1.Resize({1, 1, 1, C}); + } + + Tensor common_mean_tile; + { + framework::NPUAttributeMap attr_input = {{"multiples", multiples}}; + common_mean_tile.Resize(x->dims()); + common_mean_tile.mutable_data(place); + const auto &runner = NpuOpRunner("TileD", {common_mean_tile_1}, + {common_mean_tile}, attr_input); + runner.Run(stream); + } + + Tensor common_var_tile_1; + { + common_var_tile_1.Resize({C}); + common_var_tile_1.mutable_data(place); + TensorCopySync(*common_var, place, &common_var_tile_1); + if (layout == framework::DataLayout::kNCHW) + common_var_tile_1.Resize({1, C, 1, 1}); + else if (layout == framework::DataLayout::kNHWC) + common_var_tile_1.Resize({1, 1, 1, C}); + } + + Tensor common_var_tile; + { + framework::NPUAttributeMap attr_input = {{"multiples", multiples}}; + common_var_tile.Resize(x->dims()); + common_var_tile.mutable_data(place); + const auto &runner = NpuOpRunner("TileD", {common_var_tile_1}, + {common_var_tile}, attr_input); + runner.Run(stream); + } + + Tensor common_var_tile_add_epsilon; + { + framework::NPUAttributeMap attr_input = {{"value", epsilon}}; + common_var_tile_add_epsilon.Resize(x->dims()); + common_var_tile_add_epsilon.mutable_data(place); + const auto &runner = NpuOpRunner("Adds", {common_var_tile}, + {common_var_tile_add_epsilon}, attr_input); + runner.Run(stream); + } + + Tensor common_var_tile_add_epsilon_sqrt; + { + common_var_tile_add_epsilon_sqrt.Resize(x->dims()); + common_var_tile_add_epsilon_sqrt.mutable_data(place); + const auto &runner = NpuOpRunner("Sqrt", {common_var_tile_add_epsilon}, + {common_var_tile_add_epsilon_sqrt}, {}); + runner.Run(stream); + } + + Tensor x_sub_common_mean; + { + x_sub_common_mean.Resize(x->dims()); + x_sub_common_mean.mutable_data(place); + const auto &runner = + NpuOpRunner("Sub", {*x, common_mean_tile}, {x_sub_common_mean}, {}); + runner.Run(stream); + } + + Tensor normalized; + { + normalized.Resize(x->dims()); + normalized.mutable_data(place); + const auto &runner = NpuOpRunner( + "Div", {x_sub_common_mean, common_var_tile_add_epsilon_sqrt}, + {normalized}, {}); + runner.Run(stream); + } + + Tensor scale_tile_1; + { + scale_tile_1.Resize({C}); + scale_tile_1.mutable_data(place); + TensorCopySync(*scale, place, &scale_tile_1); + if (layout == framework::DataLayout::kNCHW) + scale_tile_1.Resize({1, C, 1, 1}); + else if (layout == framework::DataLayout::kNHWC) + scale_tile_1.Resize({1, 1, 1, C}); + } + + Tensor scale_tile; + { + framework::NPUAttributeMap attr_input = {{"multiples", multiples}}; + scale_tile.Resize(x->dims()); + scale_tile.mutable_data(place); + const auto &runner = + NpuOpRunner("TileD", {scale_tile_1}, {scale_tile}, attr_input); + runner.Run(stream); + } + + Tensor normalized_mul_scale; + { + normalized_mul_scale.Resize(x->dims()); + normalized_mul_scale.mutable_data(place); + const auto &runner = NpuOpRunner("Mul", {normalized, scale_tile}, + {normalized_mul_scale}, {}); + runner.Run(stream); + } + + Tensor bias_tile_1; + { + bias_tile_1.Resize({C}); + bias_tile_1.mutable_data(place); + TensorCopySync(*bias, place, &bias_tile_1); + if (layout == framework::DataLayout::kNCHW) + bias_tile_1.Resize({1, C, 1, 1}); + else if (layout == framework::DataLayout::kNHWC) + bias_tile_1.Resize({1, 1, 1, C}); + } + + Tensor bias_tile; + { + framework::NPUAttributeMap attr_input = {{"multiples", multiples}}; + bias_tile.Resize(x->dims()); + bias_tile.mutable_data(place); + const auto &runner = + NpuOpRunner("TileD", {bias_tile_1}, {bias_tile}, attr_input); + runner.Run(stream); + } + + // calculate y + { + y->mutable_data(place); + const auto &runner = + NpuOpRunner("Add", {normalized_mul_scale, bias_tile}, {*y}, {}); + runner.Run(stream); + } + + if (!test_mode) { + Tensor ones; + { + ones.Resize({C}); + ones.mutable_data(place); + FillNpuTensorWithConstant(&ones, 1); + } + + // cacl mean_out + { + Tensor common_mean_mul_1_sub_momentum; + { + framework::NPUAttributeMap attr_input = {{"value", 1 - momentum}}; + common_mean_mul_1_sub_momentum.Resize({C}); + common_mean_mul_1_sub_momentum.mutable_data(place); + const auto &runner = + NpuOpRunner("Muls", {*common_mean}, + {common_mean_mul_1_sub_momentum}, attr_input); + runner.Run(stream); + } + + Tensor mean_mul_momentum; + { + framework::NPUAttributeMap attr_input = {{"value", momentum}}; + mean_mul_momentum.Resize({C}); + mean_mul_momentum.mutable_data(place); + const auto &runner = + NpuOpRunner("Muls", {*mean}, {mean_mul_momentum}, attr_input); + runner.Run(stream); + } + + mean_out->mutable_data(place); + + const auto &runner = NpuOpRunner( + "Add", {common_mean_mul_1_sub_momentum, mean_mul_momentum}, + {*mean_out}, {}); + runner.Run(stream); + } + + // cacl variance_out + { + Tensor momentum_mul_var; + { + framework::NPUAttributeMap attr_input = {{"value", momentum}}; + momentum_mul_var.Resize({C}); + momentum_mul_var.mutable_data(place); + const auto &runner = + NpuOpRunner("Muls", {*variance}, {momentum_mul_var}, attr_input); + runner.Run(stream); + } + + Tensor var_ref_mul_1_sub_momentum; + { + framework::NPUAttributeMap attr_input = {{"value", 1 - momentum}}; + var_ref_mul_1_sub_momentum.Resize({C}); + var_ref_mul_1_sub_momentum.mutable_data(place); + const auto &runner = NpuOpRunner( + "Muls", {*common_var}, {var_ref_mul_1_sub_momentum}, attr_input); + runner.Run(stream); + } + + variance_out->mutable_data(place); + + const auto &runner = + NpuOpRunner("Add", {var_ref_mul_1_sub_momentum, momentum_mul_var}, + {*variance_out}, {}); + runner.Run(stream); + } + + // cacl saved_variance + { + Tensor var_ref_add_epsilon; + { + framework::NPUAttributeMap attr_input = {{"value", epsilon}}; + var_ref_add_epsilon.Resize({C}); + var_ref_add_epsilon.mutable_data(place); + const auto &runner = NpuOpRunner("Adds", {*common_var}, + {var_ref_add_epsilon}, attr_input); + runner.Run(stream); + } + + Tensor var_ref_add_epsilon_sqrt; + { + var_ref_add_epsilon_sqrt.Resize({C}); + var_ref_add_epsilon_sqrt.mutable_data(place); + const auto &runner = NpuOpRunner("Sqrt", {var_ref_add_epsilon}, + {var_ref_add_epsilon_sqrt}, {}); + runner.Run(stream); + } + + saved_variance->mutable_data(place); + + const auto &runner = NpuOpRunner("Div", {ones, var_ref_add_epsilon_sqrt}, + {*saved_variance}, {}); + runner.Run(stream); + } + } +} + +template +class SyncBatchNormNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const float epsilon = ctx.Attr("epsilon"); + float momentum = ctx.Attr("momentum"); + const bool is_test = ctx.Attr("is_test"); + const std::string layout_str = ctx.Attr("data_layout"); + const DataLayout layout = framework::StringToDataLayout(layout_str); + const bool use_global_stats = ctx.Attr("use_global_stats"); + const bool trainable_stats = ctx.Attr("trainable_statistics"); + + PADDLE_ENFORCE_EQ(use_global_stats, false, + platform::errors::InvalidArgument( + "sync_batch_norm doesn't support " + "to set use_global_stats True. Please use batch_norm " + "in this case.")); + + const auto *x = ctx.Input("X"); + auto *y = ctx.Output("Y"); + const auto *scale = ctx.Input("Scale"); + const auto *bias = ctx.Input("Bias"); + const auto *mean = ctx.Input("Mean"); + const auto *variance = ctx.Input("Variance"); + auto *mean_out = ctx.Output("MeanOut"); + auto *variance_out = ctx.Output("VarianceOut"); + auto *saved_mean = ctx.Output("SavedMean"); + auto *saved_variance = ctx.Output("SavedVariance"); + + const auto &x_dims = x->dims(); + PADDLE_ENFORCE_EQ(x_dims.size(), 4, + platform::errors::InvalidArgument( + "The input tensor X's dimension must equal to 4. But " + "received X's shape = [%s], X's dimension = [%d].", + x_dims, x_dims.size())); + + int N, C, H, W, D; + ExtractNCWHD(x_dims, layout, &N, &C, &H, &W, &D); + + int x_numel = x->numel(); + auto place = ctx.GetPlace(); + auto stream = + ctx.template device_context() + .stream(); + + std::vector axes; + if (layout == framework::DataLayout::kNCHW) { + axes = {0, 2, 3}; + } else if (layout == framework::DataLayout::kNHWC) { + axes = {0, 1, 2}; + } + + bool test_mode = is_test && (!trainable_stats); + if (test_mode) { // inference + // cacl saved_mean + saved_mean->mutable_data(place); + TensorCopySync(*mean, place, saved_mean); + + // cacl saved_variance + saved_variance->mutable_data(place); + TensorCopySync(*variance, place, saved_variance); + + // cacl y + training_or_inference(ctx, stream, place, layout, test_mode, N, C, H, + W, epsilon, momentum, mean, variance, x, scale, + bias, mean, variance, NULL, NULL, NULL, NULL, y); + + } else { // training + if (ctx.HasInput("MomentumTensor")) { + const auto *mom_tensor = ctx.Input("MomentumTensor"); + Tensor mom_cpu; + TensorCopySync(*mom_tensor, platform::CPUPlace(), &mom_cpu); + momentum = mom_cpu.data()[0]; + } + + // cacl saved_mean and var_ref + Tensor var_ref; + var_ref.Resize({C}); + var_ref.mutable_data(place); + { + Tensor x_sum; + { + framework::NPUAttributeMap attr_input = {{"keep_dims", false}, + {"axes", axes}}; + x_sum.Resize({C}); + x_sum.mutable_data(place); + const auto &runner = + NpuOpRunner("ReduceSumD", {*x}, {x_sum}, attr_input); + runner.Run(stream); + } + + Tensor x_square; + { + x_square.Resize(x->dims()); + x_square.mutable_data(place); + const auto &runner = NpuOpRunner("Square", {*x}, {x_square}, {}); + runner.Run(stream); + } + + Tensor x_square_sum; + { + framework::NPUAttributeMap attr_input = {{"keep_dims", false}, + {"axes", axes}}; + x_square_sum.Resize({C}); + x_square_sum.mutable_data(place); + const auto &runner = + NpuOpRunner("ReduceSumD", {x_square}, {x_square_sum}, attr_input); + runner.Run(stream); + } + + auto comm = paddle::platform::HCCLCommContext::Instance().Get(0, place); + + float device_counts = 0.0; + if (comm) { + HcclDataType dtype = platform::ToHCCLDataType(mean_out->type()); + + Tensor device_count_tensor; + { + device_count_tensor.Resize({1}); + device_count_tensor.mutable_data(place); + FillNpuTensorWithConstant(&device_count_tensor, 1); + } + + // HcclAllReduce device_count_tensor + { + void *sendbuff = reinterpret_cast( + const_cast(device_count_tensor.data())); + void *recvbuff = sendbuff; + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce( + sendbuff, recvbuff, 1, dtype, HCCL_REDUCE_SUM, comm->comm(), + reinterpret_cast(stream))); + } + + std::vector device_count_vec(1); + TensorToVector(device_count_tensor, ctx.device_context(), + &device_count_vec); + device_counts = device_count_vec[0]; + + // HcclAllReduce x_sum + { + void *sendbuff = reinterpret_cast( + const_cast(x_sum.data())); + void *recvbuff = sendbuff; + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce( + sendbuff, recvbuff, C, dtype, HCCL_REDUCE_SUM, comm->comm(), + reinterpret_cast(stream))); + } + + // HcclAllReduce x_square_sum + { + void *sendbuff = reinterpret_cast( + const_cast(x_square_sum.data())); + void *recvbuff = sendbuff; + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce( + sendbuff, recvbuff, C, dtype, HCCL_REDUCE_SUM, comm->comm(), + reinterpret_cast(stream))); + } + } + + // cacl saved_mean + { + framework::NPUAttributeMap attr_input = { + {"value", 1.0f * C / x_numel / device_counts}}; + saved_mean->mutable_data(place); + const auto &runner = + NpuOpRunner("Muls", {x_sum}, {*saved_mean}, attr_input); + runner.Run(stream); + } + + // cacl var_ref + { + Tensor saved_mean_square; + { + saved_mean_square.Resize({C}); + saved_mean_square.mutable_data(place); + const auto &runner = + NpuOpRunner("Square", {*saved_mean}, {saved_mean_square}, {}); + runner.Run(stream); + } + + Tensor var_ref_tmp; + var_ref_tmp.Resize({C}); + var_ref_tmp.mutable_data(place); + { + framework::NPUAttributeMap attr_input = { + {"value", 1.0f * C / x_numel / device_counts}}; + const auto &runner = + NpuOpRunner("Muls", {x_square_sum}, {var_ref_tmp}, attr_input); + runner.Run(stream); + } + + // cacl var_ref + { + const auto &runner = NpuOpRunner( + "Sub", {var_ref_tmp, saved_mean_square}, {var_ref}, {}); + runner.Run(stream); + } + } + } + + training_or_inference(ctx, stream, place, layout, test_mode, N, C, H, + W, epsilon, momentum, saved_mean, &var_ref, x, + scale, bias, mean, variance, mean_out, + variance_out, saved_mean, saved_variance, y); + } + } +}; + +template +class SyncBatchNormNPUGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + float epsilon = ctx.Attr("epsilon"); + const std::string layout_str = ctx.Attr("data_layout"); + const DataLayout layout = framework::StringToDataLayout(layout_str); + + const auto *d_y = ctx.Input(framework::GradVarName("Y")); + const auto *scale = ctx.Input("Scale"); + auto *d_x = ctx.Output(framework::GradVarName("X")); + auto *d_scale = ctx.Output(framework::GradVarName("Scale")); + auto *d_bias = ctx.Output(framework::GradVarName("Bias")); + const auto *saved_mean = ctx.Input("SavedMean"); + + const Tensor *x; + if (ctx.HasInput("Y")) { + PADDLE_ENFORCE_EQ(true, false, + platform::errors::InvalidArgument( + "sync_batch_norm_grad doesn't support input Y")); + } else { + x = ctx.Input("X"); + } + + int N, C, H, W, D; + ExtractNCWHD(x->dims(), layout, &N, &C, &H, &W, &D); + + int x_numel = x->numel(); + auto place = ctx.GetPlace(); + auto stream = + ctx.template device_context() + .stream(); + + std::vector axes; + if (layout == framework::DataLayout::kNCHW) { + axes = {0, 2, 3}; + } else if (layout == framework::DataLayout::kNHWC) { + axes = {0, 1, 2}; + } + + std::vector multiples; + if (layout == framework::DataLayout::kNCHW) + multiples = {N, 1, H, W}; + else if (layout == framework::DataLayout::kNHWC) + multiples = {N, H, W, 1}; + + auto comm = paddle::platform::HCCLCommContext::Instance().Get(0, place); + HcclDataType dtype = platform::ToHCCLDataType(scale->type()); + + float device_counts = 0.0; + if (comm) { + Tensor device_count_tensor; + { + device_count_tensor.Resize({1}); + device_count_tensor.mutable_data(place); + FillNpuTensorWithConstant(&device_count_tensor, 1); + } + + // HcclAllReduce device_count_tensor + { + void *sendbuff = reinterpret_cast( + const_cast(device_count_tensor.data())); + void *recvbuff = sendbuff; + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce( + sendbuff, recvbuff, 1, dtype, HCCL_REDUCE_SUM, comm->comm(), + reinterpret_cast(stream))); + } + + std::vector device_count_vec(1); + TensorToVector(device_count_tensor, ctx.device_context(), + &device_count_vec); + device_counts = device_count_vec[0]; + PADDLE_ENFORCE_GE(device_counts, 2, platform::errors::PreconditionNotMet( + "device_counts should >= 2.")); + } + + // cacl var_ref + Tensor var_ref; + var_ref.Resize({C}); + var_ref.mutable_data(place); + { + // cacl var_ref + { + Tensor x_square; + { + x_square.Resize(x->dims()); + x_square.mutable_data(place); + const auto &runner = NpuOpRunner("Square", {*x}, {x_square}, {}); + runner.Run(stream); + } + + Tensor x_square_sum; + { + framework::NPUAttributeMap attr_input = {{"keep_dims", false}, + {"axes", axes}}; + x_square_sum.Resize({C}); + x_square_sum.mutable_data(place); + const auto &runner = + NpuOpRunner("ReduceSumD", {x_square}, {x_square_sum}, attr_input); + runner.Run(stream); + } + + Tensor x_square_sum_mean; + { + framework::NPUAttributeMap attr_input = { + {"value", 1.0f * C / x_numel}}; + x_square_sum_mean.Resize({C}); + x_square_sum_mean.mutable_data(place); + const auto &runner = NpuOpRunner("Muls", {x_square_sum}, + {x_square_sum_mean}, attr_input); + runner.Run(stream); + } + + Tensor mean_square; + { + mean_square.Resize({C}); + mean_square.mutable_data(place); + const auto &runner = + NpuOpRunner("Square", {*saved_mean}, {mean_square}, {}); + runner.Run(stream); + } + + // cacl var_ref + { + const auto &runner = NpuOpRunner( + "Sub", {x_square_sum_mean, mean_square}, {var_ref}, {}); + runner.Run(stream); + } + } + } + + Tensor saved_mean_tile_1; + { + saved_mean_tile_1.Resize({C}); + saved_mean_tile_1.mutable_data(place); + TensorCopySync(*saved_mean, place, &saved_mean_tile_1); + if (layout == framework::DataLayout::kNCHW) + saved_mean_tile_1.Resize({1, C, 1, 1}); + else if (layout == framework::DataLayout::kNHWC) + saved_mean_tile_1.Resize({1, 1, 1, C}); + } + + Tensor saved_mean_tile; + { + framework::NPUAttributeMap attr_input = {{"multiples", multiples}}; + saved_mean_tile.Resize(x->dims()); + saved_mean_tile.mutable_data(place); + const auto &runner = NpuOpRunner("TileD", {saved_mean_tile_1}, + {saved_mean_tile}, attr_input); + runner.Run(stream); + } + + Tensor x_sub_saved_mean; + { + x_sub_saved_mean.Resize(x->dims()); + x_sub_saved_mean.mutable_data(place); + const auto &runner = + NpuOpRunner("Sub", {*x, saved_mean_tile}, {x_sub_saved_mean}, {}); + runner.Run(stream); + } + + Tensor var_ref_tile_1; + { + var_ref_tile_1.Resize({C}); + var_ref_tile_1.mutable_data(place); + TensorCopySync(var_ref, place, &var_ref_tile_1); + if (layout == framework::DataLayout::kNCHW) + var_ref_tile_1.Resize({1, C, 1, 1}); + else if (layout == framework::DataLayout::kNHWC) + var_ref_tile_1.Resize({1, 1, 1, C}); + } + + Tensor var_ref_tile; + { + framework::NPUAttributeMap attr_input = {{"multiples", multiples}}; + var_ref_tile.Resize(x->dims()); + var_ref_tile.mutable_data(place); + const auto &runner = + NpuOpRunner("TileD", {var_ref_tile_1}, {var_ref_tile}, attr_input); + runner.Run(stream); + } + + Tensor var_ref_tile_add_epsilon; + { + framework::NPUAttributeMap attr_input = {{"value", epsilon}}; + var_ref_tile_add_epsilon.Resize(x->dims()); + var_ref_tile_add_epsilon.mutable_data(place); + const auto &runner = NpuOpRunner("Adds", {var_ref_tile}, + {var_ref_tile_add_epsilon}, attr_input); + runner.Run(stream); + } + + Tensor var_ref_tile_add_epsilon_sqrt; + { + var_ref_tile_add_epsilon_sqrt.Resize(x->dims()); + var_ref_tile_add_epsilon_sqrt.mutable_data(place); + const auto &runner = NpuOpRunner("Sqrt", {var_ref_tile_add_epsilon}, + {var_ref_tile_add_epsilon_sqrt}, {}); + runner.Run(stream); + } + + Tensor dy_mul_x_sub_mean_for_scale; + { + if (d_y->type() == framework::proto::VarType::FP16) { + dy_mul_x_sub_mean_for_scale.Resize(x->dims()); + dy_mul_x_sub_mean_for_scale.mutable_data(place); + const auto &runner = NpuOpRunner("Mul", {*d_y, x_sub_saved_mean}, + {dy_mul_x_sub_mean_for_scale}, {}); + runner.Run(stream); + } else { + dy_mul_x_sub_mean_for_scale.Resize(x->dims()); + dy_mul_x_sub_mean_for_scale.mutable_data(place); + const auto &runner = NpuOpRunner("Mul", {*d_y, x_sub_saved_mean}, + {dy_mul_x_sub_mean_for_scale}, {}); + runner.Run(stream); + } + } + + Tensor dy_mul_x_sub_mean; + { + if (d_y->type() == framework::proto::VarType::FP16) { + dy_mul_x_sub_mean.Resize(x->dims()); + dy_mul_x_sub_mean.mutable_data(place); + const auto &runner = NpuOpRunner("Mul", {*d_y, x_sub_saved_mean}, + {dy_mul_x_sub_mean}, {}); + runner.Run(stream); + } else { + dy_mul_x_sub_mean.Resize(x->dims()); + dy_mul_x_sub_mean.mutable_data(place); + const auto &runner = NpuOpRunner("Mul", {*d_y, x_sub_saved_mean}, + {dy_mul_x_sub_mean}, {}); + runner.Run(stream); + } + } + + // HcclAllReduce dy_mul_x_sub_mean + if (comm) { + { + void *sendbuff = reinterpret_cast( + const_cast(dy_mul_x_sub_mean.data())); + void *recvbuff = sendbuff; + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce( + sendbuff, recvbuff, C, dtype, HCCL_REDUCE_SUM, comm->comm(), + reinterpret_cast(stream))); + } + + { + framework::NPUAttributeMap attr_input = { + {"value", 1.0f / device_counts}}; + const auto &runner = NpuOpRunner("Muls", {dy_mul_x_sub_mean}, + {dy_mul_x_sub_mean}, attr_input); + runner.Run(stream); + } + } + + // cacl d_x + if (d_x) { + Tensor dy_mean; + { + if (d_y->type() == framework::proto::VarType::FP16) { + framework::NPUAttributeMap attr_input = {{"keep_dims", false}, + {"axes", axes}}; + dy_mean.Resize({C}); + dy_mean.mutable_data(place); + const auto &runner = + NpuOpRunner("ReduceMeanD", {*d_y}, {dy_mean}, attr_input); + runner.Run(stream); + } else { + framework::NPUAttributeMap attr_input = {{"keep_dims", false}, + {"axes", axes}}; + dy_mean.Resize({C}); + dy_mean.mutable_data(place); + const auto &runner = + NpuOpRunner("ReduceMeanD", {*d_y}, {dy_mean}, attr_input); + runner.Run(stream); + } + } + + // HcclAllReduce dy_mean + if (comm) { + { + void *sendbuff = reinterpret_cast( + const_cast(dy_mean.data())); + void *recvbuff = sendbuff; + PADDLE_ENFORCE_NPU_SUCCESS(platform::dynload::HcclAllReduce( + sendbuff, recvbuff, C, dtype, HCCL_REDUCE_SUM, comm->comm(), + reinterpret_cast(stream))); + } + + { + framework::NPUAttributeMap attr_input = { + {"value", 1.0f / device_counts}}; + const auto &runner = + NpuOpRunner("Muls", {dy_mean}, {dy_mean}, attr_input); + runner.Run(stream); + } + } + + Tensor dy_mean_tile_1; + { + dy_mean_tile_1.Resize({C}); + dy_mean_tile_1.mutable_data(place); + TensorCopySync(dy_mean, place, &dy_mean_tile_1); + if (layout == framework::DataLayout::kNCHW) + dy_mean_tile_1.Resize({1, C, 1, 1}); + else if (layout == framework::DataLayout::kNHWC) + dy_mean_tile_1.Resize({1, 1, 1, C}); + } + + Tensor dy_mean_tile; + { + framework::NPUAttributeMap attr_input = {{"multiples", multiples}}; + dy_mean_tile.Resize(x->dims()); + dy_mean_tile.mutable_data(place); + const auto &runner = + NpuOpRunner("TileD", {dy_mean_tile_1}, {dy_mean_tile}, attr_input); + runner.Run(stream); + } + + Tensor dy_sub_dy_mean; + { + if (d_y->type() == framework::proto::VarType::FP16) { + dy_sub_dy_mean.Resize(x->dims()); + dy_sub_dy_mean.mutable_data(place); + const auto &runner = + NpuOpRunner("Sub", {*d_y, dy_mean_tile}, {dy_sub_dy_mean}, {}); + runner.Run(stream); + } else { + dy_sub_dy_mean.Resize(x->dims()); + dy_sub_dy_mean.mutable_data(place); + const auto &runner = + NpuOpRunner("Sub", {*d_y, dy_mean_tile}, {dy_sub_dy_mean}, {}); + runner.Run(stream); + } + } + + Tensor dy_mul_x_sub_mean_mean; + { + framework::NPUAttributeMap attr_input = {{"keep_dims", false}, + {"axes", axes}}; + dy_mul_x_sub_mean_mean.Resize({C}); + dy_mul_x_sub_mean_mean.mutable_data(place); + const auto &runner = NpuOpRunner("ReduceMeanD", {dy_mul_x_sub_mean}, + {dy_mul_x_sub_mean_mean}, attr_input); + runner.Run(stream); + } + + Tensor dy_mul_x_sub_mean_mean_tile_1; + { + dy_mul_x_sub_mean_mean_tile_1.Resize({C}); + dy_mul_x_sub_mean_mean_tile_1.mutable_data(place); + TensorCopySync(dy_mul_x_sub_mean_mean, place, + &dy_mul_x_sub_mean_mean_tile_1); + if (layout == framework::DataLayout::kNCHW) + dy_mul_x_sub_mean_mean_tile_1.Resize({1, C, 1, 1}); + else if (layout == framework::DataLayout::kNHWC) + dy_mul_x_sub_mean_mean_tile_1.Resize({1, 1, 1, C}); + } + + Tensor dy_mul_x_sub_mean_mean_tile; + { + framework::NPUAttributeMap attr_input = {{"multiples", multiples}}; + dy_mul_x_sub_mean_mean_tile.Resize(x->dims()); + dy_mul_x_sub_mean_mean_tile.mutable_data(place); + const auto &runner = + NpuOpRunner("TileD", {dy_mul_x_sub_mean_mean_tile_1}, + {dy_mul_x_sub_mean_mean_tile}, attr_input); + runner.Run(stream); + } + + // (x - mean) * np.mean(dy * (x - mean), axis=axis) + // x_sub_saved_mean * dy_mul_x_sub_mean_mean_tile + Tensor tmp1; + { + tmp1.Resize(x->dims()); + tmp1.mutable_data(place); + const auto &runner = NpuOpRunner( + "Mul", {x_sub_saved_mean, dy_mul_x_sub_mean_mean_tile}, {tmp1}, {}); + runner.Run(stream); + } + + // (x - mean) * np.mean(dy * (x - mean), axis=axis) / (var + epsilon) + // tmp1 / (var + epsilon) + // tmp1 / var_ref_tile_add_epsilon + Tensor tmp2; + { + tmp2.Resize(x->dims()); + tmp2.mutable_data(place); + const auto &runner = + NpuOpRunner("Div", {tmp1, var_ref_tile_add_epsilon}, {tmp2}, {}); + runner.Run(stream); + } + + // dy - np.mean(dy, axis) - (x - mean) * np.mean(dy * (x - mean), axis) / + // (var + epsilon) + // dy_sub_dy_mean - tmp2 + Tensor tmp3; + { + tmp3.Resize(x->dims()); + tmp3.mutable_data(place); + const auto &runner = + NpuOpRunner("Sub", {dy_sub_dy_mean, tmp2}, {tmp3}, {}); + runner.Run(stream); + } + + Tensor scale_tile_1; + { + scale_tile_1.Resize({C}); + scale_tile_1.mutable_data(place); + TensorCopySync(*scale, place, &scale_tile_1); + if (layout == framework::DataLayout::kNCHW) + scale_tile_1.Resize({1, C, 1, 1}); + else if (layout == framework::DataLayout::kNHWC) + scale_tile_1.Resize({1, 1, 1, C}); + } + + Tensor scale_tile; + { + framework::NPUAttributeMap attr_input = {{"multiples", multiples}}; + scale_tile.Resize(x->dims()); + scale_tile.mutable_data(place); + const auto &runner = + NpuOpRunner("TileD", {scale_tile_1}, {scale_tile}, attr_input); + runner.Run(stream); + } + + // scale * (dy - np.mean(dy, axis) - (x - mean) * np.mean(dy * (x - mean), + // axis) / (var + epsilon)) + // scale * tmp3 + Tensor dx_1; + { + dx_1.Resize(x->dims()); + dx_1.mutable_data(place); + + const auto &runner = NpuOpRunner("Mul", {scale_tile, tmp3}, {dx_1}, {}); + runner.Run(stream); + } + + // dx_1 / var_ref_tile_add_epsilon_sqrt + { + d_x->Resize(x->dims()); + d_x->mutable_data(place); + const auto &runner = NpuOpRunner( + "Div", {dx_1, var_ref_tile_add_epsilon_sqrt}, {*d_x}, {}); + runner.Run(stream); + } + } + + // cacl d_scale + if (d_scale) { + Tensor d_scale_2; + { + d_scale_2.Resize(x->dims()); + d_scale_2.mutable_data(place); + const auto &runner = NpuOpRunner( + "Div", {dy_mul_x_sub_mean_for_scale, var_ref_tile_add_epsilon_sqrt}, + {d_scale_2}, {}); + runner.Run(stream); + } + + { + framework::NPUAttributeMap attr_input = {{"keep_dims", false}, + {"axes", axes}}; + d_scale->mutable_data(place); + const auto &runner = + NpuOpRunner("ReduceSumD", {d_scale_2}, {*d_scale}, attr_input); + runner.Run(stream); + } + } + + // cacl d_bias + if (d_bias) { + if (d_y->type() == framework::proto::VarType::FP16) { + framework::NPUAttributeMap attr_input = {{"keep_dims", false}, + {"axes", axes}}; + d_bias->mutable_data(place); + const auto &runner = + NpuOpRunner("ReduceSumD", {*d_y}, {*d_bias}, attr_input); + runner.Run(stream); + } else { + framework::NPUAttributeMap attr_input = {{"keep_dims", false}, + {"axes", axes}}; + d_bias->mutable_data(place); + const auto &runner = + NpuOpRunner("ReduceSumD", {*d_y}, {*d_bias}, attr_input); + runner.Run(stream); + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_NPU_KERNEL( + sync_batch_norm, + ops::SyncBatchNormNPUKernel); +REGISTER_OP_NPU_KERNEL( + sync_batch_norm_grad, + ops::SyncBatchNormNPUGradKernel); diff --git a/paddle/fluid/operators/tile_op_npu.cc b/paddle/fluid/operators/tile_op_npu.cc index c85a1cbc671af1..95d7cb9e362c78 100644 --- a/paddle/fluid/operators/tile_op_npu.cc +++ b/paddle/fluid/operators/tile_op_npu.cc @@ -16,7 +16,11 @@ limitations under the License. */ namespace paddle { namespace operators { -template + +using Tensor = framework::Tensor; +using NPUDeviceContext = platform::NPUDeviceContext; + +template class TileNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { @@ -92,18 +96,21 @@ class TileNPUKernel : public framework::OpKernel { std::vector temp(repeat_times.size(), 1); if (repeat_times == temp) { - framework::TensorCopy( - *in0, context.GetPlace(), - context.template device_context(), out0); + framework::TensorCopy(*in0, context.GetPlace(), + context.template device_context(), + out0); return; } - const auto& runner = - NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", repeat_times}}); - auto stream = - context.template device_context() - .stream(); - runner.Run(stream); + // const auto& runner = + // NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", repeat_times}}); + auto stream = context.template device_context().stream(); + NpuOpRunner runner; + runner.SetType("Tile") + .AddInput(*in0) + .AddInput(std::move(repeat_times)) + .AddOutput(*out0) + .Run(stream); } }; @@ -111,8 +118,9 @@ class TileNPUKernel : public framework::OpKernel { } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_NPU_KERNEL( - tile, ops::TileNPUKernel, - ops::TileNPUKernel, - ops::TileNPUKernel); +REGISTER_OP_NPU_KERNEL(tile, ops::TileNPUKernel, ops::TileNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::TileNPUKernel, +#endif + ops::TileNPUKernel, + ops::TileNPUKernel); diff --git a/paddle/fluid/operators/top_k_op_npu.cc b/paddle/fluid/operators/top_k_op_npu.cc index ca3a5f957685d9..a7d8fe01edd4cd 100644 --- a/paddle/fluid/operators/top_k_op_npu.cc +++ b/paddle/fluid/operators/top_k_op_npu.cc @@ -51,7 +51,9 @@ class TopkNPUKernel : public framework::OpKernel { indices->mutable_data(ctx.GetPlace()); // prepare assit - auto dim = input->dims().size(); + auto size = input->dims().size(); + // dim is the last dimension of input + auto dim = input->dims()[size - 1]; framework::Tensor assist_seq_tensor; assist_seq_tensor.Resize({2 * dim}); assist_seq_tensor.mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/transpose_op_npu.cc b/paddle/fluid/operators/transpose_op_npu.cc index 035ad5f3f314aa..7cc68e93c5d620 100644 --- a/paddle/fluid/operators/transpose_op_npu.cc +++ b/paddle/fluid/operators/transpose_op_npu.cc @@ -27,9 +27,12 @@ class TransposeNPUKernel : public framework::OpKernel { auto* x = ctx.Input("X"); auto* out = ctx.Output("Out"); std::vector axis = ctx.Attr>("axis"); - framework::NPUAttributeMap attr_input = {{"perm", axis}}; out->mutable_data(ctx.device_context().GetPlace()); - const auto& runner = NpuOpRunner("TransposeD", {*x}, {*out}, attr_input); + NpuOpRunner runner; + runner.SetType("Transpose") + .AddInput(*x) + .AddInput(std::move(axis)) + .AddOutput(*out); auto stream = ctx.template device_context() .stream(); @@ -51,9 +54,11 @@ class TransposeGradNPUKernel : public framework::OpKernel { reversed_axis[axis[i]] = i; } x_grad->mutable_data(ctx.GetPlace()); - framework::NPUAttributeMap attr_input = {{"perm", reversed_axis}}; - const auto& runner = - NpuOpRunner("TransposeD", {*out_grad}, {*x_grad}, attr_input); + NpuOpRunner runner; + runner.SetType("Transpose") + .AddInput(*out_grad) + .AddInput(std::move(reversed_axis)) + .AddOutput(*x_grad); auto stream = ctx.template device_context() .stream(); @@ -72,11 +77,17 @@ REGISTER_OP_NPU_KERNEL( ops::TransposeNPUKernel, ops::TransposeNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::TransposeNPUKernel, +#endif ops::TransposeNPUKernel, ops::TransposeNPUKernel); REGISTER_OP_NPU_KERNEL(transpose2_grad, ops::TransposeGradNPUKernel, ops::TransposeGradNPUKernel, ops::TransposeGradNPUKernel, +#ifdef PADDLE_WITH_ASCEND_INT64 + ops::TransposeGradNPUKernel, +#endif ops::TransposeGradNPUKernel, ops::TransposeGradNPUKernel); diff --git a/paddle/fluid/operators/tril_triu_op_npu.cc b/paddle/fluid/operators/tril_triu_op_npu.cc index cdabc28255b518..6e7e03911370fd 100644 --- a/paddle/fluid/operators/tril_triu_op_npu.cc +++ b/paddle/fluid/operators/tril_triu_op_npu.cc @@ -10,7 +10,7 @@ Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and -limitations under the Licnse. */ +limitations under the License. */ #include "paddle/fluid/operators/tril_triu_op.h" #include "paddle/fluid/operators/npu_op_runner.h" diff --git a/paddle/fluid/operators/unique_op.h b/paddle/fluid/operators/unique_op.h index 99793ecd244cf2..66b0543771f4d3 100644 --- a/paddle/fluid/operators/unique_op.h +++ b/paddle/fluid/operators/unique_op.h @@ -403,7 +403,10 @@ class UniqueKernel : public framework::OpKernel { bool return_index = context.Attr("return_index"); bool return_inverse = context.Attr("return_inverse"); bool return_counts = context.Attr("return_counts"); - + if (x->numel() == 0) { + out->mutable_data(context.GetPlace()); + return; + } if (axis_vec.empty()) { framework::VisitDataTypeTiny( data_type, diff --git a/paddle/fluid/operators/unstack_op.h b/paddle/fluid/operators/unstack_op.h index 82118b692707fb..cfd4d6bce83643 100644 --- a/paddle/fluid/operators/unstack_op.h +++ b/paddle/fluid/operators/unstack_op.h @@ -149,7 +149,7 @@ class UnStackKernel : public framework::OpKernel { dx_datas[i] = dx[i]->mutable_data(ctx.GetPlace()); } auto dy_data = dy->data(); - + if (dy->numel() == 0) return; int pre = 1; for (int i = 0; i < axis; ++i) pre *= dy->dims()[i]; int total_num = dy->numel(); diff --git a/paddle/fluid/operators/viterbi_decode_op.cc b/paddle/fluid/operators/viterbi_decode_op.cc new file mode 100644 index 00000000000000..bf1cdeed65a842 --- /dev/null +++ b/paddle/fluid/operators/viterbi_decode_op.cc @@ -0,0 +1,109 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/viterbi_decode_op.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +class ViterbiDecodeOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "ViterbiDecode"); + OP_INOUT_CHECK(ctx->HasInput("Transition"), "Input", "Transition", + "ViterbiDecode"); + OP_INOUT_CHECK(ctx->HasInput("Length"), "Input", "Length", "ViterbiDecode"); + OP_INOUT_CHECK(ctx->HasOutput("Scores"), "Output", "Scores", + "ViterbiDecode"); + OP_INOUT_CHECK(ctx->HasOutput("Path"), "Output", "Path", "ViterbiDecode"); + auto in_dims = ctx->GetInputDim("Input"); + PADDLE_ENFORCE_EQ(in_dims.size(), 3, + platform::errors::InvalidArgument( + "The rank of Input in ViterbiDecode must be 3. But " + "received Input's rank is %d.", + in_dims.size())); + auto length_dims = ctx->GetInputDim("Length"); + PADDLE_ENFORCE_EQ(length_dims.size(), 1, + platform::errors::InvalidArgument( + "The rank of Length in ViterbiDecode must be 1. But " + "received Length's rank is %d.", + length_dims.size())); + auto transition_dims = ctx->GetInputDim("Transition"); + PADDLE_ENFORCE_EQ( + transition_dims.size(), 2, + platform::errors::InvalidArgument( + "The rank of Transition in ViterbiDecode must be 2. But " + "received Transition's rank is %d.", + transition_dims.size())); + if (ctx->IsRuntime()) { + PADDLE_ENFORCE_EQ( + in_dims[0], length_dims[0], + platform::errors::InvalidArgument( + "The batch size of Input and Length should be equal.")); + PADDLE_ENFORCE_EQ(in_dims[2], transition_dims[0], + platform::errors::InvalidArgument( + "The number of tags of Input (%d) and Transition " + "(%d) should be equal.", + transition_dims[0], in_dims[2])); + } + ctx->SetOutputDim("Scores", length_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "Input"), + ctx.device_context()); + } +}; + +class ViterbiDecodeOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput( + "Input", + "The unary emission tensor. The shape of Input must be (batch_size," + "sequence_length, num_tags). "); + AddInput("Transition", + "The transition matrix. The shape of Transition must be ( " + "num_tags, num_tags). "); + AddInput("Length", + "The input length tensor storing real length of each sequence for " + "correctness. The shape of Length MUST be (batch_size)."); + AddOutput("Scores", + "The scores tensor containing the score for the Viterbi " + "sequence. The shape of Scores MUST be (batch_size)."); + AddOutput("Path", + "The paths tensor containing the highest scoring tag indices. " + "The shape of Scores MUST be (batch_size, sequence_length)."); + AddAttr("include_bos_eos_tag", + "If set to True, the last row and the last column of " + "transitions will be considered as start tag.") + .SetDefault(true); + AddComment(R"DOC( + )DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace platform = paddle::platform; +REGISTER_OP_WITHOUT_GRADIENT(viterbi_decode, ops::ViterbiDecodeOp, + ops::ViterbiDecodeOpMaker); +REGISTER_OP_CPU_KERNEL( + viterbi_decode, ops::ViterbiDecodeKernel, + ops::ViterbiDecodeKernel); diff --git a/paddle/fluid/operators/viterbi_decode_op.cu b/paddle/fluid/operators/viterbi_decode_op.cu new file mode 100644 index 00000000000000..086ff05b084612 --- /dev/null +++ b/paddle/fluid/operators/viterbi_decode_op.cu @@ -0,0 +1,200 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at +http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/elementwise/elementwise_functor.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" +#include "paddle/fluid/operators/gather.cu.h" +#include "paddle/fluid/operators/viterbi_decode_op.h" + +#ifdef __NVCC__ +#include "cub/cub.cuh" +#endif +#ifdef __HIPCC__ +#include +namespace cub = hipcub; +#endif + +namespace paddle { +namespace operators { + +#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...) \ + case (1 << (log2_block_dim)): { \ + constexpr auto kBlockDim = (1 << (log2_block_dim)); \ + __VA_ARGS__; \ + } break + +#define FIXED_BLOCK_DIM_CASE(...) \ + FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__); + +int64_t ComputeBlockSize(int64_t col) { + if (col > 512) + return 1024; + else if (col > 256) + return 512; + else if (col > 128) + return 256; + else if (col > 64) + return 128; + else if (col > 32) + return 64; + else if (col > 16) + return 32; + else if (col > 8) + return 16; + else + return 8; +} + +template