diff --git a/CMakeLists.txt b/CMakeLists.txt index 4f6ed9de30efe4..83191254f1a229 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -130,7 +130,7 @@ if(WIN32) # NOTE(zhouwei25): GPU compile have too high memory utilization when parallel compiling, # For Visual Studio generators, /MP should be added. # For other generators like Ninja, it is not need to add /MP. - if("${CMAKE_GENERATOR}" STREQUAL "Visual Studio" AND NOT WITH_GPU) + if(CMAKE_GENERATOR MATCHES "Visual Studio" AND NOT WITH_GPU) math(EXPR PROCESS_MAX "${CPU_CORES} * 2 / 3") set(${flag_var} "${${flag_var}} /MP${PROCESS_MAX}") endif() diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake index 8360761de6fb98..0f9739014d52bf 100644 --- a/cmake/external/gflags.cmake +++ b/cmake/external/gflags.cmake @@ -41,6 +41,7 @@ ExternalProject_Add( ${SHALLOW_CLONE} "${GFLAGS_DOWNLOAD_CMD}" PREFIX ${GFLAGS_PREFIX_DIR} + UPDATE_COMMAND "" SOURCE_DIR ${GFLAGS_SOURCE_DIR} BUILD_COMMAND ${BUILD_COMMAND} INSTALL_COMMAND ${INSTALL_COMMAND} diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake index d2bb1e62e83de3..b9dbe90a92e6f4 100644 --- a/cmake/external/glog.cmake +++ b/cmake/external/glog.cmake @@ -45,6 +45,7 @@ ExternalProject_Add( DEPENDS gflags PREFIX ${GLOG_PREFIX_DIR} SOURCE_DIR ${GLOG_SOURCE_DIR} + UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_CXX_FLAGS=${GLOG_CMAKE_CXX_FLAGS} diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index 9963237ff188cf..0a3b64e5d56821 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -79,22 +79,10 @@ ExternalProject_Add( -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG} -DDNNL_BUILD_TESTS=OFF -DDNNL_BUILD_EXAMPLES=OFF CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR} - BUILD_BYPRODUCTS ${MKLDNN_LIB} ) -ADD_LIBRARY(shared_mkldnn SHARED IMPORTED GLOBAL) -SET_PROPERTY(TARGET shared_mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB}) -ADD_DEPENDENCIES(shared_mkldnn ${MKLDNN_PROJECT}) MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}") add_definitions(-DPADDLE_WITH_MKLDNN) - -# generate a static dummy target to track mkldnn dependencies -# for cc_library(xxx SRCS xxx.c DEPS mkldnn) -generate_dummy_static_lib(LIB_NAME "mkldnn" GENERATOR "mkldnn.cmake") - -TARGET_LINK_LIBRARIES(mkldnn ${MKLDNN_LIB} ${MKLML_IOMP_LIB}) -ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT}) - # copy the real so.0 lib to install dir # it can be directly contained in wheel or capi if(WIN32) @@ -102,26 +90,33 @@ if(WIN32) file(TO_NATIVE_PATH ${MKLDNN_INSTALL_DIR} NATIVE_MKLDNN_INSTALL_DIR) file(TO_NATIVE_PATH ${MKLDNN_SHARED_LIB} NATIVE_MKLDNN_SHARED_LIB) - ADD_CUSTOM_COMMAND(TARGET ${MKLDNN_PROJECT} POST_BUILD - COMMAND (copy ${NATIVE_MKLDNN_INSTALL_DIR}\\bin\\dnnl.dll ${NATIVE_MKLDNN_SHARED_LIB} /Y)) - add_custom_command(TARGET ${MKLDNN_PROJECT} POST_BUILD VERBATIM - COMMAND dumpbin /exports ${MKLDNN_INSTALL_DIR}/bin/mkldnn.dll > ${MKLDNN_INSTALL_DIR}/bin/exports.txt) - add_custom_command(TARGET ${MKLDNN_PROJECT} POST_BUILD VERBATIM - COMMAND echo LIBRARY mkldnn > ${MKLDNN_INSTALL_DIR}/bin/mkldnn.def) - add_custom_command(TARGET ${MKLDNN_PROJECT} POST_BUILD VERBATIM - COMMAND echo EXPORTS >> ${MKLDNN_INSTALL_DIR}/bin/mkldnn.def) - add_custom_command(TARGET ${MKLDNN_PROJECT} POST_BUILD VERBATIM - COMMAND echo off && (for /f "skip=19 tokens=4" %A in (${MKLDNN_INSTALL_DIR}/bin/exports.txt) do echo %A >> ${MKLDNN_INSTALL_DIR}/bin/mkldnn.def) && echo on) - add_custom_command(TARGET ${MKLDNN_PROJECT} POST_BUILD VERBATIM - COMMAND lib /def:${MKLDNN_INSTALL_DIR}/bin/mkldnn.def /out:${MKLDNN_INSTALL_DIR}/bin/mkldnn.lib /machine:x64) + + ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_LIB} + COMMAND (copy ${NATIVE_MKLDNN_INSTALL_DIR}\\bin\\dnnl.dll ${NATIVE_MKLDNN_SHARED_LIB} /Y) + COMMAND dumpbin /exports ${MKLDNN_INSTALL_DIR}/bin/mkldnn.dll > ${MKLDNN_INSTALL_DIR}/bin/exports.txt + COMMAND echo LIBRARY mkldnn > ${MKLDNN_INSTALL_DIR}/bin/mkldnn.def + COMMAND echo EXPORTS >> ${MKLDNN_INSTALL_DIR}/bin/mkldnn.def + COMMAND echo off && (for /f "skip=19 tokens=4" %A in (${MKLDNN_INSTALL_DIR}/bin/exports.txt) do echo %A >> ${MKLDNN_INSTALL_DIR}/bin/mkldnn.def) && echo on + COMMAND lib /def:${MKLDNN_INSTALL_DIR}/bin/mkldnn.def /out:${MKLDNN_LIB} /machine:x64 + COMMENT "Generate mkldnn.lib manually--->" + DEPENDS ${MKLDNN_PROJECT} + VERBATIM) + ADD_CUSTOM_TARGET(mkldnn_cmd ALL DEPENDS ${MKLDNN_LIB}) else(WIN32) SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0) SET(MKLDNN_SHARED_LIB_1 ${MKLDNN_INSTALL_DIR}/libdnnl.so.1) SET(MKLDNN_SHARED_LIB_2 ${MKLDNN_INSTALL_DIR}/libdnnl.so.2) - ADD_CUSTOM_COMMAND(TARGET ${MKLDNN_PROJECT} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB}) - ADD_CUSTOM_COMMAND(TARGET ${MKLDNN_PROJECT} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB_1}) - ADD_CUSTOM_COMMAND(TARGET ${MKLDNN_PROJECT} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB_2}) + ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB_2} + COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB} + COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB_1} + COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB_2} + DEPENDS ${MKLDNN_PROJECT}) + ADD_CUSTOM_TARGET(mkldnn_cmd ALL DEPENDS ${MKLDNN_SHARED_LIB_2}) endif(WIN32) + +# generate a static dummy target to track mkldnn dependencies +# for cc_library(xxx SRCS xxx.c DEPS mkldnn) +generate_dummy_static_lib(LIB_NAME "mkldnn" GENERATOR "mkldnn.cmake") + +TARGET_LINK_LIBRARIES(mkldnn ${MKLDNN_LIB} ${MKLML_IOMP_LIB}) +ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT} mkldnn_cmd) diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index a2b6ddadb625f6..8a9bc6e42c1464 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -198,16 +198,16 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) "-Dprotobuf_MSVC_STATIC_RUNTIME=${MSVC_STATIC_CRT}") ENDIF() -if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11) - SET(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git) - SET(PROTOBUF_TAG v3.8.0) -elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11) - SET(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git) - SET(PROTOBUF_TAG v3.8.0) -else() - SET(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git) - SET(PROTOBUF_TAG 9f75c5aa851cd877fb0d93ccc31b8567a6706546) -endif() + if(WITH_ASCEND AND NOT WITH_ASCEND_CXX11) + SET(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git) + SET(PROTOBUF_TAG v3.8.0) + elseif(WITH_ASCEND_CL AND NOT WITH_ASCEND_CXX11) + SET(PROTOBUF_REPOSITORY https://gitee.com/tianjianhe/protobuf.git) + SET(PROTOBUF_TAG v3.8.0) + else() + SET(PROTOBUF_REPOSITORY ${GIT_URL}/protocolbuffers/protobuf.git) + SET(PROTOBUF_TAG 9f75c5aa851cd877fb0d93ccc31b8567a6706546) + endif() cache_third_party(${TARGET_NAME} REPOSITORY ${PROTOBUF_REPOSITORY} diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake index 69bd68c2778497..353cb5c72fdfb9 100644 --- a/cmake/external/pybind11.cmake +++ b/cmake/external/pybind11.cmake @@ -39,6 +39,7 @@ ExternalProject_Add( # to be modified without triggering incremental compilation, and the # third-party library version changes cannot be incorporated. # reference: https://cmake.org/cmake/help/latest/module/ExternalProject.html + UPDATE_COMMAND "" CONFIGURE_COMMAND "" BUILD_COMMAND "" INSTALL_COMMAND "" diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index 640e2e37ad434d..aa41173c81a22a 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -35,7 +35,7 @@ ELSE () ENDIF() SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev") -SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210729") +SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210804") SET(XPU_XRE_URL "${XPU_BASE_URL}/${XPU_XRE_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) SET(XPU_XDNN_URL "${XPU_BASE_URL}/${XPU_XDNN_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) SET(XPU_XCCL_URL "${XPU_BASE_URL_WITHOUT_DATE}/20210623/${XPU_XCCL_DIR_NAME}.tar.gz" CACHE STRING "" FORCE) diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 3dcf0b74f7940f..669875d81dfecc 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -205,6 +205,9 @@ copy(inference_lib_dist copy(inference_lib_dist SRCS ${PADDLE_SOURCE_DIR}/paddle/fluid/platform/float16.h DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/) +copy(inference_lib_dist + SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/any.h + DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/) # CAPI inference library for only inference set(PADDLE_INFERENCE_C_INSTALL_DIR "${CMAKE_BINARY_DIR}/paddle_inference_c_install_dir" CACHE STRING diff --git a/paddle/fluid/distributed/common/sparse_sharding_merge.h b/paddle/fluid/distributed/common/sparse_sharding_merge.h index 3f84b5c4b212e2..3bd36e65ba8521 100644 --- a/paddle/fluid/distributed/common/sparse_sharding_merge.h +++ b/paddle/fluid/distributed/common/sparse_sharding_merge.h @@ -21,7 +21,6 @@ #include #include -#include "boost/lexical_cast.hpp" #include "glog/logging.h" #include "paddle/fluid/distributed/common/utils.h" #include "paddle/fluid/framework/blocking_queue.h" @@ -36,8 +35,6 @@ constexpr int Q_SIZE = 10000; constexpr int BUCKET = 10; constexpr char XEOF[] = "EOF"; -using boost::lexical_cast; - inline double GetCurrentUS() { struct timeval time; gettimeofday(&time, NULL); @@ -208,8 +205,10 @@ class ShardingMerge { for (int x = 0; x < embedding_dim; ++x) { float v = 0.0; try { - v = lexical_cast(values_str[x]); - } catch (boost::bad_lexical_cast &e) { + v = std::stof(values_str[x]); + } catch (std::invalid_argument &e) { + VLOG(0) << " get unexpected line: " << line; + } catch (std::out_of_range &e) { VLOG(0) << " get unexpected line: " << line; } out->push_back(v); diff --git a/paddle/fluid/distributed/index_dataset/index_wrapper.cc b/paddle/fluid/distributed/index_dataset/index_wrapper.cc index 99fe4ca0c6d043..7a9691f3602e26 100644 --- a/paddle/fluid/distributed/index_dataset/index_wrapper.cc +++ b/paddle/fluid/distributed/index_dataset/index_wrapper.cc @@ -17,8 +17,6 @@ limitations under the License. */ #include #include "paddle/fluid/framework/io/fs.h" -#include -#include #include "paddle/fluid/distributed/index_dataset/index_wrapper.h" namespace paddle { @@ -65,7 +63,7 @@ int TreeIndex::Load(const std::string filename) { if (item.key() == ".tree_meta") { meta_.ParseFromString(item.value()); } else { - auto code = boost::lexical_cast(item.key()); + auto code = std::stoull(item.key()); IndexNode node; node.ParseFromString(item.value()); PADDLE_ENFORCE_NE(node.id(), 0, diff --git a/paddle/fluid/distributed/table/common_sparse_table.cc b/paddle/fluid/distributed/table/common_sparse_table.cc index e1223face0f54a..8b79b1c02fce5e 100644 --- a/paddle/fluid/distributed/table/common_sparse_table.cc +++ b/paddle/fluid/distributed/table/common_sparse_table.cc @@ -15,7 +15,6 @@ #include "paddle/fluid/distributed/table/common_sparse_table.h" #include -#include "boost/lexical_cast.hpp" #include "glog/logging.h" #include "paddle/fluid/platform/enforce.h" @@ -50,8 +49,11 @@ void CommonSparseTable::ProcessALine(const std::vector& columns, float v = 0.0; try { - v = lexical_cast(va); - } catch (boost::bad_lexical_cast& e) { + v = std::stof(va); + } catch (std::invalid_argument& e) { + VLOG(0) << "id: " << id << " get unexpected value: " << va + << " and be reset to: 0.0"; + } catch (std::out_of_range& e) { VLOG(0) << "id: " << id << " get unexpected value: " << va << " and be reset to: 0.0"; } @@ -131,7 +133,7 @@ int64_t CommonSparseTable::LoadFromText( while (std::getline(file, line)) { auto values = paddle::string::split_string(line, "\t"); - auto id = lexical_cast(values[0]); + auto id = std::stoull(values[0]); if (id % pserver_num != pserver_id) { VLOG(3) << "will not load " << values[0] << " from " << valuepath @@ -150,10 +152,9 @@ int64_t CommonSparseTable::LoadFromText( VALUE* value_instant = block->GetValue(id); if (values.size() == 5) { - value_instant->count_ = lexical_cast(values[1]); - value_instant->unseen_days_ = lexical_cast(values[2]); - value_instant->is_entry_ = - static_cast(lexical_cast(values[3])); + value_instant->count_ = std::stoi(values[1]); + value_instant->unseen_days_ = std::stoi(values[2]); + value_instant->is_entry_ = static_cast(std::stoi(values[3])); } std::vector block_values = block->Get(id, meta.names, meta.dims); diff --git a/paddle/fluid/distributed/table/common_sparse_table.h b/paddle/fluid/distributed/table/common_sparse_table.h index ce3cc11686a480..a443710bf0fd82 100644 --- a/paddle/fluid/distributed/table/common_sparse_table.h +++ b/paddle/fluid/distributed/table/common_sparse_table.h @@ -33,7 +33,6 @@ #include "paddle/fluid/string/string_helper.h" #define PSERVER_SAVE_SUFFIX ".shard" -using boost::lexical_cast; namespace paddle { namespace distributed { diff --git a/paddle/fluid/distributed/table/ssd_sparse_table.cc b/paddle/fluid/distributed/table/ssd_sparse_table.cc index 5de6de3d2909d6..41eca72cf80717 100644 --- a/paddle/fluid/distributed/table/ssd_sparse_table.cc +++ b/paddle/fluid/distributed/table/ssd_sparse_table.cc @@ -310,7 +310,7 @@ int64_t SSDSparseTable::LoadFromText( while (std::getline(file, line)) { auto values = paddle::string::split_string(line, "\t"); - auto id = lexical_cast(values[0]); + auto id = std::stoull(values[0]); if (id % pserver_num != pserver_id) { VLOG(3) << "will not load " << values[0] << " from " << valuepath @@ -329,10 +329,9 @@ int64_t SSDSparseTable::LoadFromText( VALUE* value_instant = block->GetValue(id); if (values.size() == 5) { - value_instant->count_ = lexical_cast(values[1]); - value_instant->unseen_days_ = lexical_cast(values[2]); - value_instant->is_entry_ = - static_cast(lexical_cast(values[3])); + value_instant->count_ = std::stoi(values[1]); + value_instant->unseen_days_ = std::stoi(values[2]); + value_instant->is_entry_ = static_cast(std::stoi(values[3])); } std::vector block_values = block->Get(id, meta.names, meta.dims); diff --git a/paddle/fluid/extension/include/ext_op_meta_info.h b/paddle/fluid/extension/include/ext_op_meta_info.h index c400164c7543da..6f2528030e603d 100644 --- a/paddle/fluid/extension/include/ext_op_meta_info.h +++ b/paddle/fluid/extension/include/ext_op_meta_info.h @@ -19,8 +19,7 @@ limitations under the License. */ #include #include -#include - +#include "any.h" #include "ext_dll_decl.h" // NOLINT #include "ext_exception.h" // NOLINT #include "ext_tensor.h" // NOLINT @@ -83,7 +82,7 @@ inline std::string Vec(const std::string& t_name) { using KernelFunc = std::vector (*)(const std::vector& inputs, const std::vector>& vec_inputs, - const std::vector& attrs); + const std::vector& attrs); #define PD_SPECIALIZE_ComputeCallHelper(attr_type) \ template \ @@ -92,14 +91,14 @@ using KernelFunc = typename... PreviousArgs> \ static Return Compute(const std::vector& inputs, \ const std::vector>& vec_inputs, \ - const std::vector& attrs, \ + const std::vector& attrs, \ const PreviousArgs&... pargs) { \ try { \ - attr_type arg = boost::any_cast(attrs[attr_idx]); \ + attr_type arg = paddle::any_cast(attrs[attr_idx]); \ return ComputeCallHelper::template Compute< \ in_idx, vec_in_idx, attr_idx + 1>(inputs, vec_inputs, attrs, \ pargs..., arg); \ - } catch (boost::bad_any_cast&) { \ + } catch (paddle::bad_any_cast&) { \ PD_THROW( \ "Attribute cast error in custom operator. Expected " #attr_type \ " value."); \ @@ -117,7 +116,7 @@ template struct KernelFuncImpl { static Return Compute(const std::vector& inputs, const std::vector>& vec_inputs, - const std::vector& attrs) { + const std::vector& attrs) { return ComputeCallHelper>::template Compute<0, 0, 0>( inputs, vec_inputs, attrs); } @@ -132,7 +131,7 @@ struct KernelFuncImpl { typename... PreviousArgs> static Return Compute(const std::vector& inputs, const std::vector>& vec_inputs, - const std::vector& attrs, + const std::vector& attrs, const PreviousArgs&... pargs) { const Tensor& arg = inputs[in_idx]; return ComputeCallHelper::template Compute { typename... PreviousArgs> static Return Compute(const std::vector& inputs, const std::vector>& vec_inputs, - const std::vector& attrs, + const std::vector& attrs, const PreviousArgs&... pargs) { const std::vector& arg = vec_inputs[vec_in_idx]; return ComputeCallHelper::template Compute< @@ -189,7 +188,7 @@ struct KernelFuncImpl { template static Return Compute(const std::vector& inputs, const std::vector>& vec_inputs, - const std::vector& attrs, + const std::vector& attrs, const Args&... args) { return impl_fn(args...); } @@ -205,67 +204,67 @@ struct KernelFuncImpl { using InferShapeFunc = std::vector> (*)( const std::vector>& input_shapes, const std::vector>>& vec_input_shapes, - const std::vector& attrs); - -#define PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPE(input_type) \ - template \ - struct InferShapeCallHelper { \ - template \ - static Return InferShape( \ - const std::vector>& input_shapes, \ - const std::vector>>& \ - vec_input_shapes, \ - const std::vector& attrs, const PreviousArgs&... pargs) { \ - input_type arg = input_shapes[in_idx]; \ - return InferShapeCallHelper::template InferShape< \ - in_idx + 1, vec_in_idx, attr_idx>(input_shapes, vec_input_shapes, \ - attrs, pargs..., arg); \ - } \ + const std::vector& attrs); + +#define PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPE(input_type) \ + template \ + struct InferShapeCallHelper { \ + template \ + static Return InferShape( \ + const std::vector>& input_shapes, \ + const std::vector>>& \ + vec_input_shapes, \ + const std::vector& attrs, const PreviousArgs&... pargs) { \ + input_type arg = input_shapes[in_idx]; \ + return InferShapeCallHelper::template InferShape< \ + in_idx + 1, vec_in_idx, attr_idx>(input_shapes, vec_input_shapes, \ + attrs, pargs..., arg); \ + } \ } -#define PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPES(input_type) \ - template \ - struct InferShapeCallHelper { \ - template \ - static Return InferShape( \ - const std::vector>& input_shapes, \ - const std::vector>>& \ - vec_input_shapes, \ - const std::vector& attrs, const PreviousArgs&... pargs) { \ - input_type arg = vec_input_shapes[vec_in_idx]; \ - return InferShapeCallHelper::template InferShape< \ - in_idx, vec_in_idx + 1, attr_idx>(input_shapes, vec_input_shapes, \ - attrs, pargs..., arg); \ - } \ +#define PD_SPECIALIZE_InferShapeCallHelper_FOR_SHAPES(input_type) \ + template \ + struct InferShapeCallHelper { \ + template \ + static Return InferShape( \ + const std::vector>& input_shapes, \ + const std::vector>>& \ + vec_input_shapes, \ + const std::vector& attrs, const PreviousArgs&... pargs) { \ + input_type arg = vec_input_shapes[vec_in_idx]; \ + return InferShapeCallHelper::template InferShape< \ + in_idx, vec_in_idx + 1, attr_idx>(input_shapes, vec_input_shapes, \ + attrs, pargs..., arg); \ + } \ } -#define PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(attr_type) \ - template \ - struct InferShapeCallHelper { \ - template \ - static Return InferShape( \ - const std::vector>& input_shapes, \ - const std::vector>>& \ - vec_input_shapes, \ - const std::vector& attrs, const PreviousArgs&... pargs) { \ - try { \ - attr_type arg = boost::any_cast(attrs[attr_idx]); \ - return InferShapeCallHelper::template InferShape< \ - in_idx, vec_in_idx, attr_idx + 1>(input_shapes, vec_input_shapes, \ - attrs, pargs..., arg); \ - } catch (boost::bad_any_cast&) { \ - PD_THROW( \ - "Attribute cast error in custom operator InferShapeFn. " \ - "Expected " #attr_type \ - " value. InferShapeFn's attribute list must be exactly same as " \ - "Forward " \ - "KernelFn's attribute list except std::vector " \ - "attribute."); \ - } \ - } \ +#define PD_SPECIALIZE_InferShapeCallHelper_FOR_ATTR(attr_type) \ + template \ + struct InferShapeCallHelper { \ + template \ + static Return InferShape( \ + const std::vector>& input_shapes, \ + const std::vector>>& \ + vec_input_shapes, \ + const std::vector& attrs, const PreviousArgs&... pargs) { \ + try { \ + attr_type arg = paddle::any_cast(attrs[attr_idx]); \ + return InferShapeCallHelper::template InferShape< \ + in_idx, vec_in_idx, attr_idx + 1>(input_shapes, vec_input_shapes, \ + attrs, pargs..., arg); \ + } catch (paddle::bad_any_cast&) { \ + PD_THROW( \ + "Attribute cast error in custom operator InferShapeFn. " \ + "Expected " #attr_type \ + " value. InferShapeFn's attribute list must be exactly same as " \ + "Forward " \ + "KernelFn's attribute list except std::vector " \ + "attribute."); \ + } \ + } \ } template @@ -276,7 +275,7 @@ struct InferShapeFuncImpl { static Return InferShape( const std::vector>& input_shapes, const std::vector>>& vec_input_shapes, - const std::vector& attrs) { + const std::vector& attrs) { return InferShapeCallHelper>::template InferShape< 0, 0, 0>(input_shapes, vec_input_shapes, attrs); } @@ -314,7 +313,7 @@ struct InferShapeFuncImpl { static Return InferShape( const std::vector>& input_shapes, const std::vector>>& vec_input_shapes, - const std::vector& attrs, const Args&... args) { + const std::vector& attrs, const Args&... args) { return impl_fn(args...); } }; diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 08e912f52ccb57..419db670467a01 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -411,6 +411,7 @@ configure_file(commit.h.in commit.h) # to avoid exposing the path of the underlying file include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform) include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/extension/include) +include_directories(${PADDLE_SOURCE_DIR}/paddle/utils) if(WITH_ROCM) hip_library(custom_tensor SRCS ../extension/src/ext_tensor.cc DEPS lod_tensor memory enforce) @@ -427,6 +428,9 @@ else() cc_test(custom_tensor_test SRCS custom_tensor_test.cc DEPS custom_tensor glog) endif() +#cc_binary(test_executor SRCS test_executor.cc DEPS executor op_registry ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ) +#cc_binary(new_executor SRCS new_exec_test.cc DEPS operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler) + set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader custom_operator) cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES}) diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc index b1c5ff86d19790..7fef165f373969 100644 --- a/paddle/fluid/framework/custom_operator.cc +++ b/paddle/fluid/framework/custom_operator.cc @@ -34,6 +34,7 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/dynload/dynamic_loader.h" #include "paddle/fluid/string/string_helper.h" +#include "paddle/utils/any.h" namespace paddle { namespace framework { @@ -149,7 +150,7 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx, } } - std::vector custom_attrs; + std::vector custom_attrs; for (auto& attr_str : attrs) { auto attr_name_and_type = detail::ParseAttrStr(attr_str); auto attr_name = attr_name_and_type[0]; @@ -605,7 +606,7 @@ void RegisterOperatorWithMetaInfo( } } - std::vector custom_attrs; + std::vector custom_attrs; for (auto& attr_str : op_attrs) { auto attr_name_and_type = detail::ParseAttrStr(attr_str); auto attr_name = attr_name_and_type[0]; diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 1546027b794bb5..bbb781c8664baf 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -141,7 +141,7 @@ if(NOT APPLE AND NOT WIN32 AND (WITH_GPU OR WITH_ROCM)) endif() cc_library(build_strategy SRCS build_strategy.cc DEPS pass_builder ${IR_PASS_DEPS}) cc_test(build_strategy_test SRCS build_strategy_test.cc - DEPS build_strategy op_registry op_proto_maker graph) + DEPS build_strategy op_registry op_proto_maker graph string_helper) if (WITH_MKLDNN) target_link_libraries(build_strategy mkldnn_placement_pass) diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto index b28c884429c179..1de6d26d05b9e4 100644 --- a/paddle/fluid/framework/distributed_strategy.proto +++ b/paddle/fluid/framework/distributed_strategy.proto @@ -183,7 +183,7 @@ message DistributedStrategy { optional bool use_hierarchical_allreduce = 15 [ default = false ]; optional int32 hierarchical_allreduce_inter_nranks = 16 [ default = 1 ]; optional bool sync_batch_norm = 17 [ default = false ]; - optional bool fuse_all_reduce_ops = 18 [ default = false ]; + optional bool fuse_all_reduce_ops = 18 [ default = true ]; optional int32 fuse_grad_size_in_MB = 19 [ default = 32 ]; optional float fuse_grad_size_in_TFLOPS = 20 [ default = 50 ]; optional bool cudnn_exhaustive_search = 21 [ default = false ]; diff --git a/paddle/fluid/framework/fleet/fleet_wrapper.cc b/paddle/fluid/framework/fleet/fleet_wrapper.cc index 54a647a73cfebb..bb318e59e46e41 100644 --- a/paddle/fluid/framework/fleet/fleet_wrapper.cc +++ b/paddle/fluid/framework/fleet/fleet_wrapper.cc @@ -262,7 +262,7 @@ void FleetWrapper::HeterPushSparseVars( int64_t* ids = tensor->data(); int slot = 0; if (dump_slot) { - slot = boost::lexical_cast(sparse_key_names[i]); + slot = std::stoi(sparse_key_names[i]); } Variable* g_var = scope.FindVar(sparse_grad_names[i]); if (g_var == nullptr) { @@ -915,12 +915,17 @@ void FleetWrapper::PushSparseVarsWithLabelAsync( int slot = 0; if (dump_slot) { try { - slot = boost::lexical_cast(sparse_key_names[i]); - } catch (boost::bad_lexical_cast const& e) { + slot = std::stoi(sparse_key_names[i]); + } catch (std::invalid_argument const& e) { PADDLE_THROW(platform::errors::PreconditionNotMet( "sparse var's name: %s, doesn't support non-integer type name when " "dump_slot=True", sparse_key_names[i])); + } catch (std::out_of_range const& e) { + PADDLE_THROW(platform::errors::PreconditionNotMet( + "sparse var's name: %s, integer type name out of range when " + "dump_slot=True", + sparse_key_names[i])); } } Variable* g_var = scope.FindVar(sparse_grad_names[i]); @@ -1121,7 +1126,7 @@ void FleetWrapper::PushSparseFromTensorWithLabelAsync( data[click_index] = static_cast(fea_labels.at(input_idx)); } if (dump_slot) { - int slot = boost::lexical_cast(input_names[index]); + int slot = std::stoi(input_names[index]); data[0] = static_cast(slot); } ++input_idx; diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 0107f5976499ce..384f80395c7784 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -59,7 +59,7 @@ cc_library(coalesce_grad_tensor_pass SRCS coalesce_grad_tensor_pass.cc DEPS grap pass_library(graph_to_program_pass base) pass_library(graph_viz_pass base) -pass_library(lock_free_optimize_pass base) +pass_library(lock_free_optimize_pass base DEPS string_helper) pass_library(fc_fuse_pass inference) pass_library(map_matmul_to_mul_pass inference) pass_library(attention_lstm_fuse_pass inference) diff --git a/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.cc b/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.cc index 0e2bb3eaad536f..c280b7c32ed21d 100644 --- a/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.cc +++ b/paddle/fluid/framework/ir/adaptive_pool2d_convert_global_pass.cc @@ -60,6 +60,7 @@ AdaptivePool2dConvertGlobalPass::AdaptivePool2dConvertGlobalPass() { .IsStringIn({"NHWC", "NCHW"}) .End() .AddAttr("padding_algorithm") + .IsOptional() .IsStringIn({"EXPLICIT", "SAME", "VALID"}) .End(); } diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc index e4ac89f04ff679..3875d856d20bd6 100644 --- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc @@ -120,6 +120,7 @@ ConvAffineChannelFusePass::ConvAffineChannelFusePass() { .IsType>() .End() .AddAttr("padding_algorithm") + .IsOptional() .IsStringIn({"EXPLICIT", "SAME", "VALID"}) .End() .AddAttr("groups") @@ -129,7 +130,7 @@ ConvAffineChannelFusePass::ConvAffineChannelFusePass() { .IsType>() .End() .AddAttr("data_format") - .IsStringIn({"NCHW", "NHWC"}) + .IsStringIn({"NCHW", "NHWC", "AnyLayout"}) .End(); AddOpCompat(OpCompat("affine_channel")) @@ -267,6 +268,7 @@ ConvEltwiseAddAffineChannelFusePass::ConvEltwiseAddAffineChannelFusePass() { .IsType>() .End() .AddAttr("padding_algorithm") + .IsOptional() .IsStringIn({"EXPLICIT", "SAME", "VALID"}) .End() .AddAttr("groups") @@ -276,7 +278,7 @@ ConvEltwiseAddAffineChannelFusePass::ConvEltwiseAddAffineChannelFusePass() { .IsType>() .End() .AddAttr("data_format") - .IsStringIn({"NCHW", "NHWC"}) + .IsStringIn({"NCHW", "NHWC", "AnyLayout"}) .End(); AddOpCompat(OpCompat("affine_channel")) .AddInput("X") diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc index c362eec34b0683..3a012b908482ac 100644 --- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc @@ -620,6 +620,7 @@ ConvTransposeBNFusePass::ConvTransposeBNFusePass() { .IsType>() .End() .AddAttr("padding_algorithm") + .IsOptional() .IsStringIn({"EXPLICIT", "SAME", "VALID"}) .End() .AddAttr("data_format") @@ -663,6 +664,7 @@ ConvTransposeEltwiseAddBNFusePass::ConvTransposeEltwiseAddBNFusePass() { .IsType>() .End() .AddAttr("padding_algorithm") + .IsOptional() .IsStringIn({"EXPLICIT", "SAME", "VALID"}) .End() .AddAttr("data_format") diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc index 573436d393b855..3d1c1eb55aa079 100644 --- a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc @@ -68,6 +68,7 @@ ConvElementwiseAdd2ActFusePass::ConvElementwiseAdd2ActFusePass() { .AddAttr("paddings") .End() .AddAttr("padding_algorithm") + .IsOptional() .IsStringIn({"EXPLICIT", "SAME", "VALID"}) .End() .AddAttr("groups") diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index 50c5671cb91a49..21e743e3587d80 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -25,6 +25,7 @@ limitations under the License. */ #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/variant.h" +#include "paddle/utils/any.h" DECLARE_bool(convert_all_blocks); @@ -147,8 +148,8 @@ class Graph { platform::errors::PreconditionNotMet( "%s attribute not registered for current graph.", attr_name)); try { - return *boost::any_cast(attrs_.at(attr_name)); - } catch (boost::bad_any_cast &) { + return *paddle::any_cast(attrs_.at(attr_name)); + } catch (paddle::bad_any_cast &) { PADDLE_THROW(platform::errors::InvalidArgument( "Invalid attribute type of %s, expected: %s, received: %s.", attr_name, platform::demangle(typeid(AttrType *).name()), // NOLINT @@ -426,7 +427,7 @@ class Graph { const Graph *main_graph_; // not owned. std::vector> sub_graphs_; - std::map attrs_; + std::map attrs_; std::map> attr_dels_; std::map> nodes_; std::unordered_set node_set_; diff --git a/paddle/fluid/framework/ir/lock_free_optimize_pass.h b/paddle/fluid/framework/ir/lock_free_optimize_pass.h index 26ec61fd36eb3c..93b6396bf7f310 100644 --- a/paddle/fluid/framework/ir/lock_free_optimize_pass.h +++ b/paddle/fluid/framework/ir/lock_free_optimize_pass.h @@ -17,10 +17,9 @@ #include #include -#include - #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/pass.h" +#include "paddle/fluid/string/string_helper.h" namespace paddle { namespace framework { @@ -109,7 +108,7 @@ class LockFreeOptimizePass : public Pass { "Input argument node cannot be nullptr.")); return node->NodeType() == Node::Type::kVariable && - boost::algorithm::ends_with(node->Name(), name); + paddle::string::ends_with(node->Name(), name); } inline bool IsVarNameContains(ir::Node* node, const std::string& name) const { diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc index a7514038d400b6..41539a05b37177 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc @@ -47,6 +47,7 @@ ConvBiasFusePass::ConvBiasFusePass() { .IsType>() .End() .AddAttr("padding_algorithm") + .IsOptional() .IsStringIn({"EXPLICIT", "SAME", "VALID"}) .End() .AddAttr("groups") @@ -56,7 +57,7 @@ ConvBiasFusePass::ConvBiasFusePass() { .IsType>() .End() .AddAttr("data_format") - .IsStringIn({"NCHW", "NHWC"}) + .IsStringIn({"NCHW", "NHWC", "AnyLayout"}) .End(); AddOpCompat(OpCompat("elementwise_add")) @@ -110,6 +111,7 @@ Conv2DTransposeBiasFusePass::Conv2DTransposeBiasFusePass() { .IsType>() .End() .AddAttr("padding_algorithm") + .IsOptional() .IsStringIn({"EXPLICIT", "SAME", "VALID"}) .End() .AddAttr("data_format") @@ -135,6 +137,7 @@ Conv3DBiasFusePass::Conv3DBiasFusePass() { .IsType>() .End() .AddAttr("padding_algorithm") + .IsOptional() .IsStringIn({"EXPLICIT", "SAME", "VALID"}) .End() .AddAttr("groups") diff --git a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc index bd65ad8e643785..b07cc58959faa0 100644 --- a/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.cc @@ -158,11 +158,6 @@ void ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::operator()( Node* elementwise_add_op; Node* elementwise_add_identity; Node* elementwise_add_out; - if (!pass_->IsCompat(subgraph, graph)) { - LOG(WARNING) - << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed."; - return; - } std::tie(conv_op, conv_input, conv_filter, conv_output) = get_node_from_conv_op(subgraph); @@ -175,6 +170,12 @@ void ResidualConnectionMKLDNNFusePass::IdentityFuseHandle::operator()( if (HasFusedActivation(conv_op)) return; + if (!pass_->IsCompat(subgraph, graph)) { + LOG(WARNING) + << "conv_elementwise_add_mkldnn_fuse_pass in op compat failed."; + return; + } + conv_op->Op()->SetInput("ResidualData", {elementwise_add_identity->Name()}); conv_op->Op()->SetOutput("Output", {elementwise_add_out->Name()}); conv_op->Op()->SetAttr("fuse_residual_connection", true); diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc index 2483a506a8f934..2b9419a5502f1c 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc @@ -77,7 +77,7 @@ CPUQuantizeSquashPass::CPUQuantizeSquashPass() { .End() .AddAttr("data_format") .IsOptional() - .IsStringIn({"NCHW", "NHWC"}) + .IsStringIn({"NCHW", "NHWC", "AnyLayout"}) .End(); } diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt b/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt index f945ddbd5d6a31..6764799d828661 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/CMakeLists.txt @@ -18,4 +18,4 @@ cc_library(fuse_all_reduce_op_pass SRCS fuse_all_reduce_op_pass.cc DEPS graph gr cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS all_reduce_op_handle graph graph_helper pass) cc_library(backward_optimizer_op_deps_pass SRCS backward_optimizer_op_deps_pass.cc DEPS graph graph_helper pass) cc_library(add_reader_dependency_pass SRCS add_reader_dependency_pass.cc DEPS graph graph_helper pass) -cc_library(fix_op_run_order_pass SRCS fix_op_run_order_pass DEPS graph graph_helper multi_devices_helper pass op_handle_base eager_deletion_op_handle) +cc_library(fix_op_run_order_pass SRCS fix_op_run_order_pass.cc DEPS graph graph_helper multi_devices_helper pass op_handle_base eager_deletion_op_handle) diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index d0568f39ef6a45..54bd4376c6e5cb 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -23,7 +23,7 @@ limitations under the License. */ #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/platform/macros.h" - +#include "paddle/utils/any.h" namespace paddle { namespace framework { class OpDesc; @@ -104,8 +104,8 @@ class Node { template T& Wrapper() { try { - return *boost::any_cast(wrapper_); - } catch (boost::bad_any_cast&) { + return *paddle::any_cast(wrapper_); + } catch (paddle::bad_any_cast&) { PADDLE_THROW(platform::errors::InvalidArgument( "Invalid wrapper type error, expected %s, actual %s.", typeid(T).name(), wrapper_type_.name())); @@ -277,7 +277,7 @@ class Node { Node() = delete; - boost::any wrapper_; + paddle::any wrapper_; std::function wrapper_deleter_; std::type_index wrapper_type_ = std::type_index(typeid(void)); diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h index 8fb96bec9cbd56..fecdfc404e6dca 100644 --- a/paddle/fluid/framework/ir/pass.h +++ b/paddle/fluid/framework/ir/pass.h @@ -26,6 +26,7 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/platform/variant.h" +#include "paddle/utils/any.h" namespace paddle { namespace framework { @@ -73,8 +74,8 @@ class Pass { platform::errors::InvalidArgument( "Attribute %s not registered for pass.", attr_name)); try { - return *boost::any_cast(attrs_.at(attr_name)); - } catch (boost::bad_any_cast &) { + return *paddle::any_cast(attrs_.at(attr_name)); + } catch (paddle::bad_any_cast &) { auto TypeToString = [](const std::type_info &info) -> std::string { if (std::type_index(info) == std::type_index(typeid(bool *))) { return "bool"; @@ -166,7 +167,7 @@ class Pass { // Pass doesn't take ownership. PassRegistrar should delete default_attrs void RegisterDefaultPassAttrs( - std::map default_attr_values) { + std::map default_attr_values) { for (auto const &attr_name : default_attr_values) { default_pass_attrs_.insert(attr_name.first); } @@ -180,7 +181,7 @@ class Pass { std::unordered_set required_pass_attrs_; std::unordered_set default_pass_attrs_; std::unordered_set required_graph_attrs_; - std::map attrs_; + std::map attrs_; std::map> attr_dels_; }; @@ -290,7 +291,7 @@ struct PassRegistrar : public Registrar { private: std::unordered_set required_pass_attrs_; std::unordered_set required_graph_attrs_; - std::map default_attr_values_; + std::map default_attr_values_; std::map> default_attr_dels_; }; diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc index 068a50a1dc0e9a..b48c8c6e70a939 100644 --- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc +++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc @@ -243,6 +243,7 @@ QuantDequantFusePass::QuantDequantFusePass() { .IsType>() .End() .AddAttr("padding_algorithm") + .IsOptional() .IsStringIn({"EXPLICIT", "SAME", "VALID"}) .End() .AddAttr("data_format") diff --git a/paddle/fluid/framework/new_exec.h b/paddle/fluid/framework/new_exec.h new file mode 100644 index 00000000000000..defa7a967336b5 --- /dev/null +++ b/paddle/fluid/framework/new_exec.h @@ -0,0 +1,629 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include +#include + +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/executor_gc_helper.h" +#include "paddle/fluid/framework/garbage_collector.h" +#include "paddle/fluid/framework/new_exec_util.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/init.h" + +// USE_OP(fill_constant); +// USE_OP(elementwise_add); + +// using namespace std; + +namespace paddle { +namespace framework { + +using std::cerr; +using std::endl; + +using OpKernelComputeFunc = std::function; +using OpKernelMap = + std::unordered_map; + +framework::ProgramDesc load_from_file(const std::string& file_name) { + std::ifstream fin(file_name, std::ios::in | std::ios::binary); + fin.seekg(0, std::ios::end); + std::string buffer(fin.tellg(), ' '); + fin.seekg(0, std::ios::beg); + fin.read(&buffer[0], buffer.size()); + fin.close(); + + ProgramDesc program_desc(buffer); + return program_desc; +} + +struct OpKernelFunc { + OpKernelComputeFunc compute_func_; + OperatorBase* operator_base_; +}; + +struct VariableMetaInfo { + int var_ref_count_; +}; + +struct VariableScope { + std::vector var_list; + std::map name2id; + std::vector vec_meta_info_; +}; + +struct NextInstruction { + std::vector direct_run_; +}; + +struct EventInter {}; + +struct InstructionInfo { + std::vector dependecy_count_; +}; + +struct EventRun { + EventInter event_inter; + std::vector same_device_run_; + std::vector synchronized_run; +}; + +struct Instruction { + OpKernelFunc kernel_func_; + std::map> input_index_; + std::map> output_index_; + + std::vector gc_check_var_list; + NextInstruction next_instruction_; + std::vector vec_event_list_; +}; + +struct OpFuncNode { + // int unsed; + std::map> input_index; + std::map> output_index; + + OpKernelComputeFunc kernel_func_; +}; + +int convert(const platform::Place& place) { + if (is_cpu_place(place)) { + return 0; + } + if (is_gpu_place(place)) { + return 1; + } + + return -1; +} + +std::vector merge_vec(const std::vector& first, + const std::vector& second) { + std::vector out(first.size() + second.size()); + std::merge(first.begin(), first.end(), second.begin(), second.end(), + out.begin()); + + std::vector::iterator it; + it = std::unique(out.begin(), out.end()); + + out.resize(std::distance(out.begin(), it)); + + return out; +} + +void build_variable_outer_scope(const framework::ProgramDesc& pdesc, + VariableScope* var_scope, Scope* outer_scope) { + auto& global_block = pdesc.Block(0); + + for (auto& var : global_block.AllVars()) { + if (var->Name() == framework::kEmptyVarName) { + continue; + } + auto v = outer_scope->Var(var->Name()); + + if (var_scope->name2id.find(var->Name()) == var_scope->name2id.end()) { + var_scope->name2id[var->Name()] = var_scope->var_list.size(); + } + + InitializeVariable(v, var->GetType()); + var_scope->var_list.push_back(v); + } +} + +void build_variable_scope(const framework::ProgramDesc& pdesc, + VariableScope* var_scope) { + auto& global_block = pdesc.Block(0); + + for (auto& var : global_block.AllVars()) { + if (var->Name() == framework::kEmptyVarName) { + continue; + } + + if (var_scope->name2id.find(var->Name()) == var_scope->name2id.end()) { + var_scope->name2id[var->Name()] = var_scope->var_list.size(); + } + + auto v = new Variable(); + InitializeVariable(v, var->GetType()); + var_scope->var_list.push_back(v); + } +} + +void build_op_func_list(const framework::ProgramDesc& pdesc, + std::vector* op_list, + std::vector* vec_func_list, + VariableScope* var_scope, + const platform::Place& place) { + auto& global_block = pdesc.Block(0); + + for (auto& op : global_block.AllOps()) { + VLOG(3) << op->Type(); + // << op->Type() << endl; + + auto& info = OpInfoMap::Instance().Get(op->Type()); + + const VariableNameMap& inputs_names = op->Inputs(); + const VariableNameMap& outputs_names = op->Outputs(); + AttributeMap op_attr_map = op->GetAttrMap(); + + if (info.Checker() != nullptr) { + info.Checker()->Check(&op_attr_map); + } + auto op_base = + info.Creator()(op->Type(), inputs_names, outputs_names, op_attr_map); + + OpFuncNode op_func_node; + + VariableValueMap ins_map; + std::map> ins_name2id; + for (auto& var_name_item : inputs_names) { + std::vector input_vars; + std::vector vec_ids; + input_vars.reserve(var_name_item.second.size()); + for (auto& var_name : var_name_item.second) { + auto it = var_scope->name2id.find(var_name); + assert(it != var_scope->name2id.end()); + input_vars.push_back(var_scope->var_list[it->second]); + vec_ids.push_back(it->second); + } + ins_map[var_name_item.first] = input_vars; + ins_name2id[var_name_item.first] = vec_ids; + } + + VariableValueMap outs_map; + std::map> outs_name2id; + for (auto& var_name_item : outputs_names) { + std::vector output_vars; + std::vector vec_ids; + output_vars.reserve(var_name_item.second.size()); + for (auto& var_name : var_name_item.second) { + auto it = var_scope->name2id.find(var_name); + assert(it != var_scope->name2id.end()); + output_vars.push_back(var_scope->var_list[it->second]); + vec_ids.push_back(it->second); + } + outs_map[var_name_item.first] = output_vars; + outs_name2id[var_name_item.first] = vec_ids; + } + + op_func_node.input_index = ins_name2id; + op_func_node.output_index = outs_name2id; + RuntimeContext runtime_context({}, {}); + runtime_context.inputs.swap(ins_map); + runtime_context.outputs.swap(outs_map); + RuntimeInferShapeContext infer_shape_ctx(*op_base, runtime_context); + static_cast(op_base)->InferShape( + &infer_shape_ctx); + auto& all_op_kernels = OperatorWithKernel::AllOpKernels(); + auto kernels_iter = all_op_kernels.find(op->Type()); + PADDLE_ENFORCE_NE( + kernels_iter, all_op_kernels.end(), + platform::errors::Unavailable( + "There are no kernels which are registered in the %s operator.", + op->Type())); + + OpKernelMap& kernels = kernels_iter->second; + // auto place = platform::CPUPlace(); + // auto place = platform::CUDAPlace(0); + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(place); + Scope scope; + auto exec_ctx = + ExecutionContext(*op_base, scope, *dev_ctx, runtime_context); + auto expected_kernel_key = + dynamic_cast(op_base) + ->GetExpectedKernelType(exec_ctx); + + VariableValueMap& ins_map_temp = runtime_context.inputs; + + for (auto& var_name_item : ins_map_temp) { + for (size_t i = 0; i < var_name_item.second.size(); ++i) { + auto var = var_name_item.second[i]; + auto tensor_in = static_cast(&(var->Get())); + if (!tensor_in->IsInitialized()) { + continue; + } + auto kernel_type_for_var = + static_cast(op_base) + ->GetKernelTypeForVar(var_name_item.first, *tensor_in, + expected_kernel_key); + if (!platform::is_same_place(kernel_type_for_var.place_, + expected_kernel_key.place_)) { + // need trans place + // 1. add var in scope + // 2. add copy op + std::string new_var_name = + "temp_1" + std::to_string(var_scope->var_list.size() + 1); + auto v = new Variable(); + v->GetMutable(); + var_scope->name2id[new_var_name] = var_scope->var_list.size(); + var_scope->var_list.push_back(v); + + VariableNameMap copy_in_map; + auto x_iter = inputs_names.find(var_name_item.first); + copy_in_map["X"] = {x_iter->second[i]}; + VariableNameMap copy_out_map; + copy_out_map["Out"] = {new_var_name}; + AttributeMap attr_map; + attr_map["dst_place_type"] = convert(place); + + std::map> copy_ins_name2id; + copy_ins_name2id["X"] = ins_name2id[var_name_item.first]; + std::map> copy_out_name2id; + copy_out_name2id["Out"] = {var_scope->name2id[new_var_name]}; + + op_func_node.input_index[var_name_item.first][i] = + var_scope->name2id[new_var_name]; + + VariableValueMap copy_ins_value_map; + copy_ins_value_map["X"] = {var}; + VariableValueMap copy_outs_value_map; + copy_outs_value_map["Out"] = {v}; + + auto& copy_info = OpInfoMap::Instance().Get("memcpy"); + auto copy_op = copy_info.Creator()("memcpy", copy_in_map, + copy_out_map, attr_map); + OpFuncNode copy_op_func_node; + copy_op_func_node.input_index = copy_ins_name2id; + copy_op_func_node.output_index = copy_out_name2id; + + RuntimeContext copy_runtime_context({}, {}); + copy_runtime_context.inputs.swap(copy_ins_value_map); + copy_runtime_context.outputs.swap(copy_outs_value_map); + RuntimeInferShapeContext copy_infer_shape_ctx(*copy_op, + copy_runtime_context); + static_cast(copy_op) + ->InferShape(©_infer_shape_ctx); + auto& all_op_kernels = OperatorWithKernel::AllOpKernels(); + auto kernels_iter = all_op_kernels.find("memcpy"); + PADDLE_ENFORCE_NE(kernels_iter, all_op_kernels.end(), + platform::errors::Unavailable( + "There are no kernels which are registered in " + "the memcpy operator.")); + + OpKernelMap& kernels = kernels_iter->second; + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(place); + Scope scope; + auto copy_exec_ctx = + ExecutionContext(*copy_op, scope, *dev_ctx, copy_runtime_context); + auto expected_kernel_key = + dynamic_cast(copy_op) + ->GetExpectedKernelType(copy_exec_ctx); + auto kernel_iter = kernels.find(expected_kernel_key); + copy_op_func_node.kernel_func_ = + OpKernelComputeFunc(kernel_iter->second); + copy_op_func_node.kernel_func_(copy_exec_ctx); + op_list->push_back(copy_op); + vec_func_list->push_back(copy_op_func_node); + + var_name_item.second[i] = v; + } + } + } + + op_list->push_back(op_base); + + auto kernel_iter = kernels.find(expected_kernel_key); + PADDLE_ENFORCE_NE(kernel_iter, kernels.end(), + platform::errors::NotFound( + "Operator (%s) does not have kernel for %s.", + op->Type(), KernelTypeToString(expected_kernel_key))); + + op_func_node.kernel_func_ = OpKernelComputeFunc(kernel_iter->second); + op_func_node.kernel_func_(exec_ctx); + vec_func_list->push_back(op_func_node); + } +} + +class InterpreterCore { + public: + InterpreterCore(const platform::Place& place, const ProgramDesc& prog, + const ProgramDesc& startup_prog, Scope* scope) + : place_(place), prog_(prog), outer_scope_(scope) { + paddle::framework::InitDevices(); + + is_build_ = false; + + if (outer_scope_ != nullptr) { + auto name_list = outer_scope_->LocalVarNames(); + for (auto name : name_list) { + auto v = outer_scope_->Var(name); + if (global_scope.name2id.find(name) == global_scope.name2id.end()) { + global_scope.name2id[name] = global_scope.var_list.size(); + } + + global_scope.var_list.push_back(v); + } + } + + paddle::framework::build_variable_outer_scope(startup_prog, &global_scope, + outer_scope_); + + std::vector vec_func_list; + std::vector op_list; + paddle::framework::build_op_func_list( + startup_prog, &op_list, &vec_func_list, &global_scope, place_); + // add variable to outer_scope + } + void run(const std::vector& vec_name, + const std::vector& vec_tensor, + const std::vector& vec_fetch_name, + std::vector* vec_out) { + if (is_build_ == false) { + paddle::framework::build_variable_scope(prog_, &global_scope); + } + for (size_t i = 0; i < vec_name.size(); ++i) { + auto it = global_scope.name2id.find(vec_name[i]); + assert(it != global_scope.name2id.end()); + + auto feed_tensor = + global_scope.var_list[it->second]->GetMutable(); + feed_tensor->ShareDataWith(vec_tensor[i]); + } + + if (is_build_ == false) { + paddle::framework::build_op_func_list(prog_, &op_list, &vec_func_list, + &global_scope, place_); + is_build_ = true; + // convert vec func_list to graph + convert(); + } else { + exec_instruction_list(vec_instruction_, global_scope, place_); + } + + for (size_t i = 0; i < vec_fetch_name.size(); ++i) { + auto it = global_scope.name2id.find(vec_fetch_name[i]); + assert(it != global_scope.name2id.end()); + PADDLE_ENFORCE_NE(it, global_scope.name2id.end(), + platform::errors::NotFound( + "Can't find (%d) the fetch var (%s) in scope", i, + vec_fetch_name[i])); + + auto fetch_tensor = + global_scope.var_list[it->second]->GetMutable(); + + if (platform::is_gpu_place(fetch_tensor->place())) { + Tensor out; + platform::DeviceContextPool& pool = + platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(place_); + dev_ctx->Wait(); + TensorCopySync(*fetch_tensor, platform::CPUPlace(), &out); + dev_ctx->Wait(); + vec_out->push_back(out); + } else { + Tensor out; + TensorCopySync(*fetch_tensor, platform::CPUPlace(), &out); + vec_out->push_back(out); + } + } + } + + private: + void convert() { + input_var2op_info_.resize(global_scope.var_list.size()); + + vec_instruction_.reserve(vec_func_list.size()); + dependecy_count_.resize(vec_func_list.size()); + global_scope.vec_meta_info_.resize(global_scope.var_list.size()); + for (size_t i = 0; i < vec_func_list.size(); ++i) { + Instruction temp_inst; + temp_inst.kernel_func_.compute_func_ = vec_func_list[i].kernel_func_; + temp_inst.kernel_func_.operator_base_ = op_list[i]; + temp_inst.input_index_ = vec_func_list[i].input_index; + temp_inst.output_index_ = vec_func_list[i].output_index; + + std::vector gc_check_input_list; + for (auto& item : vec_func_list[i].input_index) { + for (auto id : item.second) { + input_var2op_info_[id].push_back(i); + gc_check_input_list.push_back(id); + } + } + std::sort(gc_check_input_list.begin(), gc_check_input_list.end()); + auto last = + std::unique(gc_check_input_list.begin(), gc_check_input_list.end()); + gc_check_input_list.erase(last, gc_check_input_list.end()); + for (auto var_id : gc_check_input_list) { + global_scope.vec_meta_info_[var_id].var_ref_count_++; + } + + temp_inst.gc_check_var_list.swap(gc_check_input_list); + + vec_instruction_.push_back(temp_inst); + } + + for (size_t i = 0; i < vec_instruction_.size(); ++i) { + std::vector vec_temp; + for (auto& item : vec_instruction_[i].output_index_) { + for (auto id : item.second) { + vec_temp = merge_vec(vec_temp, input_var2op_info_[id]); + } + } + + // In Program, op order is a very import information. + // Op can noly add op after it as next as next ops. + std::vector filter_next; + filter_next.reserve(vec_temp.size()); + for (auto item : vec_temp) { + if (item > i) { + filter_next.push_back(item); + } + } + vec_instruction_[i].next_instruction_.direct_run_ = filter_next; + + // checkout ouput + for (auto& item : vec_instruction_[i].output_index_) { + for (auto id : item.second) { + if (input_var2op_info_[id].size() == 0) { + // output var not be used by any kernel + vec_instruction_[i].gc_check_var_list.push_back(id); + global_scope.vec_meta_info_[id].var_ref_count_++; + } + } + } + + for (auto inst_id : filter_next) { + dependecy_count_[inst_id]++; + } + } + } + + void run_instr(const Instruction& instr_node, const VariableScope& var_scope, + const platform::Place& place) { + auto op_base = instr_node.kernel_func_.operator_base_; + // build runtime cost + VariableValueMap ins_map; + for (auto& var_name_item : instr_node.input_index_) { + std::vector input_vars; + + input_vars.reserve(var_name_item.second.size()); + for (auto& id : var_name_item.second) { + input_vars.emplace_back(var_scope.var_list[id]); + } + ins_map.emplace(var_name_item.first, std::move(input_vars)); + } + + VariableValueMap outs_map; + for (auto& var_name_item : instr_node.output_index_) { + std::vector out_vars; + + out_vars.reserve(var_name_item.second.size()); + for (auto& id : var_name_item.second) { + out_vars.emplace_back(var_scope.var_list[id]); + } + outs_map.emplace(var_name_item.first, std::move(out_vars)); + } + + RuntimeContext runtime_context({}, {}); + runtime_context.inputs.swap(ins_map); + runtime_context.outputs.swap(outs_map); + + RuntimeInferShapeContext infer_shape_ctx(*op_base, runtime_context); + + static_cast(op_base)->InferShape( + &infer_shape_ctx); + + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto* dev_ctx = pool.Get(place); + Scope scope; + + auto exec_context = + ExecutionContext(*op_base, scope, *dev_ctx, runtime_context); + + instr_node.kernel_func_.compute_func_(exec_context); + } + + void exec_instruction_list(const std::vector& vec_instr, + const VariableScope& var_scope, + const platform::Place& place) { + std::queue working_queue; + auto working_dependecy_count = dependecy_count_; + for (size_t i = 0; i < dependecy_count_.size(); ++i) { + if (dependecy_count_[i] == 0) { + working_queue.push(i); + } + } + + auto working_var_ref = global_scope.vec_meta_info_; + + size_t run_op_number = 0; + while (!working_queue.empty()) { + auto instr_id = working_queue.front(); + working_queue.pop(); + auto& instr_node = vec_instr[instr_id]; + run_instr(instr_node, var_scope, place); + + auto& next_instr = instr_node.next_instruction_.direct_run_; + ++run_op_number; + + for (auto next_i : next_instr) { + --working_dependecy_count[next_i]; + if (working_dependecy_count[next_i] == 0) { + working_queue.push(next_i); + } + } + + // GC infomation + + auto& gc_check_list = instr_node.gc_check_var_list; + for (auto var_id : gc_check_list) { + --working_var_ref[var_id].var_ref_count_; + } + } + + for (size_t i = 0; i < working_var_ref.size(); ++i) { + if (working_var_ref[i].var_ref_count_ != 0) { + cerr << " var ref is not zero " << i << endl; + } + } + } + + const platform::Place& place_; + const ProgramDesc& prog_; + paddle::framework::VariableScope global_scope; + std::vector vec_func_list; + std::vector op_list; + + bool is_build_; + + std::vector vec_instruction_; + + InstructionInfo instruction_info_; + + std::vector dependecy_count_; + std::vector ref_coun_info; + std::vector> input_var2op_info_; + + Scope* outer_scope_; +}; +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/new_exec_test.cc b/paddle/fluid/framework/new_exec_test.cc new file mode 100644 index 00000000000000..7bfb6b6540cff8 --- /dev/null +++ b/paddle/fluid/framework/new_exec_test.cc @@ -0,0 +1,88 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/executor_gc_helper.h" +#include "paddle/fluid/framework/garbage_collector.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/platform/device_context.h" + +#include "paddle/fluid/pybind/pybind.h" + +#include "gperftools/profiler.h" +#include "paddle/fluid/framework/new_exec.h" +#include "paddle/fluid/platform/init.h" + +int main() { + paddle::framework::InitDevices(); + paddle::framework::VariableScope global_scope; + auto place = paddle::platform::CUDAPlace(0); + auto test_prog = paddle::framework::load_from_file("lm_startup_program"); + { + paddle::framework::build_variable_scope(test_prog, &global_scope); + + std::vector vec_func_list; + std::vector op_list; + paddle::framework::build_op_func_list(test_prog, op_list, vec_func_list, + &global_scope, place); + + // paddle::framework::exec_op_func_list( vec_func_list, op_list, + // global_scope, place ); + } + + cerr << "run main" << endl; + auto main_prog = paddle::framework::load_from_file("lm_main_program"); + + paddle::framework::build_variable_scope(main_prog, &global_scope); + + std::vector vec_main_func_list; + std::vector op_main_list; + paddle::framework::build_op_func_list( + main_prog, op_main_list, vec_main_func_list, &global_scope, place); + paddle::framework::Scope scope; + paddle::framework::InterpreterCore interp_core(place, main_prog, test_prog, + &scope); + auto start = std::chrono::steady_clock::now(); + ProfilerStart("new_executor.prof"); + for (size_t i = 0; i < 2320; ++i) { + if (i % 200 == 0) { + cerr << i << endl; + } + // paddle::framework::exec_op_func_list( vec_main_func_list, op_main_list, + // global_scope, place ); + std::vector vec_out; + interp_core.run({}, {}, {}, vec_out); + } + ProfilerStop(); + auto end = std::chrono::steady_clock::now(); + std::chrono::duration diff = end - start; + + cerr << "time cost " << diff.count() << endl; + + return 1; +} diff --git a/paddle/fluid/framework/new_exec_util.h b/paddle/fluid/framework/new_exec_util.h new file mode 100644 index 00000000000000..1783b9be74becf --- /dev/null +++ b/paddle/fluid/framework/new_exec_util.h @@ -0,0 +1,472 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/************************************************************************* + > File Name: new_exec_util.h + > Author: guanshanshan@baidu.com + > Created Time: Fri 23 Jul 2021 06:19:19 AM UTC + ************************************************************************/ + +#pragma once + +#include +#include +#include + +#include +#include +#include +#include + +#include "paddle/fluid/framework/executor_gc_helper.h" +#include "paddle/fluid/framework/garbage_collector.h" +#include "paddle/fluid/framework/op_info.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/init.h" + +namespace paddle { +namespace framework { + +class RuntimeInferShapeContext : public InferShapeContext { + public: + RuntimeInferShapeContext(const OperatorBase& op, const RuntimeContext& ctx) + : op_(op), ctx_(ctx) {} + + bool HasInput(const std::string& name) const override { + // has only one input + const auto& ins = ctx_.inputs; + auto it = ins.find(name); + if (it == ins.end()) { + return false; + } + const auto& in = it->second; + if (in.size() == 0) return false; + PADDLE_ENFORCE_EQ( + in.size(), 1UL, + platform::errors::InvalidArgument( + "Input %s should not contain more than one inputs.", name)); + return in[0] != nullptr; + } + + bool HasOutput(const std::string& name) const override { + // has only one output + const auto& outs = ctx_.outputs; + auto it = outs.find(name); + if (it == outs.end()) { + return false; + } + const auto& out = it->second; + if (out.size() == 0) { + return false; + } + PADDLE_ENFORCE_EQ( + out.size(), 1UL, + platform::errors::InvalidArgument( + "Output %s should not contain more than one outputs.", name)); + return out[0] != nullptr; + } + + bool HasInputs(const std::string& name) const override { + const auto& ins = ctx_.inputs; + auto it = ins.find(name); + if (it == ins.end() || it->second.empty()) { + return false; + } + for (auto& input : it->second) { + if (input == nullptr) { + return false; + } + } + return true; + } + + bool HasOutputs(const std::string& name) const override { + const auto& outs = ctx_.outputs; + auto it = outs.find(name); + if (it == outs.end() || it->second.empty()) { + return false; + } + for (auto& output : it->second) { + if (output == nullptr) { + return false; + } + } + return true; + } + + AttrReader Attrs() const override { return AttrReader(op_.Attrs()); } + + std::vector Inputs(const std::string& name) const override { + return op_.Inputs(name); + } + + std::vector Outputs(const std::string& name) const override { + return op_.Outputs(name); + } + + std::string GetInputNameByIdx(size_t idx) const override { + auto& op_proto = + paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_; + PADDLE_ENFORCE_LT(idx, op_proto->inputs().size(), + platform::errors::OutOfRange( + "The index should be less than the size of inputs of " + "operator %s, but got index is %d and size is %d", + op_.Type(), idx, op_proto->inputs().size())); + return op_proto->inputs()[idx].name(); + } + + std::string GetOutputNameByIdx(size_t idx) const override { + auto& op_proto = + paddle::framework::OpInfoMap::Instance().Get(op_.Type()).proto_; + PADDLE_ENFORCE_LT( + idx, op_proto->outputs().size(), + platform::errors::OutOfRange( + "The index should be less than the size of outputs of " + "operator %s, but got index is %d and size is %d", + op_.Type(), idx, op_proto->outputs().size())); + return op_proto->outputs()[idx].name(); + } + + void ShareDim(const std::string& in, const std::string& out, size_t i = 0, + size_t j = 0) override { + auto in_it = ctx_.inputs.find(in); + auto out_it = ctx_.outputs.find(out); + PADDLE_ENFORCE_NE( + in_it, ctx_.inputs.end(), + platform::errors::NotFound("Input %s does not exist.", in)); + PADDLE_ENFORCE_NE( + out_it, ctx_.outputs.end(), + platform::errors::NotFound("Output %s does not exist.", out)); + PADDLE_ENFORCE_LT(i, in_it->second.size(), + platform::errors::InvalidArgument( + "The index of input dimension is out of range, " + "excepted index less than %zu, but received %zu.", + in_it->second.size(), i)); + PADDLE_ENFORCE_LT(j, out_it->second.size(), + platform::errors::InvalidArgument( + "The index of output dimension is out of range, " + "excepted index less than %zu, but received %zu.", + out_it->second.size(), j)); + + Variable* in_var = in_it->second[i]; + Variable* out_var = out_it->second[j]; + + PADDLE_ENFORCE_EQ( + in_var->Type(), out_var->Type(), + platform::errors::InvalidArgument( + "The type of input (%s) and output (%s) are inconsistent.", in, + out)); + + if (in_var->IsType()) { + auto& in_sele_rows = in_var->Get(); + auto out_sele_rows = out_var->GetMutable(); + out_sele_rows->mutable_value()->Resize(in_sele_rows.value().dims()); + out_sele_rows->set_rows(in_sele_rows.rows()); + out_sele_rows->set_height(in_sele_rows.height()); + } else if (in_var->IsType()) { + auto& in_lod_tensor = in_var->Get(); + auto* out_lod_tensor = out_var->GetMutable(); + out_lod_tensor->Resize(in_lod_tensor.dims()); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Currently, the input type of ShareDim only can be LoDTensor " + "or SelectedRows.")); + } + } + + void ShareAllLoD(const std::string& in, + const std::string& out) const override { + auto in_it = ctx_.inputs.find(in); + auto out_it = ctx_.outputs.find(out); + PADDLE_ENFORCE_NE(in_it, ctx_.inputs.end(), + platform::errors::NotFound( + "Input [%s] found error in Op [%s]", in, op_.Type())); + PADDLE_ENFORCE_NE( + out_it, ctx_.outputs.end(), + platform::errors::NotFound("Output [%s] found error in Op [%s]", out, + op_.Type())); + + auto& in_var_list = in_it->second; + auto& out_var_list = out_it->second; + + PADDLE_ENFORCE_EQ( + in_var_list.size(), out_var_list.size(), + platform::errors::PreconditionNotMet( + "Op [%s]: Input var size should be equal with output var size", + op_.Type())); + + auto& out_var_names = op_.Outputs(out); + + for (size_t i = 0; i < in_var_list.size(); ++i) { + if (out_var_names[i] == framework::kEmptyVarName) { + continue; + } + + Variable* in_var = in_var_list[i]; + if (!in_var->IsType()) return; + Variable* out_var = out_var_list[i]; + PADDLE_ENFORCE_EQ(out_var->IsType(), true, + platform::errors::PreconditionNotMet( + "The %d-th output of Output(%s) must be LoDTensor.", + i, out_var_names[i])); + auto& in_tensor = in_var->Get(); + auto* out_tensor = out_var->GetMutable(); + out_tensor->set_lod(in_tensor.lod()); +#ifdef PADDLE_WITH_MKLDNN + if (in_tensor.layout() != DataLayout::kMKLDNN) +#endif + out_tensor->set_layout(in_tensor.layout()); + } + } + + void ShareLoD(const std::string& in, const std::string& out, size_t i = 0, + size_t j = 0) const override { + auto in_it = ctx_.inputs.find(in); + auto out_it = ctx_.outputs.find(out); + PADDLE_ENFORCE_NE( + in_it, ctx_.inputs.end(), + platform::errors::NotFound("Input %s does not exist.", in)); + PADDLE_ENFORCE_NE( + out_it, ctx_.outputs.end(), + platform::errors::NotFound("Output %s does not exist.", out)); + PADDLE_ENFORCE_LT(i, in_it->second.size(), + platform::errors::InvalidArgument( + "The index of input dimension is out of range, " + "excepted index less than %zu, but received %zu.", + in_it->second.size(), i)); + PADDLE_ENFORCE_LT(j, out_it->second.size(), + platform::errors::InvalidArgument( + "The index of output dimension is out of range, " + "excepted index less than %zu, but received %zu.", + out_it->second.size(), j)); + + Variable* in_var = in_it->second.at(i); + if (!in_var->IsType()) return; + Variable* out_var = out_it->second.at(j); + PADDLE_ENFORCE_EQ( + out_var->IsType(), true, + platform::errors::InvalidArgument( + "The %zu-th output of Output(%s) must be LoDTensor.", j, out)); + auto& in_tensor = in_var->Get(); + auto* out_tensor = out_var->GetMutable(); + out_tensor->set_lod(in_tensor.lod()); + +// TODO(dzhwinter) : reuse ShareLoD in most operators. +// Need to call ShareLayout explicitly in sequence related ops. +// Shall we have a better method to shared info between in/out Tensor? +#ifdef PADDLE_WITH_MKLDNN + // Fix me: ugly workaround below + // Correct solution: + // set_layout() should NOT be called here (i.e. ShareLoD). Instead, + // layout of output tensor should be set "manually" in Compute() + // of each OPKernel. The reason layout should NOT be shared between + // input and output "automatically" (now by InferShape()->ShareLoD()) + // is that layout transform may occur after InferShape(). + // Workaround: + // Skip set_layout() when input layout is kMKLDNN + // This is to avoid kMKLDNN is populated wrongly into a non-MKLDNN + // OPKernel. In all MKLDNN OPkernel, set_layout(kMKLDNN) should be called + // in Compute() + if (in_tensor.layout() != DataLayout::kMKLDNN) +#endif + out_tensor->set_layout(in_tensor.layout()); + } + + int32_t GetLoDLevel(const std::string& in, size_t i = 0) const override { + PADDLE_THROW(platform::errors::PreconditionNotMet( + "GetLoDLevel is only used in compile time. The calculation of " + "output's actual lod is different among operators so that should be " + "set in the runtime kernel.")); + } + + void SetLoDLevel(const std::string& out, int32_t lod_level, + size_t j = 0) const override { + PADDLE_THROW(platform::errors::PreconditionNotMet( + "SetLoDLevel is only used in compile time. The calculation of " + "output's actual lod is different among operators so that should be " + "set in the runtime kernel.")); + } + + bool IsRuntime() const override { return true; } + + // TODO(paddle-dev): Can this be template? + std::vector GetInputVarPtrs( + const std::string& name) override { + const std::vector& vars = InputVars(name); + std::vector res; + res.reserve(vars.size()); + res.insert(res.begin(), vars.begin(), vars.end()); + return res; + } + + std::vector GetOutputVarPtrs( + const std::string& name) override { + const std::vector& vars = OutputVars(name); + std::vector res; + res.reserve(vars.size()); + res.insert(res.begin(), vars.begin(), vars.end()); + return res; + } + + DDim GetInputDim(const std::string& name) const override { + const std::vector& vars = InputVars(name); + PADDLE_ENFORCE_EQ( + vars.size(), 1UL, + platform::errors::InvalidArgument( + "Input(%s) should hold one element, but now it holds %zu elements.", + name, vars.size())); + return this->GetDim(vars[0]); + } + + std::vector GetInputsDim(const std::string& name) const override { + const std::vector& vars = InputVars(name); + return GetDims(vars); + } + + std::vector GetInputsVarType( + const std::string& name) const override { + return GetVarTypes(InputVars(name)); + } + + std::vector GetOutputsVarType( + const std::string& name) const override { + return GetVarTypes(OutputVars(name)); + } + + void SetOutputDim(const std::string& name, const DDim& dim) override { + auto& vars = OutputVars(name); + PADDLE_ENFORCE_EQ( + vars.size(), 1UL, + platform::errors::InvalidArgument("Output(%s) should hold one element, " + "but now it holds %zu elements.", + name, vars.size())); + SetDim(vars[0], dim); + } + + void SetOutputsDim(const std::string& name, + const std::vector& dims) override { + auto& vars = OutputVars(name); + SetDims(vars, dims); + } + + protected: + DDim GetDim(Variable* var) const { + PADDLE_ENFORCE_NOT_NULL( + var, platform::errors::InvalidArgument("Input variable is nullptr.")); + if (var->IsType()) { + return var->Get().dims(); + } else if (var->IsType()) { + return var->Get().GetCompleteDims(); + } else { + PADDLE_THROW(platform::errors::InvalidArgument( + "Only LoDTensor or SelectedRows support 'GetDim', but input " + "Variable's type is %s.", + ToTypeName(var->Type()))); + } + } + + std::vector GetDims(const std::vector& vars) const { + std::vector ret; + ret.reserve(vars.size()); + std::transform(vars.begin(), vars.end(), std::back_inserter(ret), + [this](Variable* var) { return this->GetDim(var); }); + return ret; + } + + std::vector GetRepeatedDims(const std::string& name) const override { + PADDLE_THROW(platform::errors::PreconditionNotMet( + "GetRepeatedDims method only ban be used in compile time.")); + } + + void SetDim(Variable* var, const DDim& dim) { + if (var->IsType()) { + var->GetMutable()->Resize(dim); + } else if (var->IsType()) { + var->GetMutable()->set_height(dim[0]); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Variable type error, expect LoDTensor or SelectedRows, but received " + "(%s).", + ToTypeName(var->Type()))); + } + } + + void SetDims(const std::vector& vars, + const std::vector& dims) { + size_t length = vars.size(); + PADDLE_ENFORCE_EQ(length, dims.size(), + platform::errors::InvalidArgument( + "The number of input variables do not match the " + "number of input dimensions, the number of variables " + "is %zu, the number of dimensions is %zu.", + length, dims.size())); + for (size_t i = 0; i < length; ++i) { + if (vars[i] == nullptr) { + continue; + } + SetDim(vars[i], dims[i]); + } + } + + void SetRepeatedDims(const std::string& name, + const std::vector& dims) override { + PADDLE_THROW(platform::errors::PreconditionNotMet( + "SetRepeatedDims method only can be used in compile time.")); + } + + std::vector GetVarTypes( + const std::vector& vars) const { + std::vector retv; + retv.resize(vars.size()); + std::transform(vars.begin(), vars.end(), retv.begin(), + std::bind(std::mem_fn(&RuntimeInferShapeContext::GetVarType), + this, std::placeholders::_1)); + return retv; + } + + proto::VarType::Type GetVarType(Variable* var) const { + return ToVarType(var->Type()); + } + + private: + const std::vector& InputVars(const std::string& name) const { + auto it = ctx_.inputs.find(name); + PADDLE_ENFORCE_NE( + it, ctx_.inputs.end(), + platform::errors::NotFound( + "Operator (%s) does not have the input (%s).", op_.Type(), name)); + return it->second; + } + + const std::vector& OutputVars(const std::string& name) const { + auto it = ctx_.outputs.find(name); + PADDLE_ENFORCE_NE( + it, ctx_.outputs.end(), + platform::errors::NotFound( + "Operator (%s) does not have the outputs (%s).", op_.Type(), name)); + return it->second; + } + + const OperatorBase& op_; + const RuntimeContext& ctx_; +}; +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 0f7012940d76b0..6a9f5577705335 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -1254,9 +1254,10 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx, } #endif #ifdef PADDLE_WITH_XPU - if (kernel_iter == kernels.end() && - is_xpu_place(expected_kernel_key.place_) && - !paddle::platform::is_xpu_support_op(type_, expected_kernel_key)) { + if ((kernel_iter == kernels.end() && + is_xpu_place(expected_kernel_key.place_) && + !paddle::platform::is_xpu_support_op(type_, expected_kernel_key)) || + paddle::platform::is_in_xpu_black_list(type_)) { VLOG(3) << "missing XPU kernel: " << type_ << ", expected_kernel_key:" << expected_kernel_key << ", fallbacking to CPU one!"; diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc index 619d31c4f5b257..93f2fd38a73064 100644 --- a/paddle/fluid/imperative/prepared_operator.cc +++ b/paddle/fluid/imperative/prepared_operator.cc @@ -131,9 +131,10 @@ PreparedOp PrepareImpl(const NameVarMap& ins, auto& kernels = kernels_iter->second; auto kernel_iter = kernels.find(expected_kernel_key); #ifdef PADDLE_WITH_XPU - if (kernel_iter == kernels.end() && - is_xpu_place(expected_kernel_key.place_) && - !paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key)) { + if ((kernel_iter == kernels.end() && + is_xpu_place(expected_kernel_key.place_) && + !paddle::platform::is_xpu_support_op(op.Type(), expected_kernel_key)) || + paddle::platform::is_in_xpu_black_list(op.Type())) { VLOG(3) << "missing XPU kernel: " << op.Type() << ", expected_kernel_key:" << expected_kernel_key << ", fallbacking to CPU one!"; diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc index 3d97d68b5c7dfd..9dc9c4d90acaba 100644 --- a/paddle/fluid/imperative/tracer.cc +++ b/paddle/fluid/imperative/tracer.cc @@ -30,6 +30,8 @@ DECLARE_string(tracer_mkldnn_ops_off); namespace paddle { namespace imperative { +thread_local bool Tracer::has_grad_ = true; + static std::shared_ptr g_current_tracer(nullptr); const std::shared_ptr& GetCurrentTracer() { return g_current_tracer; } diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index 8f50550878262f..b734ae5c499369 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -118,9 +118,9 @@ class Tracer { bool enable_program_desc_tracing_{false}; std::unique_ptr generator_; platform::Place expected_place_; - bool has_grad_{true}; bool enable_autocast_{false}; GarbageCollectorMap gcs_; + static thread_local bool has_grad_; }; // To access static variable current_tracer diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index c002c7a10cb7b3..6567c41ee1fedc 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -39,6 +39,7 @@ get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) # Adapt to custom op mechanism: Include the header files related to the data type # to avoid exposing the path of the underlying file include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform) +include_directories(${PADDLE_SOURCE_DIR}/paddle/utils) add_subdirectory(api) diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor.cc b/paddle/fluid/inference/api/details/zero_copy_tensor.cc index 5ed6691ebb8673..b117a21dea3e65 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor.cc @@ -65,10 +65,13 @@ T *Tensor::mutable_data(PlaceType place) { case static_cast(PlaceType::kXPU): { return tensor->mutable_data(paddle::platform::XPUPlace(device_)); } + case static_cast(PlaceType::kNPU): { + return tensor->mutable_data(paddle::platform::NPUPlace(device_)); + } default: PADDLE_THROW(paddle::platform::errors::Unavailable( - "Only CPU / CUDA / XPU places is supported. The place `%d` is not " - "supported.", + "Only CPU / CUDA / XPU / NPU places is supported. The place `%d` is " + "not supported.", static_cast(place))); break; } @@ -86,6 +89,8 @@ T *Tensor::data(PlaceType *place, int *size) const { *place = PlaceType::kGPU; } else if (paddle::platform::is_xpu_place(tensor->place())) { *place = PlaceType::kXPU; + } else if (paddle::platform::is_npu_place(tensor->place())) { + *place = PlaceType::kNPU; } else { *place = PlaceType::kUNK; } diff --git a/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc b/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc index 7e709924e91f93..0c092a8684d1ad 100644 --- a/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc +++ b/paddle/fluid/inference/api/details/zero_copy_tensor_test.cc @@ -133,6 +133,14 @@ TEST(Tensor, FillRandomDataAndCheck) { ASSERT_TRUE(FillRandomDataAndCheck(PlaceType::kGPU)); ASSERT_TRUE(SetPlaceAndCheck(PlaceType::kGPU)); #endif +#ifdef PADDLE_WITH_ASCEND_CL + ASSERT_TRUE(FillRandomDataAndCheck(PlaceType::kNPU)); + ASSERT_TRUE(SetPlaceAndCheck(PlaceType::kNPU)); +#endif +#ifdef PADDLE_WITH_XPU + ASSERT_TRUE(FillRandomDataAndCheck(PlaceType::kXPU)); + ASSERT_TRUE(SetPlaceAndCheck(PlaceType::kXPU)); +#endif } } // namespace paddle_infer diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc index d9e87122ac258c..dbaaf2bdc7c098 100644 --- a/paddle/fluid/inference/tensorrt/engine.cc +++ b/paddle/fluid/inference/tensorrt/engine.cc @@ -89,7 +89,6 @@ void TensorRTEngine::FreezeNetwork() { if (enable_int8) { infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kFP16); infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kINT8); - infer_builder_config_->setFlag(nvinfer1::BuilderFlag::kSTRICT_TYPES); if (calibrator_) { infer_builder_config_->setInt8Calibrator(calibrator_); diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h index 38c453bde6d2db..3604a47a7eb90b 100644 --- a/paddle/fluid/inference/tensorrt/engine.h +++ b/paddle/fluid/inference/tensorrt/engine.h @@ -32,6 +32,7 @@ limitations under the License. */ #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" #include "paddle/fluid/inference/tensorrt/trt_int8_calibrator.h" #include "paddle/fluid/inference/utils/singleton.h" +#include "paddle/utils/any.h" namespace paddle { namespace framework { @@ -425,8 +426,8 @@ class TensorRTEngine { platform::errors::InvalidArgument( "Attribute %s not found in trt engine.", attr_name)); try { - return *boost::any_cast(attrs_.at(attr_name)); - } catch (boost::bad_any_cast&) { + return *paddle::any_cast(attrs_.at(attr_name)); + } catch (paddle::bad_any_cast&) { auto TypeToString = [](const std::type_info& info) -> std::string { if (std::type_index(info) == std::type_index(typeid(bool*))) { return "bool"; @@ -504,7 +505,7 @@ class TensorRTEngine { infer_ptr ihost_memory_; std::unordered_map quant_dynamic_range_; - std::unordered_map attrs_; + std::unordered_map attrs_; std::unordered_map> attr_dels_; // For dynamic shape diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc index 2829a740236d27..bfe3dfc85eecdd 100644 --- a/paddle/fluid/inference/tensorrt/op_teller.cc +++ b/paddle/fluid/inference/tensorrt/op_teller.cc @@ -703,8 +703,9 @@ bool OpTeller::Tell(const framework::ir::Node* node, bool use_no_calib_int8, return false; } // Paddle-TRT does not support the input tensors: Shape and ShapeTensor - if (desc.Input("Shape").size() >= 1 || - desc.Input("ShapeTensor").size() >= 1) { + auto reshape_inputs = desc.Inputs(); + if (reshape_inputs.find("Shape") != reshape_inputs.end() || + reshape_inputs.find("ShapeTensor") != reshape_inputs.end()) { return false; } std::vector shape = diff --git a/paddle/fluid/memory/allocation/npu_pinned_allocator.cc b/paddle/fluid/memory/allocation/npu_pinned_allocator.cc index 507a8589d94ddd..9178825efa9e1d 100644 --- a/paddle/fluid/memory/allocation/npu_pinned_allocator.cc +++ b/paddle/fluid/memory/allocation/npu_pinned_allocator.cc @@ -39,6 +39,7 @@ void NPUPinnedAllocator::ProcessEventsAndFree() { } Allocation *NPUPinnedAllocator::AllocateImpl(size_t size) { + std::lock_guard lock(mtx_); ProcessEventsAndFree(); void *ptr; int error = posix_memalign(&ptr, kAlignment, size); @@ -50,6 +51,7 @@ Allocation *NPUPinnedAllocator::AllocateImpl(size_t size) { } void NPUPinnedAllocator::FreeImpl(Allocation *allocation) { + std::lock_guard lock(mtx_); void *ptr = allocation->ptr(); auto iter = npu_events_.find(allocation); aclrtEvent event = iter->second; @@ -65,11 +67,14 @@ void NPUPinnedAllocator::FreeImpl(Allocation *allocation) { } uint64_t NPUPinnedAllocator::ReleaseImpl(const platform::Place &place) { + std::lock_guard lock(mtx_); + // Empty implementation return static_cast(0); } void NPUPinnedAllocator::RecordEvent(Allocation *allocation, aclrtStream stream) { + std::lock_guard lock(mtx_); aclrtEvent event = nullptr; PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateEvent(&event)); PADDLE_ENFORCE_NPU_SUCCESS(aclrtRecordEvent(event, stream)); diff --git a/paddle/fluid/memory/allocation/npu_pinned_allocator.h b/paddle/fluid/memory/allocation/npu_pinned_allocator.h index 4c856b931ee2cf..b330b6e352ce42 100644 --- a/paddle/fluid/memory/allocation/npu_pinned_allocator.h +++ b/paddle/fluid/memory/allocation/npu_pinned_allocator.h @@ -42,6 +42,7 @@ class NPUPinnedAllocator : public Allocator { private: std::unordered_map npu_events_; + mutable std::mutex mtx_; }; } // namespace allocation diff --git a/paddle/fluid/operators/activation_op_npu.cc b/paddle/fluid/operators/activation_op_npu.cc index 1ccd99c71f339a..02ce817bcc8b2b 100644 --- a/paddle/fluid/operators/activation_op_npu.cc +++ b/paddle/fluid/operators/activation_op_npu.cc @@ -144,6 +144,47 @@ class ReluGradNPUKernel : public framework::OpKernel { } }; +template +class Relu6NPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + + out->mutable_data(ctx.GetPlace()); + + const auto& runner = NpuOpRunner("Relu6", + { + *x, + }, + {*out}, {}); + + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +template +class Relu6GradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* out = ctx.Input("Out"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + + auto stream = + ctx.template device_context() + .stream(); + + dx->mutable_data(ctx.GetPlace()); + const auto& runner = NpuOpRunner("Relu6Grad", {*dout, *out}, {*dx}, {}); + + runner.Run(stream); + } +}; + template class SqrtNPUKernel : public framework::OpKernel { public: @@ -431,6 +472,94 @@ class ReciprocalGradNPUKernel : public framework::OpKernel { } }; +template +class CosNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + + auto place = ctx.GetPlace(); + out->mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + + const auto& runner = NpuOpRunner("Cos", {*x}, {*out}, {}); + runner.Run(stream); + } +}; + +template +class CosGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* x = ctx.Input("X"); + auto* dx = ctx.Output(framework::GradVarName("X")); + + auto place = ctx.GetPlace(); + dx->mutable_data(place); + + Tensor sin_out(x->type()); // Temporary Tensor + sin_out.Resize(x->dims()); + sin_out.mutable_data(place); + + auto stream = + ctx.template device_context() + .stream(); + const auto& runner = NpuOpRunner("Sin", {*x}, {sin_out}, {}); + runner.Run(stream); + + const auto& runner_dx = NpuOpRunner("Mul", {*dout, sin_out}, {*dx}, {}); + runner_dx.Run(stream); + + Tensor tmp(x->type()); // Temporary Tensor + tmp.Resize(framework::make_ddim({1, 1})); + tmp.mutable_data(place); + float factor = -1.; + FillNpuTensorWithConstant(&tmp, static_cast(factor)); + + const auto& runner_dx_ = NpuOpRunner("Xdivy", {*dx, tmp}, {*dx}, {}); + runner_dx_.Run(stream); + // dx = -dout * Sine(x); + } +}; + +template +class AtanNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + auto place = ctx.GetPlace(); + out->mutable_data(place); + const auto& runner = NpuOpRunner("Atan", {*x}, {*out}, {}); + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +template +class AtanGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* x = ctx.Input("X"); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto place = ctx.GetPlace(); + dx->mutable_data(place); + auto stream = + ctx.template device_context() + .stream(); + const auto& runner_dx = NpuOpRunner("AtanGrad", {*x, *dout}, {*dx}, {}); + runner_dx.Run(stream); + } +}; + } // namespace operators } // namespace paddle @@ -457,6 +586,17 @@ REGISTER_OP_NPU_KERNEL( ops::ReluGradNPUKernel); +REGISTER_OP_NPU_KERNEL( + relu6, ops::Relu6NPUKernel, + ops::Relu6NPUKernel); + +REGISTER_OP_NPU_KERNEL( + relu6_grad, + ops::Relu6GradNPUKernel, + ops::Relu6GradNPUKernel); + REGISTER_OP_NPU_KERNEL( sqrt, ops::SqrtNPUKernel, ops::SqrtNPUKernel, ops::ReciprocalGradNPUKernel); + +REGISTER_OP_NPU_KERNEL( + cos, ops::CosNPUKernel, + ops::CosNPUKernel); + +REGISTER_OP_NPU_KERNEL( + cos_grad, ops::CosGradNPUKernel, + ops::CosGradNPUKernel); + +REGISTER_OP_NPU_KERNEL( + atan, ops::AtanNPUKernel, + ops::AtanNPUKernel); + +REGISTER_OP_NPU_KERNEL( + atan_grad, + ops::AtanGradNPUKernel, + ops::AtanGradNPUKernel); diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index b2cffc3f9063c1..be17bf9a03fc19 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -295,8 +295,7 @@ class BatchNormKernel bool global_stats = test_mode || use_global_stats; const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); + DataLayout data_layout = framework::StringToDataLayout(data_layout_str); const auto *x = ctx.Input("X"); const auto &x_dims = x->dims(); @@ -332,6 +331,12 @@ class BatchNormKernel saved_mean->mutable_data(ctx.GetPlace()); saved_variance->mutable_data(ctx.GetPlace()); + // input dimension is 2 and the format is NCHW. The input can be regarded + // as NHWC format + if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) { + data_layout = DataLayout::kNHWC; + } + if (!global_stats) { // saved_xx is use just in this batch of data EigenVectorArrayMap saved_mean_e( @@ -578,8 +583,7 @@ class BatchNormGradKernel bool use_global_stats = ctx.Attr("use_global_stats"); const bool is_test = ctx.Attr("is_test"); const float epsilon = ctx.Attr("epsilon"); - const DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); + DataLayout data_layout = framework::StringToDataLayout(data_layout_str); auto *d_x = ctx.Output(framework::GradVarName("X")); auto *d_scale = ctx.Output(framework::GradVarName("Scale")); @@ -633,6 +637,12 @@ class BatchNormGradKernel : x_dims[x_dims.size() - 1]); const int sample_size = x->numel() / N / C; + // input dimension is 2 and the format is NCHW. The input can be regarded as + // NHWC format + if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) { + data_layout = DataLayout::kNHWC; + } + // init output if (d_x) { d_x->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt index 3f210219608fb7..bd88c8f9cd2b40 100644 --- a/paddle/fluid/operators/collective/CMakeLists.txt +++ b/paddle/fluid/operators/collective/CMakeLists.txt @@ -59,6 +59,8 @@ if(WITH_ASCEND_CL) DEPS send_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) cc_test(recv_v2_op_npu_test SRCS recv_v2_op_npu_test.cc DEPS recv_v2_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) + cc_test(checknumeric SRCS checknumeric_npu_test.cc + DEPS c_allreduce_sum_op ${COLLECTIVE_DEPS} ${COMMON_TEST_DEPS_FOR_HCOM}) cc_test(c_sync_comm_stream_op_npu_test SRCS c_sync_comm_stream_op_npu_test.cc DEPS op_registry c_broadcast_op c_comm_init_hccl_op c_sync_comm_stream_op c_gen_hccl_id_op gen_hccl_id_op_helper ${COLLECTIVE_DEPS} ascend_hccl dynamic_loader dynload_warpctc scope device_context enforce executor) cc_test(c_sync_calc_stream_op_npu_test SRCS c_sync_calc_stream_op_npu_test.cc diff --git a/paddle/fluid/operators/collective/c_allreduce_op.h b/paddle/fluid/operators/collective/c_allreduce_op.h index 3c51c65b073904..1076e84e613f4a 100644 --- a/paddle/fluid/operators/collective/c_allreduce_op.h +++ b/paddle/fluid/operators/collective/c_allreduce_op.h @@ -121,35 +121,44 @@ class CAllReduceOpCPUKernel : public framework::OpKernel { }; #if defined(PADDLE_WITH_ASCEND_CL) -// return true if found_inf_or_nan or return false; -template -bool CheckNumerics(const framework::ExecutionContext& exe_ctx, - aclrtStream stream, const paddle::framework::Tensor* in) { - auto& dev_ctx = - exe_ctx.template device_context(); +// return true if found_nan or return false; +inline bool ContainsNan(const paddle::platform::NPUDeviceContext& dev_ctx, + aclrtStream stream, + const paddle::framework::Tensor* in) { using Tensor = paddle::framework::Tensor; Tensor out(in->type()); - out.Resize(in->dims()); - out.mutable_data(dev_ctx.GetPlace()); - bool found_inf_data = false; + Tensor mean(in->type()); + mean.Resize({1}); + mean.mutable_data(dev_ctx.GetPlace()); + std::vector axes; + for (int i = 0; i < in->dims().size(); ++i) { + axes.push_back(i); + } + std::vector vec; try { - const auto& runner = - NpuOpRunner("CheckNumerics", {*in}, {out}, - {{"message", std::string("check_numberics")}}); - runner.Run(stream); - dev_ctx.Wait(); - } catch (platform::EnforceNotMet& exception) { - LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!"; - found_inf_data = true; + const auto& runner_mean = paddle::operators::NpuOpRunner( + "ReduceMeanD", {*in}, {mean}, {{"axes", axes}, {"keep_dims", false}}); + TensorToVector(mean, dev_ctx, &vec); } catch (...) { - LOG(WARNING) << "[check_nan_and_inf] detected contains NaN or INF!!!"; - found_inf_data = true; + LOG(WARNING) << "ContainsNan catch exception"; + return true; + } + + VLOG(4) << "reducemeand result:" << vec[0]; + if (std::isnan(static_cast(vec[0]))) { + LOG(WARNING) << "ContainsNan detects nan"; + return true; + } + + if (std::isinf(static_cast(vec[0]))) { + LOG(WARNING) << "ContainsNan detects inf"; } - return found_inf_data; + return false; } + #endif template @@ -216,22 +225,24 @@ class CAllReduceOpASCENDKernel : public framework::OpKernel { framework::Tensor tmp; tmp.mutable_data({8}, ctx.GetPlace()); - bool check_numerics = false; + bool found_nan = false; auto d_type = in->type(); switch (d_type) { - case framework::proto::VarType::FP16: + case framework::proto::VarType::FP16: { + break; + } case framework::proto::VarType::FP32: { VLOG(4) << "prepare to FoundNanInf"; - check_numerics = CheckNumerics(ctx, dev_ctx->stream(), in); - VLOG(4) << "check_numerics:" << check_numerics; + found_nan = ContainsNan(*dev_ctx, dev_ctx->stream(), in); + VLOG(4) << "check_numerics:" << found_nan; break; } default: break; } - if (check_numerics) { + if (found_nan) { T inf = static_cast(std::numeric_limits::infinity()); VLOG(4) << "fill input data constant inf"; auto dims = in->dims(); diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc index f1bf9683e35593..ecf9f18d40f86d 100644 --- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc +++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc @@ -38,6 +38,11 @@ limitations under the License. */ #include "paddle/fluid/platform/hccl_helper.h" #endif +// Node1: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=1 GLOG_v=4 RANK_ID=1 +// DEVICE_ID=1 ./paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test +// Node2: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=0 GLOG_v=4 RANK_ID=0 +// DEVICE_ID=0 ./paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test + namespace f = paddle::framework; namespace p = paddle::platform; namespace m = paddle::operators::math; @@ -52,10 +57,11 @@ DECLARE_string(selected_npus); template void PrintDebugInfo(const std::string preStr, const std::vector& data) { std::string debugstring = ""; + std::cout << preStr << ":" << std::endl << debugstring; for (auto ele : data) { - debugstring += std::to_string(ele) + std::string(","); + std::cout << ele << " "; } - VLOG(3) << preStr << ":" << std::endl << debugstring; + std::cout << std::endl; } void PrepareUniqueId(f::Scope* scope, const p::DeviceContext& ctx, @@ -120,6 +126,7 @@ void Prepare(f::Scope* scope, const p::DeviceContext& ctx, ctx.Wait(); } +template void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int iter) { // init @@ -130,10 +137,11 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx, int num1 = 3; int num2 = 128; - std::vector init; + std::vector init; for (int64_t i = 0; i < num1 * num2; ++i) { - init.push_back(1.0 + rank_id); + init.push_back(static_cast(1.0 + rank_id)); } + init[0] = static_cast(std::numeric_limits::quiet_NaN()); PrintDebugInfo("input data", init); auto place = ctx.GetPlace(); @@ -145,31 +153,33 @@ void TestHCCLAllReduceOp(f::Scope* scope, const p::DeviceContext& ctx, auto out = scope->Var("OutData"); auto tensor_out = out->GetMutable(); tensor_out->Resize({num1, num2}); - tensor_out->mutable_data(place); // allocate + tensor_out->mutable_data(place); // allocate ctx.Wait(); // run f::AttributeMap attrs; attrs["tag"] = std::string("tagx_" + std::to_string(iter)); attrs["ring_id"] = 0; + attrs["use_calc_stream"] = 1; auto op = f::OpRegistry::CreateOp("c_allreduce_sum", {{"X", {"Data"}}}, {{"Out", {"OutData"}}}, attrs); - - for (int i = 0; i < 10; i++) { + for (int i = 0; i < 1; i++) { op->Run(*scope, place); } ctx.Wait(); - std::vector out_vec; + std::vector out_vec; TensorToVector(*tensor_out, ctx, &out_vec); ctx.Wait(); PrintDebugInfo("output data", out_vec); + float diff = static_cast(out_vec[0]) - 65504; + EXPECT_TRUE(diff < 0.1 && diff > -0.1); EXPECT_EQ(out_vec.size(), init.size()); - for (uint32_t i = 0; i < out_vec.size(); i++) { - EXPECT_EQ(out_vec[i], 3.0); + for (uint32_t i = 1; i < 10; i++) { + EXPECT_EQ(out_vec[i], static_cast(3.0)); } } @@ -182,8 +192,7 @@ TEST(c_allreduce_sum, NPU) { // only support one device, if more than one device, use first default PrepareUniqueId(&scope, ctx, &hccl_id); Prepare(&scope, ctx, &hccl_id); - for (int i = 0; i < 1; i++) { - VLOG(2) << "iter num: " << i; - TestHCCLAllReduceOp(&scope, ctx, i); - } + + TestHCCLAllReduceOp(&scope, ctx, 1); + // TestHCCLAllReduceOp(&scope, ctx, 0); } diff --git a/paddle/fluid/operators/collective/checknumeric_npu_test.cc b/paddle/fluid/operators/collective/checknumeric_npu_test.cc new file mode 100644 index 00000000000000..804e8c2a2cbe0c --- /dev/null +++ b/paddle/fluid/operators/collective/checknumeric_npu_test.cc @@ -0,0 +1,99 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +#include "paddle/fluid/operators/collective/c_allreduce_op.h" +#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h" + +#if defined(PADDLE_WITH_ASCEND_CL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/hccl_helper.h" +#endif + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(c_allreduce_sum); +USE_OP_DEVICE_KERNEL(c_allreduce_sum, NPU); +DECLARE_string(selected_npus); + +template +bool Check(T value, int size = 2 * 512 * 8192) { + f::Scope scope; + auto x = scope.Var("in"); + auto& ctx = *dynamic_cast( + p::DeviceContextPool::Instance().Get(p::NPUPlace(0))); + auto place = ctx.GetPlace(); + + auto tensor_x = x->GetMutable(); + tensor_x->Resize({size}); + tensor_x->mutable_data(place); // allocate + + std::vector init; + for (int64_t i = 0; i < size; ++i) { + init.push_back(static_cast(value)); + } + + TensorFromVector(init, ctx, tensor_x); + bool result = paddle::operators::ContainsNan(ctx, ctx.stream(), tensor_x); + return result; +} + +TEST(check_numeric, NPU) { + auto inf = std::numeric_limits::infinity(); + auto fp16_inf = static_cast(inf); + auto nan = NAN; + auto fp16_nan = static_cast(nan); + + bool result = false; + // Normal + VLOG(0) << "start normal"; + result = Check(static_cast(65546)); + ASSERT_FALSE(result); + Check(static_cast(1.0)); + ASSERT_FALSE(result); + + // Inf + VLOG(0) << "start inf"; + result = Check(fp16_inf); + ASSERT_FALSE(result); + result = Check(inf); + ASSERT_FALSE(result); + + // Nan + VLOG(0) << "start nan"; + result = Check(fp16_nan); + ASSERT_TRUE(result); + result = Check(nan); + ASSERT_TRUE(result); +} diff --git a/paddle/fluid/operators/controlflow/compare_op_npu.cc b/paddle/fluid/operators/controlflow/compare_op_npu.cc index b1d4d1e7022a32..235d44b92f9195 100644 --- a/paddle/fluid/operators/controlflow/compare_op_npu.cc +++ b/paddle/fluid/operators/controlflow/compare_op_npu.cc @@ -11,21 +11,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include -#include -#include - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/controlflow/compare_op.h" +#include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/operators/npu_op_runner.h" -#ifdef PADDLE_WITH_ASCEND_CL namespace paddle { namespace operators { -template +template class EqualNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -42,16 +36,33 @@ class EqualNPUKernel : public framework::OpKernel { } }; +template +class NotEqualNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + const auto& runner = NpuOpRunner("NotEqual", {*x, *y}, {*out}, {}); + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + template class LessThanNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* x = ctx.Input("X"); auto* y = ctx.Input("Y"); - auto* z = ctx.Output("Out"); - // int axis = context.Attr("axis"); - z->mutable_data(ctx.GetPlace()); // allocate - const auto& runner = NpuOpRunner("Less", {*x, *y}, {*z}); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + const auto& runner = NpuOpRunner("Less", {*x, *y}, {*out}, {}); auto stream = ctx.template device_context() .stream(); @@ -65,9 +76,10 @@ class LessEqualNPUKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { auto* x = ctx.Input("X"); auto* y = ctx.Input("Y"); - auto* z = ctx.Output("Out"); - z->mutable_data(ctx.GetPlace()); - const auto& runner = NpuOpRunner("LessEqual", {*x, *y}, {*z}); + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + const auto& runner = NpuOpRunner("LessEqual", {*x, *y}, {*out}, {}); auto stream = ctx.template device_context() .stream(); @@ -81,10 +93,10 @@ class GreaterThanNPUKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { auto* x = ctx.Input("X"); auto* y = ctx.Input("Y"); - auto* z = ctx.Output("Out"); + auto* out = ctx.Output("Out"); - z->mutable_data(ctx.GetPlace()); - const auto& runner = NpuOpRunner("Greater", {*x, *y}, {*z}); + out->mutable_data(ctx.GetPlace()); + const auto& runner = NpuOpRunner("Greater", {*x, *y}, {*out}, {}); auto stream = ctx.template device_context() .stream(); @@ -98,10 +110,10 @@ class GreaterEqualNPUKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { auto* x = ctx.Input("X"); auto* y = ctx.Input("Y"); - auto* z = ctx.Output("Out"); + auto* out = ctx.Output("Out"); - z->mutable_data(ctx.GetPlace()); - const auto& runner = NpuOpRunner("GreaterEqual", {*x, *y}, {*z}); + out->mutable_data(ctx.GetPlace()); + const auto& runner = NpuOpRunner("GreaterEqual", {*x, *y}, {*out}, {}); auto stream = ctx.template device_context() .stream(); @@ -115,32 +127,64 @@ class GreaterEqualNPUKernel : public framework::OpKernel { namespace ops = paddle::operators; namespace plat = paddle::platform; -REGISTER_OP_NPU_KERNEL(equal, ops::EqualNPUKernel, - ops::EqualNPUKernel, - ops::EqualNPUKernel); +REGISTER_OP_NPU_KERNEL( + equal, ops::EqualNPUKernel, + ops::EqualNPUKernel, + ops::EqualNPUKernel, + ops::EqualNPUKernel, + ops::EqualNPUKernel, + ops::EqualNPUKernel, + ops::EqualNPUKernel, + ops::EqualNPUKernel, + ops::EqualNPUKernel); + +REGISTER_OP_NPU_KERNEL( + not_equal, ops::NotEqualNPUKernel, + ops::NotEqualNPUKernel, + ops::NotEqualNPUKernel, + ops::NotEqualNPUKernel, + ops::NotEqualNPUKernel, + ops::NotEqualNPUKernel, + ops::NotEqualNPUKernel, + ops::NotEqualNPUKernel); REGISTER_OP_NPU_KERNEL( - less_than, - ops::LessThanNPUKernel, - ops::LessThanNPUKernel); + less_than, ops::LessThanNPUKernel, + ops::LessThanNPUKernel, + ops::LessThanNPUKernel, + ops::LessThanNPUKernel, + ops::LessThanNPUKernel, + ops::LessThanNPUKernel, + ops::LessThanNPUKernel, + ops::LessThanNPUKernel); REGISTER_OP_NPU_KERNEL( - less_equal, - ops::LessEqualNPUKernel, - ops::LessEqualNPUKernel); + less_equal, ops::LessEqualNPUKernel, + ops::LessEqualNPUKernel, + ops::LessEqualNPUKernel, + ops::LessEqualNPUKernel, + ops::LessEqualNPUKernel, + ops::LessEqualNPUKernel, + ops::LessEqualNPUKernel, + ops::LessEqualNPUKernel); REGISTER_OP_NPU_KERNEL( greater_than, - ops::GreaterThanNPUKernel, - ops::GreaterThanNPUKernel); + ops::GreaterThanNPUKernel, + ops::GreaterThanNPUKernel, + ops::GreaterThanNPUKernel, + ops::GreaterThanNPUKernel, + ops::GreaterThanNPUKernel, + ops::GreaterThanNPUKernel, + ops::GreaterThanNPUKernel, + ops::GreaterThanNPUKernel); REGISTER_OP_NPU_KERNEL( greater_equal, - ops::GreaterEqualNPUKernel, - ops::GreaterEqualNPUKernel); - -#endif + ops::GreaterEqualNPUKernel, + ops::GreaterEqualNPUKernel, + ops::GreaterEqualNPUKernel, + ops::GreaterEqualNPUKernel, + ops::GreaterEqualNPUKernel, + ops::GreaterEqualNPUKernel, + ops::GreaterEqualNPUKernel); diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.h b/paddle/fluid/operators/elementwise/elementwise_add_op.h index a469ebbaec2edc..ad9066540c23bf 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_add_op.h @@ -17,7 +17,6 @@ limitations under the License. */ #include #include "paddle/fluid/operators/elementwise/elementwise_op.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.cu.h" -#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/math_function.h" diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc index 72d7e318d7b052..1ba6c4cb1932b1 100644 --- a/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc +++ b/paddle/fluid/operators/elementwise/elementwise_add_op_npu.cc @@ -17,6 +17,7 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/operators/elementwise/elementwise_add_op.h" +#include "paddle/fluid/operators/elementwise/elementwise_npu.h" #include "paddle/fluid/operators/npu_op_runner.h" namespace paddle { @@ -27,12 +28,37 @@ template class ElementwiseAddNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = + ctx.template device_context(); auto* x = ctx.Input("X"); auto* y = ctx.Input("Y"); auto* out = ctx.Output("Out"); out->mutable_data(ctx.GetPlace()); - const auto& runner = NpuOpRunner("Add", {*x, *y}, {*out}, {}); + int axis = ctx.Attr("axis"); + + bool direct_compute = false; + auto x_dims = x->dims(); + auto y_dims = y->dims(); + axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); + if (x_dims.size() >= y_dims.size()) { + direct_compute = + y_dims == framework::slice_ddim(x_dims, axis, x_dims.size()); + } else { + direct_compute = + x_dims == framework::slice_ddim(y_dims, axis, y_dims.size()); + } + + Tensor transformed_x, transformed_y; + if (direct_compute) { + transformed_x.ShareDataWith(*x); + transformed_y.ShareDataWith(*y); + } else { + NpuElementWiseOpBroadcast(dev_ctx, x, y, axis, &transformed_x, + &transformed_y); + } + const auto& runner = + NpuOpRunner("Add", {transformed_x, transformed_y}, {*out}, {}); auto stream = ctx.template device_context() .stream(); @@ -44,109 +70,75 @@ template class ElementwiseAddGradNPUKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* dout = ctx.Input(framework::GradVarName("Out")); - auto* dx = ctx.Output(framework::GradVarName("X")); - auto* dy = ctx.Output(framework::GradVarName("Y")); - - auto stream = - ctx.template device_context() - .stream(); - - // NOTE(zhiqiu): It seems Ascend Sub follow the broadcast sematics with - // default axis=-1? - // So, the sub_grad should do reduce if needed. - // For example, the shape of each variable in elementwise_sub: - // x, dx: [2, 3, 5] - // y, dy: [1, 5] - // out, dout: [2, 3, 5] - // Then, out = x - y => dx = dout, dy = -dout - // And, the shape of dy can be computed by two stages reduce, - // 1. [2, 3, 5] => [3, 5], ReduceSumD on axis = 0, keep_dims = false. - // 2. [3, 5] => [1, 5], ReduceSumD on axis = 0, keep_dims = true. - + auto& dev_ctx = + ctx.template device_context(); + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int axis = ctx.Attr("axis"); + + axis = (axis == -1 ? std::abs(x->dims().size() - y->dims().size()) : axis); + auto stream = dev_ctx.stream(); if (dx) { dx->mutable_data(ctx.GetPlace()); - // For dx - // stage 1 - auto reduce_ndim = dout->dims().size() - dx->dims().size(); - std::vector axes; - for (auto i = 0; i < reduce_ndim; ++i) { - axes.push_back(i); - } - Tensor* tmp_dout = const_cast(dout); - Tensor reduced_dout(dx->type()); - if (axes.size() != 0) { - std::vector reduced_dout_dims; - for (auto i = reduce_ndim; i < dout->dims().size(); ++i) { - reduced_dout_dims.push_back(dout->dims()[i]); + if (dx->dims() != dout->dims()) { + std::vector dst_dims_vec; + std::vector reduce_axes; + auto src_dims = dx->dims(); + auto dout_dims = dout->dims(); + + int src_axis = (src_dims.size() < dout_dims.size() ? axis : 0); + for (int ax = 0; ax < dout_dims.size(); ++ax) { + if ((ax < src_axis || ax >= src_axis + src_dims.size()) || + (dout_dims[ax] > 1 && src_dims[ax - src_axis] == 1)) { + reduce_axes.push_back(ax); + } else { + dst_dims_vec.push_back(dout_dims[ax]); + } } - reduced_dout.Resize(framework::make_ddim(reduced_dout_dims)); - reduced_dout.mutable_data(ctx.GetPlace()); - const auto& runner = - NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout}, - {{"axes", axes}, {"keep_dims", false}}); - runner.Run(stream); - tmp_dout = &reduced_dout; - } - - // stage 2 - axes.clear(); - for (auto i = 0; i < dx->dims().size(); ++i) { - if (dx->dims()[i] == 1) { - axes.push_back(i); + if (!reduce_axes.empty()) { + Tensor tmp; + tmp.ShareDataWith(*dx); + tmp.Resize(framework::make_ddim(dst_dims_vec)); + const auto& runner = + NpuOpRunner("ReduceSumD", {*dout}, {tmp}, + {{"axes", reduce_axes}, {"keep_dims", false}}); + runner.Run(stream); } - } - if (axes.size() != 0) { - const auto& runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dx}, - {{"axes", axes}, {"keep_dims", true}}); - runner.Run(stream); } else { - framework::TensorCopy( - *tmp_dout, ctx.GetPlace(), - ctx.template device_context(), dx); + framework::TensorCopy(*dout, ctx.GetPlace(), dev_ctx, dx); } } - if (dy) { - // For dy - // stage 1 - auto reduce_ndim = dout->dims().size() - dy->dims().size(); - std::vector axes; - for (auto i = 0; i < reduce_ndim; ++i) { - axes.push_back(i); - } - Tensor* tmp_dout = const_cast(dout); - Tensor reduced_dout(dout->type()); - if (axes.size() != 0) { - std::vector reduced_dout_dims; - for (auto i = reduce_ndim; i < dout->dims().size(); ++i) { - reduced_dout_dims.push_back(dout->dims()[i]); + dy->mutable_data(ctx.GetPlace()); + if (dy->dims() != dout->dims()) { + std::vector dst_dims_vec; + std::vector reduce_axes; + auto src_dims = dy->dims(); + auto dout_dims = dout->dims(); + + int src_axis = (src_dims.size() < dout_dims.size() ? axis : 0); + for (int ax = 0; ax < dout_dims.size(); ++ax) { + if ((ax < src_axis || ax >= src_axis + src_dims.size()) || + (dout_dims[ax] > 1 && src_dims[ax - src_axis] == 1)) { + reduce_axes.push_back(ax); + } else { + dst_dims_vec.push_back(dout_dims[ax]); + } } - reduced_dout.Resize(framework::make_ddim(reduced_dout_dims)); - reduced_dout.mutable_data(ctx.GetPlace()); - const auto& runner = - NpuOpRunner("ReduceSumD", {*dout}, {reduced_dout}, - {{"axes", axes}, {"keep_dims", false}}); - runner.Run(stream); - tmp_dout = &reduced_dout; - } - - // stage 2 - axes.clear(); - for (auto i = 0; i < dy->dims().size(); ++i) { - if (dy->dims()[i] == 1) { - axes.push_back(i); + if (!reduce_axes.empty()) { + Tensor tmp; + tmp.ShareDataWith(*dy); + tmp.Resize(framework::make_ddim(dst_dims_vec)); + const auto& runner = + NpuOpRunner("ReduceSumD", {*dout}, {tmp}, + {{"axes", reduce_axes}, {"keep_dims", false}}); + runner.Run(stream); } - } - if (axes.size() != 0) { - dy->mutable_data(ctx.GetPlace()); - const auto& runner = NpuOpRunner("ReduceSumD", {*tmp_dout}, {*dy}, - {{"axes", axes}, {"keep_dims", true}}); - runner.Run(stream); } else { - framework::TensorCopy( - *tmp_dout, ctx.GetPlace(), - ctx.template device_context(), dy); + framework::TensorCopy(*dout, ctx.GetPlace(), dev_ctx, dy); } } } diff --git a/paddle/fluid/operators/elementwise/elementwise_npu.h b/paddle/fluid/operators/elementwise/elementwise_npu.h new file mode 100644 index 00000000000000..5ee1ebda90f44c --- /dev/null +++ b/paddle/fluid/operators/elementwise/elementwise_npu.h @@ -0,0 +1,135 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/elementwise/elementwise_op.h" +#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; + +template +void NpuBroadcast(const platform::NPUDeviceContext& dev_ctx, const Tensor* src, + int axis, const framework::DDim& dst_dims, + Tensor* transformed_src) { + auto stream = dev_ctx.stream(); + + // 1. expand the axis with dim 1 + auto src_dims = src->dims(); + Tensor tmp_src; + tmp_src.ShareDataWith(*src); + tmp_src.Resize(src_dims); + for (int i = 0; i < src_dims.size(); ++i) { + if (src_dims[i] == 1 && dst_dims[i + axis] > 1) { + Tensor tmp_tensor; + auto tmp_tensor_dims = tmp_src.dims(); + tmp_tensor_dims[i] = dst_dims[i + axis]; + tmp_tensor.mutable_data(tmp_tensor_dims, dev_ctx.GetPlace()); + const auto& runner = + NpuOpRunner("TileWithAxis", {tmp_src}, {tmp_tensor}, + {{"axis", static_cast(i)}, + {"tiles", static_cast(dst_dims[i + axis])}}); + runner.Run(stream); + tmp_src.ShareDataWith(tmp_tensor); + tmp_src.Resize(tmp_tensor_dims); + } + } + + // 2.expand the ahead axis + auto prev = framework::product(framework::slice_ddim(dst_dims, 0, axis)); + if (prev > 1) { + Tensor tmp_tensor; + auto tmp_tensor_dims = + framework::slice_ddim(dst_dims, 0, axis + src_dims.size()); + tmp_tensor.mutable_data(tmp_tensor_dims, dev_ctx.GetPlace()); + const auto& runner = NpuOpRunner( + "ExpandD", {tmp_src}, {tmp_tensor}, + {{"shape", framework::vectorize(tmp_tensor_dims)}}); + runner.Run(stream); + tmp_src.ShareDataWith(tmp_tensor); + tmp_src.Resize(tmp_tensor_dims); + } else { + tmp_src.Resize(framework::slice_ddim(dst_dims, 0, axis + src_dims.size())); + } + + // 3.expand the tail axis + auto post = framework::product( + framework::slice_ddim(dst_dims, axis + src_dims.size(), dst_dims.size())); + if (post > 1) { + auto src_dims_vec = framework::vectorize(tmp_src.dims()); + src_dims_vec.push_back(1); + tmp_src.Resize(framework::make_ddim(src_dims_vec)); + + Tensor tmp_tensor; + tmp_tensor.mutable_data(dst_dims, dev_ctx.GetPlace()); + const auto& runner = + NpuOpRunner("TileWithAxis", {tmp_src}, {tmp_tensor}, + {{"axis", static_cast(axis + src_dims.size())}, + {"tiles", static_cast(post)}}); + runner.Run(stream); + tmp_src.ShareDataWith(tmp_tensor); + } + tmp_src.Resize(dst_dims); + framework::TensorCopy(tmp_src, dev_ctx.GetPlace(), transformed_src); +} + +template +void NpuElementWiseOpBroadcast(const platform::NPUDeviceContext& dev_ctx, + const Tensor* x, const Tensor* y, int axis, + Tensor* transformed_x, Tensor* transformed_y) { + auto x_dims = x->dims(); + auto y_dims = y->dims(); + bool is_xsize_larger = true; + int max_dim = x_dims.size(); + std::vector dst_dims_vec = framework::vectorize(x_dims); + + if (x_dims.size() < y_dims.size()) { + is_xsize_larger = false; + max_dim = y_dims.size(); + dst_dims_vec = framework::vectorize(y_dims); + } + + axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis); + int x_axis = is_xsize_larger ? 0 : axis; + int y_axis = is_xsize_larger ? axis : 0; + + PADDLE_ENFORCE_GE( + axis, 0, + platform::errors::InvalidArgument( + "Axis should be great than or equal to 0, but received axis is %d.", + axis)); + PADDLE_ENFORCE_LT(axis, max_dim, + platform::errors::InvalidArgument( + "Axis should be less than %d, but received axis is %d.", + max_dim, axis)); + + for (int i = 0; i < x_dims.size(); ++i) { + dst_dims_vec[i + x_axis] = + std::max(dst_dims_vec[i + x_axis], static_cast(x_dims[i])); + } + for (int i = 0; i < y_dims.size(); ++i) { + dst_dims_vec[i + y_axis] = + std::max(dst_dims_vec[i + y_axis], static_cast(y_dims[i])); + } + + auto dst_dims = framework::make_ddim(dst_dims_vec); + NpuBroadcast(dev_ctx, x, x_axis, dst_dims, transformed_x); + NpuBroadcast(dev_ctx, y, y_axis, dst_dims, transformed_y); +} + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h index 541ff9aacfc462..95dc6ed342ffc3 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h @@ -163,7 +163,7 @@ struct DimensionsTransform { struct StridesCalculation { std::vector> strides; - std::vector divmoders; + std::vector divmoders; private: // To calculate the strides of each input_tensor. @@ -190,7 +190,7 @@ struct StridesCalculation { strides.resize(N, std::vector(dim_size, 1)); for (int i = 0; i < dim_size; ++i) { - divmoders[i] = FastDivMod(out_dims[i]); + divmoders[i] = platform::FastDivMod(out_dims[i]); } CalculateStrides(N, dim_size, in_dims); } @@ -198,21 +198,21 @@ struct StridesCalculation { template -struct BroadcastArgsWarpper { - using InVecType = CudaAlignedVector; - using OutVecType = CudaAlignedVector; +struct BroadcastArgsWrapper { + using InVecType = platform::CudaAlignedVector; + using OutVecType = platform::CudaAlignedVector; OutT *out_data; OutVecType *vec_out_data; const InT *__restrict__ in_data[ET]; const InVecType *__restrict__ vec_in_data[ET]; bool no_broadcast[ET]; - FastDivMod divmoders[kDims]; + platform::FastDivMod divmoders[kDims]; uint32_t strides[ET][framework::DDim::kMaxRank]; uint32_t scalar_cal_offset; Functor func; - HOSTDEVICE BroadcastArgsWarpper( + HOSTDEVICE BroadcastArgsWrapper( const std::vector &ins, framework::Tensor *out, int scalar_cal_offset, Functor func, const StridesCalculation &offset_calculator) @@ -227,7 +227,7 @@ struct BroadcastArgsWarpper { out_data = out->data(); vec_out_data = reinterpret_cast(out_data); memcpy(divmoders, offset_calculator.divmoders.data(), - kDims * sizeof(FastDivMod)); + kDims * sizeof(platform::FastDivMod)); } __device__ __forceinline__ uint32_t GetOffsetByDivmod(int idx, int in_idx) { @@ -302,30 +302,29 @@ struct BroadcastArgsWarpper { } }; -template __device__ inline void ScalarizedBroadcastKernelImpl( - BroadcastArgsWarpper broadcast_warpper, int tid) { + BroadcastArgsWrapper broadcast_wrapper, int tid) { InT args[ET]; OutT args_out; - broadcast_warpper.LoadScalarizedData(args, tid); + broadcast_wrapper.LoadScalarizedData(args, tid); -#pragma unroll(ET) - for (int j = 1; j < ET; ++j) { - args_out = broadcast_warpper.func(args); - } - broadcast_warpper.StoreScalarizedData(args_out, tid); + // Calcualtion of the in_tensor data. + args_out = broadcast_wrapper.func(args); + + broadcast_wrapper.StoreScalarizedData(args_out, tid); } -template __device__ inline void VectorizedBroadcastKernelImpl( - BroadcastArgsWarpper broadcast_warpper, int tid) { - using OutVecType = CudaAlignedVector; + BroadcastArgsWrapper broadcast_wrapper, int tid) { + using OutVecType = platform::CudaAlignedVector; OutVecType args_out; InT ins[ET]; InT args[ET][VecSize]; - broadcast_warpper.LoadVectorizedData(args, tid); + broadcast_wrapper.LoadVectorizedData(args, tid); #pragma unroll(VecSize) for (int i = 0; i < VecSize; ++i) { @@ -333,30 +332,30 @@ __device__ inline void VectorizedBroadcastKernelImpl( for (int j = 0; j < ET; ++j) { ins[j] = args[j][i]; } - args_out.val[i] = broadcast_warpper.func(ins); + args_out.val[i] = broadcast_wrapper.func(ins); } - broadcast_warpper.StoreVectorizedData(args_out, tid); + broadcast_wrapper.StoreVectorizedData(args_out, tid); } -template __global__ void ElementwiseBroadcastKernel( - BroadcastArgsWarpper broadcast_warpper, int main_tid, int tail_tid) { + BroadcastArgsWrapper broadcast_wrapper, int main_tid, int tail_tid) { int tid = threadIdx.x + blockIdx.x * blockDim.x; // Vectorized calculation of major data whose length is the max multipler of // VecSize, // eg: Calcualting the front 1024-length data in total 1027 data once VecSize // is 4. if (tid < main_tid) { - VectorizedBroadcastKernelImpl( - broadcast_warpper, tid); + VectorizedBroadcastKernelImpl( + broadcast_wrapper, tid); } // Scalarzed calculation of rest data whose lenght cannot fulfill VecSize. // eg: Calcualting the rest 3-length data in total 1027 data once VecSize is // 4. if (tid < tail_tid) { - ScalarizedBroadcastKernelImpl( - broadcast_warpper, tid); + ScalarizedBroadcastKernelImpl( + broadcast_wrapper, tid); } } @@ -367,7 +366,7 @@ void LaunchBroadcastKernelForDifferentDimSize( const std::vector &ins, framework::Tensor *out, int axis, Functor func) { int numel = out->numel(); - const int threads = 256; + int threads = GetThreadsConfig(ctx, numel, VecSize); int blocks = ((numel + VecSize - 1) / VecSize + threads - 1) / threads; int main_tid = numel / VecSize; int tail_tid = numel % VecSize; @@ -380,75 +379,75 @@ void LaunchBroadcastKernelForDifferentDimSize( switch (merge_dims.dim_size) { case 1: { - auto broadcast_warpper = - BroadcastArgsWarpper( + auto broadcast_wrapper = + BroadcastArgsWrapper( ins, out, vec_len, func, offset_calculator); - ElementwiseBroadcastKernel<<>>( - broadcast_warpper, main_tid, tail_tid); + broadcast_wrapper, main_tid, tail_tid); break; } case 2: { - auto broadcast_warpper = - BroadcastArgsWarpper( + auto broadcast_wrapper = + BroadcastArgsWrapper( ins, out, vec_len, func, offset_calculator); - ElementwiseBroadcastKernel<<>>( - broadcast_warpper, main_tid, tail_tid); + broadcast_wrapper, main_tid, tail_tid); break; } case 3: { - auto broadcast_warpper = - BroadcastArgsWarpper( + auto broadcast_wrapper = + BroadcastArgsWrapper( ins, out, vec_len, func, offset_calculator); - ElementwiseBroadcastKernel<<>>( - broadcast_warpper, main_tid, tail_tid); + broadcast_wrapper, main_tid, tail_tid); break; } case 4: { - auto broadcast_warpper = - BroadcastArgsWarpper( + auto broadcast_wrapper = + BroadcastArgsWrapper( ins, out, vec_len, func, offset_calculator); - ElementwiseBroadcastKernel<<>>( - broadcast_warpper, main_tid, tail_tid); + broadcast_wrapper, main_tid, tail_tid); break; } case 5: { - auto broadcast_warpper = - BroadcastArgsWarpper( + auto broadcast_wrapper = + BroadcastArgsWrapper( ins, out, vec_len, func, offset_calculator); - ElementwiseBroadcastKernel<<>>( - broadcast_warpper, main_tid, tail_tid); + broadcast_wrapper, main_tid, tail_tid); break; } case 6: { - auto broadcast_warpper = - BroadcastArgsWarpper( + auto broadcast_wrapper = + BroadcastArgsWrapper( ins, out, vec_len, func, offset_calculator); - ElementwiseBroadcastKernel<<>>( - broadcast_warpper, main_tid, tail_tid); + broadcast_wrapper, main_tid, tail_tid); break; } case 7: { - auto broadcast_warpper = - BroadcastArgsWarpper( + auto broadcast_wrapper = + BroadcastArgsWrapper( ins, out, vec_len, func, offset_calculator); - ElementwiseBroadcastKernel<<>>( - broadcast_warpper, main_tid, tail_tid); + broadcast_wrapper, main_tid, tail_tid); break; } case 8: { - auto broadcast_warpper = - BroadcastArgsWarpper( + auto broadcast_wrapper = + BroadcastArgsWrapper( ins, out, vec_len, func, offset_calculator); - ElementwiseBroadcastKernel<<>>( - broadcast_warpper, main_tid, tail_tid); + broadcast_wrapper, main_tid, tail_tid); break; } default: { @@ -473,11 +472,11 @@ void LaunchBroadcastElementwiseCudaKernel( int in_vec_size = 4; framework::Tensor *out = (*outs)[0]; for (auto *in : ins) { - auto temp_size = GetVectorizedSizeImpl(in->data()); + auto temp_size = platform::GetVectorizedSize(in->data()); in_vec_size = in->dims() == out->dims() ? std::min(temp_size, in_vec_size) : in_vec_size; } - int out_vec_size = GetVectorizedSizeImpl(out->data()); + int out_vec_size = platform::GetVectorizedSize(out->data()); int vec_size = std::min(out_vec_size, in_vec_size); switch (vec_size) { diff --git a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h index 101512e35fdcb7..3bd746ace06103 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h +++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h @@ -26,7 +26,7 @@ limitations under the License. */ namespace paddle { namespace operators { -enum ElementwiseType { kUnary = 1, kBinary = 2 }; +enum ElementwiseType { kUnary = 1, kBinary = 2, kTernary = 3 }; /* * According to NVIDIA, if number of threads per block is 64/128/256/512, @@ -52,98 +52,73 @@ inline int GetThreadsConfig(const platform::CUDADeviceContext &ctx, return std::max(64, threads); } -/* -* Only the address of input data is the multiplier of 1,2,4, vectorized load -* with corresponding multiplier-value is possible. Moreover, the maximum length -* of vectorized load is 128 bits once. Hence, valid length of vectorized load -* shall be determined under both former constraints. -*/ -template -int GetVectorizedSizeImpl(const T *pointer) { - constexpr int max_load_bits = 128; - int valid_vec_size = max_load_bits / CHAR_BIT / sizeof(T); - uint64_t address = reinterpret_cast(pointer); - constexpr int vec8 = - std::alignment_of>::value; // NOLINT - constexpr int vec4 = - std::alignment_of>::value; // NOLINT - constexpr int vec2 = - std::alignment_of>::value; // NOLINT - if (address % vec8 == 0) { - /* - * Currently, decide to deal with no more than 4 data once while adopting - * vectorization load/store, if performance test shows that dealing with - * 8 data once in vectorization load/store does get optimized, return code - * below can be changed into " return std::min(8, valid_vec_size); " . - */ - return std::min(4, valid_vec_size); - } else if (address % vec4 == 0) { - return std::min(4, valid_vec_size); - } else if (address % vec2 == 0) { - return std::min(2, valid_vec_size); - } else { - return 1; - } -} - template -int GetVectorizedSize(const std::vector &ins, - const std::vector &outs) { +int GetVectorizedSizeForIO(const std::vector &ins, + const std::vector &outs) { int vec_size = 4; for (auto iter = ins.begin(); iter != ins.end(); ++iter) { - vec_size = - std::min(vec_size, GetVectorizedSizeImpl((*iter)->data())); + vec_size = std::min(vec_size, + platform::GetVectorizedSize((*iter)->data())); } for (auto iter = outs.begin(); iter != outs.end(); ++iter) { - vec_size = - std::min(vec_size, GetVectorizedSizeImpl((*iter)->data())); + vec_size = std::min( + vec_size, platform::GetVectorizedSize((*iter)->data())); } return vec_size; } template struct ElementwiseDataWrapper { - OutT *out; - const InT *in0; - const InT *in1; - __device__ ElementwiseDataWrapper(OutT *out, const InT *in0, - const InT *in1 = nullptr) - : out(out), in0(in0), in1(in1) {} - - using InVecType = CudaAlignedVector; - using OutVecType = CudaAlignedVector; - - inline __device__ void load_vector(InVecType args[], int idx) { - const InVecType *x_vec = reinterpret_cast(in0); - args[0] = x_vec[idx]; - if (ET == ElementwiseType::kBinary) { - const InVecType *y_vec = reinterpret_cast(in1); - args[1] = y_vec[idx]; + using InVecType = platform::CudaAlignedVector; + using OutVecType = platform::CudaAlignedVector; + + const InT *__restrict__ in_data[ET]; + OutT *out_data; + uint32_t scalar_cal_offset; + + HOSTDEVICE ElementwiseDataWrapper( + const std::vector &ins, + std::vector *outs, uint32_t scalar_cal_offset) + : scalar_cal_offset(scalar_cal_offset) { +#pragma unroll + for (int i = 0; i < ET; ++i) { + in_data[i] = ins[i]->data(); + } + out_data = (*outs)[0]->data(); + } + + inline __device__ void LoadVectorizedData(InVecType vec_args[], int tid) { +#pragma unroll + for (int i = 0; i < ET; ++i) { + const InVecType *in_vec_data = + reinterpret_cast(in_data[i]); + vec_args[i] = in_vec_data[tid]; } } - inline __device__ void load_scalar(InT args[], int idx) { - args[0] = in0[idx]; - if (ET == ElementwiseType::kBinary) { - args[1] = in1[idx]; + inline __device__ void LoadScalarizedData(InT args[], int tid) { +#pragma unroll + for (int i = 0; i < ET; ++i) { + args[i] = in_data[i][tid + scalar_cal_offset]; } } - inline __device__ void store_vector(OutVecType res, int idx) { - OutVecType *out_vec = reinterpret_cast(out); - out_vec[idx] = res; + inline __device__ void StoreVectorizedData(OutVecType res, int tid) { + OutVecType *out_vec = reinterpret_cast(out_data); + out_vec[tid] = res; } - inline __device__ void store_scalar(OutT res, int idx) { out[idx] = res; } + inline __device__ void StoreScalarizedData(OutT res, int tid) { + out_data[tid + scalar_cal_offset] = res; + } }; -template -__device__ inline void VectorizedKernelImpl( - ElementwiseDataWrapper data, Functor func, - int tid) { - using InVecType = CudaAlignedVector; - using OutVecType = CudaAlignedVector; +template +__device__ inline void VectorizedKernelImpl(ElementwiseWrapper data, + Functor func, int tid) { + using InVecType = platform::CudaAlignedVector; + using OutVecType = platform::CudaAlignedVector; InVecType ins_vec[ET]; OutVecType out_vec; InT *ins_ptr[ET]; @@ -153,7 +128,7 @@ __device__ inline void VectorizedKernelImpl( ins_ptr[i] = reinterpret_cast(&(ins_vec[i])); } // load - data.load_vector(ins_vec, tid); + data.LoadVectorizedData(ins_vec, tid); // compute #pragma unroll @@ -165,52 +140,48 @@ __device__ inline void VectorizedKernelImpl( out_vec.val[i] = func(ins); } // store - data.store_vector(out_vec, tid); + data.StoreVectorizedData(out_vec, tid); } -template -__device__ inline void ScalarKernelImpl( - ElementwiseDataWrapper data, Functor func, - int start, int remain) { +template +__device__ inline void ScalarKernelImpl(ElementwiseWrapper data, Functor func, + int tid) { InT ins[ET]; OutT out; - for (int i = 0; i < remain; ++i) { - int idx = start + i; - // load - data.load_scalar(ins, idx); - // compute - out = func(ins); - // store - data.store_scalar(out, idx); - } + // load + data.LoadScalarizedData(ins, tid); + // compute + out = func(ins); + // store + data.StoreScalarizedData(out, tid); } -template -__global__ void VectorizedKernel(const InT *__restrict__ in0, - const InT *__restrict__ in1, OutT *out, - int size, Functor func) { +template +__global__ void VectorizedKernel(ElementwiseWrapper data, int main_tid, + int tail_tid, Functor func) { int tid = blockIdx.x * blockDim.x + threadIdx.x; - int remain = size - VecSize * tid; - remain = remain > 0 ? remain : 0; - auto data = ElementwiseDataWrapper(out, in0, in1); - if (remain >= VecSize) { - VectorizedKernelImpl(data, func, tid); - } else { - ScalarKernelImpl(data, func, tid * VecSize, remain); + + if (tid < main_tid) { + VectorizedKernelImpl( + data, func, tid); + } + if (tid < tail_tid) { + ScalarKernelImpl(data, func, + tid); } } -template -__global__ void ScalarKernel(const InT *__restrict__ in0, - const InT *__restrict__ in1, OutT *out, int size, - Functor func) { - auto data = ElementwiseDataWrapper(out, in0, in1); +template +__global__ void ScalarKernel(ElementwiseWrapper data, int numel, Functor func) { int tid = blockIdx.x * blockDim.x + threadIdx.x; - int remain = tid < size ? 1 : 0; - ScalarKernelImpl(data, func, tid, remain); + if (tid < numel) { + ScalarKernelImpl(data, func, + tid); + } } template @@ -219,35 +190,48 @@ void LaunchSameDimsElementwiseCudaKernel( const std::vector &ins, std::vector *outs, Functor func) { // calculate the max vec_size for all ins and outs - auto size = ins[0]->numel(); - int vec_size = GetVectorizedSize(ins, *outs); - int block_size = GetThreadsConfig(ctx, size, vec_size); + auto numel = ins[0]->numel(); + int vec_size = GetVectorizedSizeForIO(ins, *outs); + int block_size = GetThreadsConfig(ctx, numel, vec_size); int grid_size = - ((size + vec_size - 1) / vec_size + block_size - 1) / block_size; - const InT *in0 = ins[0]->data(); - const InT *in1 = - (ET == ElementwiseType::kBinary) ? ins[1]->data() : nullptr; - OutT *out = (*outs)[0]->data(); + ((numel + vec_size - 1) / vec_size + block_size - 1) / block_size; + int main_tid = numel / vec_size; + int tail_tid = numel % vec_size; + uint32_t vec_len = main_tid * vec_size; + // cuda kernel auto stream = ctx.stream(); switch (vec_size) { - case 4: - VectorizedKernel<<>>( - in0, in1, out, size, func); + case 4: { + auto data_wrapper = + ElementwiseDataWrapper(ins, outs, vec_len); + VectorizedKernel<<>>( + data_wrapper, main_tid, tail_tid, func); break; - case 2: - VectorizedKernel<<>>( - in0, in1, out, size, func); + } + case 2: { + auto data_wrapper = + ElementwiseDataWrapper(ins, outs, vec_len); + VectorizedKernel<<>>( + data_wrapper, main_tid, tail_tid, func); break; - case 1: - ScalarKernel<<>>(in0, in1, out, - size, func); + } + case 1: { + auto data_wrapper = + ElementwiseDataWrapper(ins, outs, 0); + ScalarKernel<<>>(data_wrapper, + numel, func); break; - default: + } + default: { PADDLE_THROW(platform::errors::Unimplemented( "Unsupported vectorized size: %d !", vec_size)); break; + } } } diff --git a/paddle/fluid/operators/expand_as_op.h b/paddle/fluid/operators/expand_as_op.h old mode 100755 new mode 100644 index 406455af741715..07ba0e5ad87133 --- a/paddle/fluid/operators/expand_as_op.h +++ b/paddle/fluid/operators/expand_as_op.h @@ -13,42 +13,12 @@ limitations under the License. */ #include -#include -#include -#include -#include -#include -#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/operators/eigen/eigen_function.h" #define MAX_RANK_SUPPORTED 6 -// 1. BOOST_PP_REPEAT macro represents a fast horizontal repetition construct. -// Usage: BOOST_PP_REPEAT(count, macro, data). -// This macro expands to the sequence: -// macro(z, 0, data) macro(z, 1, data) ... macro(z, count - 1, data). -// 2. As for our case, count = MAX_RANK_SUPPORTED(which is 6). -// So the range of n is 0-5(which is count-1). -// We want to generate case 1-6 instead of case 0-5. -// So we need to change n to n + 1. -#define EXPAND_AS_TEMPLATE(z, n, data) \ - case n + 1: { \ - ExpandAs(context); \ - break; \ - } -#define REP_EXPAND_AS_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_AS_TEMPLATE, ~) -#define COND(n) BOOST_PP_GREATER_EQUAL(n, BOOST_PP_MOD(n, MAX_RANK_SUPPORTED)) -#define EXPAND_AS_GRAD_CASE(n) \ - case n + 1: { \ - ExpandAsBackward(context, reshape_dims_vec, reduce_dims_vec); \ - break; \ - } -#define EXPAND_AS_GRAD_TEMPLATE(z, n, data) \ - BOOST_PP_IF(COND(n), EXPAND_AS_GRAD_CASE(n), ) -#define REP_EXPAND_AS_GRAD_TEMPLATE(n) \ - BOOST_PP_REPEAT(n, EXPAND_AS_GRAD_TEMPLATE, ~) namespace paddle { namespace operators { @@ -67,7 +37,24 @@ class ExpandAsKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& context) const override { auto rank = context.Input("X")->dims().size(); switch (rank) { - REP_EXPAND_AS_TEMPLATE(MAX_RANK_SUPPORTED) + case 1: + ExpandAs<1>(context); + break; + case 2: + ExpandAs<2>(context); + break; + case 3: + ExpandAs<3>(context); + break; + case 4: + ExpandAs<4>(context); + break; + case 5: + ExpandAs<5>(context); + break; + case 6: + ExpandAs<6>(context); + break; default: PADDLE_THROW(platform::errors::InvalidArgument( "Only support tensor with rank being between 1 and 6. But received " @@ -165,7 +152,24 @@ class ExpandAsGradKernel : public framework::OpKernel { "to %d, but the value received is %d.", MAX_RANK_SUPPORTED, dims)); switch (dims) { - REP_EXPAND_AS_GRAD_TEMPLATE(MAX_RANK_SUPPORTED) + case 1: + ExpandAsBackward<1>(context, reshape_dims_vec, reduce_dims_vec); + break; + case 2: + ExpandAsBackward<2>(context, reshape_dims_vec, reduce_dims_vec); + break; + case 3: + ExpandAsBackward<3>(context, reshape_dims_vec, reduce_dims_vec); + break; + case 4: + ExpandAsBackward<4>(context, reshape_dims_vec, reduce_dims_vec); + break; + case 5: + ExpandAsBackward<5>(context, reshape_dims_vec, reduce_dims_vec); + break; + case 6: + ExpandAsBackward<6>(context, reshape_dims_vec, reduce_dims_vec); + break; default: PADDLE_THROW(platform::errors::InvalidArgument( "Only support tensor with rank being between 1 and 6. But " diff --git a/paddle/fluid/operators/expand_as_v2_op.h b/paddle/fluid/operators/expand_as_v2_op.h old mode 100755 new mode 100644 index 6df4c592378cb2..3e8f7d15880bcd --- a/paddle/fluid/operators/expand_as_v2_op.h +++ b/paddle/fluid/operators/expand_as_v2_op.h @@ -14,42 +14,12 @@ limitations under the License. */ #include #include -#include -#include -#include -#include -#include -#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/operators/eigen/eigen_function.h" #define MAX_RANK_SUPPORTED 6 -// 1. BOOST_PP_REPEAT macro represents a fast horizontal repetition construct. -// Usage: BOOST_PP_REPEAT(count, macro, data). -// This macro expands to the sequence: -// macro(z, 0, data) macro(z, 1, data) ... macro(z, count - 1, data). -// 2. As for our case, count = MAX_RANK_SUPPORTED(which is 6). -// So the range of n is 0-5(which is count-1). -// We want to generate case 1-6 instead of case 0-5. -// So we need to change n to n + 1. -#define EXPAND_AS_TEMPLATE(z, n, data) \ - case n + 1: { \ - ExpandAs(context); \ - break; \ - } -#define REP_EXPAND_AS_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_AS_TEMPLATE, ~) -#define COND(n) BOOST_PP_GREATER_EQUAL(n, BOOST_PP_MOD(n, MAX_RANK_SUPPORTED)) -#define EXPAND_AS_GRAD_CASE(n) \ - case n + 1: { \ - ExpandAsBackward(context, reshape_dims_vec, reduce_dims_vec); \ - break; \ - } -#define EXPAND_AS_GRAD_TEMPLATE(z, n, data) \ - BOOST_PP_IF(COND(n), EXPAND_AS_GRAD_CASE(n), ) -#define REP_EXPAND_AS_GRAD_TEMPLATE(n) \ - BOOST_PP_REPEAT(n, EXPAND_AS_GRAD_TEMPLATE, ~) namespace paddle { namespace operators { @@ -85,7 +55,26 @@ class ExpandAsV2Kernel : public framework::OpKernel { "expand_as_v2 op must be less than or equal to %d.", target_rank, MAX_RANK_SUPPORTED)); - switch (target_rank) { REP_EXPAND_AS_TEMPLATE(MAX_RANK_SUPPORTED) } + switch (target_rank) { + case 1: + ExpandAs<1>(context); + break; + case 2: + ExpandAs<2>(context); + break; + case 3: + ExpandAs<3>(context); + break; + case 4: + ExpandAs<4>(context); + break; + case 5: + ExpandAs<5>(context); + break; + case 6: + ExpandAs<6>(context); + break; + } } protected: @@ -186,7 +175,24 @@ class ExpandAsV2GradKernel : public framework::OpKernel { "to %d, but the value received is %d.", MAX_RANK_SUPPORTED, dims)); switch (dims) { - REP_EXPAND_AS_GRAD_TEMPLATE(MAX_RANK_SUPPORTED) + case 1: + ExpandAsBackward<1>(context, reshape_dims_vec, reduce_dims_vec); + break; + case 2: + ExpandAsBackward<2>(context, reshape_dims_vec, reduce_dims_vec); + break; + case 3: + ExpandAsBackward<3>(context, reshape_dims_vec, reduce_dims_vec); + break; + case 4: + ExpandAsBackward<4>(context, reshape_dims_vec, reduce_dims_vec); + break; + case 5: + ExpandAsBackward<5>(context, reshape_dims_vec, reduce_dims_vec); + break; + case 6: + ExpandAsBackward<6>(context, reshape_dims_vec, reduce_dims_vec); + break; default: PADDLE_THROW(platform::errors::InvalidArgument( "Only support tensor with rank being between 1 and 6. But " diff --git a/paddle/fluid/operators/expand_as_v2_op_npu.cc b/paddle/fluid/operators/expand_as_v2_op_npu.cc new file mode 100644 index 00000000000000..76cb12330b5cd3 --- /dev/null +++ b/paddle/fluid/operators/expand_as_v2_op_npu.cc @@ -0,0 +1,96 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/expand_as_v2_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +template +class ExpandAsV2NPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto rank = context.Input("X")->dims().size(); + auto target_shape = context.Attr>("target_shape"); + auto target_rank = target_shape.size(); + PADDLE_ENFORCE_GE(target_rank, rank, + platform::errors::InvalidArgument( + "The rank (%d) of the input 'target_tensor' for " + "expand_as_v2 op must be greater than or equal to " + "the rank (%d) of the input 'x'.", + target_rank, rank)); + PADDLE_ENFORCE_GE(rank, 1, platform::errors::InvalidArgument( + "The rank (%d) of the input 'x' for " + "expand_as_v2 op must be positive.", + rank)); + PADDLE_ENFORCE_LE(target_rank, MAX_RANK_SUPPORTED, + platform::errors::InvalidArgument( + "The rank (%d) of the input 'target_tensor' for " + "expand_as_v2 op must be less than or equal to %d.", + target_rank, MAX_RANK_SUPPORTED)); + ExpandAs(context); + } + + protected: + void ExpandAs(const framework::ExecutionContext& context) const { + auto* in0 = context.Input("X"); + auto in_dims = in0->dims(); + auto target_shape = context.Attr>("target_shape"); + auto vec_in_dims = framework::vectorize(in_dims); + auto diff = target_shape.size() - vec_in_dims.size(); + vec_in_dims.insert(vec_in_dims.begin(), diff, 1); + + for (size_t i = 0; i < vec_in_dims.size(); ++i) { + PADDLE_ENFORCE_NE(target_shape[i], 0, + platform::errors::InvalidArgument( + "The value of target shape cannot be zero.")); + if (vec_in_dims[i] != 1) { + PADDLE_ENFORCE_EQ( + vec_in_dims[i], target_shape[i], + platform::errors::InvalidArgument( + "The value (%d) of the non-singleton dimension does not match" + " the corresponding value (%d) in " + "target tensor for expand_as_v2 op.", + vec_in_dims[i], target_shape[i])); + } + } + auto* out0 = context.Output("Out"); + + framework::DDim out_dims = framework::make_ddim(target_shape); + + out0->Resize(out_dims); + out0->mutable_data(context.GetPlace()); + + const auto& runner = + NpuOpRunner("ExpandD", {*in0}, {*out0}, {{"shape", target_shape}}); + + auto stream = + context.template device_context() + .stream(); + + runner.Run(stream); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_NPU_KERNEL( + expand_as_v2, + ops::ExpandAsV2NPUKernel, + ops::ExpandAsV2NPUKernel, + ops::ExpandAsV2NPUKernel, + ops::ExpandAsV2NPUKernel, + ops::ExpandAsV2NPUKernel); diff --git a/paddle/fluid/operators/expand_op.h b/paddle/fluid/operators/expand_op.h old mode 100755 new mode 100644 index e566d69096595c..809bad1d6c1eec --- a/paddle/fluid/operators/expand_op.h +++ b/paddle/fluid/operators/expand_op.h @@ -16,41 +16,12 @@ limitations under the License. */ #include -#include -#include -#include -#include -#include -#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/operators/eigen/eigen_function.h" #define MAX_RANK_SUPPORTED 6 -// 1. BOOST_PP_REPEAT macro represents a fast horizontal repetition construct. -// Usage: BOOST_PP_REPEAT(count, macro, data). -// This macro expands to the sequence: -// macro(z, 0, data) macro(z, 1, data) ... macro(z, count - 1, data). -// 2. As for our case, count = MAX_RANK_SUPPORTED(which is 6). -// So the range of n is 0-5(which is count-1). -// We want to generate case 1-6 instead of case 0-5. -// So we need to change n to n + 1. -#define EXPAND_TEMPLATE(z, n, data) \ - case n + 1: { \ - Expand(context); \ - break; \ - } -#define REP_EXPAND_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_TEMPLATE, ~) -#define COND(n) BOOST_PP_GREATER_EQUAL(n, BOOST_PP_MOD(n, MAX_RANK_SUPPORTED)) -#define EXPAND_GRAD_CASE(n) \ - case n + 1: { \ - ExpandBackward(context, reshape_dims_vec, reduce_dims_vec); \ - break; \ - } -#define EXPAND_GRAD_TEMPLATE(z, n, data) \ - BOOST_PP_IF(COND(n), EXPAND_GRAD_CASE(n), ) -#define REP_EXPAND_GRAD_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_GRAD_TEMPLATE, ~) namespace paddle { namespace operators { @@ -137,7 +108,26 @@ class ExpandKernel : public framework::OpKernel { "The number of dimensions of the input 'x' for Op(expand) " "must be less than or equal to %d, but the value received is %d.", MAX_RANK_SUPPORTED, rank)); - switch (rank) { REP_EXPAND_TEMPLATE(MAX_RANK_SUPPORTED) } + switch (rank) { + case 1: + Expand<1>(context); + break; + case 2: + Expand<2>(context); + break; + case 3: + Expand<3>(context); + break; + case 4: + Expand<4>(context); + break; + case 5: + Expand<5>(context); + break; + case 6: + Expand<6>(context); + break; + } } protected: @@ -233,7 +223,24 @@ class ExpandGradKernel : public framework::OpKernel { "to %d, but the value received is %d.", MAX_RANK_SUPPORTED, dims)); switch (dims) { - REP_EXPAND_GRAD_TEMPLATE(MAX_RANK_SUPPORTED) + case 1: + ExpandBackward<1>(context, reshape_dims_vec, reduce_dims_vec); + break; + case 2: + ExpandBackward<2>(context, reshape_dims_vec, reduce_dims_vec); + break; + case 3: + ExpandBackward<3>(context, reshape_dims_vec, reduce_dims_vec); + break; + case 4: + ExpandBackward<4>(context, reshape_dims_vec, reduce_dims_vec); + break; + case 5: + ExpandBackward<5>(context, reshape_dims_vec, reduce_dims_vec); + break; + case 6: + ExpandBackward<6>(context, reshape_dims_vec, reduce_dims_vec); + break; default: PADDLE_THROW(platform::errors::InvalidArgument( "Only support tensor with rank being between 1 and 6. But " diff --git a/paddle/fluid/operators/expand_op_npu.cc b/paddle/fluid/operators/expand_op_npu.cc index 76d5a203f306b9..2f66316c483a9c 100644 --- a/paddle/fluid/operators/expand_op_npu.cc +++ b/paddle/fluid/operators/expand_op_npu.cc @@ -39,7 +39,26 @@ class ExpandNPUKernel : public framework::OpKernel { "The number of dimensions of the input 'x' for Op(expand) " "must be less than or equal to %d, but the value received is %d.", MAX_RANK_SUPPORTED, rank)); - switch (rank) { REP_EXPAND_TEMPLATE(MAX_RANK_SUPPORTED) } + switch (rank) { + case 1: + Expand<1>(context); + break; + case 2: + Expand<2>(context); + break; + case 3: + Expand<3>(context); + break; + case 4: + Expand<4>(context); + break; + case 5: + Expand<5>(context); + break; + case 6: + Expand<6>(context); + break; + } } protected: diff --git a/paddle/fluid/operators/expand_v2_op.h b/paddle/fluid/operators/expand_v2_op.h old mode 100755 new mode 100644 index 8a87a067c51f11..a720bd7b551823 --- a/paddle/fluid/operators/expand_v2_op.h +++ b/paddle/fluid/operators/expand_v2_op.h @@ -17,41 +17,12 @@ limitations under the License. */ #include #include -#include -#include -#include -#include -#include -#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/operators/eigen/eigen_function.h" #define MAX_RANK_SUPPORTED 6 -// 1. BOOST_PP_REPEAT macro represents a fast horizontal repetition construct. -// Usage: BOOST_PP_REPEAT(count, macro, data). -// This macro expands to the sequence: -// macro(z, 0, data) macro(z, 1, data) ... macro(z, count - 1, data). -// 2. As for our case, count = MAX_RANK_SUPPORTED(which is 6). -// So the range of n is 0-5(which is count-1). -// We want to generate case 1-6 instead of case 0-5. -// So we need to change n to n + 1. -#define EXPAND_TEMPLATE(z, n, data) \ - case n + 1: { \ - Expand(context); \ - break; \ - } -#define REP_EXPAND_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_TEMPLATE, ~) -#define COND(n) BOOST_PP_GREATER_EQUAL(n, BOOST_PP_MOD(n, MAX_RANK_SUPPORTED)) -#define EXPAND_GRAD_CASE(n) \ - case n + 1: { \ - ExpandBackward(context, reshape_dims_vec, reduce_dims_vec); \ - break; \ - } -#define EXPAND_GRAD_TEMPLATE(z, n, data) \ - BOOST_PP_IF(COND(n), EXPAND_GRAD_CASE(n), ) -#define REP_EXPAND_GRAD_TEMPLATE(n) BOOST_PP_REPEAT(n, EXPAND_GRAD_TEMPLATE, ~) namespace paddle { namespace operators { @@ -132,7 +103,26 @@ class ExpandV2Kernel : public framework::OpKernel { "less than or equal to %d.", shape_size, MAX_RANK_SUPPORTED)); rank = std::max(rank, static_cast(shape_size)); - switch (rank) { REP_EXPAND_TEMPLATE(MAX_RANK_SUPPORTED) } + switch (rank) { + case 1: + Expand<1>(context); + break; + case 2: + Expand<2>(context); + break; + case 3: + Expand<3>(context); + break; + case 4: + Expand<4>(context); + break; + case 5: + Expand<5>(context); + break; + case 6: + Expand<6>(context); + break; + } } protected: @@ -271,7 +261,24 @@ class ExpandV2GradKernel : public framework::OpKernel { "to %d, but the value received is %d.", MAX_RANK_SUPPORTED, dims)); switch (dims) { - REP_EXPAND_GRAD_TEMPLATE(MAX_RANK_SUPPORTED) + case 1: + ExpandBackward<1>(context, reshape_dims_vec, reduce_dims_vec); + break; + case 2: + ExpandBackward<2>(context, reshape_dims_vec, reduce_dims_vec); + break; + case 3: + ExpandBackward<3>(context, reshape_dims_vec, reduce_dims_vec); + break; + case 4: + ExpandBackward<4>(context, reshape_dims_vec, reduce_dims_vec); + break; + case 5: + ExpandBackward<5>(context, reshape_dims_vec, reduce_dims_vec); + break; + case 6: + ExpandBackward<6>(context, reshape_dims_vec, reduce_dims_vec); + break; default: PADDLE_THROW(platform::errors::InvalidArgument( "Only support tensor with rank being between 1 and 6. But " diff --git a/paddle/fluid/operators/eye_op_npu.cc b/paddle/fluid/operators/eye_op_npu.cc new file mode 100644 index 00000000000000..c23f24b78441f1 --- /dev/null +++ b/paddle/fluid/operators/eye_op_npu.cc @@ -0,0 +1,59 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/eye_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class EyeNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto num_rows = ctx.Attr("num_rows"); + + auto d_nums = ctx.Attr("dtype"); + auto dtype = + ConvertToNpuDtype(static_cast(d_nums)); + + auto num_columns = ctx.Attr("num_columns"); + if (num_columns == -1) num_columns = num_rows; + + framework::NPUAttributeMap attr_input = { + {"num_rows", num_rows}, {"num_columns", num_columns}, {"dtype", dtype}}; + + auto* out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + const auto& runner = NpuOpRunner("Eye", {}, {*out}, attr_input); + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + eye, ops::EyeNPUKernel, + ops::EyeNPUKernel, + ops::EyeNPUKernel); diff --git a/paddle/fluid/operators/fill_any_like_op.h b/paddle/fluid/operators/fill_any_like_op.h index 9c514ed3aaa38f..2fb7bf985f222a 100644 --- a/paddle/fluid/operators/fill_any_like_op.h +++ b/paddle/fluid/operators/fill_any_like_op.h @@ -45,15 +45,18 @@ class FillAnyLikeKernel : public framework::OpKernel { static_cast(std::numeric_limits::lowest())) && (common_type_value <= static_cast(std::numeric_limits::max())), - true, platform::errors::InvalidArgument( - "filled value is out of range for" - " targeted type in fill_any_like, your kernel type is %s" - ", please check value you set.", - typeid(T).name())); + true, + platform::errors::InvalidArgument( + "The filled value is out of range for target type, " + "current kernel type is %s, the range should between %f " + "and %f, but now value is %f.", + typeid(T).name(), + static_cast(std::numeric_limits::lowest()), + static_cast(std::numeric_limits::max()), value)); + PADDLE_ENFORCE_EQ( std::isnan(value), false, - platform::errors::InvalidArgument("filled value should not be NaN," - " but received NaN")); + platform::errors::InvalidArgument("The filled value is NaN.")); math::SetConstant setter; setter(context.template device_context(), out, diff --git a/paddle/fluid/operators/fill_any_like_op_npu.cc b/paddle/fluid/operators/fill_any_like_op_npu.cc new file mode 100644 index 00000000000000..d5204f5cacfc68 --- /dev/null +++ b/paddle/fluid/operators/fill_any_like_op_npu.cc @@ -0,0 +1,79 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fill_any_like_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +template +class FillAnyLikeNPUKernel : public framework::OpKernel { + public: + using CommonType = typename std::common_type< + float, + typename std::conditional::value, + float, T>::type>::type; + + void Compute(const framework::ExecutionContext& context) const override { + auto data_type = static_cast( + context.Attr("dtype")); + auto* out = context.Output("Out"); + out->mutable_data(context.GetPlace()); + + float value = context.Attr("value"); + + auto common_type_value = static_cast(value); + + PADDLE_ENFORCE_EQ( + (common_type_value >= + static_cast(std::numeric_limits::lowest())) && + (common_type_value <= + static_cast(std::numeric_limits::max())), + true, + platform::errors::InvalidArgument( + "The filled value is out of range for target type, " + "current kernel type is %s, the range should between %f " + "and %f, but now value is %f.", + typeid(T).name(), + static_cast(std::numeric_limits::lowest()), + static_cast(std::numeric_limits::max()), value)); + + PADDLE_ENFORCE_EQ( + std::isnan(value), false, + platform::errors::InvalidArgument("The filled value is NaN.")); + + Tensor tensor_tmp(data_type); + tensor_tmp.mutable_data({1}, context.GetPlace()); + FillNpuTensorWithConstant(&tensor_tmp, static_cast(value)); + + auto stream = + context.template device_context() + .stream(); + + auto shape = out->dims(); + const auto& runner = NpuOpRunner("FillD", {tensor_tmp}, {*out}, + {{"dims", framework::vectorize(shape)}}); + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL(fill_any_like, ops::FillAnyLikeNPUKernel, + ops::FillAnyLikeNPUKernel, + ops::FillAnyLikeNPUKernel); diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc new file mode 100644 index 00000000000000..7edddce65cc6f5 --- /dev/null +++ b/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc @@ -0,0 +1,97 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fill_constant_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" +#include "paddle/fluid/operators/utils.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class FillConstantBatchSizeLikeOpNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto data_type = + static_cast(ctx.Attr("dtype")); + auto float_value = ctx.Attr("value"); + auto str_value = ctx.Attr("str_value"); + auto force_cpu = ctx.Attr("force_cpu"); + + auto *out = ctx.Output("Out"); + auto *input = ctx.Input("Input"); + if (&ctx.Attr("input_dim_idx") == 0) { + // set the correct batch size. + auto odims = out->dims(); + int input_dim_idx = ctx.Attr("input_dim_idx"); + int output_dim_idx = ctx.Attr("output_dim_idx"); + odims[output_dim_idx] = input->dims()[input_dim_idx]; + out->mutable_data(odims, ctx.GetPlace()); + } + + T value; + if (str_value.empty()) { + value = static_cast(float_value); + } else { + std::stringstream convert_stream(str_value); + if (std::is_same::value) { + int64_t tmp_value; + convert_stream >> tmp_value; + value = static_cast(tmp_value); + } else { + double tmp_value; + convert_stream >> tmp_value; + value = static_cast(tmp_value); + } + } + + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(ctx.GetPlace()); + bool cpu_place = force_cpu || ctx.GetPlace() == platform::CPUPlace(); + if (cpu_place) { + math::SetConstant functor; + out->mutable_data(platform::CPUPlace(), data_type); + functor(reinterpret_cast(dev_ctx), + out, static_cast(value)); + } else { + out->mutable_data(ctx.GetPlace(), data_type); + Tensor tensor_tmp(data_type); + tensor_tmp.mutable_data({1}, ctx.GetPlace()); + FillNpuTensorWithConstant(&tensor_tmp, value); + + auto stream = + ctx.template device_context() + .stream(); + const auto &runner = + NpuOpRunner("FillD", {tensor_tmp}, {*out}, + {{"dims", framework::vectorize(out->dims())}}); + runner.Run(stream); + } + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + fill_constant_batch_size_like, + ops::FillConstantBatchSizeLikeOpNPUKernel< + paddle::platform::NPUDeviceContext, float>, + ops::FillConstantBatchSizeLikeOpNPUKernel< + paddle::platform::NPUDeviceContext, int>, + ops::FillConstantBatchSizeLikeOpNPUKernel< + paddle::platform::NPUDeviceContext, paddle::platform::float16>); diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc index d465e77ea1886f..0dcbb6e727de78 100644 --- a/paddle/fluid/operators/fill_constant_op.cc +++ b/paddle/fluid/operators/fill_constant_op.cc @@ -36,7 +36,6 @@ class FillConstantOp : public framework::OperatorWithKernel { i, shape[i], framework::make_ddim(shape))); } } - if (shape.empty() && ctx->HasInput("ShapeTensor")) { auto shape_dims = ctx->GetInputDim("ShapeTensor"); int num_ele = 1; diff --git a/paddle/fluid/operators/flatten_op_npu.cc b/paddle/fluid/operators/flatten_op_npu.cc new file mode 100644 index 00000000000000..1569760fe3b96f --- /dev/null +++ b/paddle/fluid/operators/flatten_op_npu.cc @@ -0,0 +1,112 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#include "paddle/fluid/operators/flatten_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +template +class Flatten2NPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *in = context.Input("X"); + auto *out = context.Output("Out"); + auto &axis = context.Attr("axis"); + out->mutable_data(context.GetPlace(), in->type()); + framework::NPUAttributeMap attr_input = {{"axis", axis}}; + + auto stream = + context.template device_context() + .stream(); + const auto &runner = NpuOpRunner("FlattenV2", {*in}, {*out}, attr_input); + runner.Run(stream); + } +}; + +template +class Flatten2GradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto *d_x = ctx.Output(framework::GradVarName("X")); + auto *d_out = + ctx.Input(framework::GradVarName("Out")); + + auto xshape_dims = ctx.Input("XShape")->dims(); + auto x_dims = framework::slice_ddim(xshape_dims, 1, xshape_dims.size()); + + d_x->mutable_data(ctx.GetPlace(), d_out->type()); + framework::TensorCopy( + *d_out, ctx.GetPlace(), + ctx.template device_context(), d_x); + d_x->Resize(x_dims); + } +}; + +using Tensor = framework::Tensor; + +template +class FlattenContiguousRangeNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto *X = ctx.Input("X"); + auto *Out = ctx.Output("Out"); + int start_axis = ctx.Attr("start_axis"); + int stop_axis = ctx.Attr("stop_axis"); + + Out->mutable_data(ctx.GetPlace()); + + const auto &runner = + NpuOpRunner("FlattenV2", {*X}, {*Out}, + {{"axis", static_cast(start_axis)}, + {"end_axis", static_cast(stop_axis)}}); + auto stream = + ctx.template device_context() + .stream(); + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL(flatten2, ops::Flatten2NPUKernel, + ops::Flatten2NPUKernel, + ops::Flatten2NPUKernel, + ops::Flatten2NPUKernel, + ops::Flatten2NPUKernel, + ops::Flatten2NPUKernel); +REGISTER_OP_NPU_KERNEL(flatten2_grad, ops::Flatten2GradNPUKernel, + ops::Flatten2GradNPUKernel, + ops::Flatten2GradNPUKernel, + ops::Flatten2GradNPUKernel, + ops::Flatten2GradNPUKernel, + ops::Flatten2GradNPUKernel); + +REGISTER_OP_NPU_KERNEL( + flatten_contiguous_range, + ops::FlattenContiguousRangeNPUKernel, + ops::FlattenContiguousRangeNPUKernel, + ops::FlattenContiguousRangeNPUKernel, + ops::FlattenContiguousRangeNPUKernel, + ops::FlattenContiguousRangeNPUKernel, + ops::FlattenContiguousRangeNPUKernel); diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc index 05d521be5a1064..b9fbd18cf146c8 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc @@ -71,8 +71,17 @@ class HierarchicalSigmoidOp : public framework::OperatorWithKernel { if (with_prefetch) { OP_INOUT_CHECK(ctx->HasOutput("W_Out"), "Output", "W_Out", "hsigmoid"); } - const int64_t batch_size = ctx->GetInputDim("X")[0]; - std::vector output_shape({batch_size, 1}); + const int64_t input_dims = ctx->GetInputDim("X")[0]; + const int64_t label_dims = ctx->GetInputDim("Label")[0]; + PADDLE_ENFORCE_EQ(input_dims, label_dims, + platform::errors::InvalidArgument( + "The first dimension of " + "input and label is expected to be the same. " + "But received input's first dimension is %d; " + "label's first dimension is %d.", + input_dims, label_dims)); + + std::vector output_shape({input_dims, 1}); ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); ctx->ShareLoD("X", /*->*/ "Out"); } diff --git a/paddle/fluid/operators/index_select_op_npu.cc b/paddle/fluid/operators/index_select_op_npu.cc new file mode 100644 index 00000000000000..8df6c4e5d9ea72 --- /dev/null +++ b/paddle/fluid/operators/index_select_op_npu.cc @@ -0,0 +1,57 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/index_select_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +template +class IndexSelectNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + auto *x = ctx.Input("X"); + auto *index = ctx.Input("Index"); + auto dim = ctx.Attr("dim"); + + auto *out = ctx.Output("Out"); + out->mutable_data(ctx.GetPlace()); + + auto stream = + ctx.template device_context() + .stream(); + + NpuOpRunner runner; + runner.SetType("GatherV2") + .AddInput(*x) + .AddInput(*index) + .AddInput(std::vector{dim}) + .AddOutput(*out); + runner.Run(stream); + } +}; + +// todo: add class 'IndexSelectGradNPUKernel' here. + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_NPU_KERNEL( + index_select, + ops::IndexSelectNPUKernel, + ops::IndexSelectNPUKernel, + ops::IndexSelectNPUKernel); +// todo: register npu index_select_grad kernel here. diff --git a/paddle/fluid/operators/kernel_primitives/compute_primitives.h b/paddle/fluid/operators/kernel_primitives/compute_primitives.h index 1d23cfe007558f..ccd301aa8ca3d4 100644 --- a/paddle/fluid/operators/kernel_primitives/compute_primitives.h +++ b/paddle/fluid/operators/kernel_primitives/compute_primitives.h @@ -14,8 +14,140 @@ #pragma once +#ifdef PADDLE_WITH_CUDA +#include +#endif +#ifdef PADDLE_WITH_HIP +#include +#endif + +#include +#include "paddle/fluid/platform/float16.h" + namespace paddle { namespace operators { -namespace kernel_primitives {} +namespace kernel_primitives { +namespace details { + +template +class MPTypeTrait { + public: + using Type = T; +}; + +template <> +class MPTypeTrait { + public: + using Type = float; +}; + +} // namespace details + +/*************************** Compute Functor****************************/ +template +struct DivFunctor { + inline HOSTDEVICE T operator()(const T* args) const { + return args[0] / args[1]; + } +}; + +template +struct DivFunctor::value>> { + inline HOSTDEVICE T operator()(const T* args) const { + PADDLE_ENFORCE(args[1] != 0, + platform::errors::InvalidArgument( + "Invalid Argument Error: Integer division by zero " + "encountered in divide. Please check the input value.")); + return args[0] / args[1]; + } +}; + +/*************************** Compute Function****************************/ + +/** + * @brief compute functor for elementwise_two, in1 and in2 has the same shape + * @param: + * T : the type of in1 and in2 + * NX: the row of in1 and in2 + * NY: the col of in1 and in2 + * BlockSize: the strid of col + * OpFunc: compute functor eg: ADD, SUB, XOR, OR, MUL + */ +template +__device__ __forceinline__ void ElementwiseBinary(OutT* out, const T* in1, + const T* in2, + OpFunc compute) { + T args[2]; +#pragma unroll + for (int idx = 0; idx < NX * NY; ++idx) { + args[0] = in1[idx]; + args[1] = in2[idx]; + out[idx] = static_cast(compute(args)); + } +} + +/** + * @brief fma eg: a * b + c, in1 in2, in3 and out has the same shape + * @param: + * T : the type of in1 and in2, in3 + * NX: the row of in1, in2 and in3 + * NY: the col of in1, in2 and in3 + * BlockSize: the strid of col + */ +template +__device__ __forceinline__ void ElementwiseFma(OutT* out, const T* in1, + const T* in2, const T* in3, + OpFunc compute) { +#pragma unroll + for (int idx = 0; idx < NX * NY; ++idx) { + out[idx] = static_cast(compute(in1[idx], in2[idx], in3[idx])); + } } + +/** + * @brief compute functor for elementwise_two, in1 is [1, NY], in2 is [NX, NY] + * @param: + * T : the type of in1 and in2 + * NX: the row of in1 and in2 + * NY: the col of in2 + * BlockSize: the strid of col + * OpFunc: compute functor eg: ADD, SUB, XOR, OR, MUL + */ +template +__device__ __forceinline__ void CycleBinary(OutT* out, const T* in1, + const T* in2, OpFunc compute) { +#pragma unroll + for (int idx = 0; idx < NX; idx++) { +#pragma unroll + for (int idy = 0; idy < NY; idy++) { + out[idx + idy * NX] = + static_cast(compute(in1[idx], in2[idx + idy * NX])); + } + } } + +/** + * @brief compute functor for unary, in1 is [NX, NY] + * @param: + * T : the type of in + * NX: the row of in + * NY: the col of in + * BlockSize: the strid of col + * OpFunc: compute functor eg: relu, sigmoid, exp + */ +template +__device__ __forceinline__ void ElementwiseUnary(OutT* out, const T* in, + OpFunc compute) { +#pragma unroll + for (int idx = 0; idx < NX * NY; idx++) { + out[idx] = static_cast(compute(in + idx)); + } +} + +} // namespace kernel_primitives +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/kernel_primitives/datamover_primitives.h b/paddle/fluid/operators/kernel_primitives/datamover_primitives.h index 1d23cfe007558f..d520c33ca9bccf 100644 --- a/paddle/fluid/operators/kernel_primitives/datamover_primitives.h +++ b/paddle/fluid/operators/kernel_primitives/datamover_primitives.h @@ -13,9 +13,205 @@ // limitations under the License. #pragma once +#include +#include +#include +#include +#include namespace paddle { namespace operators { -namespace kernel_primitives {} +namespace kernel_primitives { +namespace details { + +#define INT_BITS 32 + +template +struct alignas(sizeof(T) * VecSize) VectorType { + T val[VecSize]; +}; + +struct FastDivMod { + // 1st value represents the result of input number divides by recorded divisor + // 2nd value represents the result of input number modulo by recorded divisor + using DivModT = VectorType; + + FastDivMod() {} + HOSTDEVICE FastDivMod(uint32_t d) : divisor(d) { + static_assert(sizeof(unsigned int) == 4, + "Only Support 32-bit unsigned int."); + + for (shift_val = 0; shift_val < INT_BITS; ++shift_val) { + auto shift_limit = 1 << shift_val; + if (shift_limit >= divisor) break; + } + uint64_t long_one = 1; + uint64_t temp_div = + ((long_one << INT_BITS) * ((long_one << shift_val) - divisor)) / + divisor + + 1; + multiplier = temp_div; + } + + __device__ __forceinline__ uint32_t Div(uint32_t n) const { + uint32_t t = __umulhi(n, multiplier); + return (t + n) >> shift_val; + } + + __device__ __forceinline__ DivModT Divmod(uint32_t n) const { + uint32_t q = Div(n); + DivModT result = {q, n - q * divisor}; + return result; + } + + int32_t divisor; + int32_t shift_val; + uint32_t multiplier; +}; + +template +struct BroadcastConfig { + FastDivMod divmoders[kDims]; + uint32_t strides[framework::DDim::kMaxRank]; + HOSTDEVICE BroadcastConfig() {} + + HOSTDEVICE BroadcastConfig(const std::vector& out_dims, + const std::vector& in_dims, + int dim_size) { + std::vector strides_in; + std::vector divmoders_in; + // for divmoders + divmoders_in.resize(dim_size); + for (int i = 0; i < dim_size; ++i) { + divmoders_in[i] = FastDivMod(out_dims[i]); + } + // for strides + strides_in.resize(dim_size, 1); + for (int i = 0; i < dim_size; ++i) { + strides_in[i] = in_dims[i] == 1 ? 0 : strides_in[i]; + strides_in[i] = + (i != 0 && strides_in[i] != 0) + ? std::accumulate(in_dims.begin(), in_dims.begin() + i, 1, + std::multiplies()) + : strides_in[i]; + } + + memcpy(strides, strides_in.data(), kDims * sizeof(uint32_t)); + memcpy(divmoders, divmoders_in.data(), kDims * sizeof(FastDivMod)); + } +}; + +#undef INT_BITS +} // namespace details + +template +__device__ __forceinline__ void ReadDataBase(T* dst, const T* __restrict__ src, + int size) { + int dx = threadIdx.x * NX; +#pragma unroll + for (int idx = 0; idx < NX; ++idx) { + if ((idx + dx) >= size) { + break; + } + dst[idx] = src[idx + dx]; + } +} + +template +__device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src, + int size) { + const int VECTOR_SIZE = (NX % 4 == 0) ? 4 : (NX % 2 == 0) ? 2 : 1; + const int VECTORS_PER_THREAD = NX / VECTOR_SIZE; + + // Vector per thread + if (blockDim.x * NX > size) { + ReadDataBase(dst, src, size); + } else { + // Vector type + using VecType = details::VectorType; + VecType vec_temp[VECTORS_PER_THREAD]; + const VecType* vec_input = reinterpret_cast(src); + ReadDataBase( + vec_temp, vec_input, VECTORS_PER_THREAD * blockDim.x); +#pragma unroll + for (int idx = 0; idx < NX; ++idx) { + dst[idx] = *(reinterpret_cast(vec_temp) + idx); + } + } +} + +/** @brief: ReadDataBc + * read data from src ptr when the shape of src and dst are different + * @param: + * src: the source pointer + * dst: the dst pointer + * stride_nx: the stride of src + * stride_ny: the stride of src + * the shape of dst is [NY, NX] + */ +template +__device__ __forceinline__ void ReadDataBc( + T* dst, const T* __restrict__ src, uint32_t fix, + details::BroadcastConfig config, int num, int stride_nx, + int stride_ny) { + uint32_t base_offset = fix + threadIdx.x * NX; + uint32_t offset = 0; + +#pragma unroll + for (int ny = 0; ny < NY; ++ny) { +#pragma unroll + for (uint32_t nx = 0; nx < NX; ++nx) { + uint32_t idx = base_offset + ny * stride_ny + nx * stride_nx; + if (idx < num) { + offset = 0; +#pragma unroll + for (int i = 0; i < ShapeSize; ++i) { + auto fast_divmoder = config.divmoders[i].Divmod(idx); + idx = fast_divmoder.val[0]; + offset += fast_divmoder.val[1] * config.strides[i]; + } + dst[nx + ny * NX] = src[offset]; + } + } + } +} + +template +__device__ __forceinline__ void WriteDataBase(T* dst, const T* __restrict__ src, + int size) { + int dx = threadIdx.x * NX; +#pragma unroll + for (int idx = 0; idx < NX; ++idx) { + if ((idx + dx) >= size) { + break; + } + dst[idx + dx] = src[idx]; + } } + +template +__device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src, + int size) { + const int VECTOR_SIZE = (NX % 4 == 0) ? 4 : (NX % 2 == 0) ? 2 : 1; + const int VECTORS_PER_THREAD = NX / VECTOR_SIZE; + + // Vector per thread + if (blockDim.x * NX > size) { + WriteDataBase(dst, src, size); + } else { + // Vector type + using VecType = details::VectorType; + VecType vec_temp[VECTORS_PER_THREAD]; +#pragma unroll + for (int idx = 0; idx < VECTORS_PER_THREAD; ++idx) { + vec_temp[idx] = *(reinterpret_cast(src) + idx); + } + VecType* vec_dst = reinterpret_cast(dst); + WriteDataBase( + vec_dst, vec_temp, VECTORS_PER_THREAD * blockDim.x); + } } + +} // namespace kernel_primitives +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/log_softmax_op.h b/paddle/fluid/operators/log_softmax_op.h index c732ec5a2da0ab..162087a75662d7 100644 --- a/paddle/fluid/operators/log_softmax_op.h +++ b/paddle/fluid/operators/log_softmax_op.h @@ -131,8 +131,10 @@ class LogSoftmaxKernel : public framework::OpKernel { // allocate memory on device. Out->mutable_data(context.GetPlace()); - LogSoftmaxFunctor()( - context.template device_context(), X, Out, axis); + if (X->numel() != 0) { + LogSoftmaxFunctor()( + context.template device_context(), X, Out, axis); + } } }; @@ -183,8 +185,11 @@ class LogSoftmaxGradKernel : public framework::OpKernel { // allocate memory on device. dX->mutable_data(context.GetPlace()); - LogSoftmaxGradFunctor()( - context.template device_context(), Out, dOut, dX, axis); + if (Out->numel() != 0) { + LogSoftmaxGradFunctor()( + context.template device_context(), Out, dOut, dX, + axis); + } } }; diff --git a/paddle/fluid/operators/math/concat_and_split.cc b/paddle/fluid/operators/math/concat_and_split.cc index 6c1ee863737011..83b4e89fe046f4 100644 --- a/paddle/fluid/operators/math/concat_and_split.cc +++ b/paddle/fluid/operators/math/concat_and_split.cc @@ -83,6 +83,12 @@ class SplitFunctor { const framework::Tensor& input, const std::vector& ref_inputs, const int axis, std::vector* outputs) { + // NOTE(zhiqiu): split a tensor of shape [0,3,4] at axis=1, result in 3 + // tensors of shape [0,1,4] + if (input.numel() == 0) { + return; + } + // TODO(zcd): Add input data validity checking size_t num = outputs->size(); diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu index f9cce061383939..b9481f1c8e40e2 100644 --- a/paddle/fluid/operators/math/concat_and_split.cu +++ b/paddle/fluid/operators/math/concat_and_split.cu @@ -352,6 +352,12 @@ class SplitFunctor { const framework::Tensor& input, const std::vector& ref_inputs, int axis, std::vector* outputs) { + // NOTE(zhiqiu): split a tensor of shape [0,3,4] at axis=1, result in 3 + // tensors of shape [0,1,4] + if (input.numel() == 0) { + return; + } + // TODO(zcd): Add input data validity checking int o_num = outputs->size(); int64_t out_row = 1; diff --git a/paddle/fluid/operators/meshgrid_op.h b/paddle/fluid/operators/meshgrid_op.h old mode 100755 new mode 100644 index 2aad894e11d4b4..e01469f26d74fa --- a/paddle/fluid/operators/meshgrid_op.h +++ b/paddle/fluid/operators/meshgrid_op.h @@ -16,12 +16,6 @@ #include -#include -#include -#include -#include -#include - #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" @@ -29,31 +23,6 @@ #include "paddle/fluid/platform/errors.h" #define MAX_RANK_SUPPORTED 6 -// 1. BOOST_PP_REPEAT macro represents a fast horizontal repetition construct. -// Usage: BOOST_PP_REPEAT(count, macro, data). -// This macro expands to the sequence: -// macro(z, 0, data) macro(z, 1, data) ... macro(z, count - 1, data). -// 2. As for our case, count = MAX_RANK_SUPPORTED(which is 6). -// So the range of n is 0-5(which is count-1). -// We want to generate case 1-6 instead of case 0-5. -// So we need to change n to n + 1. -#define MESHGRID_TEMPLATE(z, n, data) \ - case n + 1: { \ - MeshgridForward(context); \ - break; \ - } -#define REP_MESHGRID_TEMPLATE(n) BOOST_PP_REPEAT(n, MESHGRID_TEMPLATE, ~) -#define COND(n) BOOST_PP_GREATER_EQUAL(n, BOOST_PP_MOD(n, MAX_RANK_SUPPORTED)) - -#define MESHGRID_GRAD_CASE(n) \ - case n + 1: { \ - MeshgridBackward(context); \ - break; \ - } -#define MESHGRID_GRAD_TEMPLATE(z, n, data) \ - BOOST_PP_IF(COND(n), MESHGRID_GRAD_CASE(n), ) -#define REP_MESHGRID_GRAD_TEMPLATE(n) \ - BOOST_PP_REPEAT(n, MESHGRID_GRAD_TEMPLATE, ~) namespace paddle { namespace operators { @@ -65,7 +34,24 @@ class MeshgridKernel : public framework::OpKernel { auto ins = context.MultiInput("X"); auto rank = ins.size(); switch (rank) { - REP_MESHGRID_TEMPLATE(MAX_RANK_SUPPORTED) + case 1: + MeshgridForward<1>(context); + break; + case 2: + MeshgridForward<2>(context); + break; + case 3: + MeshgridForward<3>(context); + break; + case 4: + MeshgridForward<4>(context); + break; + case 5: + MeshgridForward<5>(context); + break; + case 6: + MeshgridForward<6>(context); + break; default: PADDLE_THROW(platform::errors::InvalidArgument( "Excepted Tensor numbers between 1 and 6, but only received d% .", @@ -141,7 +127,24 @@ class MeshgridGradKernel : public framework::OpKernel { context.MultiInput(framework::GradVarName("Out")); int n = out_grad.size(); switch (n) { - REP_MESHGRID_GRAD_TEMPLATE(MAX_RANK_SUPPORTED) + case 1: + MeshgridBackward<1>(context); + break; + case 2: + MeshgridBackward<2>(context); + break; + case 3: + MeshgridBackward<3>(context); + break; + case 4: + MeshgridBackward<4>(context); + break; + case 5: + MeshgridBackward<5>(context); + break; + case 6: + MeshgridBackward<6>(context); + break; default: PADDLE_THROW(platform::errors::InvalidArgument( "Excepted Tensor numbers between 1 and 6, but only received d% .", diff --git a/paddle/fluid/operators/npu_op_runner.cc b/paddle/fluid/operators/npu_op_runner.cc index 4461941e85c2a5..bb6549c111988e 100644 --- a/paddle/fluid/operators/npu_op_runner.cc +++ b/paddle/fluid/operators/npu_op_runner.cc @@ -33,6 +33,7 @@ static std::map DTYPE_2_ACL_DTYPE = { {framework::proto::VarType::BOOL, ACL_BOOL}, {framework::proto::VarType::UINT8, ACL_UINT8}, + {framework::proto::VarType::INT8, ACL_INT8}, {framework::proto::VarType::INT16, ACL_INT16}, {framework::proto::VarType::INT32, ACL_INT32}, {framework::proto::VarType::INT64, ACL_INT64}, @@ -240,6 +241,38 @@ NpuOpRunner &NpuOpRunner::AddInput(std::vector &&dims) { return *this; } +NpuOpRunner &NpuOpRunner::AddInput(std::vector &&values) { + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto *dev_ctx = + static_cast(pool.Get(platform::CPUPlace())); + Tensor host_tensor; + TensorFromVector(values, *dev_ctx, &host_tensor); + host_tensors_.emplace_back(host_tensor); + + // create aclTensorDesc + input_descs_.emplace_back(CreateTensorDesc(host_tensor, ACL_MEMTYPE_HOST)); + // create aclDataBuffer + input_buffers_.emplace_back(CreateDataBuffer(host_tensor)); + + return *this; +} + +NpuOpRunner &NpuOpRunner::AddInput(std::vector &&values) { + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto *dev_ctx = + static_cast(pool.Get(platform::CPUPlace())); + Tensor host_tensor; + TensorFromVector(values, *dev_ctx, &host_tensor); + host_tensors_.emplace_back(host_tensor); + + // create aclTensorDesc + input_descs_.emplace_back(CreateTensorDesc(host_tensor, ACL_MEMTYPE_HOST)); + // create aclDataBuffer + input_buffers_.emplace_back(CreateDataBuffer(host_tensor)); + + return *this; +} + NpuOpRunner &NpuOpRunner::AddOutput(const Tensor &tensor) { // create aclTensorDesc output_descs_.emplace_back(CreateTensorDesc(tensor)); diff --git a/paddle/fluid/operators/npu_op_runner.h b/paddle/fluid/operators/npu_op_runner.h index 2257c209550d60..45e973970a956d 100644 --- a/paddle/fluid/operators/npu_op_runner.h +++ b/paddle/fluid/operators/npu_op_runner.h @@ -71,6 +71,10 @@ class NpuOpRunner { NpuOpRunner &AddInput(std::vector &&dims); + NpuOpRunner &AddInput(std::vector &&values); + + NpuOpRunner &AddInput(std::vector &&values); + NpuOpRunner &AddOutput(const Tensor &tensor); NpuOpRunner &AddInputs(const std::vector &tensors); diff --git a/paddle/fluid/operators/one_hot_op_npu.cc b/paddle/fluid/operators/one_hot_op_npu.cc new file mode 100644 index 00000000000000..1cf99d844c8887 --- /dev/null +++ b/paddle/fluid/operators/one_hot_op_npu.cc @@ -0,0 +1,82 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/one_hot_op.h" + +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { +using Tensor = framework::Tensor; + +template +class OneHotNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = + ctx.template device_context(); + auto* in = ctx.Input("X"); + auto* out = ctx.Output("Out"); + int depth = ctx.Attr("depth"); + + if (ctx.HasInput("depth_tensor")) { + auto* depth_tensor = ctx.Input("depth_tensor"); + std::vector depth_data; + framework::TensorToVector(*depth_tensor, dev_ctx, &depth_data); + depth = depth_data[0]; + auto in_dims = in->dims(); + framework::DDim out_dims(in_dims); + out_dims[out_dims.size() - 1] = depth; + out->Resize(out_dims); + } + out->mutable_data(ctx.GetPlace()); + + float on_value = 1.0f, off_value = 0.0f; + if (in->type() == framework::proto::VarType::INT32) { + NpuOpRunner runner; + runner.SetType("OneHot") + .AddInput(*in) + .AddInput(std::vector({static_cast(depth)})) + .AddInput(std::vector({on_value})) + .AddInput(std::vector({off_value})) + .AddAttr("axis", -1) + .AddOutput(*out); + runner.Run(dev_ctx.stream()); + } else { + Tensor transformed_in; + transformed_in.mutable_data(in->dims(), dev_ctx.GetPlace()); + const auto& cast_runner = NpuOpRunner("Cast", {*in}, {transformed_in}, + {{"dst_type", ACL_INT32}}); + cast_runner.Run(dev_ctx.stream()); + NpuOpRunner runner; + runner.SetType("OneHot") + .AddInput(transformed_in) + .AddInput(std::vector({static_cast(depth)})) + .AddInput(std::vector({on_value})) + .AddInput(std::vector({off_value})) + .AddAttr("axis", -1) + .AddOutput(*out); + runner.Run(dev_ctx.stream()); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL(one_hot, ops::OneHotNPUKernel, + ops::OneHotNPUKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu index 99a5caaad6ab80..674326f90c504d 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu @@ -15,7 +15,6 @@ #include "paddle/fluid/operators/reduce_ops/reduce_all_op.h" #include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h" -// reduce_prod REGISTER_OP_CUDA_KERNEL( reduce_all, ops::ReduceCudaKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu index c7eafa2ac8760a..b7b0eb598249b1 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu @@ -16,7 +16,6 @@ #include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h" #include "paddle/fluid/operators/reduce_ops/reduce_op.h" -// reduce_prod REGISTER_OP_CUDA_KERNEL( reduce_any, ops::ReduceCudaKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu index 50d2fcdee23bd9..b5d5bb33d0a880 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_mean_op.cu @@ -13,58 +13,11 @@ // limitations under the License. #include -#include "paddle/fluid/operators/reduce_ops/cub_reduce.h" +#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h" #include "paddle/fluid/operators/reduce_ops/reduce_mean_op.h" +#include "paddle/fluid/operators/reduce_ops/reduce_op.h" -namespace paddle { -namespace operators { - -template -struct DivideFunctor { - HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((T)(1.0 / n)) {} - - HOSTDEVICE inline T operator()(const T& x) const { return x * n_inv; } - - private: - T n_inv; -}; - -template -class ReduceMeanKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - bool reduce_all = context.Attr("reduce_all"); - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - - auto dims = context.Attr>("dim"); - bool keep_dim = context.Attr("keep_dim"); - - std::vector reduce_dims; - if (reduce_all) { - reduce_dims.resize(input->dims().size()); - for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i; - } else { - for (auto e : dims) { - reduce_dims.push_back(e >= 0 ? e : e + input->dims().size()); - } - } - - int reduce_num = 1; - for (int i = 0; i < reduce_dims.size(); ++i) { - reduce_num *= input->dims()[reduce_dims[i]]; - } - - auto stream = context.cuda_device_context().stream(); - TensorReduce>( - *input, output, reduce_dims, static_cast(0), cub::Sum(), - DivideFunctor(reduce_num), stream); - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OP_CUDA_KERNEL(reduce_mean, ops::ReduceMeanKernel, - ops::ReduceMeanKernel, - ops::ReduceMeanKernel); +REGISTER_OP_CUDA_KERNEL( + reduce_mean, ops::ReduceCudaKernel, + ops::ReduceCudaKernel, + ops::ReduceCudaKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h index fd329acaf5ff21..fe77d3158ed27c 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.cu.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.cu.h @@ -33,6 +33,7 @@ namespace cub = hipcub; #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/fast_divmod.h" @@ -145,7 +146,6 @@ using Tensor = framework::Tensor; constexpr int kMaxRank = framework::DDim::kMaxRank; enum ReduceType { - kReduceAll = 0x00, // when reduce_rank == x_rank kReduceLastDim = 0x01, // when reduce_dim[0] == x_dim.size() - 1; kReduceHigherDim = 0x02, // ReduceFirstDim or reduceSecondDim kReduceAny = 0x03, // when reduce_dim.size() > 1 @@ -158,12 +158,13 @@ struct IndexCalculator { : dim(dim) { dims = detail::VectorToArray(cal_dims); strides = detail::VectorToArray(full_strides); - std::vector cal_divmoders; + std::vector cal_divmoders; // fast divmod for (auto i : cal_strides) { - cal_divmoders.push_back(FastDivMod(i)); + cal_divmoders.push_back(platform::FastDivMod(i)); } - divmoders = detail::VectorToArray(cal_divmoders); + divmoders = + detail::VectorToArray(cal_divmoders); } __device__ inline int Get(int offset) const { @@ -183,7 +184,7 @@ struct IndexCalculator { int dim; framework::Array dims; framework::Array strides; - framework::Array divmoders; + framework::Array divmoders; }; // reduce config @@ -338,15 +339,11 @@ struct ReduceConfig { void SetReduceType() { int rank = x_dim.size(); int reduce_rank = reduce_dim.size(); - bool is_large_enough = (reduce_num > REDUCE_SPLIT_BOUNDARY / 2) || - (left_num > REDUCE_SPLIT_BOUNDARY); - - if (rank == reduce_rank) { - reduce_type = static_cast(ReduceType::kReduceAll); - } else if (rank == 2 && reduce_rank == 1 && reduce_dim[0] == 1) { + bool is_last_dim = + (rank == 2) && (reduce_rank == 1) && (reduce_dim[0] == 1); + if (rank == reduce_rank || is_last_dim) { reduce_type = static_cast(ReduceType::kReduceLastDim); - } else if (reduce_rank == 1 && - ((rank == 2 && is_large_enough) || rank != 2)) { + } else if (reduce_rank == 1) { // ReduceFirstDim and reduceSecondDim reduce_type = static_cast(ReduceType::kReduceHigherDim); } else { @@ -576,14 +573,15 @@ static __device__ T BlockYReduce(T val, ReduceOp reducer) { // eg: x_dim = {nz, ny, nx}, nx != 1, axis can be 0 or 1 // if axis = 1 then grid.z = nz, grid.y = ny / block_size, grid.x = nx / 32 // else grid.z = 1, grid.y = ny / block_size, grid.x = nx /32 -template +template __device__ void ReduceHigherDim(const Tx* x, Ty* y, ReduceOp reducer, - TransformOp transformer, Ty init, + TransformOp transformer, MPType init, int reduce_num, int left_num, int block_size) { int idx = blockIdx.x * blockDim.x + threadIdx.x; int idy = blockIdx.y * block_size; - Ty reduce_var = init; + MPType reduce_var = init; if (idx < left_num) { int loop = reduce_num - idy; @@ -591,24 +589,24 @@ __device__ void ReduceHigherDim(const Tx* x, Ty* y, ReduceOp reducer, for (int iy = 0; iy < loop; iy++) { int id = (idy + iy) * left_num + idx + blockIdx.z * reduce_num * left_num; - reduce_var = reducer(reduce_var, static_cast(transformer(x[id]))); + reduce_var = reducer(reduce_var, static_cast(transformer(x[id]))); } y[idx + blockIdx.y * left_num + blockIdx.z * gridDim.y * left_num] = - reduce_var; + static_cast(reduce_var); } } // when reduce_dim.size() == 1 and reduce_dim[0] == x_dim.size() - 1, or // when reduce_dim.size() != 1 and reduce_dim.size() != x_dim.size(), this // function will be used -template +template __device__ void ReduceAny(const Tx* x, Ty* y, ReduceOp reducer, - TransformOp transformer, Ty init, int reduce_num, + TransformOp transformer, MPType init, int reduce_num, int left_num, bool reduce_lastdim, - ReduceIndexCal reduce_index_calculator, - LeftIndexCal left_index_calculator) { + const IndexCalculator& reduce_index_calculator, + const IndexCalculator& left_index_calculator) { int input_idx, left_idx, stride; // the last dim gets involved in reduction if (reduce_lastdim) { @@ -621,9 +619,9 @@ __device__ void ReduceAny(const Tx* x, Ty* y, ReduceOp reducer, stride = gridDim.y * blockDim.y; } // calculate the offset, means the addr where each thread really start. - int input_offset = left_index_calculator(left_idx); + int input_offset = left_index_calculator.Get(left_idx); const Tx* input = x + input_offset; - Ty reduce_var = init; + MPType reduce_var = init; // 1. reduce for each thread if (left_idx < left_num) { @@ -634,12 +632,13 @@ __device__ void ReduceAny(const Tx* x, Ty* y, ReduceOp reducer, #pragma unroll for (int i = 0; i < REDUCE_VEC_SIZE; ++i) { int reduce_idx = input_idx + i * stride; - int idx_x = reduce_index_calculator(reduce_idx); + int idx_x = reduce_index_calculator.Get(reduce_idx); input_reg[i] = input[idx_x]; } #pragma unroll for (int i = 0; i < REDUCE_VEC_SIZE; ++i) { - reduce_var = reducer(reduce_var, transformer(input_reg[i])); + reduce_var = + reducer(reduce_var, static_cast(transformer(input_reg[i]))); } input_idx += REDUCE_VEC_SIZE * stride; } @@ -652,7 +651,7 @@ __device__ void ReduceAny(const Tx* x, Ty* y, ReduceOp reducer, break; } int reduce_idx = input_idx; - int idx_x = reduce_index_calculator(reduce_idx); + int idx_x = reduce_index_calculator.Get(reduce_idx); input_reg[i] = input[idx_x]; input_idx += stride; } @@ -662,7 +661,8 @@ __device__ void ReduceAny(const Tx* x, Ty* y, ReduceOp reducer, if (input_idx >= reduce_num) { break; } - reduce_var = reducer(reduce_var, transformer(input_reg[i])); + reduce_var = + reducer(reduce_var, static_cast(transformer(input_reg[i]))); input_idx += stride; } } @@ -677,63 +677,56 @@ __device__ void ReduceAny(const Tx* x, Ty* y, ReduceOp reducer, // 3. reduce in block x reduce_var = BlockXReduce(reduce_var, reducer); if (left_idx < left_num && threadIdx.x == 0) { - y[blockIdx.y * left_num + left_idx] = reduce_var; + y[blockIdx.y * left_num + left_idx] = static_cast(reduce_var); } } else { if (left_idx < left_num && threadIdx.y == 0) { - y[blockIdx.y * left_num + left_idx] = reduce_var; + y[blockIdx.y * left_num + left_idx] = static_cast(reduce_var); } } } // module function designed for global function -template +template __device__ void ReduceModule(const Tx* x, Ty* y, ReduceOp reducer, - TransformOp transformer, Ty init, int reduce_num, - int left_num, int blocking_size, int reduce_type, - bool reduce_lastdim, + TransformOp transformer, MPType init, + int reduce_num, int left_num, int blocking_size, + int reduce_type, bool reduce_lastdim, const IndexCalculator& reduce_index_calculator, const IndexCalculator& left_index_calculator) { - if (reduce_type == ReduceType::kReduceLastDim) { - ReduceAny( + if (reduce_type == ReduceType::kReduceLastDim || + reduce_type == ReduceType::kReduceAny) { + ReduceAny( x, y, reducer, transformer, init, reduce_num, left_num, reduce_lastdim, - [&](int idx) { return idx; }, - [&](int idx) { return idx * reduce_num; }); - + reduce_index_calculator, left_index_calculator); // reduce_rank == 1 && reduce_dim[0] != x_dim.size() - 1 } else if (reduce_type == ReduceType::kReduceHigherDim) { - ReduceHigherDim( + ReduceHigherDim( x, y, reducer, transformer, init, reduce_num, left_num, blocking_size); - - // reduce_rank >= 2 - } else { - ReduceAny( - x, y, reducer, transformer, init, reduce_num, left_num, reduce_lastdim, - [&](int idx) { return reduce_index_calculator.Get(idx); }, - [&](int idx) { return left_index_calculator.Get(idx); }); } } -template +template __global__ void ReduceKernelFunction(const Tx* x, Ty* y, ReduceOp reducer, - TransformOp transformer, Ty init, + TransformOp transformer, MPType init, int reduce_num, int left_num, int blocking_size, int reduce_type, bool reduce_lastdim, IndexCalculator reduce_index_calculator, IndexCalculator left_index_calculator) { - ReduceModule( + ReduceModule( x, y, reducer, transformer, init, reduce_num, left_num, blocking_size, reduce_type, reduce_lastdim, reduce_index_calculator, left_index_calculator); } -template +template static void LaunchReduceKernel(const Tx* x_data, Ty* y_data, - const ReduceOp& reducer, Ty init, + const ReduceOp& reducer, MPType init, gpuStream_t stream, ReduceConfig config) { using TransformOp = typename ReduceOp::Transformer; - int reduce_rank = config.reduce_strides.size(); int left_rank = config.left_strides.size(); auto reduce_index_calculator = IndexCalculator( @@ -741,7 +734,7 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data, auto left_index_calculator = IndexCalculator( left_rank, config.left_dim, config.left_strides, config.x_strides); - ReduceKernelFunction<<>>( x_data, config.output_data, reducer, TransformOp(config.reduce_num), init, config.reduce_num, config.left_num, config.blocking_size, @@ -759,10 +752,11 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data, grid = dim3(config.grid.x, 1, config.grid.z); } - ReduceKernelFunction><<>>( + ReduceKernelFunction< + Ty, Ty, MPType, ReduceOp, + detail::IdentityFunctor><<>>( config.output_data, y_data, reducer, - detail::IdentityFunctor(config.grid.y), init, config.grid.y, + detail::IdentityFunctor(config.grid.y), init, config.grid.y, config.left_num, config.grid.y, ReduceType::kReduceHigherDim, config.reduce_lastdim, reduce_index_calculator, left_index_calculator); } @@ -793,11 +787,12 @@ void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y, } config.SetOutputData(y_data, x.place(), &tmp); - - using TransformOp = typename ReduceOp::Transformer; - auto reducer = ReduceOp(); - // launch CUB::Reduce - if (config.reduce_type == static_cast(ReduceType::kReduceAll)) { + bool use_cub_reduce = (config.left_num == 1) && + (!std::is_same::value); + if (use_cub_reduce) { + // launch CUB::Reduce + using TransformOp = typename ReduceOp::Transformer; + auto reducer = ReduceOp(); cub::TransformInputIterator trans_x( x_data, TransformOp(config.reduce_num)); size_t temp_storage_bytes = 0; @@ -815,7 +810,9 @@ void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y, return; } - LaunchReduceKernel>( + using MPType = typename details::MPTypeTrait::Type; + auto reducer = ReduceOp(); + LaunchReduceKernel>( x_data, y_data, reducer, reducer.initial(), stream, config); } diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h index 1c36cebe70a77e..af01b71adb78e3 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_op.h +++ b/paddle/fluid/operators/reduce_ops/reduce_op.h @@ -591,7 +591,6 @@ class ReduceGradOp : public framework::OperatorWithKernel { (in_dtype >= 0) ? static_cast(in_dtype) : OperatorWithKernel::IndicateVarDataType( ctx, framework::GradVarName("Out")); - #ifdef PADDLE_WITH_MKLDNN auto CanMKLDNNReduceGradBeUsed = [&]() { auto dx_dims = ctx.Input("X")->dims(); diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc b/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc new file mode 100644 index 00000000000000..834b63f199e37d --- /dev/null +++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op_npu.cc @@ -0,0 +1,101 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the Licnse. */ + +#include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +class ReduceProdNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + auto dims = ctx.Attr>("dim"); + bool keep_dim = ctx.Attr("keep_dim"); + bool reduce_all = ctx.Attr("reduce_all"); + int out_dtype = ctx.Attr("out_dtype"); + + auto place = ctx.GetPlace(); + + framework::Tensor cast_out(x->type()); + cast_out.Resize(out->dims()); + cast_out.mutable_data(place); + + auto cast_out_dtype = x->type(); + if (out_dtype != -1) { + cast_out_dtype = static_cast(out_dtype); + } + + if (x->type() != cast_out_dtype) { + if (cast_out_dtype == framework::proto::VarType::FP32) { + out->mutable_data(place); + } else if (cast_out_dtype == framework::proto::VarType::FP16) { + out->mutable_data(place); + } else if (cast_out_dtype == framework::proto::VarType::INT16) { + out->mutable_data(place); + } else if (cast_out_dtype == framework::proto::VarType::INT32) { + out->mutable_data(place); + } else if (cast_out_dtype == framework::proto::VarType::INT64) { + out->mutable_data(place); + } else if (cast_out_dtype == framework::proto::VarType::FP64) { + out->mutable_data(place); + } else if (cast_out_dtype == framework::proto::VarType::BOOL) { + out->mutable_data(place); + } + } else { + out->ShareDataWith(cast_out); + } + + framework::NPUAttributeMap attr_input = {{"axes", dims}, + {"keep_dims", keep_dim}}; + + if (reduce_all) { + std::vector dim_vec; + for (int i = 0; i < x->dims().size(); i++) { + dim_vec.push_back(i); + } + + attr_input = {{"axes", dim_vec}, {"keep_dims", keep_dim}}; + } + + auto stream = + ctx.template device_context() + .stream(); + + const auto& runner = + NpuOpRunner("ReduceProdD", {*x}, {cast_out}, attr_input); + runner.Run(stream); + + if (x->type() != cast_out_dtype) { + auto dst_dtype = ConvertToNpuDtype(cast_out_dtype); + const auto& runner_cast = + NpuOpRunner("Cast", {cast_out}, {*out}, + {{"dst_type", static_cast(dst_dtype)}}); + runner_cast.Run(stream); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_NPU_KERNEL( + reduce_prod, ops::ReduceProdNPUKernel, + ops::ReduceProdNPUKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu index efbafe4aa8c3e0..27a29a5b095056 100644 --- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu +++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cu @@ -11,72 +11,18 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. - -#include "paddle/fluid/operators/reduce_ops/cub_reduce.h" +#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h" +#include "paddle/fluid/operators/reduce_ops/reduce_op.h" #include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h" - -namespace paddle { -namespace operators { - -template -struct IdentityFunctor { - HOSTDEVICE explicit inline IdentityFunctor() {} - - template - HOSTDEVICE inline Tout operator()(const U& x) const { - return static_cast(x); - } -}; - -template -class ReduceSumKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - bool reduce_all = context.Attr("reduce_all"); - auto* input = context.Input("X"); - auto* output = context.Output("Out"); - auto out_dtype = context.Attr("out_dtype"); - - auto dims = context.Attr>("dim"); - bool keep_dim = context.Attr("keep_dim"); - - std::vector reduce_dims; - if (reduce_all) { - reduce_dims.resize(input->dims().size()); - for (int i = 0; i < reduce_dims.size(); ++i) reduce_dims[i] = i; - } else { - for (auto e : dims) { - reduce_dims.push_back(e >= 0 ? e : e + input->dims().size()); - } - } - - int reduce_num = 1; - for (int i = 0; i < reduce_dims.size(); ++i) { - reduce_num *= input->dims()[reduce_dims[i]]; - } - - auto stream = context.cuda_device_context().stream(); - if (out_dtype >= 0) { - framework::VisitDataTypeSmall( - static_cast(out_dtype), - TensorReduceFunctor( - *input, output, reduce_dims, static_cast(0.0), cub::Sum(), - stream)); - } else { - TensorReduce>( - *input, output, reduce_dims, static_cast(0), cub::Sum(), - IdentityFunctor(), stream); - } - } -}; - -} // namespace operators -} // namespace paddle - REGISTER_OP_CUDA_KERNEL( - reduce_sum, ops::ReduceSumKernel, ops::ReduceSumKernel, - ops::ReduceSumKernel, - ops::ReduceSumKernel, ops::ReduceSumKernel, - ops::ReduceSumKernel, - ops::ReduceSumKernel>, - ops::ReduceSumKernel>); + reduce_sum, ops::ReduceCudaKernel, + ops::ReduceCudaKernel, + ops::ReduceCudaKernel, + ops::ReduceCudaKernel, + ops::ReduceCudaKernel, + ops::ReduceCudaKernel, + ops::ReduceCudaKernel, + paddle::operators::CustomSum>, + ops::ReduceCudaKernel, + paddle::operators::CustomSum>); diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc b/paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc new file mode 100644 index 00000000000000..aa84da10ad6531 --- /dev/null +++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op_npu.cc @@ -0,0 +1,138 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/sequence_ops/sequence_mask_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class SequenceMaskNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto& dev_ctx = ctx.template device_context(); + auto* x = ctx.Input("X"); + auto* y = ctx.Output("Y"); + int maxlen = ctx.Attr("maxlen"); + + if (ctx.HasInput("MaxLenTensor")) { + auto max_len_tensor = ctx.Input("MaxLenTensor"); + PADDLE_ENFORCE_NOT_NULL(max_len_tensor, + platform::errors::InvalidArgument( + "Input(MaxLenTensor) should not be NULL." + "But received Input(MaxLenTensor) is NULL")); + framework::Tensor temp; + TensorCopySync(*max_len_tensor, platform::CPUPlace(), &temp); + maxlen = *temp.data(); + PADDLE_ENFORCE_GT( + maxlen, 0, + platform::errors::InvalidArgument( + "Input(MaxLenTensor) value should be greater than 0. But " + "received Input(MaxLenTensor) value = %d.", + maxlen)); + } + + if (maxlen < 0) { + auto x_numel = x->numel(); + std::vector x_vec; + framework::TensorToVector(*x, dev_ctx, &x_vec); + auto x_data = x_vec.data(); + maxlen = static_cast(*std::max_element(x_data, x_data + x_numel)); + } + auto y_dim = framework::vectorize(x->dims()); + y_dim.push_back(maxlen); + + Tensor cast_x; + cast_x.mutable_data(x->dims(), ctx.GetPlace()); + const auto& cast1_runner = + NpuOpRunner("Cast", {*x}, {cast_x}, + {{"dst_type", ConvertToNpuDtype(cast_x.type())}}); + cast1_runner.Run(dev_ctx.stream()); + + Tensor tmp; + tmp.mutable_data(framework::make_ddim({maxlen}), ctx.GetPlace()); + NpuOpRunner range_runner; + range_runner.SetType("Range"); + range_runner.AddInput(std::vector({0})); + range_runner.AddInput(std::vector({maxlen})); + range_runner.AddInput(std::vector({1})); + range_runner.AddOutput(tmp); + range_runner.Run(dev_ctx.stream()); + + Tensor expand_tmp; + expand_tmp.mutable_data(framework::make_ddim(y_dim), + ctx.GetPlace()); + const auto& expand_runner = + NpuOpRunner("ExpandD", {tmp}, {expand_tmp}, {{"shape", y_dim}}); + expand_runner.Run(dev_ctx.stream()); + + auto x_dims = framework::vectorize(x->dims()); + x_dims.push_back(1); + cast_x.Resize(framework::make_ddim({x_dims})); + Tensor x_tmp; + x_tmp.mutable_data(framework::make_ddim(y_dim), ctx.GetPlace()); + const auto& tile_runner = + NpuOpRunner("TileWithAxis", {cast_x}, {x_tmp}, + {{"axis", x->dims().size()}, {"tiles", maxlen}}); + tile_runner.Run(dev_ctx.stream()); + + Tensor y_tmp; + y_tmp.mutable_data(framework::make_ddim(y_dim), ctx.GetPlace()); + const auto& less_runner = + NpuOpRunner("Less", {expand_tmp, x_tmp}, {y_tmp}, {}); + less_runner.Run(dev_ctx.stream()); + + y->Resize(framework::make_ddim(y_dim)); + auto out_dtype = static_cast( + ctx.Attr("out_dtype")); + if (out_dtype == framework::proto::VarType::INT32) { + y->mutable_data(ctx.GetPlace()); + } else if (out_dtype == framework::proto::VarType::INT64) { + y->mutable_data(ctx.GetPlace()); + } else if (out_dtype == framework::proto::VarType::FP32) { + y->mutable_data(ctx.GetPlace()); + } else if (out_dtype == framework::proto::VarType::FP64) { + y->mutable_data(ctx.GetPlace()); + } else if (out_dtype == framework::proto::VarType::BOOL) { + y->mutable_data(ctx.GetPlace()); + } else if (out_dtype == framework::proto::VarType::UINT8) { + y->mutable_data(ctx.GetPlace()); + } else { + PADDLE_ENFORCE(false, + platform::errors::InvalidArgument( + "out_dtype only supporing int32, int64, fp32, fp64, " + "bool, uint8, but receive out_dtype is %d", + out_dtype)); + } + + const auto& cast2_runner = NpuOpRunner( + "Cast", {y_tmp}, {*y}, {{"dst_type", ConvertToNpuDtype(out_dtype)}}); + cast2_runner.Run(dev_ctx.stream()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL( + sequence_mask, ops::SequenceMaskNPUKernel, + ops::SequenceMaskNPUKernel, + ops::SequenceMaskNPUKernel, + ops::SequenceMaskNPUKernel); diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.h b/paddle/fluid/operators/softmax_with_cross_entropy_op.h index 74316841a13b17..29528ae0d29925 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h @@ -111,15 +111,12 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel { const Tensor* labels = context.Input("Label"); Tensor* logit_grad = context.Output(framework::GradVarName("Logits")); - const Tensor* softmax = context.Input("Softmax"); const bool use_softmax = context.Attr("use_softmax"); - if (logit_grad != softmax || !use_softmax) { framework::TensorCopy(*softmax, context.GetPlace(), context.device_context(), logit_grad); } - const bool soft_label = context.Attr("soft_label"); auto ignore_index = context.Attr("ignore_index"); @@ -133,7 +130,6 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel { logit_grad_2d.ShareDataWith(*logit_grad).Resize({n, d}); labels_2d.ShareDataWith(*labels).Resize({n, labels->numel() / n}); out_grad_2d.ShareDataWith(*out_grad).Resize({n, d / axis_dim}); - auto out_grad_mat = framework::EigenMatrix::From(out_grad_2d); auto logit_grad_mat = framework::EigenMatrix::From(logit_grad_2d); auto& place = *context.template device_context() @@ -147,9 +143,8 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel { logit_grad_mat.device(place) = out_grad_mat.broadcast(Eigen::DSizes(1, axis_dim)) * logit_grad_mat; - } - // use_softmax step2 - else { + } else { + // use_softmax step2 const int64_t* label_data = labels->data(); T* logit_grad_data = logit_grad->data(); const T* out_grad_data = out_grad->data(); @@ -180,7 +175,6 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel { } return; } - // for use_softmax=False, continue if (soft_label) { diff --git a/paddle/fluid/operators/squared_l2_norm_op_npu.cc b/paddle/fluid/operators/squared_l2_norm_op_npu.cc new file mode 100644 index 00000000000000..fb4d8fefda7a7f --- /dev/null +++ b/paddle/fluid/operators/squared_l2_norm_op_npu.cc @@ -0,0 +1,99 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/squared_l2_norm_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class SquaredL2NormNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *x = context.Input("X"); + auto *out = context.Output("Out"); + + auto place = context.GetPlace(); + auto stream = + context.template device_context() + .stream(); + + std::vector axis; + for (int i = 0; i < x->dims().size(); ++i) { + axis.push_back(i); + } + out->mutable_data(place); + const auto &runner = NpuOpRunner("SquareSumV1", {*x}, {*out}, + {{"axis", axis}, {"keep_dims", false}}); + runner.Run(stream); + } +}; + +template +class SquaredL2NormGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &context) const override { + auto *x = context.Input("X"); + auto *x_grad = context.Output(framework::GradVarName("X")); + auto *out_grad = context.Input(framework::GradVarName("Out")); + + PADDLE_ENFORCE_EQ( + out_grad->numel(), 1, + platform::errors::InvalidArgument( + "Input(GRAD@Out) of SquaredL2NormGradOP should be a scalar.")); + + auto place = context.GetPlace(); + auto stream = + context.template device_context() + .stream(); + + // broadcast out_grad + Tensor broadcasted_out_grad; + broadcasted_out_grad.mutable_data(x_grad->dims(), place); + const auto &broadcast_runner = + NpuOpRunner("BroadcastToD", {*out_grad}, {broadcasted_out_grad}, + {{"shape", framework::vectorize(x_grad->dims())}}); + broadcast_runner.Run(stream); + // mul x + Tensor tmp_x_grad; + tmp_x_grad.mutable_data(x_grad->dims(), place); + const auto &mul_x_runner = + NpuOpRunner("Mul", {broadcasted_out_grad, *x}, {tmp_x_grad}, {}); + mul_x_runner.Run(stream); + // mul coefficient:2 + Tensor coefficient; + coefficient.mutable_data({1}, place); + FillNpuTensorWithConstant(&coefficient, static_cast(2.0)); + x_grad->mutable_data(place); + const auto &mul_coefficient_runner = + NpuOpRunner("Mul", {tmp_x_grad, coefficient}, {*x_grad}, {}); + mul_coefficient_runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_NPU_KERNEL( + squared_l2_norm, + ops::SquaredL2NormNPUKernel); +REGISTER_OP_NPU_KERNEL( + squared_l2_norm_grad, + ops::SquaredL2NormGradNPUKernel); diff --git a/paddle/fluid/operators/tile_op.h b/paddle/fluid/operators/tile_op.h old mode 100755 new mode 100644 index 1fb0fa6ce5176f..260cbc23687313 --- a/paddle/fluid/operators/tile_op.h +++ b/paddle/fluid/operators/tile_op.h @@ -17,40 +17,12 @@ limitations under the License. */ #include #include -#include -#include -#include -#include -#include -#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/operators/eigen/eigen_function.h" #define MAX_RANK_SUPPORTED 6 -// 1. BOOST_PP_REPEAT macro represents a fast horizontal repetition construct. -// Usage: BOOST_PP_REPEAT(count, macro, data). -// This macro expands to the sequence: -// macro(z, 0, data) macro(z, 1, data) ... macro(z, count - 1, data). -// 2. As for our case, count = MAX_RANK_SUPPORTED(which is 6). -// So the range of n is 0-5(which is count-1). -// We want to generate case 1-6 instead of case 0-5. -// So we need to change n to n + 1. -#define TILE_TEMPLATE(z, n, data) \ - case n + 1: { \ - Tile(context); \ - break; \ - } -#define REP_TILE_TEMPLATE(n) BOOST_PP_REPEAT(n, TILE_TEMPLATE, ~) -#define COND(n) BOOST_PP_GREATER_EQUAL(n, BOOST_PP_MOD(n, MAX_RANK_SUPPORTED)) -#define TILE_GRAD_CASE(n) \ - case n + 1: { \ - TileBackward(context, reshape_dims_vec, reduce_dims_vec); \ - break; \ - } -#define TILE_GRAD_TEMPLATE(z, n, data) BOOST_PP_IF(COND(n), TILE_GRAD_CASE(n), ) -#define REP_TILE_GRAD_TEMPLATE(n) BOOST_PP_REPEAT(n, TILE_GRAD_TEMPLATE, ~) namespace paddle { namespace operators { @@ -60,7 +32,8 @@ inline std::vector get_repeat_times( auto* repeat_tensor = ctx.Input("RepeatTimes"); auto* repeat_data = repeat_tensor->data(); framework::Tensor cpu_repeat_tensor; - if (platform::is_gpu_place(repeat_tensor->place())) { + if (platform::is_gpu_place(repeat_tensor->place()) || + platform::is_npu_place(repeat_tensor->place())) { TensorCopySync(*repeat_tensor, platform::CPUPlace(), &cpu_repeat_tensor); repeat_data = cpu_repeat_tensor.data(); } @@ -76,7 +49,8 @@ inline std::vector get_repeat_times( std::vector vec_repeat_times; for (size_t i = 0; i < list_repeat_times_tensor.size(); ++i) { auto tensor = list_repeat_times_tensor[i]; - if (platform::is_gpu_place(tensor->place())) { + if (platform::is_gpu_place(tensor->place()) || + platform::is_npu_place(tensor->place())) { framework::Tensor temp; TensorCopySync(*tensor, platform::CPUPlace(), &temp); vec_repeat_times.push_back(*temp.data()); @@ -130,7 +104,26 @@ class TileKernel : public framework::OpKernel { "must be less than or equal to %d, but the value received is %d.", MAX_RANK_SUPPORTED, repeat_times_size)); rank = std::max(rank, repeat_times_size); - switch (rank) { REP_TILE_TEMPLATE(MAX_RANK_SUPPORTED) } + switch (rank) { + case 1: + Tile<1>(context); + break; + case 2: + Tile<2>(context); + break; + case 3: + Tile<3>(context); + break; + case 4: + Tile<4>(context); + break; + case 5: + Tile<5>(context); + break; + case 6: + Tile<6>(context); + break; + } } protected: @@ -251,7 +244,24 @@ class TileGradKernel : public framework::OpKernel { "to %d, but the value received is %d.", MAX_RANK_SUPPORTED, dims)); switch (dims) { - REP_TILE_GRAD_TEMPLATE(MAX_RANK_SUPPORTED) + case 1: + TileBackward<1>(context, reshape_dims_vec, reduce_dims_vec); + break; + case 2: + TileBackward<2>(context, reshape_dims_vec, reduce_dims_vec); + break; + case 3: + TileBackward<3>(context, reshape_dims_vec, reduce_dims_vec); + break; + case 4: + TileBackward<4>(context, reshape_dims_vec, reduce_dims_vec); + break; + case 5: + TileBackward<5>(context, reshape_dims_vec, reduce_dims_vec); + break; + case 6: + TileBackward<6>(context, reshape_dims_vec, reduce_dims_vec); + break; default: PADDLE_THROW(platform::errors::InvalidArgument( "Only support tensor with rank being between 1 and 6. But " diff --git a/paddle/fluid/operators/tile_op_npu.cc b/paddle/fluid/operators/tile_op_npu.cc new file mode 100644 index 00000000000000..c85a1cbc671af1 --- /dev/null +++ b/paddle/fluid/operators/tile_op_npu.cc @@ -0,0 +1,118 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/tile_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { +template +class TileNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto rank = context.Input("X")->dims().size(); + PADDLE_ENFORCE_GE( + rank, 1, platform::errors::InvalidArgument( + "The rank of the input 'x' for tile op must be a positive " + "integer, but the value received is %d.", + rank)); + PADDLE_ENFORCE_LE( + rank, MAX_RANK_SUPPORTED, + platform::errors::InvalidArgument( + "The rank of the input 'x' for tile op " + "must be less than or equal to %d, but the value received is %d.", + MAX_RANK_SUPPORTED, rank)); + auto repeat_times = get_repeat_times(context); + int repeat_times_size = repeat_times.size(); + PADDLE_ENFORCE_GE( + repeat_times_size, 1, + platform::errors::InvalidArgument( + "The number of elements of the input 'repeat_times' for tile " + "op must be positive, but the value received is %d.", + repeat_times_size)); + PADDLE_ENFORCE_LE( + repeat_times_size, MAX_RANK_SUPPORTED, + platform::errors::InvalidArgument( + "The number of elements of the input 'repeat_times' for tile op " + "must be less than or equal to %d, but the value received is %d.", + MAX_RANK_SUPPORTED, repeat_times_size)); + rank = std::max(rank, repeat_times_size); + Tile(context); + } + + protected: + void Tile(const framework::ExecutionContext& context) const { + auto* in0 = context.Input("X"); + + auto in_dims = in0->dims(); + auto repeat_times = get_repeat_times(context); + for (size_t i = 0; i < repeat_times.size(); ++i) { + PADDLE_ENFORCE_GT( + repeat_times[i], 0, + platform::errors::InvalidArgument( + "All elements of the input 'repeat_times' for tile op must " + "be positive integers, but the value received is %d.", + repeat_times[i])); + } + auto vec_in_dims = framework::vectorize(in_dims); + if (repeat_times.size() < vec_in_dims.size()) { + int diff = vec_in_dims.size() - repeat_times.size(); + repeat_times.insert(repeat_times.begin(), diff, 1); + } else { + int diff = repeat_times.size() - vec_in_dims.size(); + vec_in_dims.insert(vec_in_dims.begin(), diff, 1); + } + PADDLE_ENFORCE_EQ( + repeat_times.size(), vec_in_dims.size(), + platform::errors::InvalidArgument( + "The rank (%d) of the input 'x' and the rank (%d) of the input " + "'repeat_times' for tile op must match after promotion.", + vec_in_dims.size(), repeat_times.size())); + auto* out0 = context.Output("Out"); + + framework::DDim new_in_dims = framework::make_ddim(vec_in_dims); + framework::DDim out_dims(new_in_dims); + + for (size_t i = 0; i < repeat_times.size(); ++i) { + out_dims[i] *= repeat_times[i]; + } + + out0->Resize(out_dims); + out0->mutable_data(context.GetPlace()); + + std::vector temp(repeat_times.size(), 1); + if (repeat_times == temp) { + framework::TensorCopy( + *in0, context.GetPlace(), + context.template device_context(), out0); + return; + } + + const auto& runner = + NpuOpRunner("TileD", {*in0}, {*out0}, {{"multiples", repeat_times}}); + auto stream = + context.template device_context() + .stream(); + runner.Run(stream); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_NPU_KERNEL( + tile, ops::TileNPUKernel, + ops::TileNPUKernel, + ops::TileNPUKernel); diff --git a/paddle/fluid/operators/top_k_v2_op_npu.cc b/paddle/fluid/operators/top_k_v2_op_npu.cc new file mode 100755 index 00000000000000..e536055013fb88 --- /dev/null +++ b/paddle/fluid/operators/top_k_v2_op_npu.cc @@ -0,0 +1,94 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/top_k_v2_op.h" +#include +#include +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { +// NOTE(Ruibiao): the Ascend TopKV2 operator used in this kernel +// may lead to large accuracy error for float32 data +template +class TopkV2NPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* input = context.Input("X"); + auto* k_tensor = context.Input("K"); + auto* out = context.Output("Out"); + auto* indices = context.Output("Indices"); // type: INT64 + + int32_t k = static_cast(context.Attr("k")); + int axis = static_cast(context.Attr("axis")); + const bool sorted = static_cast(context.Attr("sorted")); + const bool largest = static_cast(context.Attr("largest")); + + if (axis < 0) { + axis += input->dims().size(); + } + + if (k_tensor != nullptr) { + std::vector v_tmp(1); + TensorToVector( + *k_tensor, + context.template device_context(), + &v_tmp); + k = static_cast(v_tmp[0]); + } + + framework::DDim output_dims = input->dims(); + output_dims[axis] = k; + + out->Resize(output_dims); + indices->Resize(output_dims); + + out->mutable_data(context.GetPlace()); + indices->mutable_data(context.GetPlace()); + + framework::Tensor indices_int32(framework::proto::VarType::INT32); + indices_int32.Resize(output_dims); + indices_int32.mutable_data(context.GetPlace()); + + auto npu_stream = + context.template device_context() + .stream(); + + NpuOpRunner npu_op_runner_topkv2; + npu_op_runner_topkv2.SetType("TopKV2") + .AddInput(*input) + .AddInput(std::vector{k}) + .AddOutput(*out) + .AddOutput(indices_int32) + .AddAttr("sorted", sorted) + .AddAttr("dim", axis) + .AddAttr("largest", largest) + .Run(npu_stream); + + // Cast 'indices_int32' to 'indices', from INT32 to INT64 + auto dst_dtype = ConvertToNpuDtype(indices->type()); + const auto& npu_op_runner_cast = + NpuOpRunner("Cast", {indices_int32}, {*indices}, + {{"dst_type", static_cast(dst_dtype)}}); + npu_op_runner_cast.Run(npu_stream); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_NPU_KERNEL(top_k_v2, ops::TopkV2NPUKernel, + ops::TopkV2NPUKernel, + ops::TopkV2NPUKernel, + ops::TopkV2NPUKernel); diff --git a/paddle/fluid/operators/unfold_op.cc b/paddle/fluid/operators/unfold_op.cc index 0a36b6ef840887..5c0eb64993b556 100644 --- a/paddle/fluid/operators/unfold_op.cc +++ b/paddle/fluid/operators/unfold_op.cc @@ -107,6 +107,42 @@ class UnfoldOp : public framework::OperatorWithKernel { "But recieved dims(strides: %u) != dims(dilations: %u).", strides.size(), dilations.size())); + // check kernel_sizes + PADDLE_ENFORCE_GT(kernel_sizes[0], 0, + platform::errors::InvalidArgument( + "The `kernel_sizes` should be greater than zero, " + "but recieved kernel_height: %d kernel_width: %d.", + kernel_sizes[0], kernel_sizes[1])); + PADDLE_ENFORCE_GT(kernel_sizes[1], 0, + platform::errors::InvalidArgument( + "The `kernel_sizes` should be greater than zero, " + "but recieved kernel_height: %d kernel_width: %d.", + kernel_sizes[0], kernel_sizes[1])); + // check strides + PADDLE_ENFORCE_GT(strides[0], 0, + platform::errors::InvalidArgument( + "The `strides` should be greater than zero, " + "but recieved strides_height: %d strides_width: %d.", + strides[0], strides[1])); + PADDLE_ENFORCE_GT(strides[1], 0, + platform::errors::InvalidArgument( + "The `strides` should be greater than zero, " + "but recieved strides_height: %d strides_width: %d.", + strides[0], strides[1])); + // check dilations + PADDLE_ENFORCE_GT( + dilations[0], 0, + platform::errors::InvalidArgument( + "The `dilations` should be greater than zero, " + "but recieved dilations_height: %d dilations_width: %d.", + dilations[0], dilations[1])); + PADDLE_ENFORCE_GT( + dilations[1], 0, + platform::errors::InvalidArgument( + "The `dilations` should be greater than zero, " + "but recieved dilations_height: %d dilations_width: %d.", + dilations[0], dilations[1])); + std::vector out_dims; out_dims.push_back(in_dims[0]); diff --git a/paddle/fluid/operators/warpctc_op.h b/paddle/fluid/operators/warpctc_op.h index e90eefd72d4ce2..f5b51da3d85831 100644 --- a/paddle/fluid/operators/warpctc_op.h +++ b/paddle/fluid/operators/warpctc_op.h @@ -199,6 +199,27 @@ class WarpCTCKernel : public framework::OpKernel { sequence_width = logits->dims()[2]; max_sequence_length = logits->dims()[0]; + PADDLE_ENFORCE_GT(max_sequence_length, 0, + platform::errors::InvalidArgument( + "The first dimension of Input(Logits) should be " + "greater than zero " + "but received %d. ", + max_sequence_length)); + + PADDLE_ENFORCE_GT(num_sequences, 0, + platform::errors::InvalidArgument( + "The second dimension of Input(Logits) should be " + "greater than zero " + "but received %d. ", + num_sequences)); + + PADDLE_ENFORCE_GT(sequence_width, 0, + platform::errors::InvalidArgument( + "The third dimension of Input(Logits) should be " + "greater than zero " + "but received %d. ", + sequence_width)); + auto* logits_length = ctx.Input("LogitsLength"); auto* labels_length = ctx.Input("LabelLength"); framework::Tensor logits_length_cpu; @@ -229,6 +250,13 @@ class WarpCTCKernel : public framework::OpKernel { logits_lod = framework::ToAbsOffset(logits->lod())[0]; auto logits_dims = logits->dims(); + PADDLE_ENFORCE_GT(logits_dims[0], 0, + platform::errors::InvalidArgument( + "The first dimension of Input(Logits) should be " + "greater than zero " + "but received %d. ", + logits_dims[0])); + PADDLE_ENFORCE_EQ( logits_dims[0], static_cast(logits_lod.back()), platform::errors::InvalidArgument( diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index efd25bc8929409..97c81568e673e8 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -70,7 +70,7 @@ cc_test(place_test SRCS place_test.cc DEPS place glog gflags) if(WITH_XPU) cc_library(xpu_info SRCS xpu/xpu_info.cc DEPS gflags glog enforce xpulib) -cc_library(xpu_op_list SRCS xpu/xpu_op_list.cc DEPS gflags glog enforce xpulib) +cc_library(xpu_op_list SRCS xpu/xpu_op_list.cc DEPS gflags glog enforce xpulib device_context) endif() if(WITH_ASCEND) diff --git a/paddle/fluid/platform/cudnn_workspace_helper.cc b/paddle/fluid/platform/cudnn_workspace_helper.cc index c4e71c86f9e750..bb0e9a226d1500 100644 --- a/paddle/fluid/platform/cudnn_workspace_helper.cc +++ b/paddle/fluid/platform/cudnn_workspace_helper.cc @@ -15,13 +15,14 @@ #include "paddle/fluid/platform/cudnn_workspace_helper.h" #include -#include "boost/lexical_cast.hpp" +#include + namespace paddle { namespace platform { static int GetDefaultConvWorkspaceSizeLimitMBImpl() { const char *env_str = std::getenv("FLAGS_conv_workspace_size_limit"); - return env_str ? boost::lexical_cast(std::string(env_str)) + return env_str ? std::stoi(std::string(env_str)) : kDefaultConvWorkspaceSizeLimitMB; } diff --git a/paddle/fluid/platform/event.h b/paddle/fluid/platform/event.h index 93fc56ab203b60..a79ab22743d166 100644 --- a/paddle/fluid/platform/event.h +++ b/paddle/fluid/platform/event.h @@ -120,6 +120,7 @@ class MemEvent { class CudaEvent { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + public: CudaEvent() { #ifdef PADDLE_WITH_HIP @@ -129,7 +130,7 @@ class CudaEvent { #endif } - CudaEvent(unsigned int flags) : flags_(flags) { + explicit CudaEvent(unsigned int flags) : flags_(flags) { #ifdef PADDLE_WITH_HIP hipEventCreateWithFlags(&event_, flags_); #else @@ -137,7 +138,15 @@ class CudaEvent { #endif } - void Record(paddle::platform::stream::CUDAStream& stream) { + ~CudaEvent() { +#ifdef PADDLE_WITH_HIP + hipEventDestroy(event_); +#else + cudaEventDestroy(event_); +#endif + } + + void Record(const paddle::platform::stream::CUDAStream& stream) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event_, stream.raw_stream())); #else diff --git a/paddle/fluid/platform/fast_divmod.h b/paddle/fluid/platform/fast_divmod.h index c6c22bb2f9203b..02f9d5441281c1 100644 --- a/paddle/fluid/platform/fast_divmod.h +++ b/paddle/fluid/platform/fast_divmod.h @@ -20,7 +20,7 @@ limitations under the License. */ #define INT_BITS 32 namespace paddle { -namespace operators { +namespace platform { template struct alignas(sizeof(T) * Size) CudaAlignedVector { @@ -65,5 +65,39 @@ struct FastDivMod { uint32_t multiplier; }; -} // namespace operators +/* +* Only the address of input data is the multiplier of 1,2,4, vectorized load +* with corresponding multiplier-value is possible. Moreover, the maximum length +* of vectorized load is 128 bits once. Hence, valid length of vectorized load +* shall be determined under both former constraints. +*/ +template +int GetVectorizedSize(const T *pointer) { + constexpr int max_load_bits = 128; + int valid_vec_size = max_load_bits / CHAR_BIT / sizeof(T); + uint64_t address = reinterpret_cast(pointer); + constexpr int vec8 = + std::alignment_of>::value; // NOLINT + constexpr int vec4 = + std::alignment_of>::value; // NOLINT + constexpr int vec2 = + std::alignment_of>::value; // NOLINT + if (address % vec8 == 0) { + /* + * Currently, decide to deal with no more than 4 data once while adopting + * vectorization load/store, if performance test shows that dealing with + * 8 data once in vectorization load/store does get optimized, return code + * below can be changed into " return std::min(8, valid_vec_size); " . + */ + return std::min(4, valid_vec_size); + } else if (address % vec4 == 0) { + return std::min(4, valid_vec_size); + } else if (address % vec2 == 0) { + return std::min(2, valid_vec_size); + } else { + return 1; + } +} + +} // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/xpu/xpu1_op_list.h b/paddle/fluid/platform/xpu/xpu1_op_list.h index 131525718cac75..cdd60a856fbc90 100644 --- a/paddle/fluid/platform/xpu/xpu1_op_list.h +++ b/paddle/fluid/platform/xpu/xpu1_op_list.h @@ -55,25 +55,51 @@ XPUOpMap& get_kl1_ops() { XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"affine_channel_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"assign", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"assign", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::BOOL, XPUPlace())})}, {"batch_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"batch_norm_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"cast", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"cast", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace())})}, {"clip_by_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"coalesce_tensor", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace())})}, {"c_reduce_sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"c_allreduce_sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"broadcast", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"broadcast", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::FP64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace())})}, {"concat", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"concat_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"logicalor", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"logicaland", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"logicalnot", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"logicalor", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace()), + pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::INT16, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, + {"logicaland", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace()), + pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::INT16, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, + {"logicalnot", XPUKernelSet({pOpKernelType(vartype::BOOL, XPUPlace()), + pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::INT16, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, {"depthwise_conv2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"depthwise_conv2d_grad", @@ -116,7 +142,11 @@ XPUOpMap& get_kl1_ops() { {"elementwise_min_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"fill_constant", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::FP64, XPUPlace()), + pOpKernelType(vartype::BOOL, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, {"gather", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"gather_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"gaussian_random", @@ -140,7 +170,11 @@ XPUOpMap& get_kl1_ops() { {"layer_norm", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"layer_norm_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"load", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"load", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), + pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, {"log_loss", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"log_loss_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, @@ -158,15 +192,20 @@ XPUOpMap& get_kl1_ops() { {"accuracy", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"mul", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"mul_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"one_hot", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"one_hot_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"one_hot", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace())})}, + {"one_hot_v2", XPUKernelSet({pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace())})}, {"sgd", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"adam", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"rmsprop", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"lamb", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"pool2d", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"pool2d_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"range", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"range", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, {"reduce_sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"reduce_sum_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, @@ -175,30 +214,67 @@ XPUOpMap& get_kl1_ops() { {"reduce_max", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"reduce_max_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"reshape2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"reshape2", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::BOOL, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, {"reshape2_grad", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::BOOL, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, {"rnn", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"rnn_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"roi_align", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"roi_align_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"scale", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"shape", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"shape", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::BOOL, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, {"sign", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"slice", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"slice", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace())})}, {"slice_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"softmax", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"softmax_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"softmax_with_cross_entropy", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"squeeze", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"squeeze", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::BOOL, XPUPlace()), + pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::UINT8, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, {"squeeze_grad", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"squeeze2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::BOOL, XPUPlace()), + pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::UINT8, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, + {"squeeze2", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::BOOL, XPUPlace()), + pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::UINT8, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, {"squeeze2_grad", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::BOOL, XPUPlace()), + pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::UINT8, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, {"stack", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"sum", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"top_k", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, @@ -212,12 +288,36 @@ XPUOpMap& get_kl1_ops() { XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, {"uniform_random", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"unsqueeze", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + {"unsqueeze", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::BOOL, XPUPlace()), + pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::UINT8, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, {"unsqueeze_grad", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, - {"unsqueeze2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::BOOL, XPUPlace()), + pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::UINT8, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, + {"unsqueeze2", XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::BOOL, XPUPlace()), + pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::UINT8, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, {"unsqueeze2_grad", - XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}, + XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()), + pOpKernelType(vartype::INT64, XPUPlace()), + pOpKernelType(vartype::INT32, XPUPlace()), + pOpKernelType(vartype::BOOL, XPUPlace()), + pOpKernelType(vartype::INT8, XPUPlace()), + pOpKernelType(vartype::UINT8, XPUPlace()), + pOpKernelType(vartype::FP32, XPUPlace())})}, {"momuntem", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})} // AddMore }; diff --git a/paddle/fluid/platform/xpu/xpu_op_list.cc b/paddle/fluid/platform/xpu/xpu_op_list.cc index b3349407942bd1..0c10436f397898 100644 --- a/paddle/fluid/platform/xpu/xpu_op_list.cc +++ b/paddle/fluid/platform/xpu/xpu_op_list.cc @@ -9,7 +9,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #ifdef PADDLE_WITH_XPU +#include #include +#include #include "paddle/fluid/platform/xpu/xpu1_op_list.h" #include "paddle/fluid/platform/xpu/xpu2_op_list.h" @@ -19,7 +21,7 @@ limitations under the License. */ namespace paddle { namespace platform { -bool is_xpu_support_op(std::string op_name, const pOpKernelType& type) { +bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type) { auto& ops = get_kl1_ops(); auto v = get_xpu_version(BOOST_GET_CONST(platform::XPUPlace, type.place_).device); @@ -34,6 +36,45 @@ bool is_xpu_support_op(std::string op_name, const pOpKernelType& type) { return false; } +// ops_string contains op_list(e.g., 'mul,mul_grad'), parse the op string and +// insert op to op set +static void tokenize(const std::string& ops, char delim, + std::unordered_set* op_set) { + std::string::size_type beg = 0; + for (uint64_t end = 0; (end = ops.find(delim, end)) != std::string::npos; + ++end) { + op_set->insert(ops.substr(beg, end - beg)); + beg = end + 1; + } + + op_set->insert(ops.substr(beg)); +} + +bool is_in_xpu_black_list(const std::string& op_name) { + static bool inited = false; + static std::unordered_set xpu_black_list; + static std::mutex s_mtx; + if (!inited) { + std::lock_guard guard(s_mtx); + if (!inited) { + if (std::getenv("XPU_BLACK_LIST") != nullptr) { + std::string ops(std::getenv("XPU_BLACK_LIST")); + tokenize(ops, ',', &xpu_black_list); + } + inited = true; + VLOG(3) << "XPU Black List: "; + for (auto iter = xpu_black_list.begin(); iter != xpu_black_list.end(); + ++iter) { + VLOG(3) << *iter << " "; + } + } + } + if (xpu_black_list.find(op_name) != xpu_black_list.end()) { + return true; + } + return false; +} + } // namespace platform } // namespace paddle #endif diff --git a/paddle/fluid/platform/xpu/xpu_op_list.h b/paddle/fluid/platform/xpu/xpu_op_list.h index 487bc8ac48b66f..705f701e13634a 100644 --- a/paddle/fluid/platform/xpu/xpu_op_list.h +++ b/paddle/fluid/platform/xpu/xpu_op_list.h @@ -20,7 +20,8 @@ namespace platform { using pOpKernelType = paddle::framework::OpKernelType; -bool is_xpu_support_op(std::string op_name, const pOpKernelType& type); +bool is_xpu_support_op(const std::string& op_name, const pOpKernelType& type); +bool is_in_xpu_black_list(const std::string& op_name); } // namespace platform } // namespace paddle diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index f362808a4b9528..b8774f429632e2 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,6 +1,7 @@ # Adapt to custom op mechanism: Include the header files related to the data type # to avoid exposing the path of the underlying file include_directories(${PADDLE_SOURCE_DIR}/paddle/fluid/platform) +include_directories(${PADDLE_SOURCE_DIR}/paddle/utils) set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapper prune feed_fetch_method pass pass_builder parallel_executor profiler layer tracer engine scope_pool @@ -124,23 +125,20 @@ if(WITH_PYTHON) set(impl_file ${CMAKE_SOURCE_DIR}/paddle/fluid/pybind/op_function_impl.h) set(tmp_impl_file ${impl_file}.tmp) + set(OP_IMPL_DEPS op_function_generator) if(WIN32) if("${CMAKE_GENERATOR}" STREQUAL "Ninja") - set(op_function_generator_path "${CMAKE_CURRENT_BINARY_DIR}") + set(op_impl_path "${CMAKE_CURRENT_BINARY_DIR}") else() - set(op_function_generator_path "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}") + set(op_impl_path "${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}") endif() - file(TO_NATIVE_PATH ${op_function_generator_path} op_function_generator_path) - file(TO_NATIVE_PATH ${impl_file} impl_file) - file(TO_NATIVE_PATH ${tmp_impl_file} tmp_impl_file) file(WRITE ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/op_function_generator_retry.bat "" "set build_times=1\n" ":retry\n" "ECHO op_function_generator run %build_times% time\n" - "if exist ${tmp_impl_file} del ${tmp_impl_file}\n" "taskkill /f /im op_function_generator.exe 2>NUL\n" - "${op_function_generator_path}\\op_function_generator.exe ${tmp_impl_file}\n" + "${op_impl_path}/op_function_generator.exe ${tmp_impl_file}\n" "if %ERRORLEVEL% NEQ 0 (\n" " set /a build_times=%build_times%+1\n" " if %build_times% GEQ 10 (\n" @@ -151,63 +149,61 @@ if(WITH_PYTHON) ")\n" "exit /b 0") - add_custom_command(TARGET op_function_generator POST_BUILD - COMMAND ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/op_function_generator_retry.bat - COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file} ${impl_file} - COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}" - ) - if(${CBLAS_PROVIDER} STREQUAL MKLML) - add_custom_command(TARGET op_function_generator - PRE_LINK - COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_LIB} ${op_function_generator_path} - COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_IOMP_LIB} ${op_function_generator_path} - ) + ADD_CUSTOM_COMMAND(OUTPUT ${op_impl_path}/libiomp5md.dll + COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_IOMP_LIB} ${op_impl_path} + DEPENDS mklml) + list(APPEND OP_IMPL_DEPS ${op_impl_path}/libiomp5md.dll) else(${CBLAS_PROVIDER} STREQUAL EXTERN_OPENBLAS) - add_custom_command(TARGET op_function_generator - PRE_LINK - COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_SHARED_LIB} ${op_function_generator_path} - ) + ADD_CUSTOM_COMMAND(OUTPUT ${op_impl_path}/openblas.dll + COMMAND ${CMAKE_COMMAND} -E copy ${OPENBLAS_SHARED_LIB} ${op_impl_path} + DEPENDS extern_openblas) + list(APPEND OP_IMPL_DEPS ${op_impl_path}/openblas.dll) endif() if(WITH_MKLDNN) - add_custom_command(TARGET op_function_generator - PRE_LINK - COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_SHARED_LIB} ${op_function_generator_path} - ) + ADD_CUSTOM_COMMAND(OUTPUT ${op_impl_path}/mkldnn.dll + COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_SHARED_LIB} ${op_impl_path} + DEPENDS mkldnn) + list(APPEND OP_IMPL_DEPS ${op_impl_path}/mkldnn.dll) endif() + + add_custom_command(OUTPUT ${impl_file} + COMMAND ${CMAKE_BINARY_DIR}/paddle/fluid/pybind/op_function_generator_retry.bat + COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file} ${impl_file} + COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}" + DEPENDS ${OP_IMPL_DEPS}) else(WIN32) # If there are no *.so in /usr/lib or LD_LIBRARY_PATH, # copy these *.so to current directory and append current directory to # LD_LIBRARY_PATH. This is different with Windows platformm, which search # *.dll in current directory automatically. - add_custom_command(TARGET op_function_generator - POST_BUILD + if(WITH_MKLML) + ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libiomp5.so + COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_IOMP_LIB} ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS mklml) + list(APPEND OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/libiomp5.so) + endif() + if(WITH_MKLDNN) + ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libdnnl.so.0 + COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_SHARED_LIB} ${CMAKE_CURRENT_BINARY_DIR} + DEPENDS mkldnn) + list(APPEND OP_IMPL_DEPS ${CMAKE_CURRENT_BINARY_DIR}/libdnnl.so.0) + endif() + add_custom_command(OUTPUT ${impl_file} COMMAND ${CMAKE_COMMAND} -E env "LD_LIBRARY_PATH=$ENV{LD_LIBRARY_PATH}:." "${CMAKE_CURRENT_BINARY_DIR}/op_function_generator" "${tmp_impl_file}" COMMAND ${CMAKE_COMMAND} -E copy_if_different ${tmp_impl_file} ${impl_file} COMMENT "copy_if_different ${tmp_impl_file} to ${impl_file}" - VERBATIM - ) - if(WITH_MKL) - add_custom_command(TARGET op_function_generator - PRE_LINK - COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_LIB} ${CMAKE_CURRENT_BINARY_DIR} - COMMAND ${CMAKE_COMMAND} -E copy ${MKLML_SHARED_IOMP_LIB} ${CMAKE_CURRENT_BINARY_DIR} - ) - endif(WITH_MKL) - if(WITH_MKLDNN) - add_custom_command(TARGET op_function_generator - PRE_LINK - COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_SHARED_LIB} ${CMAKE_CURRENT_BINARY_DIR} - ) - endif(WITH_MKLDNN) + DEPENDS ${OP_IMPL_DEPS} + VERBATIM) endif(WIN32) + add_custom_target(op_function_generator_cmd ALL DEPENDS ${impl_file}) cc_library(paddle_pybind SHARED - SRCS ${PYBIND_SRCS} - DEPS ${PYBIND_DEPS} - ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) + SRCS ${PYBIND_SRCS} + DEPS ${PYBIND_DEPS} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) + if(NOT APPLE AND NOT WIN32) target_link_libraries(paddle_pybind rt) endif(NOT APPLE AND NOT WIN32) @@ -218,5 +214,5 @@ if(WITH_PYTHON) get_property (os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES) target_link_libraries(paddle_pybind ${os_dependency_modules}) - add_dependencies(paddle_pybind op_function_generator) + add_dependencies(paddle_pybind op_function_generator_cmd) endif(WITH_PYTHON) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index b58e9050402bb7..040ae26213f5f5 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -42,6 +42,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_rank_table.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/new_exec.h" #include "paddle/fluid/framework/op_info.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_version_registry.h" @@ -1631,7 +1632,13 @@ All parameter, weight, gradient are variables in Paddle. .def("__repr__", string::to_string) .def("__str__", string::to_string); #ifdef PADDLE_WITH_XPU + py::enum_(m, "XPUVersion", py::arithmetic()) + .value("XPU1", platform::XPUVersion::XPU1) + .value("XPU2", platform::XPUVersion::XPU2) + .export_values(); m.def("get_xpu_device_count", platform::GetXPUDeviceCount); + m.def("get_xpu_device_version", + [](int device_id) { return platform::get_xpu_version(device_id); }); #endif py::class_(m, "CPUPlace", R"DOC( @@ -1935,6 +1942,34 @@ All parameter, weight, gradient are variables in Paddle. fetch_vars); }); + py::class_(m, "InterpreterCore") + .def(py::init()) + .def("run", + [](InterpreterCore &self, + const std::unordered_map &input_dict, + std::vector vec_fetch_name) { + pybind11::gil_scoped_release release; + std::vector vec_tensor; + std::vector vec_name; + + for (auto &item : input_dict) { + framework::LoDTensor t; + SetTensorFromPyArray( + &t, item.second, platform::CPUPlace(), false); + vec_name.push_back(item.first); + vec_tensor.push_back(t); + } + + std::vector vec_out; + self.run(vec_name, vec_tensor, vec_fetch_name, &vec_out); + std::vector vec_ret; + for (size_t i = 0; i < vec_out.size(); ++i) { + vec_ret.push_back(TensorToPyArray(vec_out[i], true)); + } + return vec_ret; + }); + m.def("init_gflags", framework::InitGflags); m.def("init_glog", framework::InitGLOG); m.def("load_op_meta_info_and_register_op", diff --git a/paddle/fluid/string/CMakeLists.txt b/paddle/fluid/string/CMakeLists.txt index a465f5909a7c6e..9667e18bc6a1e3 100644 --- a/paddle/fluid/string/CMakeLists.txt +++ b/paddle/fluid/string/CMakeLists.txt @@ -1,7 +1,8 @@ cc_library(stringpiece SRCS piece.cc DEPS flags) cc_library(pretty_log SRCS pretty_log.cc DEPS flags) -cc_library(string_helper SRCS string_helper.cc DEPS boost flags) +cc_library(string_helper SRCS string_helper.cc DEPS flags) cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags) cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags) cc_test(to_string_test SRCS to_string_test.cc) cc_test(split_test SRCS split_test.cc) +cc_test(string_helper_test SRCS string_helper_test.cc DEPS string_helper) diff --git a/paddle/fluid/string/string_helper.cc b/paddle/fluid/string/string_helper.cc index 8731e8fca8a5c4..141ac2ba47c5b9 100644 --- a/paddle/fluid/string/string_helper.cc +++ b/paddle/fluid/string/string_helper.cc @@ -88,6 +88,11 @@ inline int str_to_float(const char* str, float* v) { return index; } +bool ends_with(std::string const& input, std::string const& test) { + if (test.size() > input.size()) return false; + return std::equal(test.rbegin(), test.rend(), input.rbegin()); +} + // A helper class for reading lines from file. // A line buffer is maintained. It // doesn't need to know the maximum possible length of a line. @@ -100,7 +105,7 @@ char* LineFileReader::getdelim(FILE* f, char delim) { _buffer[--ret] = 0; } - _length = (size_t)ret; + _length = static_cast(ret); return _buffer; } else { _length = 0; diff --git a/paddle/fluid/string/string_helper.h b/paddle/fluid/string/string_helper.h index f7387e877af2cd..37b713766dd558 100644 --- a/paddle/fluid/string/string_helper.h +++ b/paddle/fluid/string/string_helper.h @@ -21,7 +21,6 @@ #include #include -#include "boost/lexical_cast.hpp" #include "glog/logging.h" namespace paddle { @@ -38,6 +37,7 @@ void format_string_append(std::string& str, const char* fmt, // NOLINT CHECK_GE(len, 0); size_t oldlen = str.length(); str.resize(oldlen + len + 1); + CHECK(snprintf(&str[oldlen], (size_t)len + 1, fmt, args...) == // NOLINT len); str.resize(oldlen + len); @@ -69,6 +69,9 @@ std::string erase_spaces(const std::string& str); int str_to_float(const char* str, float* v); +// checks whether the test string is a suffix of the input string. +bool ends_with(std::string const& input, std::string const& test); + // split string by delim template std::vector split_string(const std::string& str, const std::string& delim) { @@ -134,7 +137,9 @@ std::string join_strings(const Container& strs, char delim) { str += delim; } - str += boost::lexical_cast(elem); + std::stringstream ss; + ss << elem; + str += ss.str(); ++i; } @@ -151,7 +156,9 @@ std::string join_strings(const Container& strs, const std::string& delim) { str += delim; } - str += boost::lexical_cast(elem); + std::stringstream ss; + ss << elem; + str += ss.str(); ++i; } diff --git a/paddle/fluid/string/string_helper_test.cc b/paddle/fluid/string/string_helper_test.cc new file mode 100644 index 00000000000000..4796bf7507aba7 --- /dev/null +++ b/paddle/fluid/string/string_helper_test.cc @@ -0,0 +1,58 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/string/string_helper.h" + +#include + +#include "gtest/gtest.h" + +TEST(StringHelper, EndsWith) { + std::string input("hello world"); + std::string test1("world"); + std::string test2("helloworld"); + std::string test3("hello world hello world"); + + EXPECT_TRUE(paddle::string::ends_with(input, test1)); + EXPECT_TRUE(paddle::string::ends_with(input, input)); + + EXPECT_FALSE(paddle::string::ends_with(input, test2)); + EXPECT_FALSE(paddle::string::ends_with(input, test3)); +} + +TEST(StringHelper, FormatStringAppend) { + std::string str("hello"); + char fmt[] = "%d"; + + paddle::string::format_string_append(str, fmt, 10); + EXPECT_EQ(str, "hello10"); +} + +TEST(StringHelper, JoinStrings) { + std::vector v; + v.push_back("hello"); + v.push_back("world"); + + std::string result = paddle::string::join_strings(v, ' '); + EXPECT_EQ(result, "hello world"); + + result = paddle::string::join_strings(v, '\n'); + EXPECT_EQ(result, "hello\nworld"); + + result = paddle::string::join_strings(v, ','); + EXPECT_EQ(result, "hello,world"); + + result = paddle::string::join_strings(v, " new "); + EXPECT_EQ(result, "hello new world"); +} diff --git a/paddle/utils/any.h b/paddle/utils/any.h new file mode 100644 index 00000000000000..ec803647c11f7e --- /dev/null +++ b/paddle/utils/any.h @@ -0,0 +1,232 @@ +//This file copy from boost/any.hpp and boost version: 1.41.0 +//Modified the following points: +//1. modify namespace from boost::any to paddle::any +//2. remove the depending boost header files +//3. remove/modify some macro + +// See http://www.boost.org/libs/any for Documentation. + +#ifndef PADDLE_ANY_INCLUDED +#define PADDLE_ANY_INCLUDED + +// what: variant type boost::any +// who: contributed by Kevlin Henney, +// with features contributed and bugs found by +// Ed Brey, Mark Rodgers, Peter Dimov, and James Curran +// when: July 2001 +// where: tested with BCC 5.5, MSVC 6.0, and g++ 2.95 + +#include +#include +#include + +// See boost/python/type_id.hpp +// TODO: add BOOST_TYPEID_COMPARE_BY_NAME to config.hpp +# if (defined(__GNUC__) && __GNUC__ >= 3) \ + || defined(_AIX) \ + || ( defined(__sgi) && defined(__host_mips)) \ + || (defined(__hpux) && defined(__HP_aCC)) \ + || (defined(linux) && defined(__INTEL_COMPILER) && defined(__ICC)) +# define BOOST_AUX_ANY_TYPE_ID_NAME +#include +# endif + +namespace paddle +{ + class any + { + public: // structors + + any() + : content(0) + { + } + + template + any(const ValueType & value) + : content(new holder(value)) + { + } + + any(const any & other) + : content(other.content ? other.content->clone() : 0) + { + } + + ~any() + { + delete content; + } + + public: // modifiers + + any & swap(any & rhs) + { + std::swap(content, rhs.content); + return *this; + } + + template + any & operator=(const ValueType & rhs) + { + any(rhs).swap(*this); + return *this; + } + + any & operator=(any rhs) + { + rhs.swap(*this); + return *this; + } + + public: // queries + + bool empty() const + { + return !content; + } + + const std::type_info & type() const + { + return content ? content->type() : typeid(void); + } + + public: // types (public so any_cast can be non-friend) + + class placeholder + { + public: // structors + + virtual ~placeholder() + { + } + + public: // queries + + virtual const std::type_info & type() const = 0; + + virtual placeholder * clone() const = 0; + + }; + + template + class holder : public placeholder + { + public: // structors + + holder(const ValueType & value) + : held(value) + { + } + + public: // queries + + virtual const std::type_info & type() const + { + return typeid(ValueType); + } + + virtual placeholder * clone() const + { + return new holder(held); + } + + public: // representation + + ValueType held; + + private: // intentionally left unimplemented + holder & operator=(const holder &); + }; + + public: // representation (public so any_cast can be non-friend) + + placeholder * content; + + }; + + class bad_any_cast : public std::bad_cast + { + public: + virtual const char * what() const throw() + { + return "paddle::bad_any_cast: " + "failed conversion using paddle::any_cast"; + } + }; + + template + ValueType * any_cast(any * operand) + { + return operand && +#ifdef BOOST_AUX_ANY_TYPE_ID_NAME + std::strcmp(operand->type().name(), typeid(ValueType).name()) == 0 +#else + operand->type() == typeid(ValueType) +#endif + ? &static_cast *>(operand->content)->held + : 0; + } + + template + inline const ValueType * any_cast(const any * operand) + { + return any_cast(const_cast(operand)); + } + + template + ValueType any_cast(any & operand) + { + typedef typename std::remove_reference::type nonref; + + // If 'nonref' is still reference type, it means the user has not + // specialized 'remove_reference'. + + // Please use BOOST_BROKEN_COMPILER_TYPE_TRAITS_SPECIALIZATION macro + // to generate specialization of remove_reference for your class + // See type traits library documentation for details + static_assert(!std::is_reference::value, "!std::is_reference::value"); + + nonref * result = any_cast(&operand); + if(!result) + throw bad_any_cast(); + return *result; + } + + template + inline ValueType any_cast(const any & operand) + { + typedef typename std::remove_reference::type nonref; + + // The comment in the above version of 'any_cast' explains when this + // assert is fired and what to do. + static_assert(!std::is_reference::value, "!std::is_reference::value"); + + return any_cast(const_cast(operand)); + } + + // Note: The "unsafe" versions of any_cast are not part of the + // public interface and may be removed at any time. They are + // required where we know what type is stored in the any and can't + // use typeid() comparison, e.g., when our types may travel across + // different shared libraries. + template + inline ValueType * unsafe_any_cast(any * operand) + { + return &static_cast *>(operand->content)->held; + } + + template + inline const ValueType * unsafe_any_cast(const any * operand) + { + return unsafe_any_cast(const_cast(operand)); + } +} + +// Copyright Kevlin Henney, 2000, 2001, 2002. All rights reserved. +// +// Distributed under the Boost Software License, Version 1.0. (See +// accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) + +#endif diff --git a/python/paddle/autograd/backward_mode.py b/python/paddle/autograd/backward_mode.py index 6efbe777d537ca..36ca048c51210f 100644 --- a/python/paddle/autograd/backward_mode.py +++ b/python/paddle/autograd/backward_mode.py @@ -14,6 +14,7 @@ from paddle.fluid import core from paddle.fluid import framework +from paddle.fluid.backward import gradients_with_optimizer import paddle __all__ = [] diff --git a/python/paddle/distributed/elastic.py b/python/paddle/distributed/elastic.py index 3e4fea5e6f34d7..e6f21f6603d8da 100644 --- a/python/paddle/distributed/elastic.py +++ b/python/paddle/distributed/elastic.py @@ -37,6 +37,9 @@ def scale_np(self, np): return True return False + def clean(self): + self.etcd.delete_prefix(self.prefix) + def close(self): self.etcd.close() @@ -53,13 +56,6 @@ def close(self): args = parser.parse_args() server = args.elastic_server or os.getenv('PADDLE_ELASTIC_SERVER') - # compatible with kuberntes service discovery - if not server and os.getenv( - 'PADDLE_ELASTIC_ETCD_SERVICE_HOST') and os.getenv( - 'PADDLE_ELASTIC_ETCD_SERVICE_PORT'): - server = '{}:{}'.format( - os.getenv('PADDLE_ELASTIC_ETCD_SERVICE_HOST'), - os.getenv('PADDLE_ELASTIC_ETCD_SERVICE_PORT')) name = args.job_id or os.getenv('PADDLE_ELASTIC_JOB_ID') np = args.np or int(os.getenv('PADDLE_ELASTIC_NP', 0)) @@ -69,6 +65,9 @@ def close(self): if args.action == "scale": cmd.scale_np(np) + if args.action == "clean": + cmd.clean() + print("action {} done".format(args.action)) cmd.close() diff --git a/python/paddle/distributed/fleet/elastic/__init__.py b/python/paddle/distributed/fleet/elastic/__init__.py new file mode 100644 index 00000000000000..1ac81729d5430a --- /dev/null +++ b/python/paddle/distributed/fleet/elastic/__init__.py @@ -0,0 +1,74 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import signal +import os, sys + +from .manager import ElasticManager +from .manager import ElasticStatus +from .manager import ELASTIC_EXIT_CODE +from .collective import CollectiveLauncher + +from paddle.distributed.fleet.launch_utils import DistributeMode + + +def enable_elastic(args, distribute_mode): + if distribute_mode != DistributeMode.COLLECTIVE: + return False + + if not args.elastic_server and not os.getenv('PADDLE_ELASTIC_SERVER'): + return False + + if not args.job_id and not os.getenv('PADDLE_ELASTIC_JOB_ID'): + return False + + if not args.np and not int(os.getenv('PADDLE_ELASTIC_NP', 0)): + return False + + return True + + +def launch_elastic(args, distribute_mode): + + elastic = ElasticManager(args) + + signal.signal(signal.SIGTERM, elastic.signal_handler) + signal.signal(signal.SIGABRT, elastic.signal_handler) + signal.signal(signal.SIGINT, elastic.signal_handler) + + while True: + + # wait for all nodes ready to run + elastic.wait() + + # run self with specified launcher + elastic.run(CollectiveLauncher) + + # keep wathing the health status of self and being notified for other's failure + ret = elastic.watch() + if ret == ElasticStatus.COMPLETED: + break + if ret == ElasticStatus.HOLD: + continue + if ret == ElasticStatus.EXIT: + break + if ret == ElasticStatus.ERROR: + sys.exit(3) + if ret == ElasticStatus.RESTART: + sys.exit(ELASTIC_EXIT_CODE) + + if int(elastic.sigint) > 0: + sys.exit(128 + int(elastic.sigint)) + else: + sys.exit(0) diff --git a/python/paddle/distributed/fleet/elastic/collective.py b/python/paddle/distributed/fleet/elastic/collective.py new file mode 100644 index 00000000000000..94fe6a54b5809b --- /dev/null +++ b/python/paddle/distributed/fleet/elastic/collective.py @@ -0,0 +1,93 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from paddle.distributed.fleet import launch_utils +import paddle.distributed.fleet.cloud_utils as cloud_utils +import paddle.distributed.fleet.ascend_utils as ascend_utils + +from paddle.distributed.fleet.launch_utils import * + +from paddle.distributed.fleet.elastic.manager import LauncherInterface + + +class CollectiveLauncher(LauncherInterface): + def __init__(self, args): + self.args = args + self.procs = [] + + def launch(self): + logger.info("collective lauchner launch ...") + args = self.args + # parse arguments, used for cloud-single-machine and local + (device_mode, + devices_per_proc) = launch_utils.get_device_proc_info(args) + trainers_num = cloud_utils.get_trainers_num() + logger.debug("parsed from args trainerss_num:{} mode:{} devices:{}". + format(trainers_num, device_mode, devices_per_proc)) + + cluster = None + pod = None + + start_port = 6170 + if os.environ.get('FLAGS_START_PORT') is not None: + start_port = os.environ.get('FLAGS_START_PORT') + if cloud_utils.use_paddlecloud() and trainers_num != 1: + cluster, pod = cloud_utils.get_cloud_cluster( + args.ips, device_mode, devices_per_proc, start_port) + logger.debug("get cluster from cloud:{}".format(cluster)) + elif device_mode == DeviceMode.ASCEND_NPU: + # for ascend + cluster, pod = ascend_utils.get_cloud_cluster( + rank_table_file=os.getenv("RANK_TABLE_FILE", None), + device_mode=device_mode, + start_port=start_port) + else: + # trainers_num = 1 or not use paddlecloud ips="a,b" + cluster, pod = paddle.distributed.fleet.launch.get_cluster_from_args( + args, device_mode, devices_per_proc) + logger.debug("get cluster from args:{}".format(cluster)) + + global_envs = copy.copy(os.environ.copy()) + self.gloo_rendezvous_dir = tempfile.mkdtemp() + # add gloo env + global_envs["PADDLE_WITH_GLOO"] = str( + os.getenv("PADDLE_WITH_GLOO", "0")) + global_envs["PADDLE_GLOO_RENDEZVOUS"] = "3" + global_envs["PADDLE_GLOO_FS_PATH"] = self.gloo_rendezvous_dir + + self.procs = start_local_trainers( + cluster, + pod, + training_script=args.training_script, + training_script_args=args.training_script_args, + log_dir=args.log_dir, + envs=global_envs) + + for idx, proc in enumerate(self.procs): + logger.info("launch proc_id:{} idx:{}".format(proc.proc.pid, idx)) + + def stop(self): + logger.info("collective lauchner stop ...") + if not self._terminate_procs(): + logger.error("kill process failed") + if os.path.exists(self.gloo_rendezvous_dir): + shutil.rmtree(self.gloo_rendezvous_dir) + + def watch(self): + logger.debug("collective lauchner watch ...") + for p in self.procs: + if p.log_fn and p.local_rank == 0: + pull_worker_log(p) + ret = self._check_procs() + return ret diff --git a/python/paddle/distributed/fleet/elastic.py b/python/paddle/distributed/fleet/elastic/manager.py similarity index 100% rename from python/paddle/distributed/fleet/elastic.py rename to python/paddle/distributed/fleet/elastic/manager.py diff --git a/python/paddle/distributed/fleet/launch.py b/python/paddle/distributed/fleet/launch.py index f407892e79acf6..bc7942826e1eaa 100644 --- a/python/paddle/distributed/fleet/launch.py +++ b/python/paddle/distributed/fleet/launch.py @@ -69,17 +69,13 @@ import paddle import paddle.fluid as fluid from paddle.distributed.fleet import launch_utils -import signal # TODO(danleifeng): Don't import * from a module from paddle.distributed.fleet.launch_utils import * import paddle.distributed.fleet.cloud_utils as cloud_utils import paddle.distributed.fleet.ascend_utils as ascend_utils -from paddle.distributed.fleet.elastic import ElasticManager -from paddle.distributed.fleet.elastic import LauncherInterface -from paddle.distributed.fleet.elastic import ElasticStatus -from paddle.distributed.fleet.elastic import ELASTIC_EXIT_CODE +from paddle.distributed.fleet.elastic import enable_elastic, launch_elastic __all__ = [] @@ -235,76 +231,71 @@ def get_cluster_from_args(args, device_mode, devices_per_proc): devices_per_proc) -class CollectiveLauncher(LauncherInterface): - def __init__(self, args): - self.args = args - self.procs = [] +def launch_collective(args): + # parse arguments, used for cloud-single-machine and local + (device_mode, devices_per_proc) = launch_utils.get_device_proc_info(args) + trainers_num = cloud_utils.get_trainers_num() + logger.debug("parsed from args trainerss_num:{} mode:{} devices:{}".format( + trainers_num, device_mode, devices_per_proc)) + + cluster = None + pod = None + + start_port = 6170 + if os.environ.get('FLAGS_START_PORT') is not None: + start_port = os.environ.get('FLAGS_START_PORT') + if cloud_utils.use_paddlecloud() and trainers_num != 1: + cluster, pod = cloud_utils.get_cloud_cluster( + args.ips, device_mode, devices_per_proc, start_port) + logger.debug("get cluster from cloud:{}".format(cluster)) + elif device_mode == DeviceMode.ASCEND_NPU: + # for ascend + cluster, pod = ascend_utils.get_cloud_cluster( + rank_table_file=os.getenv("RANK_TABLE_FILE", None), + device_mode=device_mode, + start_port=start_port) + else: + # trainers_num = 1 or not use paddlecloud ips="a,b" + cluster, pod = get_cluster_from_args(args, device_mode, + devices_per_proc) + logger.debug("get cluster from args:{}".format(cluster)) + + global_envs = copy.copy(os.environ.copy()) + gloo_rendezvous_dir = tempfile.mkdtemp() + # add gloo env + global_envs["PADDLE_WITH_GLOO"] = str(os.getenv("PADDLE_WITH_GLOO", "0")) + global_envs["PADDLE_GLOO_RENDEZVOUS"] = "3" + global_envs["PADDLE_GLOO_FS_PATH"] = gloo_rendezvous_dir + + procs = start_local_trainers( + cluster, + pod, + training_script=args.training_script, + training_script_args=args.training_script_args, + log_dir=args.log_dir, + envs=global_envs) + + for idx, proc in enumerate(procs): + print("launch proc_id:{} idx:{}".format(proc.proc.pid, idx)) - def launch(self): - logger.info("collective lauchner launch ...") - args = self.args - # parse arguments, used for cloud-single-machine and local - (device_mode, - devices_per_proc) = launch_utils.get_device_proc_info(args) - trainers_num = cloud_utils.get_trainers_num() - logger.debug("parsed from args trainerss_num:{} mode:{} devices:{}". - format(trainers_num, device_mode, devices_per_proc)) + while True: + try: + alive = watch_local_trainers(procs, cluster.trainers_nranks()) - cluster = None - pod = None + if not alive: + logger.info("Local processes completed.") + logger.debug("POD info:{}".format(pod)) + break - start_port = 6170 - if os.environ.get('FLAGS_START_PORT') is not None: - start_port = os.environ.get('FLAGS_START_PORT') - if cloud_utils.use_paddlecloud() and trainers_num != 1: - cluster, pod = cloud_utils.get_cloud_cluster( - args.ips, device_mode, devices_per_proc, start_port) - logger.debug("get cluster from cloud:{}".format(cluster)) - elif device_mode == DeviceMode.ASCEND_NPU: - # for ascend - cluster, pod = ascend_utils.get_cloud_cluster( - rank_table_file=os.getenv("RANK_TABLE_FILE", None), - device_mode=device_mode, - start_port=start_port) - else: - # trainers_num = 1 or not use paddlecloud ips="a,b" - cluster, pod = get_cluster_from_args(args, device_mode, - devices_per_proc) - logger.debug("get cluster from args:{}".format(cluster)) - - global_envs = copy.copy(os.environ.copy()) - self.gloo_rendezvous_dir = tempfile.mkdtemp() - # add gloo env - global_envs["PADDLE_WITH_GLOO"] = str( - os.getenv("PADDLE_WITH_GLOO", "0")) - global_envs["PADDLE_GLOO_RENDEZVOUS"] = "3" - global_envs["PADDLE_GLOO_FS_PATH"] = self.gloo_rendezvous_dir - - self.procs = start_local_trainers( - cluster, - pod, - training_script=args.training_script, - training_script_args=args.training_script_args, - log_dir=args.log_dir, - envs=global_envs) - - for idx, proc in enumerate(self.procs): - logger.info("launch proc_id:{} idx:{}".format(proc.proc.pid, idx)) - - def stop(self): - logger.info("collective lauchner stop ...") - if not self._terminate_procs(): - logger.error("kill process failed") - if os.path.exists(self.gloo_rendezvous_dir): - shutil.rmtree(self.gloo_rendezvous_dir) - - def watch(self): - logger.debug("collective lauchner watch ...") - for p in self.procs: - if p.log_fn and p.local_rank == 0: - pull_worker_log(p) - ret = self._check_procs() - return ret + time.sleep(3) + + except: + logger.warning("Terminating... exit") + terminate_local_procs(procs) + exit(1) + + if os.path.exists(gloo_rendezvous_dir): + shutil.rmtree(gloo_rendezvous_dir) def launch_ps(args, distribute_mode): @@ -399,42 +390,15 @@ def launch(): _print_arguments(args) distribute_mode = which_distributed_mode(args) - # TODO(kuizhiqing) support ps later - if not distribute_mode == DistributeMode.COLLECTIVE: - launch_ps(args, distribute_mode) - return - - elastic = ElasticManager(args) - - signal.signal(signal.SIGTERM, elastic.signal_handler) - signal.signal(signal.SIGABRT, elastic.signal_handler) - signal.signal(signal.SIGINT, elastic.signal_handler) - while True: + if enable_elastic(args, distribute_mode): + launch_elastic(args, distribute_mode) + return - # wait for all nodes ready to run - elastic.wait() - - # run self with specified launcher - elastic.run(CollectiveLauncher) - - # keep wathing the health status of self and being notified for other's failure - ret = elastic.watch() - if ret == ElasticStatus.COMPLETED: - break - if ret == ElasticStatus.HOLD: - continue - if ret == ElasticStatus.EXIT: - break - if ret == ElasticStatus.ERROR: - sys.exit(3) - if ret == ElasticStatus.RESTART: - sys.exit(ELASTIC_EXIT_CODE) - - if int(elastic.sigint) > 0: - sys.exit(128 + int(elastic.sigint)) + if distribute_mode == DistributeMode.COLLECTIVE: + launch_collective(args) else: - sys.exit(0) + launch_ps(args, distribute_mode) if __name__ == "__main__": diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py index 6ead643df6c1b8..e114670440c065 100644 --- a/python/paddle/distributed/fleet/launch_utils.py +++ b/python/paddle/distributed/fleet/launch_utils.py @@ -307,6 +307,17 @@ def get_cluster(node_ips, node_ip, trainer_endpoints, device_mode, def terminate_local_procs(procs): + # try to terminate process by group, this happend in multiprocess senario in user process + if os.name != 'nt': + for p in procs: + if p.proc.poll() is None: + os.killpg(os.getpgid(p.proc.pid), signal.SIGTERM) + if p.log_fn: + p.log_fn.close() + logger.info("terminate process group gid:{}".format(p.proc.pid)) + + time.sleep(1) + for p in procs: if p.proc.poll() is None: p.proc.terminate() @@ -583,19 +594,19 @@ def watch_local_trainers(procs, nranks): except KeyboardInterrupt: logger.warning("KeyboardInterrupt, exit") terminate_local_procs(procs) - raise + return except SystemExit: logger.error( "ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log.". format(nranks, error_rank)) terminate_local_procs(procs) - raise + return except: logger.error( "ABORT!!! Out of all {} trainers, the trainer process with rank={} was aborted. Please check its log.". format(nranks, error_rank)) terminate_local_procs(procs) - raise + return return alive diff --git a/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py index 2205f79ef4633f..c923624651c6ae 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/raw_program_optimizer.py @@ -217,9 +217,13 @@ def _allreduce_fusion_program(self): block = self.main_program.global_block() ring_id = self.global_ring_id param_grads = [] + first_backward_idx = -1 # find all grad params - for op in reversed(block.ops): + for idx, op in enumerate(block.ops): + if first_backward_idx == -1 and \ + is_backward_op(op): + first_backward_idx = idx if is_backward_op(op) and \ OP_ROLE_VAR_KEY in op.attr_names: op_role_var = op.attr(OP_ROLE_VAR_KEY) @@ -234,70 +238,100 @@ def _allreduce_fusion_program(self): grad = block.var(grad_name) if param.is_distributed: continue - param_grads.append(grad) + param_grads.append((param, grad)) + + outputs_name_to_idx = self.__get_ouputs_name_to_idx(first_backward_idx, + block) - segments = [] + # structure of grad_param_segments is + # [([grad0, grad1], [param0, param1]), ([grad2, grad3], [param2, param3])] + # each entry of the list is a tuple stores the grads segment list and + # the corresponding params segment list + grad_param_segments = [] last_dtype = None # split the grad based on dtype and fused size - for var in param_grads: - if len(segments) == 0 \ - or len(segments[-1]) == self.fuse_grad_size_in_num \ - or var.dtype != last_dtype: - segments.append([var]) - last_dtype = var.dtype + for param, grad in param_grads: + if len(grad_param_segments) == 0 \ + or len(grad_param_segments[-1][0]) == self.fuse_grad_size_in_num \ + or grad.dtype != last_dtype: + grad_param_segments.append(([grad], [param])) + last_dtype = grad.dtype else: - segments[-1].append(var) + grad_param_segments[-1][0].append(grad) + grad_param_segments[-1][1].append(param) - fused_vars = [] - for idx, op in enumerate(block.ops): - if is_optimizer_op(op): - for segment in segments: - # insert coalesce tensor - tmp_var = block.create_var( - name=unique_name.generate('FusedOutput_{}'.format( - segment[0].name)), - dtype=segment[0].dtype, - persistable=True, - stop_gradient=True) - fused_vars.append(tmp_var) - block._insert_op_without_sync( - idx, - type="coalesce_tensor", - inputs={"Input": segment}, - outputs={"Output": segment, - "FusedOutput": tmp_var}, - attrs={ - "copy_data": True, - "use_align": True, - "dtype": segment[0].dtype, - OP_ROLE_KEY: OpRole.Backward - }) - break + if len(grad_param_segments) == 0: + return - # insert the allreduce_sum op - for idx, op in enumerate(block.ops): - if is_optimizer_op(op): - for fused_var in fused_vars: - block._insert_op_without_sync( - idx, - type='c_allreduce_sum', - inputs={'X': fused_var}, - outputs={'Out': fused_var}, - attrs={ - 'ring_id': ring_id, - 'use_calc_stream': self.calc_comm_same_stream, - OP_ROLE_KEY: OpRole.Backward - }) - if not self.calc_comm_same_stream: - block._insert_op_without_sync( - idx, - type='c_sync_calc_stream', - inputs={'X': fused_var}, - outputs={'Out': fused_var}, - attrs={OP_ROLE_KEY: OpRole.Backward}) - break + fused_vars = [None] * len(grad_param_segments) + for i in range(len(grad_param_segments) - 1, -1, -1): + # travers the grad_param_segments in backward + # not to use reversed since needs the absolute index value + grad_segment, param_segment = grad_param_segments[i] + # insert coalesce tensor + fused_var = block.create_var( + name=unique_name.generate('FusedOutput_{}'.format(grad_segment[ + 0].name)), + dtype=grad_segment[0].dtype, + persistable=False, + stop_gradient=True) + fused_vars[i] = fused_var + after_idx = outputs_name_to_idx[grad_segment[-1]][1] + block._insert_op_without_sync( + after_idx + 1, + type='c_allreduce_sum', + inputs={'X': fused_var}, + outputs={'Out': fused_var}, + attrs={ + 'ring_id': ring_id, + 'use_calc_stream': self.calc_comm_same_stream, + OP_ROLE_KEY: OpRole.Backward + }) + if not self.calc_comm_same_stream: + block._insert_op_without_sync( + after_idx + 1, + type='c_sync_calc_stream', + inputs={'X': fused_var}, + outputs={'Out': fused_var}, + attrs={OP_ROLE_KEY: OpRole.Backward}) - if len(fused_vars) == 0: + # update the outputs_name_to_idx after insertion of sync/allreduce ops + outputs_name_to_idx = self.__get_ouputs_name_to_idx(first_backward_idx, + block) + # the before_idx is not guaranteed sorted, therefore we have to find the + # topology to insert the coalesce ops + pos_for_coalesce = {} + for i in range(len(grad_param_segments) - 1, -1, -1): + # We separate the insertion of coalesce op and the insertion of sync/allreduce op, + # since that the coalesce op's insertion may invalidate the outputs_name_to_idx + grad_segment, param_segment = grad_param_segments[i] + before_idx = len(block.ops) + for grad in outputs_name_to_idx: + before_idx = min(before_idx, outputs_name_to_idx[grad][0]) + pos_for_coalesce[i] = before_idx + + # insert the coalesce op based on the sorted before_idx + pos_for_coalesce = sorted( + pos_for_coalesce.items(), + key=lambda kv: (kv[1], kv[0]), + reverse=True) + for i, before_idx in pos_for_coalesce: + grad_segment, param_segment = grad_param_segments[i] + fused_var = fused_vars[i] + block._insert_op_without_sync( + before_idx, + type="coalesce_tensor", + inputs={"Input": param_segment}, + outputs={"Output": grad_segment, + "FusedOutput": fused_var}, + attrs={ + "copy_data": False, + "use_align": True, + "dtype": grad_segment[0].dtype, + OP_ROLE_KEY: OpRole.Backward + }) + + if self.calc_comm_same_stream: block._sync_with_cpp() return @@ -307,9 +341,31 @@ def _allreduce_fusion_program(self): block._insert_op_without_sync( idx, type='c_sync_comm_stream', - inputs={'X': fused_vars[0]}, - outputs={'Out': fused_vars[0]}, + inputs={'X': grad_segment[0]}, + outputs={'Out': grad_segment[0]}, attrs={'ring_id': ring_id, OP_ROLE_KEY: OpRole.Backward}) break block._sync_with_cpp() + + def __get_ouputs_name_to_idx(self, first_backward_idx, block): + # Each item of outputs_name_to_idx is a pair of idx. + # The first entry of this pair is the idx of the first op generates the grad, + # which is used to indicate the position to insert coalesce op. + # The second entry of this pair is the idx of the last op generates the grad, + # which is used to indicate the position to insert sync and allreduce op. + outputs_name_to_idx = {} + for idx in range(first_backward_idx, len(block.ops)): + op = block.ops[idx] + if is_optimizer_op(op): + break + for name in op.output_arg_names: + var = block.var(name) + if not outputs_name_to_idx.get(var): + # if the grad only be generated by one op + # the first idx and the last ids are identical + outputs_name_to_idx[var] = (idx, idx) + else: + outputs_name_to_idx[var] = (outputs_name_to_idx[var][0], + idx) + return outputs_name_to_idx diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py index df775247c8c9e5..a5df9486da4656 100755 --- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py +++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py @@ -84,27 +84,23 @@ def _enable_strategy(self, dist_strategy, context): dist_strategy.sharding = True dist_strategy.sharding_configs = {"segment_broadcast_MB": 32} - def minimize_impl(self, - loss, - startup_program=None, - parameter_list=None, - no_grad_set=None): - # TODO: (JZ-LIANG) support multiple comm in future - # self._nrings = self.user_defined_strategy.nccl_comm_num - self._nrings_sharding = 1 - self._nrings_dp = 1 + def _get_sharding_segment_strategy(self): + """ get + self._sharding_segment_strategy + 1. if by_size: self._broadcast_MB + 2. if by_anchors: self._sharding_segment_anchors + self._backward_remain_anchors + self._forward_remain_anchors + """ + strategy = self.user_defined_strategy + sharding_configs = strategy.sharding_configs + segment_strategy = str(sharding_configs["sharding_segment_strategy"]) - # segment - self._sharding_segment_strategy = str( - self.user_defined_strategy.sharding_configs[ - "sharding_segment_strategy"]) - if self._sharding_segment_strategy == "segment_broadcast_MB": - self._broadcast_MB = self.user_defined_strategy.sharding_configs[ - "segment_broadcast_MB"] + if segment_strategy == "segment_broadcast_MB": + self._broadcast_MB = sharding_configs["segment_broadcast_MB"] assert self._broadcast_MB > 0, "segment size should larger than zero !" - elif self._sharding_segment_strategy == "segment_anchors": - self._sharding_segment_anchors = self.user_defined_strategy.sharding_configs[ - "segment_anchors"] + elif segment_strategy == "segment_anchors": + self._sharding_segment_anchors = sharding_configs["segment_anchors"] assert len(self._sharding_segment_anchors ) > 0, "you should set the sharding segment anchors !" self._backward_remain_anchors = self._sharding_segment_anchors[:] @@ -112,82 +108,104 @@ def minimize_impl(self, else: raise NotImplementedError( "the sharding segment strategy [{}] is not implemented".format( - str(self._sharding_segment_strategy))) + str(segment_strategy))) + self._sharding_segment_strategy = segment_strategy + + def _get_hybrid_degree(self): + """ get + self.hybrid_dp + self.sharding_degree + self.mp_degree + self.pp_degree + self.dp_degree + """ + strategy = self.user_defined_strategy + sharding_configs = strategy.sharding_configs # parallelism - self.sharding_degree = int(self.user_defined_strategy.sharding_configs[ - "sharding_degree"]) - assert self.sharding_degree > 0, "sharding degree must be larger than zero" - self.mp_degree = int(self.user_defined_strategy.sharding_configs[ - "mp_degree"]) + sharding_degree = int(sharding_configs["sharding_degree"]) + mp_degree = int(sharding_configs["mp_degree"]) + pp_degree = int(sharding_configs["pp_degree"]) + dp_degree = int(sharding_configs['dp_degree']) + global_world_size = self.role_maker._worker_num() + + assert sharding_degree > 0, "sharding degree must be larger than zero" # pipeline setting # TODO (JZ-LIANG) should revise here for support mix parallelism with pipeline - self.pp_degree = int(self.user_defined_strategy.sharding_configs[ - "pp_degree"]) - if self.pp_degree > 1: - assert self.user_defined_strategy.pipeline == True - - self.dp_degree = int(self.user_defined_strategy.sharding_configs[ - 'dp_degree']) - assert self.role_maker._worker_num( - ) == self.mp_degree * self.sharding_degree * self.pp_degree * self.dp_degree, "global work size [{}], mp_degree [{}], sharding_degree [{}], pp_degree [{}], dp_degree [{}].".format( - self.role_maker._worker_num(), - self.mp_degree, - self.sharding_degree, - self.pp_degree, - self.dp_degree, ) + if pp_degree > 1: + assert strategy.pipeline is True + + assert global_world_size == mp_degree * sharding_degree * pp_degree * dp_degree, \ + "global work size [{}], mp_degree [{}], sharding_degree [{}], pp_degree [{}], dp_degree [{}].".format( + global_world_size, mp_degree, sharding_degree, pp_degree, dp_degree) # FIXME (JZ-LIANG) deprecated hybrid_dp - if self.user_defined_strategy.sharding_configs["hybrid_dp"]: + if sharding_configs["hybrid_dp"]: logger.warning( - "[hybrid_dp] API setting is deprecated. Now when dp_degree >= 2, its will be in hybrid dp mode automatically" - ) - assert self.dp_degree >= 1 - if self.dp_degree > 1: - self.hybrid_dp = True - else: - self.hybrid_dp = False - - # NOTE (JZ-LIANG) - # there 2 kind of modes for gradient-merge and hybrid-dp in mixed parallism [sharding] and [pipeline]. - # we distinguish this two modes since the gm/hybrid-dp related allreduce should be insert in different place according different mode to have best performance: - # sharding: communication within node, and therefore should insert within backward segment to overlap with bw calc, conduct every micro step - # pipeline: communication accross nodes, and therefore should insert in update segemnt, conduct just once per global step - self.hybrid_dp_mode = None + "[hybrid_dp] API setting is deprecated. Now when " + "dp_degree >= 2, its will be in hybrid dp mode automatically") + assert dp_degree >= 1 + + self.hybrid_dp = True if dp_degree > 1 else False + self.sharding_degree = sharding_degree + self.mp_degree = mp_degree + self.pp_degree = pp_degree + self.dp_degree = dp_degree + + def _get_hybrid_dp_mode(self): + """ get + self.hybrid_dp_mode + self.gradient_merge_mode + self._gradient_merge_acc_step + self.pp_allreduce_in_optimize + """ + strategy = self.user_defined_strategy + sharding_configs = strategy.sharding_configs + + # NOTE (JZ-LIANG) + # There 2 kind of modes for gradient-merge and hybrid-dp in mixed parallelism [sharding] and [pipeline]. + # We distinguish this two modes since the gm/hybrid-dp related allreduce should be insert in different place + # according different mode to have best performance: + # sharding: communication within node, and therefore should insert within backward segment + # to overlap with bw calc, conduct every micro step. + # pipeline: communication across nodes, and therefore should insert in update segment, + # conduct just once per global step. + dp_mode = None # dp here is the pure dp as the outest parallelism if self.hybrid_dp: - assert self.dp_degree > 1, "hybrid dp is on, but dp degree is [{}]".format( - self.dp_degree) if self.pp_degree > 1: - self.hybrid_dp_mode = "pp_hybrid_dp" + dp_mode = "pp_hybrid_dp" else: - assert self.sharding_degree > 1, "by now we only support five kind of hybrid dp: sharding_hybrid_dp, mp_sharding_hybrid_dp, pp_hybrid_dp, mp_sharding_pp_hybrid_dp, sharding_pp_hybrid_dp." - self.hybrid_dp_mode = "sharding_hybrid_dp" + assert self.sharding_degree > 1, \ + "by now we only support five kind of hybrid dp: sharding_hybrid_dp, " \ + "mp_sharding_hybrid_dp, pp_hybrid_dp, mp_sharding_pp_hybrid_dp, sharding_pp_hybrid_dp." + dp_mode = "sharding_hybrid_dp" # gradient merge - self._gradient_merge_acc_step = int( - self.user_defined_strategy.sharding_configs[ - "gradient_merge_acc_step"]) - self.gradient_merge_mode = None + gm_mode = None + gm_acc_step = int(sharding_configs["gradient_merge_acc_step"]) if self.pp_degree <= 1: - self.gradient_merge_mode = "sharding_gm" + gm_mode = "sharding_gm" self._grad2merged_grad = dict() else: - self.gradient_merge_mode = "pp_gm" - self._gradient_merge_acc_step = self.user_defined_strategy.pipeline_configs[ - 'accumulate_steps'] - if self._gradient_merge_acc_step > 1: + gm_mode = "pp_gm" + gm_acc_step = strategy.pipeline_configs['accumulate_steps'] + if gm_acc_step > 1: logger.info("Gradient merge in [{}], acc step = [{}]".format( - self.gradient_merge_mode, self._gradient_merge_acc_step)) + gm_mode, gm_acc_step)) - # optimize offload - self.optimize_offload = self.user_defined_strategy.sharding_configs[ - "optimize_offload"] + self.hybrid_dp_mode = dp_mode + self.gradient_merge_mode = gm_mode + self._gradient_merge_acc_step = gm_acc_step # this feature is design for ascend, and should NOT be used in GPU training - self.pp_allreduce_in_optimize = self.user_defined_strategy.sharding_configs[ + self.pp_allreduce_in_optimize = sharding_configs[ "pp_allreduce_in_optimize"] + def _inner_opt_minimize(self, loss, startup_program, parameter_list, + no_grad_set): + pipeline_configs = self.user_defined_strategy.pipeline_configs + if self.inner_opt is None: raise ValueError( "self.inner_opt of ShardingOptimizer should not be None.") @@ -195,32 +213,29 @@ def minimize_impl(self, if self.pp_degree > 1: pp_optimizer = fluid.optimizer.PipelineOptimizer( self.inner_opt, self._gradient_merge_acc_step) - - strategy = self.user_defined_strategy - self.schedule_mode = strategy.pipeline_configs['schedule_mode'] - self.pp_rank_ = self.role_maker._worker_index() // ( - self.sharding_degree * self.mp_degree) % self.pp_degree - - pipeline_opt = dict() - pipeline_opt['schedule_mode'] = self.schedule_mode - pipeline_opt['micro_batch_size'] = strategy.pipeline_configs[ - 'micro_batch_size'] - pipeline_opt['local_rank'] = self.pp_rank_ - pipeline_opt['global_rank'] = self.role_maker._worker_index() - pipeline_opt['use_sharding'] = True - # TODO (JZ-LIANG) should revise here for support mix parallelism with pipeline - pipeline_opt['ring_id'] = 20 - pipeline_opt['global_ring_id'] = 3 - pipeline_opt['mp_degree'] = self.mp_degree - pipeline_opt['mp_rank'] = self.role_maker._worker_index( - ) % self.mp_degree - + self._pp_optimizer = pp_optimizer + + global_rank = self.role_maker._worker_index() + schedule_mode = pipeline_configs['schedule_mode'] + + pipeline_opt = { + 'schedule_mode': schedule_mode, + 'micro_batch_size': pipeline_configs['micro_batch_size'], + 'local_rank': self.pp_rank, + 'global_rank': global_rank, + 'use_sharding': True, + # TODO (JZ-LIANG) should revise here for support mix parallelism with pipeline + 'ring_id': 20, + 'global_ring_id': 3, + 'mp_degree': self.mp_degree, + 'mp_rank': global_rank % self.mp_degree, + } main_program = loss.block.program main_program._pipeline_opt = pipeline_opt optimize_ops, params_grads, program_list, self.pipeline_pair, self.pp_ring_map = pp_optimizer.minimize( loss, startup_program, parameter_list, no_grad_set) - self.pp_degree = len(program_list) + assert self.pp_degree == len(program_list) else: optimize_ops, params_grads = self.inner_opt.minimize( loss, startup_program, parameter_list, no_grad_set) @@ -230,9 +245,8 @@ def minimize_impl(self, if self.pp_degree > 1: startup_program = startup_program._pipeline_opt['startup_program'] - #main_program = main_program._pipeline_opt['section_program']['program'] - print("pp_rank:", self.pp_rank_) - main_program = program_list[self.pp_rank_] + print("pp_rank:", self.pp_rank) + main_program = program_list[self.pp_rank] with open("main_%d" % self.role_maker._worker_index(), 'w') as f: f.writelines(str(main_program)) main_block = main_program.global_block() @@ -241,7 +255,6 @@ def minimize_impl(self, if main_block.has_var(param.name): new_params_grads.append((param, grad)) params_grads = new_params_grads - else: main_block = loss.block @@ -254,93 +267,106 @@ def minimize_impl(self, with open("main_%d" % self.role_maker._worker_index(), 'w') as f: f.writelines(str(main_program)) - # step0: _init_comm - self._init_comm() + return optimize_ops, params_grads - if self.sharding_degree > 1: + def _apply_sharding_pass(self, params_grads): + if self.sharding_degree == 1: return + + main_block = self._main_program.global_block() + startup_block = self._startup_program.global_block() - # step1: build shard - self._build_shard(params_grads) + # step1: build shard + self._build_shard(params_grads) - # step2: split_program - self._split_program(main_block) + # step2: split_program + self._split_program(main_block) - # step3: add broadcast and reduce ops - self._add_broadcast_allreduce(main_block) - main_block._sync_with_cpp() - startup_block._sync_with_cpp() + # step3: add broadcast and reduce ops + self._add_broadcast_allreduce(main_block) + main_block._sync_with_cpp() + startup_block._sync_with_cpp() - main_block._sync_with_cpp() + # step4: remove unneeded ops and vars from block + self._prune_main_program(main_block) + self._prune_startup_program(startup_block) - # step4: remove unneeded ops and vars from block - self._prune_main_program(main_block) - self._prune_startup_program(startup_block) + def _insert_allreduce_for_pp(self): + if self.pp_degree == 1: return - if self.pp_degree > 1: - # sharding-pp related logic - # pp_optimizer._rename_gradient_var_name(main_block) - # crop ops - if self.sharding_degree > 1: - for idx, op in reversed(list(enumerate(main_block.ops))): - if is_update_op(op): - op_role_var = op.attr('op_role_var') - param_name = op_role_var[0] - if not self._shard.has_param(param_name): - main_block._remove_op(idx) - - for idx, op in reversed(list(enumerate(main_block.ops))): - if op.type != 'cast': continue - in_name = op.input_arg_names[0] - if in_name not in self._params: continue - #if self._shard.has_param(param_name): continue - if in_name not in main_block.vars: + strategy = self.user_defined_strategy + main_block = self._main_program.global_block() + startup_block = self._startup_program.global_block() + + # sharding-pp related logic + # pp_optimizer._rename_gradient_var_name(main_block) + # crop ops + if self.sharding_degree > 1: + for idx, op in reversed(list(enumerate(main_block.ops))): + if is_update_op(op): + op_role_var = op.attr('op_role_var') + param_name = op_role_var[0] + if not self._shard.has_param(param_name): main_block._remove_op(idx) - accumulated_grad_names = pp_optimizer._accumulate_gradients( - main_block) - # accumulated_grad_names = sorted(accumulated_grad_names) - if self.pp_allreduce_in_optimize: - print("persistable FP32 grad: ") - print(accumulated_grad_names) - first_optimize_op_index = get_first_check_finite_and_unscale_op_idx( - main_block, raise_error=self.user_defined_strategy.amp) - insert_reduce_ops( + for idx, op in reversed(list(enumerate(main_block.ops))): + if op.type != 'cast': continue + in_name = op.input_arg_names[0] + if in_name not in self._params: continue + #if self._shard.has_param(param_name): continue + if in_name not in main_block.vars: + main_block._remove_op(idx) + + accumulated_grad_names = self._pp_optimizer._accumulate_gradients( + main_block) + # accumulated_grad_names = sorted(accumulated_grad_names) + if self.pp_allreduce_in_optimize: + print("persistable FP32 grad: ") + print(accumulated_grad_names) + first_optimize_op_index = get_first_check_finite_and_unscale_op_idx( + main_block, raise_error=strategy.amp) + insert_reduce_ops( + main_block, + first_optimize_op_index, + self.sharding_ring_id, + accumulated_grad_names, + self._shard, + core.op_proto_and_checker_maker.OpRole.Optimize, + use_calc_stream=True) + if self.hybrid_dp and self.hybrid_dp_mode == "pp_hybrid_dp": + first_optimize_op_index = get_first_check_finite_and_unscale_op_idx( + main_block, raise_error=strategy.amp) + if first_optimize_op_index >= 0: + insert_allreduce_ops( main_block, first_optimize_op_index, - self.sharding_ring_id, + self.dp_ring_id, accumulated_grad_names, - self._shard, core.op_proto_and_checker_maker.OpRole.Optimize, - use_calc_stream=True) - if self.hybrid_dp and self.hybrid_dp_mode == "pp_hybrid_dp": - first_optimize_op_index = get_first_check_finite_and_unscale_op_idx( - main_block, raise_error=self.user_defined_strategy.amp) - if first_optimize_op_index >= 0: - insert_allreduce_ops( - main_block, - first_optimize_op_index, - self.dp_ring_id, - accumulated_grad_names, - core.op_proto_and_checker_maker.OpRole.Optimize, - use_calc_stream=True, - user_defined_strategy=self.user_defined_strategy) + use_calc_stream=True, + user_defined_strategy=strategy) + def _adapt_amp_clip_without_sharding(self): + if self.sharding_degree > 1: return # if not use sharding, adapt amp/clip, for remain parallelism. # cast --> amp --> clip --> opt - if self.sharding_degree <= 1: - # FIXME(wangxi): mp should prune duplicated param_grads when calc - # amp inf_var & clip global_norm_var - # amp - FP16Utils.sync_amp_check_nan_inf( - main_block, [self.mp_ring_id, self.pp_ring_id]) + main_block = self._main_program.global_block() + startup_block = self._startup_program.global_block() + + # FIXME(wangxi): mp should prune duplicated param_grads when calc + # amp inf_var & clip global_norm_var - # clip - gradientclip_helper = GradientClipHelper(None) - gradientclip_helper.sync_global_norm( - main_block, [self.mp_ring_id, self.pp_ring_id]) + FP16Utils.sync_amp_check_nan_inf(main_block, + [self.mp_ring_id, self.pp_ring_id]) - # step6: loss div dp_degree + gradientclip_helper = GradientClipHelper(None) + gradientclip_helper.sync_global_norm( + main_block, [self.mp_ring_id, self.pp_ring_id]) + + def _insert_loss_grad_scale_op(self): + main_block = self._main_program.global_block() + + # step6: loss div dp_degree global_dp_degree = self.sharding_degree * self.dp_degree assert int(global_dp_degree) == global_dp_degree if global_dp_degree > 1: @@ -348,18 +374,67 @@ def minimize_impl(self, main_block._sync_with_cpp() - # TODO(wangxi): add optimize offload - # opt offload should be enable while gradient merge is enable && acc_step is quite large (e.g. >> 100) - # sync its memcpy could not be overlap with calc, otherwise it will slower down training severely. - if self.optimize_offload: + def _apply_optimize_offload_pass(self): + strategy = self.user_defined_strategy + sharding_configs = strategy.sharding_configs + main_block = self._main_program.global_block() + startup_block = self._startup_program.global_block() + + # optimize offload should be enable while gradient merge is enable and + # acc_step is quite large (e.g. >> 100). Since its memcpy could not be + # overlap with calc, otherwise it will slower down training severely. + if sharding_configs["optimize_offload"]: logger.info("Sharding with optimize offload !") offload_helper = OffloadHelper() offload_helper.offload(main_block, startup_block) offload_helper.offload_fp32param(main_block, startup_block) + def _dump_program_for_debug(self): + main_block = self._main_program.global_block() + startup_block = self._startup_program.global_block() + with open("start_sharding_%d" % self.role_maker._worker_index(), + 'w') as f: + f.writelines(str(startup_block.program)) + with open("main_sharding_%d" % self.role_maker._worker_index(), + 'w') as f: + f.writelines(str(main_block.program)) + + def minimize_impl(self, + loss, + startup_program=None, + parameter_list=None, + no_grad_set=None): + # TODO: (JZ-LIANG) support multiple comm in future + # self._nrings = self.user_defined_strategy.nccl_comm_num + self._nrings_sharding = 1 + self._nrings_dp = 1 + + self._get_sharding_segment_strategy() + self._get_hybrid_degree() + self._get_hybrid_dp_mode() + + # config sharding & dp groups + self._build_groups() + + # inner optimize minimize + optimize_ops, params_grads = self._inner_opt_minimize( + loss, startup_program, parameter_list, no_grad_set) + + self._init_comm() + + self._apply_sharding_pass(params_grads) + + self._insert_allreduce_for_pp() + + self._adapt_amp_clip_without_sharding() + + # loss div dp_degree + self._insert_loss_grad_scale_op() + + self._apply_optimize_offload_pass() + # step6: (optional) sharding gradient merge - if self.gradient_merge_mode == "sharding_gm" and self._gradient_merge_acc_step > 1: - self._sharding_gradient_merge(main_block) + self._sharding_gradient_merge() # # check op dependecy # FIXME (JZ-LIANG) enable checking in future. @@ -367,20 +442,15 @@ def minimize_impl(self, # check_allreduce_sum(main_block, self._shard, self.sharding_ring_id, # self.dp_ring_id) - if self.hybrid_dp: - # NOTE(JZ-LIANG) ensure in both sharding_hybrid_dp & pp_hybrid_dp - # init param broadcast should be called after startup pruning - self._initialization_broadcast(startup_block) + # NOTE(JZ-LIANG) ensure in both sharding_hybrid_dp & pp_hybrid_dp + # init param broadcast should be called after startup pruning + self._initialization_broadcast() - with open("start_sharding_%d" % self.role_maker._worker_index(), - 'w') as f: - f.writelines(str(startup_block.program)) - with open("main_sharding_%d" % self.role_maker._worker_index(), - 'w') as f: - f.writelines(str(main_block.program)) + self._dump_program_for_debug() - # GPU and NPU need to wait server ready - self._wait() + # GPU need to wait server ready, GPU and NPU is Layered connection + if not core.is_compiled_with_npu(): + self._wait() return optimize_ops, params_grads def _init_pair_comm(self, pair, ring_id): @@ -470,9 +540,6 @@ def _init_npu_pipeline_comm(self, startup_block): def _init_pipeline_comm(self, startup_block): # TODO (JZ-LIANG) to unify pp_rank_ and pp_rank - assert self.pp_rank_ == self.pp_rank, "pp rank for pp opt [{}], pp rank for sharding opt [{}]".format( - self.pp_rank_, self.pp_rank) - self._collective_helper._init_communicator( self._startup_program, self.current_endpoint, @@ -495,17 +562,8 @@ def _init_pipeline_comm(self, startup_block): self._init_pair_comm(pair, ring_id) def _init_comm(self): - - # config sharding & dp groups - self._build_groups() - # sync var startup_block = self._startup_program.global_block() - self.startup_prog_sync_var = startup_block.create_var( - name="startup_prog_sync_var", - shape=[1], - dtype=core.VarDesc.VarType.INT32, - persistable=False) # mp ring if self.mp_degree > 1: @@ -1050,7 +1108,8 @@ def _build_groups(self): sharding: 1 pure-dp: 2 global: 3 - pp: >= 20 + pp: 4 + pp-pair: >= 20 if one parallelism is not enable: -1 and only support parallelism hierarchy: mp --> sharding --> pp --> dp """ @@ -1215,11 +1274,16 @@ def _build_groups(self): return - def _initialization_broadcast(self, startup_block): + def _initialization_broadcast(self): """ this funtion is to ensure the initialization between dp group to be identical when hybrid-dp is used. """ + if not self.hybrid_dp: + return + + startup_block = self._startup_program.global_block() + params = [] for param in startup_block.iter_parameters(): params.append(param) @@ -1460,13 +1524,17 @@ def _true_apply_gradient(self): # lr_var = main_block.var("gradient_merge_current_step") # paddle.static.Print(lr_var, message="in OPTIMIZE last conditional") - def _sharding_gradient_merge(self, main_block): + def _sharding_gradient_merge(self): """ copy all optimize ops in origin main block remove all optimize ops in origin main block create cond block """ + if self.gradient_merge_mode != "sharding_gm" or self._gradient_merge_acc_step <= 1: + return + + main_block = self._main_program.global_block() # copy original optimize ops to temp ops desc list # remove them from block 0 tmp_copy_block = self._main_program._create_block() diff --git a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py index 1cec106caec82b..16ea7de2946bfd 100644 --- a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py +++ b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py @@ -64,18 +64,6 @@ def __init__(self, layers, hcg, strategy): logger.info("start broadcast dp parameters") broadcast_dp_parameters(self._layers, self._hcg) - def _set_tensor_trainable(self, tensor): - if tensor is None: - return - - if isinstance(tensor, tuple): - for t in tensor: - if is_float_tensor(t): - t.stop_gradient = False - else: - if is_float_tensor(tensor): - tensor.stop_gradient = False - def train_batch(self, data, optimizer, lr_scheduler=None, scaler=None): assert isinstance(optimizer, HybridParallelOptimizer), ( 'optimizer should be HybridParallelOptimizer subclass.') @@ -117,7 +105,6 @@ def train_batch(self, data, optimizer, lr_scheduler=None, scaler=None): for step_id in range(startup_steps): input_tensor = p2p.recv_forward() - self._set_tensor_trainable(input_tensor) output_tensor = self._forward_step(input_tensor) p2p.send_forward(output_tensor) @@ -131,7 +118,6 @@ def train_batch(self, data, optimizer, lr_scheduler=None, scaler=None): for i in range(steady_steps): last_iter = (i == (steady_steps - 1)) - self._set_tensor_trainable(input_tensor) output_tensor = self._forward_step(input_tensor) output_tensor_grad = p2p.send_forward_recv_backward(output_tensor) diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py index e533b2ef3f7a33..c508c88015cfda 100644 --- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py +++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py @@ -15,6 +15,8 @@ import paddle from .utils import paddle_2_number, number_2_dtype from ...utils.log_util import logger +import numpy as np +from paddle import _C_ops _hcg = None @@ -40,6 +42,7 @@ def __init__(self): self.recv_shape_message = None self.recv_dtype_message = None + self.recv_stop_gradient = None self.has_send_meta = False self.has_recv_meta = False @@ -57,7 +60,11 @@ def _recv_shape_dtype(self, group): # recv dtype dtype = paddle.to_tensor([0]) paddle.distributed.recv(dtype, src=0, group=group) - return shape.numpy().tolist(), dtype.item() + + # recv stop_gradient + stop_grad = paddle.to_tensor([0]) + paddle.distributed.recv(stop_grad, src=0, group=group) + return shape.numpy().tolist(), dtype.item(), stop_grad.item() def recv_meta(self, group): tensor_type = paddle.to_tensor([0]) @@ -65,9 +72,10 @@ def recv_meta(self, group): tensor_type = tensor_type.item() if tensor_type == 0: - shape, dtype = self._recv_shape_dtype(group) + shape, dtype, stop_grad = self._recv_shape_dtype(group) self.recv_shape_message = shape self.recv_dtype_message = dtype + self.recv_stop_gradient = bool(stop_grad) elif tensor_type == 1: num = paddle.to_tensor([0]) @@ -75,13 +83,16 @@ def recv_meta(self, group): num = num.item() shapes = [] dtypes = [] + stop_grads = [] for i in range(num): - shape, dtype = self._recv_shape_dtype(group) + shape, dtype, stop_grad = self._recv_shape_dtype(group) shapes.append(shape) dtypes.append(dtype) + stop_grads.append(bool(stop_grad)) self.recv_shape_message = tuple(shapes) self.recv_dtype_message = tuple(dtypes) + self.recv_stop_gradient = tuple(stop_grads) def _send_dims_shape_dtype(self, tensor, group): # send len(shape) @@ -96,6 +107,10 @@ def _send_dims_shape_dtype(self, tensor, group): dtype = paddle.to_tensor(paddle_2_number(tensor.dtype)) paddle.distributed.send(dtype, dst=1, group=group) + # send trainable + stop_grad = paddle.to_tensor(int(tensor.stop_gradient)) + paddle.distributed.send(stop_grad, dst=1, group=group) + def send_meta(self, tensor, group): if isinstance(tensor, paddle.Tensor): tensor_type = paddle.to_tensor([0]) @@ -129,6 +144,12 @@ def set_send_message(self, tensor): _send_recv_meta = SendRecvMeta() +def _is_valid_send_recv_partial(tensor, mp_degree): + tensor_numel = np.prod(tensor.shape) + assert tensor_numel != 0, "can't send/recv zero element" + return mp_degree > 1 and tensor_numel % mp_degree == 0 + + def send_partial(tensor, dst=0, nranks=1, @@ -138,9 +159,17 @@ def send_partial(tensor, if group is not None and not group.is_member(): return ring_id = 0 if group is None else group.id - return paddle.fluid.core.ops.partial_send( - tensor, 'use_calc_stream', use_calc_stream, 'ring_id', ring_id, 'peer', - dst, 'num', nranks, 'id', rank_id) + + if _is_valid_send_recv_partial(tensor, nranks): + return _C_ops.partial_send(tensor.detach(), 'use_calc_stream', + use_calc_stream, 'ring_id', ring_id, 'peer', + dst, 'num', nranks, 'id', rank_id) + else: + return paddle.distributed.send( + tensor.detach(), + dst=dst, + group=group, + use_calc_stream=use_calc_stream) def recv_partial(tensor, @@ -153,10 +182,17 @@ def recv_partial(tensor, return ring_id = 0 if group is None else group.id - paddle.fluid.core.ops.partial_recv( - tensor, 'use_calc_stream', use_calc_stream, 'ring_id', ring_id, 'peer', - src, 'num', nranks, 'id', rank_id, 'dtype', tensor.dtype, 'out_shape', - tensor.shape) + if _is_valid_send_recv_partial(tensor, nranks): + _C_ops.partial_recv(tensor.detach(), 'use_calc_stream', use_calc_stream, + 'ring_id', ring_id, 'peer', src, 'num', nranks, + 'id', rank_id, 'dtype', tensor.dtype, 'out_shape', + tensor.shape) + else: + paddle.distributed.recv( + tensor.detach(), + src=src, + group=group, + use_calc_stream=use_calc_stream) def allgather_partial(tensor, @@ -164,15 +200,15 @@ def allgather_partial(tensor, rank_id=0, group=None, use_calc_stream=True): - if nranks == 1: + if not _is_valid_send_recv_partial(tensor, nranks): return tensor if group is not None and not group.is_member(): return ring_id = 0 if group is None else group.id - return paddle.fluid.core.ops.partial_allgather_( - tensor, 'use_calc_stream', use_calc_stream, 'ring_id', ring_id, - 'nranks', nranks, 'rank', rank_id) + return _C_ops.partial_allgather_(tensor.detach(), 'use_calc_stream', + use_calc_stream, 'ring_id', ring_id, + 'nranks', nranks, 'rank', rank_id) def _p2p_helper(tensor_send_next, tensor_send_prev, recv_prev, recv_next): @@ -184,6 +220,8 @@ def _p2p_helper(tensor_send_next, tensor_send_prev, recv_prev, recv_next): # send / recv message recv_shape_msg = _send_recv_meta.recv_shape_message recv_dtype_msg = _send_recv_meta.recv_dtype_message + recv_stop_gradient = _send_recv_meta.recv_stop_gradient + send_shape_msg = _send_recv_meta.send_shape_message send_dtype_msg = _send_recv_meta.send_dtype_message @@ -196,13 +234,16 @@ def _p2p_helper(tensor_send_next, tensor_send_prev, recv_prev, recv_next): if isinstance(recv_shape_msg, tuple): tensor_recv_prev = [] for idx, shape in enumerate(recv_shape_msg): - tensor_recv_prev.append( - paddle.empty( - shape=shape, dtype=number_2_dtype(recv_dtype_msg[idx]))) + tmp = paddle.empty( + shape=shape, dtype=number_2_dtype(recv_dtype_msg[idx])) + tmp.stop_gradient = recv_stop_gradient[idx] + tensor_recv_prev.append(tmp) tensor_recv_prev = tuple(tensor_recv_prev) else: + tensor_recv_prev = paddle.empty( shape=recv_shape_msg, dtype=number_2_dtype(recv_dtype_msg)) + tensor_recv_prev.stop_gradient = recv_stop_gradient if recv_next: if isinstance(send_shape_msg, tuple): diff --git a/python/paddle/distributed/fleet/utils/recompute.py b/python/paddle/distributed/fleet/utils/recompute.py index 78503baf2fd5d2..89b14258c195ca 100755 --- a/python/paddle/distributed/fleet/utils/recompute.py +++ b/python/paddle/distributed/fleet/utils/recompute.py @@ -145,23 +145,25 @@ def backward(ctx, *args): # run backward() with only tensor that requires grad forward_outputs_with_grad = [] - backward_inputs = list(args) + # NOTE In Transformer-like network, if user put the attention mask into the recompute segment output, + # pylayer will force the stop_gradient of attention mask to be False, which will make the number of + # tensor that need grad does not match. + # the following backward_inputs_with_grad is used to avoid this case. + backward_inputs_with_grad = [] for i in range(len(outputs)): if isinstance(outputs[i], core.VarBase) and not outputs[i].stop_gradient: forward_outputs_with_grad.append(outputs[i]) + backward_inputs_with_grad.append(args[i]) + if len(forward_outputs_with_grad) == 0: raise RuntimeError( "none of output has requires_grad=True, this recompute() is not necessary" ) - assert len(backward_inputs) == len( - forward_outputs_with_grad - ), "number of forward outputs is [{}], but the backward got [{}] inputs".format( - len(forward_outputs_with_grad), len(backward_inputs)) - # actually backward - paddle.autograd.backward(forward_outputs_with_grad, backward_inputs) + paddle.autograd.backward(forward_outputs_with_grad, + backward_inputs_with_grad) grads = list(inp._grad_ivar() for inp in detached_inputs if isinstance(inp, core.VarBase)) diff --git a/python/paddle/fluid/backward.py b/python/paddle/fluid/backward.py index 5c2f305c8dca0c..8bf27f6d2fd988 100755 --- a/python/paddle/fluid/backward.py +++ b/python/paddle/fluid/backward.py @@ -16,6 +16,7 @@ from .proto import framework_pb2 from paddle.fluid import framework as framework +from paddle.fluid import program_guard from . import core import collections import copy @@ -944,6 +945,13 @@ def _append_backward_ops_with_checkpoints_( for op_desc in reversed(added_descs): grad_op_desc, op_grad_to_var = core.get_grad_op_desc( op_desc, cpt.to_text(no_grad_dict[block.idx]), []) + + # Set device for grad_op according to forward Op + if op_desc.has_attr(device_attr_name): + op_device = op_desc.attr(device_attr_name) + for g_op_desc in grad_op_desc: + g_op_desc._set_attr(device_attr_name, op_device) + for key in var_name_dict: _rename_arg_(grad_op_desc, key, var_name_dict[key]) grad_op_descs.extend(grad_op_desc) @@ -2015,3 +2023,72 @@ def gradients(targets, inputs, target_gradients=None, no_grad_set=None): outs = calc_gradient(targets, inputs, target_gradients, no_grad_set) return _as_list(outs) + + +@framework.static_only +def gradients_with_optimizer(program, optimizer, inputs=None, outputs=None): + """ + :api_attr: Static Graph + + Backpropagate the gradients of the program and apply the gradients with the given optimizer. + + Args: + program (Program): The input program. + optimizer (Optimizer): The optimizer to apply the gradients. + inputs (Tensor|list[Tensor]|tuple[Tensor], optional): The input Tensors. + If None, the inputs will be created from the input variables in the given program. Default:None. + outputs (Tensor|list[Tensor]|tuple[Tensor], optional): The output Tensors. + If None, the outputs will be created from the output variables in the given program. Default: None. + + Return: + tuple: tuple (optimize_ops, params_grads), A list of operators appended + by gradients_with_optimizer and a list of (param, grad) variable pairs, param is + ``Parameter``, grad is the gradient value corresponding to the parameter. + The returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to + indicate program pruning. If so, the program will be pruned by ``feed`` and + ``fetch_list`` before run, see details in ``Executor``. + + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + + img = static.data(name='image', shape=[None, 784]) + pred = static.nn.fc(x=img, size=10, activation='relu') + loss = paddle.mean(pred) + opt_ops, pram_grads = paddle.fluid.backward.gradients_with_optimizer(static.default_main_program(), opt) + print(opt_ops) + + """ + check_type(program, 'program', paddle.fluid.Program, + 'paddle.static.gradients_with_optimizer') + check_type(optimizer, 'optimizer', paddle.optimizer.Optimizer, + 'paddle.static.gradients_with_optimizer') + + if inputs is None or outputs is None: + in_set = set() + out_set = set() + for block in program.blocks: + for op in block.ops: + for name in op.input_arg_names: + in_set.add(block.vars[name]) + for name in op.output_arg_names: + out_set.add(block.vars[name]) + if inputs is None: + inputs = list(in_set.difference(out_set)) + if outputs is None: + outputs = list(out_set.difference(in_set)) + + grads = gradients(outputs, inputs) + + with program_guard(program, None): + pram_grads = [(pram, grad) for pram, grad in zip(inputs, grads) + if isinstance(pram, paddle.fluid.framework.Parameter) and + grad is not None] + + optimize_ops = optimizer.apply_gradients(pram_grads) + + return optimize_ops, pram_grads diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py index 8fd01509331e20..04fb45cd3ae22d 100644 --- a/python/paddle/fluid/clip.py +++ b/python/paddle/fluid/clip.py @@ -19,11 +19,15 @@ import warnings import functools +import paddle from . import layers from . import framework from . import core from . import name_scope from .dygraph import base as imperative_base +from .data_feeder import check_variable_and_dtype +from .framework import in_dygraph_mode +from .layer_helper import LayerHelper __all__ = [ 'set_gradient_clip', 'ErrorClipByValue', 'ClipGradByValue', @@ -31,6 +35,30 @@ ] +def _squared_l2_norm(x): + r""" + This OP returns the squared L2 norm of a tensor. + """ + + if core.is_compiled_with_npu() or core.is_compiled_with_xpu(): + square = layers.square(x) + sum_square = layers.reduce_sum(square) + return sum_square + + if in_dygraph_mode(): + return core.ops.squared_l2_norm(x) + + op_type = 'squared_l2_norm' + check_variable_and_dtype(x, 'x', ['float32'], op_type) + helper = LayerHelper(op_type, **locals()) + out = helper.create_variable_for_type_inference(x.dtype) + + inputs = {"X": x} + outputs = {'Out': out} + helper.append_op(type=op_type, inputs=inputs, outputs=outputs) + return out + + class BaseErrorClipAttr(object): def __str__(self): raise NotImplementedError() @@ -258,18 +286,18 @@ class ClipGradByNorm(ClipGradBase): .. math:: Out = - \\left \{ - \\begin{aligned} - & X & & if (norm(X) \\leq clip\_norm) \\\\ - & \\frac{clip\_norm*X}{norm(X)} & & if (norm(X) > clip\_norm) \\\\ - \\end{aligned} - \\right. + \left\{ + \begin{array}{ccl} + X & & if (norm(X) \leq clip\_norm) \\ + \frac{clip\_norm*X}{norm(X)} & & if (norm(X) > clip\_norm) \\ + \end{array} + \right. where :math:`norm(X)` represents the L2 norm of :math:`X`. .. math:: - norm(X) = ( \\sum_{i=1}^{n}|x\_i|^2)^{ \\frac{1}{2}} + norm(X) = ( \sum_{i=1}^{n}|x\_i|^2)^{ \frac{1}{2}} Note: ``need_clip`` of ``ClipGradByNorm`` HAS BEEN DEPRECATED since 2.0. @@ -361,7 +389,7 @@ class ClipGradByGlobalNorm(ClipGradBase): .. math:: - t\_list[i] = t\_list[i] * \\frac{clip\_norm}{\max(global\_norm, clip\_norm)} + t\_list[i] = t\_list[i] * \frac{clip\_norm}{\max(global\_norm, clip\_norm)} where: @@ -416,8 +444,8 @@ def _dygraph_clip(self, params_grads): if g.type == core.VarDesc.VarType.SELECTED_ROWS: merge_grad = layers.merge_selected_rows(g) merge_grad = layers.get_tensor_from_selected_rows(merge_grad) - square = layers.square(merge_grad) - sum_square = layers.reduce_sum(square) + + sum_square = _squared_l2_norm(merge_grad) sum_square_list.append(sum_square) # all parameters have been filterd out @@ -439,6 +467,7 @@ def _dygraph_clip(self, params_grads): if getattr(p, 'need_clip', True) is False: params_and_grads.append((p, g)) continue + # TODO(wangxi): use inplace elementwise_mul new_grad = layers.elementwise_mul(x=g, y=clip_var) params_and_grads.append((p, new_grad)) @@ -460,8 +489,7 @@ def _static_clip(self, params_grads): merge_grad = layers.get_tensor_from_selected_rows( merge_grad) - square = layers.square(merge_grad) - sum_square = layers.reduce_sum(input=square) + sum_square = _squared_l2_norm(merge_grad) sum_square_list.append(sum_square) # all parameters have been filterd out @@ -489,9 +517,14 @@ def _static_clip(self, params_grads): continue with p.block.program._optimized_guard([p, g]): - new_grad = layers.elementwise_mul(x=g, y=scale_var) - param_new_grad_name_dict[p.name] = new_grad.name - params_and_grads.append((p, new_grad)) + # inplace + p.block.append_op( + type='elementwise_mul', + inputs={'X': g, + 'Y': scale_var}, + outputs={'Out': g}) + param_new_grad_name_dict[p.name] = g.name + params_and_grads.append((p, g)) _correct_clip_op_role_var(params_and_grads, param_new_grad_name_dict) return params_and_grads @@ -513,8 +546,7 @@ def _process_context(self, context, param, grad): merge_grad = layers.merge_selected_rows(grad) merge_grad = layers.get_tensor_from_selected_rows(merge_grad) - square = layers.square(merge_grad) - local_norm_var = layers.reduce_sum(input=square) + local_norm_var = _squared_l2_norm(merge_grad) context[self.group_name].append(local_norm_var) self.context = context @@ -532,10 +564,14 @@ def _create_operators(self, param, grad): assert group_scale_var.shape == (1, ) self.context[group_scale_name] = group_scale_var - new_grad = layers.elementwise_mul( - x=grad, y=self.context[group_scale_name]) + # inplace + param.block.append_op( + type='elementwise_mul', + inputs={'X': grad, + 'Y': self.context[group_scale_name]}, + outputs={'Out': grad}) - return param, new_grad + return param, grad @framework.dygraph_not_support @@ -709,7 +745,7 @@ def _correct_clip_op_role_var(params_grads, param_new_grad_name_dict): continue block_id_list.append(block_id) for op in param.block.program.global_block().ops: - if 'op_namescope' in op.all_attrs() and "gradient_clip" in op.attr( + if op.has_attr("op_namescope") and "gradient_clip" in op.attr( "op_namescope") and op.attr('op_role_var'): param_name = op.attr('op_role_var')[0] if param_name in param_new_grad_name_dict: diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py index 37fe1e505f02d9..703146736e3c18 100644 --- a/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py +++ b/python/paddle/fluid/contrib/mixed_precision/fp16_lists.py @@ -150,6 +150,8 @@ def _update_list(self): 'c_identity', 'c_concat', 'c_allreduce_sum', + 'concat', + 'split', } # The set of ops that don't support fp16 calculation diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py index 16dfb2bd50c141..5978d3829aecae 100644 --- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py +++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py @@ -110,6 +110,27 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype): cast_name = in_var.name + '.cast_' + _dtype_to_str(dest_dtype) out_var = block.vars.get(cast_name) if out_var is None or out_var.dtype != dest_dtype: + op_device = op.attr('op_device') + # NOTE(wangxi): optimize for pipeline, reduce one send. + # if in_var is stop_gradient and prev_op device is `all`, + # set cast_op device to `all`, can reduce send cast_var. + # TODO: need remove this after we unified the dynamic + # and static pipeline interface. + if src_dtype == core.VarDesc.VarType.FP32 and in_var.stop_gradient: + prev_op = None + if in_var.op is op: + prev_op = find_true_prev_op(block.ops, op, + in_var_name) + elif in_var.op is not None: + prev_op = in_var.op + + prev_op_device = None + if prev_op is not None: + prev_op_device = prev_op.attr('op_device') + + if prev_op_device is not None and 'all' in prev_op_device: + op_device = prev_op_device + out_var = block.create_var( name=cast_name, dtype=dest_dtype, @@ -124,7 +145,7 @@ def _insert_cast_op(block, op, idx, src_dtype, dest_dtype): attrs={ "in_dtype": in_var.dtype, "out_dtype": out_var.dtype, - "op_device": op.attr("op_device") + "op_device": op_device }) num_cast_ops += 1 _rename_arg(op, in_var.name, out_var.name) diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py index 5996e752c8c22d..06f3f5f3afa750 100644 --- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py +++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py @@ -578,6 +578,7 @@ def _sample_mse(self): var_tensor = _load_variable_data(self._scope, var_name) var_tensor = var_tensor.flatten() abs_max_value = float(np.max(np.abs(var_tensor))) + abs_max_value = 1e-8 if abs_max_value == 0.0 else abs_max_value s = 0.3 if var_name not in self._best_mse_loss: self._best_mse_loss[var_name] = float('inf') diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py index b3b12a477e2a0a..9917730daa543f 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py +++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py @@ -1312,6 +1312,7 @@ def _insert_post_dequant_op(self, graph, op_node): assert self._is_float( scale_v), 'The scale of parameter %s is not a float.' % ( original_var_name) + scale_v = 1e-8 if scale_v == 0.0 else scale_v max_range *= param_range / scale_v else: max_range *= act_range @@ -1413,6 +1414,7 @@ def _clip(x, scale): x[:, i] = _clip(x[:, i], s) x[:, i] = np.round(x[:, i] / s * bnt) else: + scale = 1e-8 if scale == 0.0 else scale x = _clip(x, scale) x = np.round(x / scale * bnt) return x diff --git a/python/paddle/fluid/dygraph/amp/auto_cast.py b/python/paddle/fluid/dygraph/amp/auto_cast.py index bd464450aef7f4..a7eb0d31b7f858 100644 --- a/python/paddle/fluid/dygraph/amp/auto_cast.py +++ b/python/paddle/fluid/dygraph/amp/auto_cast.py @@ -90,6 +90,17 @@ def _update_list(custom_white_list, custom_black_list): return _white_list, _black_list +def _in_amp_guard(): + """ + Judge whether current code block is in `amp_guard` context. + """ + tracer = _dygraph_tracer() + if tracer: + return tracer._enable_autocast + else: + return False + + @signature_safe_contextmanager @dygraph_only def amp_guard(enable=True, custom_white_list=None, custom_black_list=None): diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/assert_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/assert_transformer.py index fe70fd1094f581..e2fcf4f2c2712e 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/assert_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/assert_transformer.py @@ -14,7 +14,7 @@ from __future__ import print_function -import gast +from paddle.utils import gast from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py index 29eee429ef66ab..74f946acedb27f 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py @@ -18,7 +18,7 @@ # It provides a compatibility layer between the AST of various Python versions, # as produced by ast.parse from the standard ast module. # See details in https://github.com/serge-sans-paille/gast/ -import gast +from paddle.utils import gast from paddle.fluid.dygraph.dygraph_to_static.assert_transformer import AssertTransformer from paddle.fluid.dygraph.dygraph_to_static.basic_api_transformer import BasicApiTransformer from paddle.fluid.dygraph.dygraph_to_static.break_continue_transformer import BreakContinueTransformer diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/basic_api_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/basic_api_transformer.py index 5ea1fdfac0928a..acf2c3ec09b5d5 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/basic_api_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/basic_api_transformer.py @@ -13,7 +13,7 @@ # limitations under the License. import astor -import gast +from paddle.utils import gast from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper from paddle.fluid.dygraph.dygraph_to_static import utils diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py index cb0383b9f73623..401ad1c8e84e45 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py @@ -14,7 +14,7 @@ from __future__ import print_function -import gast +from paddle.utils import gast from paddle.fluid import unique_name from paddle.fluid.dygraph.dygraph_to_static.utils import index_in_list diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py index c2481d16825ec8..3e606139245d60 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py @@ -13,7 +13,7 @@ # limitations under the License. from __future__ import print_function -import gast +from paddle.utils import gast from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/cast_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/cast_transformer.py index 1171b5dbdfa22a..ef2d062d2d0187 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/cast_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/cast_transformer.py @@ -13,7 +13,7 @@ # limitations under the License. from __future__ import print_function -import gast +from paddle.utils import gast from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/grad_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/grad_transformer.py index 272d480c5b7a20..98045b3aae4322 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/grad_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/grad_transformer.py @@ -14,7 +14,7 @@ from __future__ import print_function -import gast +from paddle.utils import gast import warnings from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py index 5bc1c3d96d9c95..8fc5a691d212c2 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/ifelse_transformer.py @@ -22,7 +22,7 @@ # It provides a compatibility layer between the AST of various Python versions, # as produced by ast.parse from the standard ast module. # See details in https://github.com/serge-sans-paille/gast/ -import gast +from paddle.utils import gast from paddle.fluid import unique_name from paddle.fluid.dygraph.dygraph_to_static.utils import create_funcDef_node, ast_to_source_code diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/list_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/list_transformer.py index a3311765a996f6..e041fe7c9ac37c 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/list_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/list_transformer.py @@ -15,7 +15,7 @@ from __future__ import print_function import astor -import gast +from paddle.utils import gast from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper, StaticAnalysisVisitor from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/logical_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/logical_transformer.py index 8470e895dd3c89..e5c093f9a9255c 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/logical_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/logical_transformer.py @@ -14,7 +14,7 @@ from __future__ import print_function -import gast +from paddle.utils import gast from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code cmpop_type_to_str = { diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py index 14bb54983b524a..9859feb9d90792 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py @@ -15,7 +15,7 @@ from __future__ import print_function import copy -import gast +from paddle.utils import gast from collections import defaultdict from paddle.fluid import unique_name diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py b/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py index b2f4060b106828..0670c048c5e26b 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/origin_info.py @@ -17,7 +17,7 @@ import collections import inspect -import gast +from paddle.utils import gast from paddle.fluid import core from paddle.fluid.dygraph.dygraph_to_static.utils import unwrap from paddle.fluid.framework import Program diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py index a99a5d50813719..e275ee04858f9a 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py @@ -17,7 +17,7 @@ import six import paddle -from paddle.fluid import framework, backward, core +from paddle.fluid import framework, backward, core, program_guard from paddle.fluid.dygraph import layers from paddle.fluid.dygraph.base import switch_to_static_graph from paddle.fluid.dygraph.dygraph_to_static import logging_utils @@ -26,6 +26,9 @@ from paddle.fluid.layers.utils import pack_sequence_as from paddle.fluid.layers.utils import _hash_with_id from paddle.fluid.compiler import BuildStrategy +from paddle.fluid.contrib.mixed_precision.decorator import AutoMixedPrecisionLists +from paddle.fluid.contrib.mixed_precision.fp16_utils import rewrite_program +from paddle.fluid.dygraph.amp.auto_cast import _in_amp_guard import paddle.compat as cpt from paddle import _C_ops @@ -149,6 +152,9 @@ def __init__(self, main_program, inputs, outputs, parameters=None, self._double_grads = self._get_double_grads(self._origin_main_program) self.training = True + # For AMP training + self._amp_list = AutoMixedPrecisionLists() + @LazyInitialized def _infer_program(self): """ @@ -168,6 +174,25 @@ def _train_program(self): return train_program + @LazyInitialized + @switch_to_static_graph + def _infer_amp_program(self): + """ + Lazy initialized property of infer_amp_program. + """ + infer_amp_program = self._origin_main_program.clone() + with program_guard(infer_amp_program): + rewrite_program(infer_amp_program, self._amp_list) + + return infer_amp_program + + @LazyInitialized + def _train_amp_program(self): + """ + Lazy initialized property of train_amp_program. + """ + return self._append_backward_desc(self._infer_amp_program) + @LazyInitialized def _infer_program_id(self): return _hash_with_id(self._infer_program, self) @@ -180,6 +205,14 @@ def _train_program_id(self): return program_id + @LazyInitialized + def _train_amp_program_id(self): + program_id = _hash_with_id(self._train_amp_program, self) + core._set_cached_executor_build_strategy(program_id, + self._build_strategy) + + return program_id + def _verify_program(self, main_program): """ Verify that the program parameter is initialized, prune some unused params, @@ -241,12 +274,17 @@ def _get_double_grads(self, program): double_grads.append(var_base) return self._valid_vars(double_grads) + def _get_end_op_index(self): + infer_program = self._infer_amp_program if _in_amp_guard( + ) else self._infer_program + return infer_program.desc.block(0).op_size() + def __call__(self, inputs): in_vars, out_vars = self._prepare(inputs) attrs = ('global_block', self.program.desc.block(0), 'start_op_index', - 0, 'end_op_index', self._infer_program.desc.block(0).op_size(), - 'is_test', not self.training, 'program_id', self.program_id) + 0, 'end_op_index', self._get_end_op_index(), 'is_test', + not self.training, 'program_id', self.program_id) _C_ops.run_program( self._valid_vars(in_vars), self._valid_vars(self._params), @@ -258,11 +296,19 @@ def __call__(self, inputs): @property def program(self): - return self._train_program if self.training else self._infer_program + if self.training: + return self._train_amp_program if _in_amp_guard( + ) else self._train_program + else: + return self._infer_program @property def program_id(self): - return self._train_program_id if self.training else self._infer_program_id + if self.training: + return self._train_amp_program_id if _in_amp_guard( + ) else self._train_program_id + else: + return self._infer_program_id def _prepare(self, inputs): """ diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py index 9d1ec35764b090..7960617369e3f2 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/print_transformer.py @@ -14,7 +14,7 @@ from __future__ import print_function -import gast +from paddle.utils import gast from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper, StaticAnalysisVisitor diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py index 3664c4b0016449..58aac8e266fedd 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py @@ -15,7 +15,7 @@ from __future__ import print_function import collections -import gast +from paddle.utils import gast import inspect import six import textwrap diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py index 4bcd49dc8e1577..0c7a8bf421a128 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py @@ -14,7 +14,7 @@ from __future__ import print_function -import gast +from paddle.utils import gast from paddle.fluid import unique_name from paddle.fluid.dygraph.dygraph_to_static.utils import index_in_list diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/static_analysis.py b/python/paddle/fluid/dygraph/dygraph_to_static/static_analysis.py index cbe6b8a0ff9428..ce5f50137b7aa9 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/static_analysis.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/static_analysis.py @@ -14,7 +14,7 @@ from __future__ import print_function -import gast +from paddle.utils import gast from .utils import is_paddle_api, is_dygraph_api, is_numpy_api, index_in_list __all__ = ['AstNodeWrapper', 'NodeVarType', 'StaticAnalysisVisitor'] diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py index eb53d7ec9bec89..0bc167132e3ed7 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/tensor_shape_transformer.py @@ -15,7 +15,7 @@ from __future__ import print_function import copy -import gast +from paddle.utils import gast from paddle.fluid import unique_name from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py index 351a9dcfa3aa2a..650857eefb3bb1 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py @@ -19,7 +19,7 @@ import atexit import copy import collections -import gast +from paddle.utils import gast import inspect import os import six diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py b/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py index c7844f160cee5a..b118eeadf7e7e5 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py @@ -15,7 +15,7 @@ from __future__ import print_function import six -import gast +from paddle.utils import gast from paddle.fluid import core from paddle.fluid import unique_name diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py index de722e6e16c894..608e85acec3f27 100644 --- a/python/paddle/fluid/dygraph/nn.py +++ b/python/paddle/fluid/dygraph/nn.py @@ -1151,9 +1151,6 @@ def forward(self, input): class BatchNorm(layers.Layer): r""" - :alias_main: paddle.nn.BatchNorm - :alias: paddle.nn.BatchNorm,paddle.nn.layer.BatchNorm,paddle.nn.layer.norm.BatchNorm - :old_api: paddle.fluid.dygraph.BatchNorm This interface is used to construct a callable object of the ``BatchNorm`` class. For more details, refer to code examples. @@ -1164,16 +1161,16 @@ class BatchNorm(layers.Layer): Internal Covariate Shift `_ for more details. - When use_global_stats = False, the :math:`\\mu_{\\beta}` - and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch. + When use_global_stats = False, the :math:`\mu_{\beta}` + and :math:`\sigma_{\beta}^{2}` are the statistics of one mini-batch. Calculated as follows: .. math:: - \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\ - \ mini-batch\ mean \\\\ - \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\ - \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\ + \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad & + //\ mini-batch\ mean \\ + \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - \mu_{\beta})^2 \qquad & + //\ mini-batch\ variance \\ - :math:`x` : mini-batch data - :math:`m` : the size of the mini-batch data @@ -1191,13 +1188,14 @@ class BatchNorm(layers.Layer): .. math:: - \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\ - \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\ - y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift + \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\ + \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\ + y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift + - - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero - - :math:`\\gamma` : trainable proportional parameter - - :math:`\\beta` : trainable deviation parameter + - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero + - :math:`\gamma` : trainable proportional parameter + - :math:`\beta` : trainable deviation parameter Parameters: num_channels(int): Indicate the number of channels of the input ``Tensor``. @@ -3011,9 +3009,9 @@ class SpectralNorm(layers.Layer): .. math:: - \mathbf{v} := \\frac{\mathbf{W}^{T} \mathbf{u}}{\|\mathbf{W}^{T} \mathbf{u}\|_2} + \mathbf{v} := \frac{\mathbf{W}^{T} \mathbf{u}}{\|\mathbf{W}^{T} \mathbf{u}\|_2} - \mathbf{u} := \\frac{\mathbf{W}^{T} \mathbf{v}}{\|\mathbf{W}^{T} \mathbf{v}\|_2} + \mathbf{u} := \frac{\mathbf{W}^{T} \mathbf{v}}{\|\mathbf{W}^{T} \mathbf{v}\|_2} Step 3: Calculate :math:`\sigma(\mathbf{W})` and normalize weight values. @@ -3022,7 +3020,7 @@ class SpectralNorm(layers.Layer): \sigma(\mathbf{W}) = \mathbf{u}^{T} \mathbf{W} \mathbf{v} - \mathbf{W} = \\frac{\mathbf{W}}{\sigma(\mathbf{W})} + \mathbf{W} = \frac{\mathbf{W}}{\sigma(\mathbf{W})} Refer to `Spectral Normalization `_ . diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 2247d49483035c..02f9fd1a95e2b2 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -2035,6 +2035,11 @@ def __init__(self, del op_attrs[role_var_name] if len(self.desc.type()) != 0: + # NOTE(Aurelius84): prog.clone() will lead that var.op is always None, + # we add this to fix the problem. + for arg in self.desc.output_arg_names(): + if block.has_var(arg) and block.var(arg).op is None: + block.var(arg).op = self return if type is None: raise ValueError( diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index cebb5e77ac636f..dc1e56f13f3b1d 100755 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -4890,6 +4890,7 @@ def split(input, num_or_sections, dim=-1, name=None): if isinstance(dim, Variable): dim = dim.numpy() dim = dim.item(0) + assert len(input.shape) + dim >= 0, "(rank(x) + axis) must >= 0" dim = (len(input.shape) + dim) if dim < 0 else dim attrs += ('axis', dim) @@ -4951,6 +4952,7 @@ def _get_SectionsTensorList(one_list): dim.stop_gradient = True inputs['AxisTensor'] = dim else: + assert len(input.shape) + dim >= 0, "(rank(x) + axis) must >= 0" dim = (len(input_shape) + dim) if dim < 0 else dim attrs['axis'] = dim @@ -7097,9 +7099,9 @@ def dice_loss(input, label, epsilon=0.00001, name=None): .. math:: - dice\_loss &= 1 - \\frac{2 * intersection\_area}{total\_area} \\\\ - &= \\frac{(total\_area - intersection\_area) - intersection\_area}{total\_area} \\\\ - &= \\frac{(union\_area - intersection\_area)}{total\_area} + dice\_loss &= 1 - \frac{2 * intersection\_area}{total\_area} \\ + &= \frac{(total\_area - intersection\_area) - intersection\_area}{total\_area} \\ + &= \frac{(union\_area - intersection\_area)}{total\_area} Parameters: @@ -13065,8 +13067,8 @@ def log_loss(input, label, epsilon=1e-4, name=None): .. math:: - Out = -label * \\log{(input + \\epsilon)} - - (1 - label) * \\log{(1 - input + \\epsilon)} + Out = -label * \log{(input + \epsilon)} + - (1 - label) * \log{(1 - input + \epsilon)} Args: input (Tensor|list): A 2-D tensor with shape [N x 1], where N is the @@ -14500,17 +14502,17 @@ def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None): .. math:: - dkernel[0] &= dilations[0] \\times (kernel\_sizes[0] - 1) + 1 + dkernel[0] &= dilations[0] \times (kernel\_sizes[0] - 1) + 1 - dkernel[1] &= dilations[1] \\times (kernel\_sizes[1] - 1) + 1 + dkernel[1] &= dilations[1] \times (kernel\_sizes[1] - 1) + 1 - hout &= \\frac{H + paddings[0] + paddings[2] - dkernel[0]}{strides[0]} + 1 + hout &= \frac{H + paddings[0] + paddings[2] - dkernel[0]}{strides[0]} + 1 - wout &= \\frac{W + paddings[1] + paddings[3] - dkernel[1]}{strides[1]} + 1 + wout &= \frac{W + paddings[1] + paddings[3] - dkernel[1]}{strides[1]} + 1 - Cout &= C \\times kernel\_sizes[0] \\times kernel\_sizes[1] + Cout &= C \times kernel\_sizes[0] \times kernel\_sizes[1] - Lout &= hout \\times wout + Lout &= hout \times wout Parameters: diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index ef168d2d921751..ab3dbad1ef326d 100755 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -6031,7 +6031,7 @@ def _offload(self, loss, startup_program=None): self._main_program = loss.block.program self.block = loss.block if startup_program == None: - startup_program = fluid.default_startup_program() + startup_program = paddle.static.default_startup_program() with program_guard(self._main_program, startup_program): assert len(self.checkpoint_shape) > 0, ( diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index e7172507696ec0..007221ca4f9ca3 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -686,6 +686,8 @@ add_subdirectory(asp) add_subdirectory(ir) +add_subdirectory(interpreter) + if (WITH_TESTING) set_property(TEST test_parallel_executor_mnist PROPERTY ENVIRONMENT GLOG_vmodule=all_reduce_deps_pass=10) set_property(TEST test_parallel_executor_fix_op_run_order PROPERTY ENVIRONMENT GLOG_vmodule=fix_op_run_order_pass=10) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ast_util.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ast_util.py index 62b6ac171a4c96..31a50226f0b79e 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ast_util.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ast_util.py @@ -16,7 +16,7 @@ import unittest import textwrap -import gast +from paddle.utils import gast import inspect import numpy as np import paddle.fluid as fluid diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py index ea745ad6614253..b86b85bb90ff69 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_basic_api_transformation.py @@ -17,7 +17,7 @@ import numpy as np import unittest import inspect -import gast +from paddle.utils import gast import paddle import paddle.fluid as fluid diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_break_continue.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_break_continue.py index 8423c056b2d830..95b5235aaa3d0a 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_break_continue.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_break_continue.py @@ -184,7 +184,7 @@ def test_optim_break_in_while(x): class TestContinueInFor(unittest.TestCase): def setUp(self): - self.input = np.zeros((1)).astype('int32') + self.input = np.zeros((1)).astype('int64') self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda( ) else fluid.CPUPlace() self.init_dygraph_func() diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse_basic.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse_basic.py index 7ea6aa8907c282..975797a487be72 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse_basic.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ifelse_basic.py @@ -16,7 +16,7 @@ import unittest import textwrap -import gast +from paddle.utils import gast from paddle.fluid.dygraph.dygraph_to_static.ifelse_transformer import get_name_ids from paddle.fluid.dygraph.dygraph_to_static.static_analysis import StaticAnalysisVisitor from paddle.fluid.dygraph.dygraph_to_static.static_analysis import NodeVarType diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py index 2ed2a273341805..385b7ce204a869 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logging_utils.py @@ -20,7 +20,7 @@ import sys import unittest -import gast +from paddle.utils import gast import paddle from paddle.fluid.dygraph.dygraph_to_static import logging_utils diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logical.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logical.py index c7193eb2a77bc8..b11e9441c8c0e2 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logical.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_logical.py @@ -18,7 +18,7 @@ import unittest -import gast +from paddle.utils import gast import numpy as np import paddle diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py index fe86d5d636811e..8116c04f2034fe 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py @@ -14,7 +14,7 @@ from __future__ import print_function -import gast +from paddle.utils import gast import inspect import numpy as np import paddle diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py index 8a21c4cfd0eca8..cac64c7391351b 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist.py @@ -32,6 +32,9 @@ SEED = 2020 +if paddle.fluid.is_compiled_with_cuda(): + paddle.fluid.set_flags({'FLAGS_cudnn_deterministic': True}) + class SimpleImgConvPool(fluid.dygraph.Layer): def __init__(self, @@ -48,7 +51,7 @@ def __init__(self, conv_dilation=1, conv_groups=1, act=None, - use_cudnn=False, + use_cudnn=True, param_attr=None, bias_attr=None): super(SimpleImgConvPool, self).__init__() @@ -101,7 +104,6 @@ def __init__(self): loc=0.0, scale=scale)), act="softmax") - @paddle.jit.to_static def forward(self, inputs, label=None): x = self.inference(inputs) if label is not None: @@ -167,14 +169,14 @@ def test_mnist_declarative_cpu_vs_mkldnn(self): dygraph_loss_cpu, dygraph_loss_mkldnn)) def train(self, to_static=False): - prog_trans = ProgramTranslator() - prog_trans.enable(to_static) loss_data = [] with fluid.dygraph.guard(self.place): fluid.default_main_program().random_seed = SEED fluid.default_startup_program().random_seed = SEED mnist = MNIST() + if to_static: + mnist = paddle.jit.to_static(mnist) adam = AdamOptimizer( learning_rate=0.001, parameter_list=mnist.parameters()) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist_amp.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist_amp.py new file mode 100644 index 00000000000000..d2160ca641665e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mnist_amp.py @@ -0,0 +1,94 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle +import unittest +import numpy as np +from time import time +from test_mnist import MNIST, TestMNIST, SEED +from paddle.jit import ProgramTranslator +from paddle.fluid.optimizer import AdamOptimizer + +if paddle.fluid.is_compiled_with_cuda(): + paddle.fluid.set_flags({'FLAGS_cudnn_deterministic': True}) + + +class TestAMP(TestMNIST): + def train_static(self): + return self.train(to_static=True) + + def train_dygraph(self): + return self.train(to_static=False) + + def test_mnist_to_static(self): + dygraph_loss = self.train_dygraph() + static_loss = self.train_static() + # NOTE(Aurelius84): In static AMP training, there is a grep_list but + # dygraph AMP don't. It will bring the numbers of cast_op is different + # and leads to loss has a bit diff. + self.assertTrue( + np.allclose( + dygraph_loss, static_loss, atol=1e-3), + msg='dygraph is {}\n static_res is \n{}'.format(dygraph_loss, + static_loss)) + + def train(self, to_static=False): + paddle.seed(SEED) + mnist = MNIST() + + if to_static: + print("Successfully to apply @to_static.") + mnist = paddle.jit.to_static(mnist) + + adam = AdamOptimizer( + learning_rate=0.001, parameter_list=mnist.parameters()) + + scaler = paddle.amp.GradScaler(init_loss_scaling=1024) + + loss_data = [] + for epoch in range(self.epoch_num): + start = time() + for batch_id, data in enumerate(self.train_reader()): + dy_x_data = np.array([x[0].reshape(1, 28, 28) + for x in data]).astype('float32') + y_data = np.array( + [x[1] for x in data]).astype('int64').reshape(-1, 1) + + img = paddle.to_tensor(dy_x_data) + label = paddle.to_tensor(y_data) + label.stop_gradient = True + + with paddle.amp.auto_cast(): + prediction, acc, avg_loss = mnist(img, label=label) + + scaled = scaler.scale(avg_loss) + scaled.backward() + scaler.minimize(adam, scaled) + + loss_data.append(avg_loss.numpy()[0]) + # save checkpoint + mnist.clear_gradients() + if batch_id % 10 == 0: + print( + "Loss at epoch {} step {}: loss: {:}, acc: {}, cost: {}" + .format(epoch, batch_id, + avg_loss.numpy(), acc.numpy(), time() - start)) + start = time() + if batch_id == 50: + break + return loss_data + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py index 2ea3e369099109..9e12b6fa208505 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_program_translator.py @@ -15,7 +15,7 @@ from __future__ import print_function import astor -import gast +from paddle.utils import gast import inspect import numpy as np import textwrap diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_static_analysis.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_static_analysis.py index 0fffb0c985375b..7f6d6cf1f3b005 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_static_analysis.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_static_analysis.py @@ -14,7 +14,7 @@ from __future__ import print_function -import gast +from paddle.utils import gast import inspect import numpy as np import paddle diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_variable_trans_func.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_variable_trans_func.py index 9f677d765f9ab2..3431c6aac4cbef 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_variable_trans_func.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_variable_trans_func.py @@ -14,7 +14,7 @@ from __future__ import print_function -import gast +from paddle.utils import gast import unittest import numpy as np diff --git a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_transformer.py b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_transformer.py index b336330836a66c..62b1a8b1da6797 100644 --- a/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_transformer.py +++ b/python/paddle/fluid/tests/unittests/hybrid_parallel_pp_transformer.py @@ -54,13 +54,17 @@ def forward(self, x): attention_mask = paddle.tensor.triu( (paddle.ones( (length, length), dtype="float32") * -1e9), 1) - attention_mask.stop_gradient = True + + no_used = paddle.ones((3, 3), dtype="int32") + w_emb = self.word_embeddings(x) p_emb = self.position_embeddings(x) w_emb = w_emb + p_emb + attention_mask.stop_gradient = True + no_used.stop_gradient = True # need to fix bug of backward() - return w_emb, attention_mask + return w_emb, attention_mask, no_used, p_emb class TransformerNet(Layer): @@ -99,12 +103,12 @@ def forward(self, x): class TransformerNetPipe(TransformerNet): def forward(self, args): - x, mask = args[0], args[1] + x, mask, no_used, p_emb = args[0], args[1], args[2], args[3] output = super().forward(x, mask) - output = output + output = output + p_emb mask.stop_gradient = True - return output, mask + return output, mask, no_used, p_emb class CriterionPipe(Layer): @@ -175,6 +179,8 @@ def test_pp_model(self): loss = model.train_batch([x, x], optimizer, scheduler) # TODO(shenliang03) add utest for loss + print("loss: ", loss) + if __name__ == "__main__": unittest.main() diff --git a/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt b/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt new file mode 100644 index 00000000000000..7692f8befdf58c --- /dev/null +++ b/python/paddle/fluid/tests/unittests/interpreter/CMakeLists.txt @@ -0,0 +1,6 @@ +file(GLOB TEST_INTERP_CASES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") +string(REPLACE ".py" "" TEST_INTERP_CASES "${TEST_INTERP_CASES}") + +foreach(target ${TEST_INTERP_CASES}) + py_test_modules(${target} MODULES ${target}) +endforeach() diff --git a/python/paddle/fluid/tests/unittests/interpreter/test_interpreter.py b/python/paddle/fluid/tests/unittests/interpreter/test_interpreter.py new file mode 100644 index 00000000000000..bb18d28e48b67d --- /dev/null +++ b/python/paddle/fluid/tests/unittests/interpreter/test_interpreter.py @@ -0,0 +1,55 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle +from paddle.fluid import core +from paddle.fluid.core import InterpreterCore + +import numpy as np + +paddle.enable_static() + + +class LinearTestCase(unittest.TestCase): + def setUp(self): + self.place = paddle.CUDAPlace(0) if core.is_compiled_with_cuda( + ) else paddle.CPUPlace() + + def test_interp_base(self): + a = paddle.static.data(name="a", shape=[2, 2], dtype='float32') + b = paddle.ones([2, 2]) * 2 + t = paddle.static.nn.fc(a, 2) + c = t + b + + main_program = paddle.fluid.default_main_program() + startup_program = paddle.fluid.default_startup_program() + p = core.Place() + p.set_place(self.place) + inter_core = InterpreterCore(p, main_program.desc, startup_program.desc, + core.Scope()) + + out = inter_core.run({ + "a": np.ones( + [2, 2], dtype="float32") * 2 + }, [c.name]) + for i in range(10): + out = inter_core.run({ + "a": np.ones( + [2, 2], dtype="float32") * i + }, [c.name]) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape_op.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape_op.py index 85054be534eeba..76dc605c3ecd27 100644 --- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape_op.py +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape_op.py @@ -80,6 +80,33 @@ def setUp(self): class TRTReshapeTest2(TRTReshapeTest): + def setUp(self): + self.bs = 2 + self.input_shape = [23, 13, 24] + self.reshape = [2, 0, -1, 12] + self.data_shape = [ + self.bs, self.input_shape[0], self.input_shape[1], + self.input_shape[2] + ] + with fluid.program_guard(self.main_program, self.startup_program): + data = fluid.data( + name='data', shape=self.data_shape, dtype='float32') + actual_reshape = fluid.data( + name='actual_reshape', shape=[4], dtype='int32') + reshape_out = fluid.layers.reshape( + x=data, shape=self.reshape, actual_shape=actual_reshape) + out = fluid.layers.batch_norm(reshape_out, is_test=True) + self.feeds = { + 'data': np.random.random(self.data_shape).astype('float32'), + 'actual_reshape': np.array([2, 0, -1, 12]).astype('int32') + } + self.enable_trt = True + self.trt_parameters = TRTReshapeTest.TensorRTParam( + 1 << 30, self.bs, 1, AnalysisConfig.Precision.Float32, False, False) + self.fetch_list = [out] + + +class TRTReshapeTest3(TRTReshapeTest): def setUp(self): self.bs = 1 self.input_shape = [14, 48, 27] diff --git a/python/paddle/fluid/tests/unittests/npu/test_atan_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_atan_op_npu.py new file mode 100644 index 00000000000000..a18b8a03075ef8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_atan_op_npu.py @@ -0,0 +1,87 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() +SEED = 1024 + + +class TestAtan(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "atan" + self.place = paddle.NPUPlace(0) + + self.dtype = np.float32 + np.random.seed(SEED) + self.shape = [11, 17] + x = np.random.uniform(0.1, 1, self.shape).astype(self.dtype) + out = np.arctan(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.outputs = {'Out': out} + + def set_attrs(self): + pass + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_grad(self): + self.check_grad_with_place(self.place, ['X'], 'Out') + + def test_out_name(self): + with fluid.program_guard(fluid.Program()): + np_x = np.array([0.1]) + data = fluid.layers.data(name="X", shape=[1]) + out = paddle.atan(data, name='Y') + place = paddle.NPUPlace(0) + exe = fluid.Executor(place) + result, = exe.run(feed={"X": np_x}, fetch_list=[out]) + expected = np.arctan(np_x) + self.assertEqual(result, expected) + + def test_dygraph(self): + with fluid.dygraph.guard(paddle.NPUPlace(0)): + np_x = np.array([0.1]) + x = fluid.dygraph.to_variable(np_x) + z = paddle.atan(x).numpy() + z_expected = np.arctan(np_x) + self.assertEqual(z, z_expected) + + def test_check_output(self): + self.check_output_with_place(self.place) + + +class TestAtanShape(TestAtan): + def set_attrs(self): + self.shape = [12, 23, 10] + + +class TestAtanFloat16(TestAtan): + def set_attrs(self): + self.dtype = np.float16 + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_compare_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_compare_op_npu.py index d8c22e2da09077..66ce81756fc9d8 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_compare_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_compare_op_npu.py @@ -142,11 +142,12 @@ def test_attr_name(self): globals()[cls_name] = Cls -for _type_name in {'float16', 'float32', 'int32'}: - if _type_name == 'int32': +for _type_name in {'float16', 'float32', 'int32', 'int64', 'bool'}: + if _type_name == 'int32' or _type_name == 'bool': create_test_class('equal', _type_name, lambda _a, _b: _a == _b) continue create_test_class('equal', _type_name, lambda _a, _b: _a == _b) + create_test_class('not_equal', _type_name, lambda _a, _b: _a != _b) create_test_class('less_than', _type_name, lambda _a, _b: _a < _b) create_test_class('less_equal', _type_name, lambda _a, _b: _a <= _b) create_test_class('greater_than', _type_name, lambda _a, _b: _a > _b) diff --git a/python/paddle/fluid/tests/unittests/npu/test_cos_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_cos_op_npu.py new file mode 100644 index 00000000000000..9b29fc812faedd --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_cos_op_npu.py @@ -0,0 +1,146 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() +SEED = 2021 + + +class TestCos(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "cos" + self.place = paddle.NPUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.uniform(1, 2, [11, 17]).astype(self.dtype) + out = np.cos(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.attrs = {} + self.outputs = {'Out': out} + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, atol=1e-7) + + def test_check_grad(self): + if self.dtype == np.float16: + return + self.check_grad_with_place(self.place, ['X'], 'Out') + + +class TestCosFp16(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "cos" + self.place = paddle.NPUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.uniform(1, 2, [3, 4]).astype(self.dtype) + out = np.cos(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.attrs = {} + self.outputs = {'Out': out} + + def set_npu(self): + self.__class__.use_npu = True + self.__class__.no_need_check_grad = True + + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place) + + +class TestCosNet(unittest.TestCase): + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(32, 32)).astype('float32') + b_np = np.random.random(size=(32, 32)).astype('float32') + label_np = np.random.randint(2, size=(32, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[32, 32], dtype='float32') + b = paddle.static.data(name="b", shape=[32, 32], dtype='float32') + label = paddle.static.data( + name="label", shape=[32, 1], dtype='int64') + + c = paddle.multiply(a, b) + d = paddle.cos(c) + + fc_1 = fluid.layers.fc(input=d, size=128) + prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + if run_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run( + main_prog, + feed={"a": a_np, + "b": b_np, + "label": label_np}, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_npu(self): + cpu_pred, cpu_loss = self._test(False) + npu_pred, npu_loss = self._test(True) + + self.assertTrue(np.allclose(npu_pred, cpu_pred)) + self.assertTrue(np.allclose(npu_loss, cpu_loss)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py index 5288db5ceb1c6f..9b27e75e37d255 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_elementwise_add_op_npu.py @@ -13,14 +13,16 @@ # limitations under the License. from __future__ import print_function - import numpy as np import unittest import sys sys.path.append("..") -from op_test import OpTest, _set_use_system_allocator -import paddle + +from paddle.fluid import Program, program_guard +import paddle.fluid.core as core import paddle.fluid as fluid +import paddle +from op_test import OpTest, skip_check_grad_ci paddle.enable_static() @@ -63,6 +65,9 @@ def test_check_output(self): self.check_output_with_place(self.place) def test_check_grad_normal(self): + if self.dtype == np.float16: + return + self.check_grad_with_place( self.place, ['X', 'Y'], @@ -70,6 +75,9 @@ def test_check_grad_normal(self): max_relative_error=0.006, ) def test_check_grad_ingore_x(self): + if self.dtype == np.float16: + return + self.check_grad_with_place( self.place, ['Y'], @@ -78,6 +86,9 @@ def test_check_grad_ingore_x(self): max_relative_error=0.006, ) def test_check_grad_ingore_y(self): + if self.dtype == np.float16: + return + self.check_grad_with_place( self.place, ['X'], @@ -86,6 +97,47 @@ def test_check_grad_ingore_y(self): max_relative_error=0.006, ) +class TestFP16ElementwiseAddOp(TestElementwiseAddOp): + def init_dtype(self): + self.dtype = np.float16 + + +@skip_check_grad_ci( + reason="[skip shape check] Use y_shape(1) to test broadcast.") +class TestElementwiseAddOp_scalar(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(2, 3, 4).astype(self.dtype) + self.y = np.random.rand(1).astype(self.dtype) + self.out = self.x + self.y + + +@skip_check_grad_ci( + reason="[skip shape check] Use y_shape(1) to test broadcast.") +class TestFP16ElementwiseAddOp_scalar(TestFP16ElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(2, 3, 4).astype(self.dtype) + self.y = np.random.rand(1).astype(self.dtype) + self.out = self.x + self.y + + +@skip_check_grad_ci( + reason="[skip shape check] Use y_shape(1,1) to test broadcast.") +class TestElementwiseAddOp_scalar2(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(2, 3, 4).astype(self.dtype) + self.y = np.random.rand(1, 1).astype(self.dtype) + self.out = self.x + self.y + + +@skip_check_grad_ci( + reason="[skip shape check] Use y_shape(1,1) to test broadcast.") +class TestFP16ElementwiseAddOp_scalar2(TestFP16ElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(2, 3, 4).astype(self.dtype) + self.y = np.random.rand(1, 1).astype(self.dtype) + self.out = self.x + self.y + + class TestAddAPI(unittest.TestCase): def test_name(self): with paddle.static.program_guard(paddle.static.Program()): @@ -148,5 +200,385 @@ def test_errors(self): self.assertRaises(TypeError, paddle.add, x2, y2) +class TestElementwiseAddOp_Vector(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.random((100, )).astype(self.dtype) + self.y = np.random.random((100, )).astype(self.dtype) + self.out = np.add(self.x, self.y) + + +class TestFP16ElementwiseAddOp_Vector(TestFP16ElementwiseAddOp): + def init_input_output(self): + self.x = np.random.random((100, )).astype(self.dtype) + self.y = np.random.random((100, )).astype(self.dtype) + self.out = np.add(self.x, self.y) + + +class TestElementwiseAddOp_broadcast_0(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(100, 2, 3).astype(self.dtype) + self.y = np.random.rand(100).astype(self.dtype) + self.out = self.x + self.y.reshape(100, 1, 1) + + def init_axis(self): + self.axis = 0 + + +class TestFP16ElementwiseAddOp_broadcast_0(TestFP16ElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(100, 2, 3).astype(self.dtype) + self.y = np.random.rand(100).astype(self.dtype) + self.out = self.x + self.y.reshape(100, 1, 1) + + def init_axis(self): + self.axis = 0 + + +class TestElementwiseAddOp_broadcast_1(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(2, 100, 3).astype(self.dtype) + self.y = np.random.rand(100).astype(self.dtype) + self.out = self.x + self.y.reshape(1, 100, 1) + + def init_axis(self): + self.axis = 1 + + +class TestFP16ElementwiseAddOp_broadcast_1(TestFP16ElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(2, 100, 3).astype(self.dtype) + self.y = np.random.rand(100).astype(self.dtype) + self.out = self.x + self.y.reshape(1, 100, 1) + + def init_axis(self): + self.axis = 1 + + +class TestElementwiseAddOp_broadcast_2(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(2, 3, 100).astype(self.dtype) + self.y = np.random.rand(100).astype(self.dtype) + self.out = self.x + self.y.reshape(1, 1, 100) + + +class TestFP16ElementwiseAddOp_broadcast_2(TestFP16ElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(2, 3, 100).astype(self.dtype) + self.y = np.random.rand(100).astype(self.dtype) + self.out = self.x + self.y.reshape(1, 1, 100) + + +class TestElementwiseAddOp_broadcast_3(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(2, 10, 12, 1).astype(self.dtype) + self.y = np.random.rand(10, 12).astype(self.dtype) + self.out = self.x + self.y.reshape(1, 10, 12, 1) + + def init_axis(self): + self.axis = 1 + + +class TestFP16ElementwiseAddOp_broadcast_3(TestFP16ElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(2, 10, 12, 3).astype(self.dtype) + self.y = np.random.rand(10, 12).astype(self.dtype) + self.out = self.x + self.y.reshape(1, 10, 12, 1) + + def init_axis(self): + self.axis = 1 + + +class TestElementwiseAddOp_broadcast_4(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(100, 2, 1, 2).astype(self.dtype) + self.y = np.random.rand(100, 1).astype(self.dtype) + self.out = self.x + self.y.reshape(100, 1, 1, 1) + + def init_axis(self): + self.axis = 0 + + +class TestFP16ElementwiseAddOp_broadcast_4(TestFP16ElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(100, 2, 1, 2).astype(self.dtype) + self.y = np.random.rand(100, 1).astype(self.dtype) + self.out = self.x + self.y.reshape(100, 1, 1, 1) + + def init_axis(self): + self.axis = 0 + + +class TestElementwiseAddOp_broadcast_5(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(10, 3, 12).astype(self.dtype) + self.y = np.random.rand(10, 1, 12).astype(self.dtype) + self.out = self.x + self.y + + +class TestFP16ElementwiseAddOp_broadcast_5(TestFP16ElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(10, 3, 12).astype(self.dtype) + self.y = np.random.rand(10, 1, 12).astype(self.dtype) + self.out = self.x + self.y + + +class TestElementwiseAddOp_broadcast_6(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(2, 12, 3, 5).astype(self.dtype) + self.y = np.random.rand(2, 12, 1, 5).astype(self.dtype) + self.out = self.x + self.y + + +class TestElementwiseAddOp_broadcast_7(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(1, 1, 20, 5).astype(self.dtype) + self.y = np.random.rand(20, 5, 1, 1).astype(self.dtype) + self.out = self.x + self.y + + +class TestFP16ElementwiseAddOp_broadcast_6(TestFP16ElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(2, 12, 3, 5).astype(self.dtype) + self.y = np.random.rand(2, 12, 1, 5).astype(self.dtype) + self.out = self.x + self.y + + +class TestElementwiseAddOp_rowwise_add_0(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(2, 10, 12).astype(self.dtype) + self.y = np.random.rand(10, 12).astype(self.dtype) + self.out = self.x + self.y.reshape(1, 10, 12) + + def init_axis(self): + self.axis = 1 + + +class TestFP16ElementwiseAddOp_rowwise_add_0(TestFP16ElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(2, 10, 12).astype(self.dtype) + self.y = np.random.rand(10, 12).astype(self.dtype) + self.out = self.x + self.y.reshape(1, 10, 12) + + def init_axis(self): + self.axis = 1 + + +@skip_check_grad_ci( + reason="[skip shape check] Use y_shape(1) to test broadcast.") +class TestElementwiseAddOp_rowwise_add_1(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(100, 1).astype(self.dtype) + self.y = np.random.rand(1).astype(self.dtype) + self.out = self.x + self.y.reshape(1, 1) + + def init_axis(self): + self.axis = 1 + + +@skip_check_grad_ci( + reason="[skip shape check] Use y_shape(1) to test broadcast.") +class TestFP16ElementwiseAddOp_rowwise_add_1(TestFP16ElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(100, 1).astype(self.dtype) + self.y = np.random.rand(1).astype(self.dtype) + self.out = self.x + self.y.reshape(1, 1) + + def init_axis(self): + self.axis = 1 + + +class TestElementwiseAddOp_channelwise_add(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(100, 2, 3).astype(self.dtype) + self.y = np.random.rand(100, 1, 1).astype(self.dtype) + self.out = self.x + self.y + + def init_axis(self): + self.axis = -1 + + +class TestFP16ElementwiseAddOp_channelwise_add(TestFP16ElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(100, 2, 3).astype(self.dtype) + self.y = np.random.rand(100, 1, 1).astype(self.dtype) + self.out = self.x + self.y + + def init_axis(self): + self.axis = -1 + + +class TestElementwiseAddOp_commonuse_add1(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(2, 3, 100).astype(self.dtype) + self.y = np.random.rand(1, 1, 100).astype(self.dtype) + self.out = self.x + self.y + + def init_axis(self): + self.axis = -1 + + +class TestElementwiseFP16AddOp_commonuse_add1(TestFP16ElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(2, 3, 100).astype(self.dtype) + self.y = np.random.rand(1, 1, 100).astype(self.dtype) + self.out = self.x + self.y + + def init_axis(self): + self.axis = -1 + + +class TestElementwiseAddOp_commonuse_add2(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(10, 3, 1, 4).astype(self.dtype) + self.y = np.random.rand(10, 1, 12, 1).astype(self.dtype) + self.out = self.x + self.y + + def init_axis(self): + self.axis = -1 + + +class TestElementwiseAddOp_xsize_lessthan_ysize_add(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(10, 12).astype(self.dtype) + self.y = np.random.rand(2, 2, 10, 12).astype(self.dtype) + self.out = self.x + self.y + + def init_axis(self): + self.axis = 2 + + +class TestElementwiseAddOp_same_shape_ysize_large(TestElementwiseAddOp): + def init_input_output(self): + self.x = np.random.rand(10, 1, 12).astype(self.dtype) + self.y = np.random.rand(10, 2, 12).astype(self.dtype) + self.out = self.x + self.y + + def init_axis(self): + self.axis = 0 + + +class TestElementwiseAddOpError(unittest.TestCase): + def test_errors(self): + with program_guard(Program(), Program()): + # the input of elementwise_add must be Variable. + x1 = fluid.create_lod_tensor( + np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0)) + y1 = fluid.create_lod_tensor( + np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.NPUPlace(0)) + self.assertRaises(TypeError, fluid.layers.elementwise_add, x1, y1) + + # the input dtype of elementwise_add must be float16 or float32 or float64 or int32 or int64 + # float16 only can be set on GPU place + x2 = fluid.layers.data(name='x2', shape=[3, 4, 5, 6], dtype="uint8") + y2 = fluid.layers.data(name='y2', shape=[3, 4, 5, 6], dtype="uint8") + self.assertRaises(TypeError, fluid.layers.elementwise_add, x2, y2) + + +class TestAddApi(unittest.TestCase): + def _executed_api(self, x, y, name=None): + return paddle.add(x, y, name) + + def test_name(self): + with fluid.program_guard(fluid.Program()): + x = fluid.data(name="x", shape=[2, 3], dtype="float32") + y = fluid.data(name='y', shape=[2, 3], dtype='float32') + + y_1 = self._executed_api(x, y, name='add_res') + self.assertEqual(('add_res' in y_1.name), True) + + def test_declarative(self): + with fluid.program_guard(fluid.Program()): + + def gen_data(): + return { + "x": np.array([2, 3, 4]).astype('float32'), + "y": np.array([1, 5, 2]).astype('float32') + } + + x = fluid.data(name="x", shape=[3], dtype='float32') + y = fluid.data(name="y", shape=[3], dtype='float32') + z = self._executed_api(x, y) + + place = fluid.NPUPlace(0) + exe = fluid.Executor(place) + z_value = exe.run(feed=gen_data(), fetch_list=[z.name]) + z_expected = np.array([3., 8., 6.]) + self.assertEqual((z_value == z_expected).all(), True) + + def test_dygraph(self): + with fluid.dygraph.guard(paddle.NPUPlace(0)): + np_x = np.array([2, 3, 4]).astype('float64') + np_y = np.array([1, 5, 2]).astype('float64') + x = fluid.dygraph.to_variable(np_x) + y = fluid.dygraph.to_variable(np_y) + z = self._executed_api(x, y) + np_z = z.numpy() + z_expected = np.array([3., 8., 6.]) + self.assertEqual((np_z == z_expected).all(), True) + + +class TestAddInplaceApi(TestAddApi): + def _executed_api(self, x, y, name=None): + return x.add_(y, name) + + +class TestAddInplaceBroadcastSuccess(unittest.TestCase): + def init_data(self): + self.x_numpy = np.random.rand(2, 3, 4).astype('float') + self.y_numpy = np.random.rand(3, 4).astype('float') + + def test_broadcast_success(self): + paddle.disable_static(place=paddle.NPUPlace(0)) + self.init_data() + x = paddle.to_tensor(self.x_numpy) + y = paddle.to_tensor(self.y_numpy) + inplace_result = x.add_(y) + numpy_result = self.x_numpy + self.y_numpy + self.assertEqual((inplace_result.numpy() == numpy_result).all(), True) + paddle.enable_static() + + +class TestAddInplaceBroadcastSuccess2(TestAddInplaceBroadcastSuccess): + def init_data(self): + self.x_numpy = np.random.rand(1, 2, 3, 1).astype('float') + self.y_numpy = np.random.rand(3, 1).astype('float') + + +class TestAddInplaceBroadcastSuccess3(TestAddInplaceBroadcastSuccess): + def init_data(self): + self.x_numpy = np.random.rand(2, 3, 1, 5).astype('float') + self.y_numpy = np.random.rand(1, 3, 1, 5).astype('float') + + +class TestAddInplaceBroadcastError(unittest.TestCase): + def init_data(self): + self.x_numpy = np.random.rand(3, 4).astype('float') + self.y_numpy = np.random.rand(2, 3, 4).astype('float') + + def test_broadcast_errors(self): + paddle.disable_static(place=paddle.NPUPlace(0)) + self.init_data() + x = paddle.to_tensor(self.x_numpy) + y = paddle.to_tensor(self.y_numpy) + + def broadcast_shape_error(): + x.add_(y) + + self.assertRaises(ValueError, broadcast_shape_error) + paddle.enable_static() + + +class TestAddInplaceBroadcastError2(TestAddInplaceBroadcastError): + def init_data(self): + self.x_numpy = np.random.rand(2, 1, 4).astype('float') + self.y_numpy = np.random.rand(2, 3, 4).astype('float') + + +class TestAddInplaceBroadcastError3(TestAddInplaceBroadcastError): + def init_data(self): + self.x_numpy = np.random.rand(5, 2, 1, 4).astype('float') + self.y_numpy = np.random.rand(2, 3, 4).astype('float') + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_expand_as_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_expand_as_v2_op_npu.py new file mode 100644 index 00000000000000..99edc25f7696a4 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_expand_as_v2_op_npu.py @@ -0,0 +1,146 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() +np.random.seed(10) + + +class TestExpandAsOpRank1(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "expand_as_v2" + x = np.random.rand(100).astype("float32") + target_tensor = np.random.rand(2, 100).astype("float32") + self.inputs = {'X': x} + self.attrs = {'target_shape': target_tensor.shape} + bcast_dims = [2, 1] + output = np.tile(self.inputs['X'], bcast_dims) + self.outputs = {'Out': output} + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + pass + + +class TestExpandAsOpRank2(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "expand_as_v2" + x = np.random.rand(10, 12).astype("float32") + target_tensor = np.random.rand(10, 12).astype("float32") + self.inputs = {'X': x} + self.attrs = {'target_shape': target_tensor.shape} + bcast_dims = [1, 1] + output = np.tile(self.inputs['X'], bcast_dims) + self.outputs = {'Out': output} + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + pass + + +class TestExpandAsOpRank3(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "expand_as_v2" + x = np.random.rand(2, 3, 20).astype("float32") + target_tensor = np.random.rand(2, 3, 20).astype("float32") + self.inputs = {'X': x} + self.attrs = {'target_shape': target_tensor.shape} + bcast_dims = [1, 1, 1] + output = np.tile(self.inputs['X'], bcast_dims) + self.outputs = {'Out': output} + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + pass + + +class TestExpandAsOpRank4(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "expand_as_v2" + x = np.random.rand(1, 1, 7, 16).astype("float32") + target_tensor = np.random.rand(4, 6, 7, 16).astype("float32") + self.inputs = {'X': x} + self.attrs = {'target_shape': target_tensor.shape} + bcast_dims = [4, 6, 1, 1] + output = np.tile(self.inputs['X'], bcast_dims) + self.outputs = {'Out': output} + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + pass + + +# Test python API +class TestExpandAsV2API(unittest.TestCase): + def test_api(self): + input1 = np.random.random([12, 14]).astype("float32") + input2 = np.random.random([2, 12, 14]).astype("float32") + x = fluid.layers.data( + name='x', shape=[12, 14], append_batch_size=False, dtype="float32") + + y = fluid.layers.data( + name='target_tensor', + shape=[2, 12, 14], + append_batch_size=False, + dtype="float32") + + out_1 = paddle.expand_as(x, y=y) + + exe = fluid.Executor(place=fluid.NPUPlace(0)) + res_1 = exe.run(fluid.default_main_program(), + feed={"x": input1, + "target_tensor": input2}, + fetch_list=[out_1]) + assert np.array_equal(res_1[0], np.tile(input1, (2, 1, 1))) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_eye_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_eye_op_npu.py new file mode 100755 index 00000000000000..abe981399a9626 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_eye_op_npu.py @@ -0,0 +1,195 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +from paddle.fluid import core +import paddle.fluid.framework as framework + +paddle.enable_static() +np.random.seed(10) + + +class TestEyeOp(OpTest): + def setUp(self): + ''' + Test eye op with specified shape + ''' + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "eye" + self.inputs = {} + + self.num_rows = 0 + self.num_columns = 0 + self.dtype = np.float32 + + self.initTestCase() + + if self.num_columns == 0: + self.attrs = { + 'num_rows': self.num_rows, + 'dtype': framework.convert_np_dtype_to_dtype_(self.dtype) + } + self.outputs = {'Out': np.eye(self.num_rows, dtype=self.dtype)} + else: + self.attrs = { + 'num_rows': self.num_rows, + 'num_columns': self.num_columns, + 'dtype': framework.convert_np_dtype_to_dtype_(self.dtype) + } + self.outputs = { + 'Out': np.eye(self.num_rows, self.num_columns, dtype=self.dtype) + } + + def initTestCase(self): + self.num_rows = 219 + self.num_columns = 319 + self.dtype = np.int32 + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_output(self): + self.check_output_with_place(self.place) + + +class TestEyeOp1(TestEyeOp): + def initTestCase(self): + self.num_rows = 50 + + +class TestEyeOp2(TestEyeOp): + def initTestCase(self): + self.num_rows = 50 + self.dtype = np.int32 + + +class TestEyeOp3(TestEyeOp): + def initTestCase(self): + self.num_rows = 50 + self.dtype = np.float16 + + +class TestEyeOp4(TestEyeOp): + def initTestCase(self): + self.num_rows = 1 + self.num_columns = 99 + + +class TestEyeOp5(TestEyeOp): + def initTestCase(self): + self.num_rows = 100 + self.num_columns = 100 + + +class TestEyeOp6(TestEyeOp): + def initTestCase(self): + self.num_rows = 100 + self.num_columns = 100 + self.dtype = np.float32 + + +class API_TestTensorEye(unittest.TestCase): + def test_out(self): + with paddle.static.program_guard(paddle.static.Program()): + data = paddle.eye(10) + place = paddle.NPUPlace(0) + exe = paddle.static.Executor(place) + result, = exe.run(fetch_list=[data]) + expected_result = np.eye(10, dtype="float32") + self.assertEqual((result == expected_result).all(), True) + + with paddle.static.program_guard(paddle.static.Program()): + data = paddle.eye(10, num_columns=7, dtype="float16") + place = paddle.NPUPlace(0) + exe = paddle.static.Executor(place) + result, = exe.run(fetch_list=[data]) + expected_result = np.eye(10, 7, dtype="float16") + self.assertEqual((result == expected_result).all(), True) + + with paddle.static.program_guard(paddle.static.Program()): + data = paddle.eye(10, dtype="int32") + place = paddle.NPUPlace(0) + exe = paddle.static.Executor(place) + result, = exe.run(fetch_list=[data]) + expected_result = np.eye(10, dtype="int32") + self.assertEqual((result == expected_result).all(), True) + + paddle.disable_static(paddle.NPUPlace(0)) + out = paddle.eye(10, dtype="int32") + expected_result = np.eye(10, dtype="int32") + paddle.enable_static() + self.assertEqual((out.numpy() == expected_result).all(), True) + + paddle.disable_static(paddle.NPUPlace(0)) + batch_shape = [2] + out = fluid.layers.eye(10, 10, dtype="int32", batch_shape=batch_shape) + result = np.eye(10, dtype="int32") + expected_result = [] + for index in reversed(batch_shape): + tmp_result = [] + for i in range(index): + tmp_result.append(result) + result = tmp_result + expected_result = np.stack(result, axis=0) + paddle.enable_static() + self.assertEqual(out.numpy().shape == np.array(expected_result).shape, + True) + self.assertEqual((out.numpy() == expected_result).all(), True) + + paddle.disable_static(paddle.NPUPlace(0)) + batch_shape = [3, 2] + out = fluid.layers.eye(10, 10, dtype="int32", batch_shape=batch_shape) + result = np.eye(10, dtype="int32") + expected_result = [] + for index in reversed(batch_shape): + tmp_result = [] + for i in range(index): + tmp_result.append(result) + result = tmp_result + expected_result = np.stack(result, axis=0) + paddle.enable_static() + self.assertEqual(out.numpy().shape == np.array(expected_result).shape, + True) + self.assertEqual((out.numpy() == expected_result).all(), True) + + def test_errors(self): + with paddle.static.program_guard(paddle.static.Program()): + + def test_num_rows_type_check(): + paddle.eye(-1, dtype="int64") + + self.assertRaises(TypeError, test_num_rows_type_check) + + def test_num_columns_type_check(): + paddle.eye(10, num_columns=5.2, dtype="int64") + + self.assertRaises(TypeError, test_num_columns_type_check) + + def test_num_columns_type_check1(): + paddle.eye(10, num_columns=10, dtype="int8") + + self.assertRaises(TypeError, test_num_columns_type_check1) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_fill_any_like_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_fill_any_like_op_npu.py new file mode 100644 index 00000000000000..a687509e6ae9c6 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_fill_any_like_op_npu.py @@ -0,0 +1,88 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +from paddle.fluid import core + +paddle.enable_static() + + +class TestFillAnyLikeNPUOp(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "fill_any_like" + self.dtype = np.float32 + self.shape = [2, 3, 4, 5] + self.value = 0.0 + + self.init() + + self.inputs = {'X': np.random.random(self.shape).astype(self.dtype)} + self.attrs = {'value': self.value} + self.outputs = {'Out': np.full(self.shape, self.value, self.dtype)} + + def init(self): + pass + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_output(self): + self.check_output_with_place(self.place) + + +class TestFillAnyLikeNPUOpInt32(TestFillAnyLikeNPUOp): + def init(self): + self.dtype = np.int32 + self.value = -1 + + +class TestFillAnyLikeNPUOpFloat32(TestFillAnyLikeNPUOp): + def init(self): + self.dtype = np.float32 + self.value = 0.09 + + +class TestFillAnyLikeNPUOpFloat16(TestFillAnyLikeNPUOp): + def init(self): + self.dtype = np.float16 + self.value = 0.05 + + +class TestFillAnyLikeNPUOpValue1(TestFillAnyLikeNPUOp): + def init(self): + self.value = 1.0 + + +class TestFillAnyLikeNPUOpValue2(TestFillAnyLikeNPUOp): + def init(self): + self.value = 1e-9 + + +class TestFillAnyLikeNPUOpShape(TestFillAnyLikeNPUOp): + def init(self): + self.shape = [12, 10] + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_fill_constant_batch_size_like_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_fill_constant_batch_size_like_op_npu.py new file mode 100644 index 00000000000000..7736c85c87aa29 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_fill_constant_batch_size_like_op_npu.py @@ -0,0 +1,134 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +from paddle.fluid import core + +paddle.enable_static() +SEED = 2021 + + +class TestFillConstantBatchSizeLike(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "fill_constant_batch_size_like" + self.init_shape() + self.init_value() + self.init_dtype() + self.init_force_cpu() + self.init_dim_idx() + + self.inputs = { + 'Input': np.random.random(self.input_shape).astype("float32") + } + self.attrs = { + 'shape': self.shape, + 'value': self.value, + 'str_value': self.str_value, + 'dtype': self.dtype, + 'force_cpu': self.force_cpu, + 'input_dim_idx': self.input_dim_idx, + 'output_dim_idx': self.output_dim_idx + } + self.outputs = { + 'Out': np.full(self.output_shape, self.output_value, + self.output_dtype) + } + + def set_npu(self): + self.__class__.use_npu = True + + def init_shape(self): + self.input_shape = [4, 5] + self.shape = [123, 92] + self.output_shape = (4, 92) + + def init_value(self): + self.value = 3.8 + self.str_value = '' + self.output_value = 3.8 + + def init_dtype(self): + self.dtype = core.VarDesc.VarType.FP32 + self.output_dtype = np.float32 + + def init_force_cpu(self): + self.force_cpu = False + + def init_dim_idx(self): + self.input_dim_idx = 0 + self.output_dim_idx = 0 + + def test_check_output(self): + self.check_output_with_place(self.place) + + +class TestFillConstantBatchSizeLike2(TestFillConstantBatchSizeLike): + def init_shape(self): + # test shape + self.input_shape = [4, 5, 6, 7] + self.shape = [10, 123, 92] + self.output_shape = (4, 123, 92) + + +class TestFillConstantBatchSizeLike3(TestFillConstantBatchSizeLike): + def init_value(self): + # use 'str_value' rather than 'value' + self.value = 3.8 + self.str_value = '4.5' + self.output_value = 4.5 + + +class TestFillConstantBatchSizeLike6(TestFillConstantBatchSizeLike): + def init_dtype(self): + self.dtype = core.VarDesc.VarType.FP16 + self.output_dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place, atol=1e-2) + + +class TestFillConstantBatchSizeLike7(TestFillConstantBatchSizeLike): + def init_dtype(self): + self.dtype = core.VarDesc.VarType.INT32 + self.output_dtype = np.int32 + + +class TestFillConstantBatchSizeLike8(TestFillConstantBatchSizeLike): + def init_force_cpu(self): + self.force_cpu = True + + +class TestFillConstantBatchSizeLike9(TestFillConstantBatchSizeLike): + def init_shape(self): + self.input_shape = [4, 5] + self.shape = [123, 92] + self.output_shape = (123, 4) + + def init_dim_idx(self): + self.input_dim_idx = 0 + self.output_dim_idx = 1 + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_flatten2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_flatten2_op_npu.py new file mode 100755 index 00000000000000..acd7ca770164e5 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_flatten2_op_npu.py @@ -0,0 +1,82 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import sys +sys.path.append("..") +import numpy as np +import paddle +import paddle.fluid as fluid +from op_test import OpTest +paddle.enable_static() + + +class TestFlatten2Op(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "flatten2" + self.place = paddle.NPUPlace(0) + self.init_test_case() + self.inputs = {"X": np.random.random(self.in_shape).astype("float64")} + self.init_attrs() + self.outputs = { + "Out": self.inputs["X"].reshape(self.new_shape), + "XShape": np.random.random(self.in_shape).astype("float32") + } + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_output(self): + self.check_output_with_place(self.place, no_check_set=["XShape"]) + + def test_check_grad(self): + self.check_grad_with_place(self.place, ["X"], "Out") + + def init_test_case(self): + self.in_shape = (3, 2, 4, 5) + self.axis = 1 + self.new_shape = (3, 40) + + def init_attrs(self): + self.attrs = {"axis": self.axis} + + +class TestFlatten2OpWithCornerAxis(TestFlatten2Op): + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.axis = 0 + self.new_shape = (1, 120) + + +class TestFlatten2OpWithDefaultAxis(TestFlatten2Op): + def init_test_case(self): + self.in_shape = (10, 2, 2, 3) + self.new_shape = (10, 12) + + def init_attrs(self): + self.attrs = {} + + +class TestFlatten2OpSixDims(TestFlatten2Op): + def init_test_case(self): + self.in_shape = (3, 2, 3, 2, 4, 4) + self.axis = 4 + self.new_shape = (36, 16) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_flatten_contiguous_range_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_flatten_contiguous_range_op_npu.py new file mode 100644 index 00000000000000..88e711dcf068e6 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_flatten_contiguous_range_op_npu.py @@ -0,0 +1,318 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() + + +class TestFlattenOp(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "flatten_contiguous_range" + self.place = paddle.NPUPlace(0) + + self.start_axis = 0 + self.stop_axis = -1 + self.dtype = np.float64 + self.init_test_case() + self.inputs = {"X": np.random.random(self.in_shape).astype(self.dtype)} + self.init_attrs() + self.outputs = { + "Out": self.inputs["X"].reshape(self.new_shape), + "XShape": np.random.random(self.in_shape).astype("float32") + } + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_output(self): + self.check_output_with_place(self.place, no_check_set=["XShape"]) + + def test_check_grad(self): + pass + + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 0 + self.stop_axis = -1 + self.new_shape = (120) + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + +class TestFlattenOp_1(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 1 + self.stop_axis = 2 + self.new_shape = (3, 10, 4) + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + +class TestFlattenOp_2(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 0 + self.stop_axis = 1 + self.new_shape = (6, 5, 4) + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + +class TestFlattenOp_3(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 0 + self.stop_axis = 2 + self.new_shape = (30, 4) + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + +class TestFlattenOp_4(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = -2 + self.stop_axis = -1 + self.new_shape = (3, 2, 20) + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + +class TestFlattenOp_5(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 2 + self.stop_axis = 2 + self.new_shape = (3, 2, 5, 4) + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + +class TestFlattenOpSixDims(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 3, 2, 4, 4) + self.start_axis = 3 + self.stop_axis = 5 + self.new_shape = (3, 2, 3, 32) + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + +class TestFlattenOp_Float32(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 0 + self.stop_axis = 1 + self.new_shape = (6, 5, 4) + self.dtype = np.float32 + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + +class TestFlattenOp_int(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 0 + self.stop_axis = 1 + self.new_shape = (6, 5, 4) + self.dtype = np.int + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + +class TestFlattenOp_uint8(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 0 + self.stop_axis = 1 + self.new_shape = (6, 5, 4) + self.dtype = np.uint8 + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + +class TestFlattenOp_int8(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 0 + self.stop_axis = 1 + self.new_shape = (6, 5, 4) + self.dtype = np.int8 + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + +class TestFlattenOp_int64(TestFlattenOp): + def init_test_case(self): + self.in_shape = (3, 2, 5, 4) + self.start_axis = 0 + self.stop_axis = 1 + self.new_shape = (6, 5, 4) + self.dtype = np.int64 + + def init_attrs(self): + self.attrs = { + "start_axis": self.start_axis, + "stop_axis": self.stop_axis + } + + +class TestFlatten2OpError(unittest.TestCase): + def test_errors(self): + image_shape = (2, 3, 4, 4) + x = np.arange(image_shape[0] * image_shape[1] * image_shape[2] * + image_shape[3]).reshape(image_shape) / 100. + x = x.astype('float32') + + def test_ValueError1(): + x_var = paddle.static.data( + name="x", shape=image_shape, dtype='float32') + out = paddle.flatten(x_var, start_axis=2, stop_axis=1) + + self.assertRaises(ValueError, test_ValueError1) + + def test_ValueError2(): + x_var = paddle.static.data( + name="x", shape=image_shape, dtype='float32') + paddle.flatten(x_var, start_axis=10, stop_axis=1) + + self.assertRaises(ValueError, test_ValueError2) + + def test_ValueError3(): + x_var = paddle.static.data( + name="x", shape=image_shape, dtype='float32') + paddle.flatten(x_var, start_axis=2, stop_axis=10) + + self.assertRaises(ValueError, test_ValueError3) + + def test_type(): + # dtype must be float32, float64, int8, int32, int64, uint8. + x2 = np.arange(image_shape[0] * image_shape[1] * image_shape[2] * + image_shape[3]).reshape(image_shape) / 100. + x2 = x2.astype('float16') + x2_var = paddle.fluid.data( + name='x2', shape=[3, 2, 4, 5], dtype='float16') + paddle.flatten(x2_var) + + self.assertRaises(TypeError, test_type) + + def test_InputError(): + out = paddle.flatten(x) + + self.assertRaises(ValueError, test_InputError) + + +class TestStaticFlattenPythonAPI(unittest.TestCase): + def execute_api(self, x, start_axis=0, stop_axis=-1): + return paddle.flatten(x, start_axis, stop_axis) + + def test_static_api(self): + paddle.enable_static() + np_x = np.random.rand(2, 3, 4, 4).astype('float32') + + main_prog = paddle.static.Program() + with paddle.static.program_guard(main_prog, paddle.static.Program()): + x = paddle.static.data( + name="x", shape=[2, 3, 4, 4], dtype='float32') + out = self.execute_api(x, start_axis=-2, stop_axis=-1) + + exe = paddle.static.Executor(place=paddle.NPUPlace(0)) + fetch_out = exe.run(main_prog, feed={"x": np_x}, fetch_list=[out]) + self.assertTrue((2, 3, 16) == fetch_out[0].shape) + + +class TestStaticInplaceFlattenPythonAPI(TestStaticFlattenPythonAPI): + def execute_api(self, x, start_axis=0, stop_axis=-1): + return x.flatten_(start_axis, stop_axis) + + +class TestFlattenPython(unittest.TestCase): + def test_python_api(self): + image_shape = (2, 3, 4, 4) + x = np.arange(image_shape[0] * image_shape[1] * image_shape[2] * + image_shape[3]).reshape(image_shape) / 100. + x = x.astype('float32') + + def test_InputError(): + out = paddle.flatten(x) + + self.assertRaises(ValueError, test_InputError) + + def test_Negative(): + paddle.disable_static(paddle.NPUPlace(0)) + img = paddle.to_tensor(x) + out = paddle.flatten(img, start_axis=-2, stop_axis=-1) + return out.numpy().shape + + res_shape = test_Negative() + self.assertTrue((2, 3, 16) == res_shape) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_index_select_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_index_select_op_npu.py new file mode 100644 index 00000000000000..ff0d57d1d4da10 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_index_select_op_npu.py @@ -0,0 +1,153 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +from paddle.static import Program, program_guard + +paddle.enable_static() +SEED = 2021 + + +class TestNPUIndexSelect(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "index_select" + self.config() + + x_np = np.random.random(self.x_shape).astype(self.x_type) + index_np = np.random.randint( + low=0, high=self.x_shape[self.dim], size=self.index_size) + + # compute real output as baseline. + outer_loop = np.prod(self.x_shape[:self.dim]) + outer_loop = outer_loop.astype(self.index_type) + x_reshape = [outer_loop] + list(self.x_shape[self.dim:]) + x_np_reshape = np.reshape(x_np, tuple(x_reshape)) + + out_list = [] + for i in range(outer_loop): + for j in range(self.index_size): + out_list.append(x_np_reshape[i, index_np[j]]) + self.out_shape = list(self.x_shape) + self.out_shape[self.dim] = self.index_size + self.out_shape = tuple(self.out_shape) + out = np.reshape(out_list, self.out_shape) + + self.inputs = {'X': x_np, 'Index': index_np} + self.attrs = {'dim': self.dim} + self.outputs = {'Out': out} + + # todo: comment second line when index_select grad npu op is ready. + def set_npu(self): + self.__class__.use_npu = True + self.__class__.no_need_check_grad = True + + def test_check_output(self): + self.check_output_with_place(self.place) + + # todo: replace first line with second line when index_select grad npu op is ready. + def test_check_grad(self): + pass + #self.check_grad_with_place(self.place, ['X'], 'Out') + + def config(self): + self.x_shape = (100, 4, 5) + self.x_type = np.float32 + self.dim = 1 + self.index_size = 100 + self.index_type = np.int64 + + +class TestNPUIndexSelectCase2(TestNPUIndexSelect): + def config(self): + self.dim = -2 + self.x_type = np.float32 + self.index_type = np.int32 + self.x_shape = (10, 10, 4, 10) + self.index_size = 10 + + +class TestNPUIndexSelectAPI(unittest.TestCase): + def input_data(self): + self.data_x = np.array([[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0], + [9.0, 10.0, 11.0, 12.0]]).astype('float32') + self.data_index = np.array([0, 1, 1]).astype('int32') + + def test_index_select_api(self): + paddle.set_device("npu:0") + paddle.enable_static() + self.input_data() + + # case 1: + with program_guard(Program(), Program()): + x = paddle.static.data(name='x', shape=[-1, 4], dtype='float32') + index = paddle.static.data(name='index', shape=[3], dtype='int32') + z = paddle.index_select(x, index, axis=1) + exe = paddle.static.Executor(paddle.NPUPlace(0)) + res, = exe.run(feed={'x': self.data_x, + 'index': self.data_index}, + fetch_list=[z.name], + return_numpy=False) + expect_out = np.array([[1.0, 2.0, 2.0], [5.0, 6.0, 6.0], + [9.0, 10.0, 10.0]]).astype('float32') + self.assertTrue(np.allclose(expect_out, np.array(res))) + + # case 2: + with program_guard(Program(), Program()): + x = paddle.static.data(name='x', shape=[-1, 4], dtype='float32') + index = paddle.static.data(name='index', shape=[3], dtype='int32') + z = paddle.index_select(x, index) + exe = paddle.static.Executor(paddle.NPUPlace(0)) + res, = exe.run(feed={'x': self.data_x, + 'index': self.data_index}, + fetch_list=[z.name], + return_numpy=False) + expect_out = np.array([[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0], + [5.0, 6.0, 7.0, 8.0]]).astype('float32') + self.assertTrue(np.allclose(expect_out, np.array(res))) + + def test_dygraph_index_select_api(self): + paddle.set_device("npu:0") + paddle.disable_static() + self.input_data() + + # case 1: + x = paddle.to_tensor(self.data_x) + index = paddle.to_tensor(self.data_index) + z = paddle.index_select(x, index) + np_z = z.numpy() + expect_out = np.array([[1.0, 2.0, 3.0, 4.0], [5.0, 6.0, 7.0, 8.0], + [5.0, 6.0, 7.0, 8.0]]).astype('float32') + self.assertTrue(np.allclose(expect_out, np_z)) + + # case 2: + x = paddle.to_tensor(self.data_x) + index = paddle.to_tensor(self.data_index) + z = paddle.index_select(x, index, axis=1) + np_z = z.numpy() + expect_out = np.array([[1.0, 2.0, 2.0], [5.0, 6.0, 6.0], + [9.0, 10.0, 10.0]]).astype('float32') + self.assertTrue(np.allclose(expect_out, np_z)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_one_hot_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_one_hot_op_npu.py new file mode 100644 index 00000000000000..c92fffb2d26cbf --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_one_hot_op_npu.py @@ -0,0 +1,193 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import sys +import unittest +import numpy as np +sys.path.append("..") + +from op_test import OpTest +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.framework import Program, program_guard + +paddle.enable_static() + + +class TestOneHotOp(OpTest): + def set_npu(self): + self.__class__.use_npu = True + + def setUp(self): + self.set_npu() + self.op_type = 'one_hot' + depth = 10 + depth_np = np.array(10).astype('int32') + dimension = 12 + x_lod = [[4, 1, 3, 3]] + x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))] + x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1]) + + out = np.zeros(shape=(np.product(x.shape[:-1]), + depth)).astype('float32') + + for i in range(np.product(x.shape)): + out[i, x[i]] = 1.0 + + self.inputs = {'X': (x, x_lod), 'depth_tensor': depth_np} + self.attrs = {'dtype': int(core.VarDesc.VarType.FP32)} + self.outputs = {'Out': (out, x_lod)} + + def test_check_output(self): + self.check_output_with_place(paddle.NPUPlace(0), check_dygraph=False) + + +class TestOneHotOp_attr(OpTest): + def set_npu(self): + self.__class__.use_npu = True + + def setUp(self): + self.set_npu() + self.op_type = 'one_hot' + depth = 10 + dimension = 12 + x_lod = [[4, 1, 3, 3]] + x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))] + x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1]) + + out = np.zeros(shape=(np.product(x.shape[:-1]), + depth)).astype('float32') + + for i in range(np.product(x.shape)): + out[i, x[i]] = 1.0 + + self.inputs = {'X': (x, x_lod)} + self.attrs = {'dtype': int(core.VarDesc.VarType.FP32), 'depth': depth} + self.outputs = {'Out': (out, x_lod)} + + def test_check_output(self): + self.check_output_with_place(paddle.NPUPlace(0), check_dygraph=False) + + +class TestOneHotOp_default_dtype(OpTest): + def set_npu(self): + self.__class__.use_npu = True + + def setUp(self): + self.set_npu() + self.op_type = 'one_hot' + depth = 10 + depth_np = np.array(10).astype('int32') + dimension = 12 + x_lod = [[4, 1, 3, 3]] + x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))] + x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1]) + + out = np.zeros(shape=(np.product(x.shape[:-1]), + depth)).astype('float32') + + for i in range(np.product(x.shape)): + out[i, x[i]] = 1.0 + + self.inputs = {'X': (x, x_lod), 'depth_tensor': depth_np} + self.attrs = {} + self.outputs = {'Out': (out, x_lod)} + + def test_check_output(self): + self.check_output_with_place(paddle.NPUPlace(0), check_dygraph=False) + + +class TestOneHotOp_default_dtype_attr(OpTest): + def set_npu(self): + self.__class__.use_npu = True + + def setUp(self): + self.set_npu() + self.op_type = 'one_hot' + depth = 10 + dimension = 12 + x_lod = [[4, 1, 3, 3]] + x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))] + x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1]) + + out = np.zeros(shape=(np.product(x.shape[:-1]), + depth)).astype('float32') + + for i in range(np.product(x.shape)): + out[i, x[i]] = 1.0 + + self.inputs = {'X': (x, x_lod)} + self.attrs = {'depth': depth} + self.outputs = {'Out': (out, x_lod)} + + def test_check_output(self): + self.check_output_with_place(paddle.NPUPlace(0), check_dygraph=False) + + +class TestOneHotOp_out_of_range(OpTest): + def set_npu(self): + self.__class__.use_npu = True + + def setUp(self): + self.set_npu() + self.op_type = 'one_hot' + depth = 10 + x_lod = [[4, 1, 3, 3]] + x = [np.random.choice([-1, depth]) for i in range(sum(x_lod[0]))] + x = np.array(x).astype('int32').reshape([sum(x_lod[0]), 1]) + + out = np.zeros(shape=(np.product(x.shape[:-1]), + depth)).astype('float32') + + self.inputs = {'X': (x, x_lod)} + self.attrs = {'depth': depth, 'allow_out_of_range': True} + self.outputs = {'Out': (out, x_lod)} + + def test_check_output(self): + self.check_output_with_place(paddle.NPUPlace(0), check_dygraph=False) + + +class TestOneHotOp_dtype_int64(OpTest): + def set_npu(self): + self.__class__.use_npu = True + + def setUp(self): + self.set_npu() + self.op_type = 'one_hot' + depth = 10 + dimension = 12 + x_lod = [[4, 1, 3, 3]] + x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))] + x = np.array(x).astype('int64').reshape([sum(x_lod[0]), 1]) + + out = np.zeros(shape=(np.product(x.shape[:-1]), + depth)).astype('float32') + + for i in range(np.product(x.shape)): + out[i, x[i]] = 1.0 + + self.inputs = {'X': (x, x_lod)} + self.attrs = {'depth': depth} + self.outputs = {'Out': (out, x_lod)} + + def test_check_output(self): + self.check_output_with_place(paddle.NPUPlace(0), check_dygraph=False) + + +if __name__ == '__main__': + paddle.enable_static() + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_reduce_max_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reduce_max_op_npu.py index a5d078ced28767..f6c346159b8bee 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_reduce_max_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_reduce_max_op_npu.py @@ -127,8 +127,6 @@ def setUp(self): 'out_dtype': int(core.VarDesc.VarType.INT16) } - self.out = self.inputs['X'].max(axis=tuple(self.attrs['dim'])) - self.outputs = { 'Out': self.inputs['X'].max(axis=tuple(self.attrs['dim'])).astype(np.int16) @@ -195,9 +193,6 @@ def setUp(self): 'dim': [-2, -1], 'out_dtype': int(core.VarDesc.VarType.FP16) } - - self.out = self.inputs['X'].max(axis=tuple(self.attrs['dim'])) - self.outputs = { 'Out': self.inputs['X'].max( axis=tuple(self.attrs['dim'])).astype(np.float16) diff --git a/python/paddle/fluid/tests/unittests/npu/test_reduce_prod_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_reduce_prod_op_npu.py new file mode 100644 index 00000000000000..59f181be5edacb --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_reduce_prod_op_npu.py @@ -0,0 +1,235 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci +import paddle +import paddle.fluid.core as core +import paddle.fluid as fluid +from paddle.fluid import compiler, Program, program_guard +from paddle.fluid.framework import convert_np_dtype_to_dtype_ + +paddle.enable_static() + + +class TestNPUReduceProd(OpTest): + def setUp(self): + self.op_type = "reduce_prod" + self.set_npu() + self.init_dtype() + + self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)} + self.attrs = {'dim': [0]} + self.outputs = { + 'Out': self.inputs['X'].prod(axis=tuple(self.attrs['dim'])) + } + + def test_check_output(self): + self.check_output_with_place(self.place) + + def set_npu(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + + def init_dtype(self): + self.dtype = np.float32 + + +class TestNPUReduceProd2(TestNPUReduceProd): + def setUp(self): + self.op_type = "reduce_prod" + self.set_npu() + self.init_dtype() + + self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)} + self.attrs = {} # default 'dim': [0] + self.outputs = {'Out': self.inputs['X'].prod(axis=tuple([0]))} + + +class TestNPUReduceProd3(TestNPUReduceProd): + def setUp(self): + self.op_type = "reduce_prod" + self.set_npu() + self.init_dtype() + + self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)} + # self.attrs = {'dim': [0]} + self.outputs = {'Out': self.inputs['X'].prod(axis=tuple([0]))} + + +class TestNPUReduceProd6D(TestNPUReduceProd): + def setUp(self): + self.op_type = "reduce_prod" + self.set_npu() + self.init_dtype() + + self.inputs = { + 'X': np.random.random((5, 6, 2, 3, 4, 2)).astype(self.dtype) + } + self.attrs = {'dim': [2, 3, 4]} + self.outputs = { + 'Out': self.inputs['X'].prod(axis=tuple(self.attrs['dim'])) + } + + +class TestNPUReduceProd8D(TestNPUReduceProd): + def setUp(self): + self.op_type = "reduce_prod" + self.set_npu() + self.init_dtype() + + self.inputs = { + 'X': np.random.random((2, 5, 3, 2, 2, 3, 4, 2)).astype(self.dtype) + } + self.attrs = {'dim': [2, 3, 4]} + self.outputs = { + 'Out': self.inputs['X'].prod(axis=tuple(self.attrs['dim'])) + } + + +class TestReduceAll(TestNPUReduceProd): + def setUp(self): + self.op_type = "reduce_prod" + self.set_npu() + self.init_dtype() + + self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)} + self.attrs = {'reduce_all': True} + self.outputs = {'Out': self.inputs['X'].prod()} + + +class TestNPUReduceProdWithOutDtype_bool(TestNPUReduceProd): + def setUp(self): + self.op_type = "reduce_prod" + self.set_npu() + self.init_dtype() + + self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)} + self.attrs = {'dim': [0], 'out_dtype': int(core.VarDesc.VarType.BOOL)} + self.outputs = { + 'Out': + self.inputs['X'].prod(axis=tuple(self.attrs['dim'])).astype(np.bool) + } + + +class TestNPUReduceProdWithOutDtype_int16(TestNPUReduceProd): + def setUp(self): + self.op_type = "reduce_prod" + self.set_npu() + self.init_dtype() + + self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)} + self.attrs = {'dim': [0], 'out_dtype': int(core.VarDesc.VarType.INT16)} + self.outputs = { + 'Out': self.inputs['X'].prod( + axis=tuple(self.attrs['dim'])).astype(np.int16) + } + + +class TestNPUReduceProdWithOutDtype_int32(TestNPUReduceProd): + def setUp(self): + self.op_type = "reduce_prod" + self.set_npu() + self.init_dtype() + + self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)} + self.attrs = {'dim': [0], 'out_dtype': int(core.VarDesc.VarType.INT32)} + self.outputs = { + 'Out': self.inputs['X'].prod( + axis=tuple(self.attrs['dim'])).astype(np.int32) + } + + +class TestNPUReduceProdWithOutDtype_int64(TestNPUReduceProd): + def setUp(self): + self.op_type = "reduce_prod" + self.set_npu() + self.init_dtype() + + self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)} + self.attrs = {'dim': [0], 'out_dtype': int(core.VarDesc.VarType.INT64)} + self.outputs = { + 'Out': self.inputs['X'].prod( + axis=tuple(self.attrs['dim'])).astype(np.int64) + } + + +class TestNPUReduceProdWithOutDtype_fp16(TestNPUReduceProd): + def setUp(self): + self.op_type = "reduce_prod" + self.set_npu() + self.init_dtype() + + self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)} + self.attrs = {'dim': [0], 'out_dtype': int(core.VarDesc.VarType.FP16)} + self.outputs = { + 'Out': self.inputs['X'].prod( + axis=tuple(self.attrs['dim'])).astype(np.float16) + } + + def test_check_output(self): + self.check_output_with_place(self.place, atol=1e-3) + + +class TestNPUReduceProdWithOutDtype_fp32(TestNPUReduceProd): + def setUp(self): + self.op_type = "reduce_prod" + self.set_npu() + self.init_dtype() + + self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)} + self.attrs = {'dim': [0], 'out_dtype': int(core.VarDesc.VarType.FP32)} + self.outputs = { + 'Out': self.inputs['X'].prod( + axis=tuple(self.attrs['dim'])).astype(np.float32) + } + + +class TestNPUReduceProdWithOutDtype_fp64(TestNPUReduceProd): + def setUp(self): + self.op_type = "reduce_prod" + self.set_npu() + self.init_dtype() + + self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)} + self.attrs = {'dim': [0], 'out_dtype': int(core.VarDesc.VarType.FP64)} + self.outputs = { + 'Out': self.inputs['X'].prod( + axis=tuple(self.attrs['dim'])).astype(np.float64) + } + + +@skip_check_grad_ci(reason="right now not implement grad op") +class TestNPUReduceProdWithOutDtype_fp32_2(TestNPUReduceProd): + def setUp(self): + self.op_type = "reduce_prod" + self.set_npu() + self.init_dtype() + + self.inputs = {'X': np.random.random((5, 6, 10)).astype(self.dtype)} + self.attrs = {'dim': [0], 'out_dtype': int(core.VarDesc.VarType.FP32)} + self.outputs = { + 'Out': self.inputs['X'].prod( + axis=tuple(self.attrs['dim'])).astype(np.float32) + } + + def init_dtype(self): + self.dtype = np.float16 + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_relu6_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_relu6_op_npu.py new file mode 100644 index 00000000000000..601a351c015f32 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_relu6_op_npu.py @@ -0,0 +1,166 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function +import paddle.fluid as fluid +import paddle +from op_test import OpTest + +import numpy as np +import unittest +import sys +sys.path.append("..") + +paddle.enable_static() +SEED = 2021 + + +def ref_relu6(x, threshold=6.0): + out = np.copy(x) + out[np.abs(x - threshold) < 0.005] = threshold + 0.02 + out = np.minimum(np.maximum(x, 0), threshold) + return out + + +class TestRelu6(OpTest): + def setUp(self): + self.set_npu() + self.op_type = "relu6" + self.place = paddle.NPUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.uniform(-1, 10, [10, 12]).astype(self.dtype) + x[np.abs(x) < 0.005] = 0.02 + out = ref_relu6(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.attrs = {'threshold': 6.0} + self.outputs = {'Out': out} + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + if self.dtype == np.float16: + return + self.check_grad_with_place(self.place, ['X'], 'Out') + + def init_dtype(self): + self.dtype = np.float32 + + +class TestRelu6Float16(TestRelu6): + def set_npu(self): + self.__class__.use_npu = True + self.__class__.no_need_check_grad = True + + def set_attrs(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place) + + +class TestReluNeg(TestRelu6): + def setUp(self): + self.set_npu() + self.op_type = "relu6" + self.place = paddle.NPUPlace(0) + + self.init_dtype() + np.random.seed(SEED) + x = np.random.uniform(-10, -1, [10, 12]).astype(self.dtype) + x[np.abs(x) < 0.005] = 0.02 + out = ref_relu6(x) + + self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)} + self.attrs = {'threshold': 6.0} + self.outputs = {'Out': out} + + def set_npu(self): + self.__class__.use_npu = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place) + + +class TestRelu6Net(unittest.TestCase): + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(32, 32)).astype('float32') + b_np = np.random.random(size=(32, 32)).astype('float32') + label_np = np.random.randint(2, size=(32, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[32, 32], dtype='float32') + b = paddle.static.data(name="b", shape=[32, 32], dtype='float32') + label = paddle.static.data( + name="label", shape=[32, 1], dtype='int64') + + sum = paddle.add(a, b) + z = paddle.nn.functional.relu6(sum) + + fc_1 = fluid.layers.fc(input=z, size=128) + prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + if run_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run( + main_prog, + feed={"a": a_np, + "b": b_np, + "label": label_np}, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_npu(self): + cpu_pred, cpu_loss = self._test(False) + npu_pred, npu_loss = self._test(True) + + self.assertTrue(np.allclose(npu_pred, cpu_pred)) + self.assertTrue(np.allclose(npu_loss, cpu_loss)) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_sequence_mask_npu.py b/python/paddle/fluid/tests/unittests/npu/test_sequence_mask_npu.py new file mode 100644 index 00000000000000..21440de9fddd13 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_sequence_mask_npu.py @@ -0,0 +1,182 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +import paddle.fluid.core as core +from paddle.fluid.framework import convert_np_dtype_to_dtype_, Program, program_guard + +paddle.enable_static() + + +class SequenceMaskTestBase(OpTest): + def set_npu(self): + self.__class__.use_npu = True + + def initDefaultParameters(self): + self.op_type = 'sequence_mask' + self.maxlen = 10 + self.mask_dtype = 'int64' + self.x = [[0, 3, 4], [5, 7, 9]] + + def initParameters(self): + pass + + def setUp(self): + self.set_npu() + self.initDefaultParameters() + self.initParameters() + if not isinstance(self.x, np.ndarray): + self.x = np.array(self.x) + + self.inputs = {'X': self.x} + self.outputs = {'Y': self.calc_ground_truth_mask()} + self.attrs = { + 'maxlen': self.maxlen, + 'out_dtype': convert_np_dtype_to_dtype_(self.mask_dtype) + } + + def calc_ground_truth_mask(self): + maxlen = np.max(self.x) if self.maxlen < 0 else self.maxlen + shape = self.x.shape + (maxlen, ) + index_broadcast = np.broadcast_to( + np.reshape( + range(maxlen), newshape=[1] * self.x.ndim + [-1]), + shape=shape) + x_broadcast = np.broadcast_to( + np.reshape( + self.x, newshape=self.x.shape + (-1, )), shape=shape) + return (index_broadcast < x_broadcast).astype(self.mask_dtype) + + def test_check_output(self): + self.check_output_with_place(paddle.NPUPlace(0)) + + +class SequenceMaskTest1(SequenceMaskTestBase): + def initParameters(self): + self.mask_dtype = 'bool' + + +class SequenceMaskTest2(SequenceMaskTestBase): + def initParameters(self): + self.mask_dtype = 'uint8' + + +class SequenceMaskTest3(SequenceMaskTestBase): + def initParameters(self): + self.mask_dtype = 'int32' + + +class SequenceMaskTest4(SequenceMaskTestBase): + def initParameters(self): + self.mask_dtype = 'float32' + + +class SequenceMaskTest5(SequenceMaskTestBase): + def initParameters(self): + self.mask_dtype = 'float64' + + +class SequenceMaskTest6(SequenceMaskTestBase): + def initParameters(self): + self.maxlen = -1 + + +class SequenceMaskTestBase_tensor_attr(OpTest): + def set_npu(self): + self.__class__.use_npu = True + + def initDefaultParameters(self): + self.op_type = 'sequence_mask' + self.maxlen = 10 + self.maxlen_tensor = np.ones((1), 'int32') * 10 + self.mask_dtype = 'int64' + self.x = [[0, 3, 4], [5, 7, 9]] + + def initParameters(self): + pass + + def setUp(self): + self.set_npu() + self.initDefaultParameters() + self.initParameters() + if not isinstance(self.x, np.ndarray): + self.x = np.array(self.x) + + self.inputs = {'X': self.x, 'MaxLenTensor': self.maxlen_tensor} + self.outputs = {'Y': self.calc_ground_truth_mask()} + self.attrs = {'out_dtype': convert_np_dtype_to_dtype_(self.mask_dtype)} + + def calc_ground_truth_mask(self): + maxlen = np.max(self.x) if self.maxlen < 0 else self.maxlen + shape = self.x.shape + (maxlen, ) + index_broadcast = np.broadcast_to( + np.reshape( + range(maxlen), newshape=[1] * self.x.ndim + [-1]), + shape=shape) + x_broadcast = np.broadcast_to( + np.reshape( + self.x, newshape=self.x.shape + (-1, )), shape=shape) + return (index_broadcast < x_broadcast).astype(self.mask_dtype) + + def test_check_output(self): + self.check_output() + + +class SequenceMaskTest1_tensor_attr(SequenceMaskTestBase_tensor_attr): + def initParameters(self): + self.mask_dtype = 'bool' + + +class SequenceMaskTest2_tensor_attr(SequenceMaskTestBase_tensor_attr): + def initParameters(self): + self.mask_dtype = 'uint8' + + +class SequenceMaskTest3_tensor_attr(SequenceMaskTestBase_tensor_attr): + def initParameters(self): + self.mask_dtype = 'int32' + + +class SequenceMaskTest4_tensor_attr(SequenceMaskTestBase_tensor_attr): + def initParameters(self): + self.mask_dtype = 'float32' + + +class SequenceMaskTest5_tensor_attr(SequenceMaskTestBase_tensor_attr): + def initParameters(self): + self.mask_dtype = 'float64' + + +class TestSequenceMaskOpError(unittest.TestCase): + def test_errors(self): + with program_guard(Program(), Program()): + input_data = np.random.uniform(1, 5, [4]).astype("float32") + + def test_Variable(): + # the input must be Variable + fluid.layers.sequence_mask(input_data, maxlen=4) + + self.assertRaises(TypeError, test_Variable) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_squared_l2_norm_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_squared_l2_norm_op_npu.py new file mode 100644 index 00000000000000..d3ee8df1cd106f --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_squared_l2_norm_op_npu.py @@ -0,0 +1,57 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +from numpy import linalg as LA +import sys +sys.path.append("..") +from op_test import OpTest +import paddle + +paddle.enable_static() + + +class TestL2LossOp(OpTest): + """Test npu squared_l2_norm + """ + + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "squared_l2_norm" + self.max_relative_error = 0.05 + + X = np.random.uniform(-1, 1, (13, 19)).astype("float32") + X[np.abs(X) < self.max_relative_error] = 0.1 + self.inputs = {'X': X} + self.outputs = {'Out': np.square(LA.norm(X))} + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_output(self): + self.check_output_with_place(place=self.place) + + def test_check_grad(self): + self.check_grad_with_place( + self.place, ['X'], + 'Out', + max_relative_error=self.max_relative_error) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_tile_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_tile_op_npu.py new file mode 100755 index 00000000000000..0da80189f7d406 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_tile_op_npu.py @@ -0,0 +1,245 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +from paddle.fluid import compiler, Program, program_guard +from paddle.fluid import core + +paddle.enable_static() +np.random.seed(10) + + +#Situation 1: repeat_times is a list (without tensor) +class TestTileOpRank1(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "tile" + self.init_data() + + self.inputs = {'X': np.random.random(self.ori_shape).astype("float32")} + self.attrs = {'repeat_times': self.repeat_times} + output = np.tile(self.inputs['X'], self.repeat_times) + self.outputs = {'Out': output} + + def set_npu(self): + self.__class__.use_npu = True + + def init_data(self): + self.ori_shape = [100] + self.repeat_times = [2] + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + pass + + +#with dimension expanding +class TestTileOpRank2Expanding(TestTileOpRank1): + def init_data(self): + self.ori_shape = [120] + self.repeat_times = [2, 2] + + +class TestTileOpRank2(TestTileOpRank1): + def init_data(self): + self.ori_shape = [12, 14] + self.repeat_times = [2, 3] + + +class TestTileOpRank3_Corner(TestTileOpRank1): + def init_data(self): + self.ori_shape = (2, 10, 5) + self.repeat_times = (1, 1, 1) + + +class TestTileOpRank3_Corner2(TestTileOpRank1): + def init_data(self): + self.ori_shape = (2, 10, 5) + self.repeat_times = (2, 2) + + +class TestTileOpRank3(TestTileOpRank1): + def init_data(self): + self.ori_shape = (2, 4, 15) + self.repeat_times = (2, 1, 4) + + +class TestTileOpRank4(TestTileOpRank1): + def init_data(self): + self.ori_shape = (2, 4, 5, 7) + self.repeat_times = (3, 2, 1, 2) + + +# Situation 2: repeat_times is a list (with tensor) +class TestTileOpRank1_tensor_attr(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "tile" + self.init_data() + repeat_times_tensor = [] + for index, ele in enumerate(self.repeat_times): + repeat_times_tensor.append(("x" + str(index), np.ones( + (1)).astype('int32') * ele)) + + self.inputs = { + 'X': np.random.random(self.ori_shape).astype("float32"), + 'repeat_times_tensor': repeat_times_tensor, + } + self.attrs = {"repeat_times": self.infer_repeat_times} + output = np.tile(self.inputs['X'], self.repeat_times) + self.outputs = {'Out': output} + + def set_npu(self): + self.__class__.use_npu = True + + def init_data(self): + self.ori_shape = [100] + self.repeat_times = [2] + self.infer_repeat_times = [-1] + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + pass + + +class TestTileOpRank2_Corner_tensor_attr(TestTileOpRank1_tensor_attr): + def init_data(self): + self.ori_shape = [12, 14] + self.repeat_times = [1, 1] + self.infer_repeat_times = [1, -1] + + +class TestTileOpRank2_attr_tensor(TestTileOpRank1_tensor_attr): + def init_data(self): + self.ori_shape = [12, 14] + self.repeat_times = [2, 3] + self.infer_repeat_times = [-1, 3] + + +# Situation 3: repeat_times is a tensor +class TestTileOpRank1_tensor(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "tile" + self.init_data() + + self.inputs = { + 'X': np.random.random(self.ori_shape).astype("float32"), + 'RepeatTimes': np.array(self.repeat_times).astype("int32"), + } + self.attrs = {} + output = np.tile(self.inputs['X'], self.repeat_times) + self.outputs = {'Out': output} + + def set_npu(self): + self.__class__.use_npu = True + + def init_data(self): + self.ori_shape = [100] + self.repeat_times = [2] + + def test_check_output(self): + self.check_output_with_place(self.place) + + def test_check_grad(self): + pass + + +class TestTileOpRank2_tensor(TestTileOpRank1_tensor): + def init_data(self): + self.ori_shape = [12, 14] + self.repeat_times = [2, 3] + + +# Situation 4: input x is Integer +class TestTileOpInteger(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "tile" + self.inputs = { + 'X': np.random.randint( + 10, size=(4, 4, 5)).astype("int32") + } + self.attrs = {'repeat_times': [2, 1, 4]} + output = np.tile(self.inputs['X'], (2, 1, 4)) + self.outputs = {'Out': output} + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_output(self): + self.check_output_with_place(self.place) + + +# Situation 5: input x is Integer +class TestTileOpInt64_t(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "tile" + self.inputs = { + 'X': np.random.randint( + 10, size=(2, 4, 5)).astype("int32") + } + self.attrs = {'repeat_times': [2, 1, 4]} + output = np.tile(self.inputs['X'], (2, 1, 4)) + self.outputs = {'Out': output} + + def set_npu(self): + self.__class__.use_npu = True + + def test_check_output(self): + self.check_output_with_place(self.place) + + +# Test python API +class TestTileAPI(unittest.TestCase): + def test_api(self): + with fluid.dygraph.guard(paddle.NPUPlace(0)): + np_x = np.random.random([12, 14]).astype("float32") + x = paddle.to_tensor(np_x) + + positive_2 = np.array([2]).astype("int32") + positive_2 = paddle.to_tensor(positive_2) + + repeat_times = np.array([2, 3]).astype("int32") + repeat_times = paddle.to_tensor(repeat_times) + + out_1 = paddle.tile(x, repeat_times=[2, 3]) + out_2 = paddle.tile(x, repeat_times=[positive_2, 3]) + out_3 = paddle.tile(x, repeat_times=repeat_times) + + assert np.array_equal(out_1.numpy(), np.tile(np_x, (2, 3))) + assert np.array_equal(out_2.numpy(), np.tile(np_x, (2, 3))) + assert np.array_equal(out_3.numpy(), np.tile(np_x, (2, 3))) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/npu/test_top_k_v2_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_top_k_v2_op_npu.py new file mode 100755 index 00000000000000..a8242be855c80a --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_top_k_v2_op_npu.py @@ -0,0 +1,343 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid.core as core + + +def numpy_topk(x, k=1, axis=-1, largest=True): + if axis < 0: + axis = len(x.shape) + axis + if largest: + indices = np.argsort(-x, axis=axis) + else: + indices = np.argsort(x, axis=axis) + if largest: + value = -np.sort(-x, axis=axis) + else: + value = np.sort(x, axis=axis) + indices = indices.take(indices=range(0, k), axis=axis) + value = value.take(indices=range(0, k), axis=axis) + return value, indices + + +class TestTopkV2NPUOp(OpTest): + def setUp(self): + paddle.enable_static() + self.op_type = "top_k_v2" + + self.set_npu() + self.set_dtype() + self.set_input_data() + self.set_attrs() + output, indices = numpy_topk( + self.input_data, axis=self.axis, k=self.k, largest=self.largest) + + self.inputs = {'X': self.input_data} + self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest} + self.outputs = {'Out': output, 'Indices': indices} + + def set_dtype(self): + self.dtype = np.int32 + + def set_attrs(self): + self.k = 3 + self.axis = 1 + self.largest = True + + def set_input_data(self): + self.input_data = np.random.choice( + 10000, size=(10, 20), replace=False).astype(self.dtype) + + def test_check_output(self): + self.__class__.no_need_check_grad = True + if self.dtype == np.float32: + self.check_output_with_place(self.place, atol=1e-3) + else: + self.check_output_with_place(self.place) + + def set_npu(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + + +class TestTopkV2OpFloat16(TestTopkV2NPUOp): + def set_attrs(self): + self.k = 3 + self.axis = 1 + self.largest = True + + def set_dtype(self): + self.dtype = np.float32 + + def set_input_data(self): + self.input_data = np.random.rand(3, 4).astype(self.dtype) + + +class TestTopkV2OP1Int32(TestTopkV2NPUOp): + def set_attrs(self): + self.k = 3 + self.axis = 0 + self.largest = False + + +class TestTopkV2OP2Int32(TestTopkV2NPUOp): + def set_attrs(self): + self.k = 4 + self.axis = 0 + self.largest = False + + +class TestTopkV2OP3Int32(TestTopkV2NPUOp): + def set_attrs(self): + self.k = 6 + self.axis = 1 + self.largest = True + + +class TestTopkV2OP4Int32(TestTopkV2NPUOp): + def set_attrs(self): + self.k = 3 + self.axis = 1 + self.largest = True + + +class TestTopkV2Op1Int64(TestTopkV2OP1Int32): + def set_dtype(self): + self.dtype = np.int64 + + +class TestTopkV2Op2Int64(TestTopkV2OP2Int32): + def set_dtype(self): + self.dtype = np.int64 + + +class TestTopkV2Op3Int64(TestTopkV2OP3Int32): + def set_dtype(self): + self.dtype = np.int64 + + +class TestTopkV2Op4Int64(TestTopkV2OP4Int32): + def set_dtype(self): + self.dtype = np.int64 + + +class TestTopkV2Op1Float32(TestTopkV2OP1Int32): + def set_dtype(self): + self.dtype = np.float32 + + def set_input_data(self): + self.input_data = np.random.rand(10, 20).astype(self.dtype) + + +class TestTopkV2Op2Float32(TestTopkV2OP2Int32): + def set_dtype(self): + self.dtype = np.float32 + + def set_input_data(self): + self.input_data = np.random.rand(10, 20).astype(self.dtype) + + +class TestTopkV2Op3Float32(TestTopkV2OP3Int32): + def set_dtype(self): + self.dtype = np.float32 + + def set_input_data(self): + self.input_data = np.random.rand(10, 20).astype(self.dtype) + + +class TestTopkV2Op4Float32(TestTopkV2OP4Int32): + def set_dtype(self): + self.dtype = np.float32 + + def set_input_data(self): + self.input_data = np.random.rand(10, 20).astype(self.dtype) + + +class TestTopkV2Op1Float64(TestTopkV2OP1Int32): + def set_dtype(self): + self.dtype = np.float64 + + def set_input_data(self): + self.input_data = np.random.rand(10, 20).astype(self.dtype) + + +class TestTopkV2Op2Float64(TestTopkV2OP2Int32): + def set_dtype(self): + self.dtype = np.float64 + + def set_input_data(self): + self.input_data = np.random.rand(10, 20).astype(self.dtype) + + +class TestTopkV2Op3Float64(TestTopkV2OP3Int32): + def set_dtype(self): + self.dtype = np.float64 + + def set_input_data(self): + self.input_data = np.random.rand(10, 20).astype(self.dtype) + + +class TestTopkV2Op4Float64(TestTopkV2OP4Int32): + def set_dtype(self): + self.dtype = np.float64 + + def set_input_data(self): + self.input_data = np.random.rand(10, 20).astype(self.dtype) + + +class TestTopKAPI(unittest.TestCase): + def setUp(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + np.random.seed(123) + self.input_data = np.random.rand(6, 7, 8) + self.large_input_data = np.random.rand(2, 1030) + + def run_dygraph(self, place): + paddle.disable_static(place) + input_tensor = paddle.to_tensor(self.input_data) + large_input_tensor = paddle.to_tensor(self.large_input_data) + # test case for basic test case 1 + paddle_result = paddle.topk(input_tensor, k=2) + numpy_result = numpy_topk(self.input_data, k=2) + self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0])) + self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1])) + + # test case for basic test case 2 with axis + paddle_result = paddle.topk(input_tensor, k=2, axis=1) + numpy_result = numpy_topk(self.input_data, k=2, axis=1) + self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0])) + self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1])) + # test case for basic test case 3 with tensor K + k_tensor = paddle.to_tensor(np.array([2])) + paddle_result = paddle.topk(input_tensor, k=k_tensor, axis=1) + numpy_result = numpy_topk(self.input_data, k=2, axis=1) + self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0])) + self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1])) + + # test case for basic test case 4 with tensor largest + k_tensor = paddle.to_tensor(np.array([2])) + paddle_result = paddle.topk(input_tensor, k=2, axis=1, largest=False) + numpy_result = numpy_topk(self.input_data, k=2, axis=1, largest=False) + self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0])) + self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1])) + + # test case for basic test case 5 with axis -1 + k_tensor = paddle.to_tensor(np.array([2])) + paddle_result = paddle.topk(input_tensor, k=2, axis=-1, largest=False) + numpy_result = numpy_topk(self.input_data, k=2, axis=-1, largest=False) + self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0])) + self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1])) + + # test case for basic test case 6 for the partial sort + paddle_result = paddle.topk(large_input_tensor, k=1, axis=-1) + numpy_result = numpy_topk(self.large_input_data, k=1, axis=-1) + self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0])) + self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1])) + # test case for basic test case 7 for the unsorted + paddle_result = paddle.topk(input_tensor, k=2, axis=1, sorted=False) + sort_paddle = numpy_topk( + np.array(paddle_result[0].numpy()), axis=1, k=2) + numpy_result = numpy_topk(self.input_data, k=2, axis=1) + self.assertTrue(np.allclose(sort_paddle[0], numpy_result[0])) + + def run_static(self, place): + paddle.enable_static() + with paddle.static.program_guard(paddle.static.Program(), + paddle.static.Program()): + input_tensor = paddle.static.data( + name="x", shape=[6, 7, 8], dtype="float64") + large_input_tensor = paddle.static.data( + name="large_x", shape=[2, 1030], dtype="float64") + k_tensor = paddle.static.data(name="k", shape=[1], dtype="int32") + result1 = paddle.topk(input_tensor, k=2) + result2 = paddle.topk(input_tensor, k=2, axis=-1) + result3 = paddle.topk(input_tensor, k=k_tensor, axis=1) + self.assertEqual(result3[0].shape, (6, -1, 8)) + self.assertEqual(result3[1].shape, (6, -1, 8)) + result4 = paddle.topk(input_tensor, k=2, axis=1, largest=False) + result5 = paddle.topk(input_tensor, k=2, axis=-1, largest=False) + result6 = paddle.topk(large_input_tensor, k=1, axis=-1) + result7 = paddle.topk(input_tensor, k=2, axis=1, sorted=False) + exe = paddle.static.Executor(place) + input_data = np.random.rand(10, 20).astype("float64") + large_input_data = np.random.rand(2, 100).astype("float64") + paddle_result = exe.run( + feed={ + "x": self.input_data, + "large_x": self.large_input_data, + "k": np.array([2]).astype("int32") + }, + fetch_list=[ + result1[0], result1[1], result2[0], result2[1], result3[0], + result3[1], result4[0], result4[1], result5[0], result5[1], + result6[0], result6[1], result7[0], result7[1] + ]) + numpy_result = numpy_topk(self.input_data, k=2) + self.assertTrue(np.allclose(paddle_result[0], numpy_result[0])) + self.assertTrue(np.allclose(paddle_result[1], numpy_result[1])) + + numpy_result = numpy_topk(self.input_data, k=2, axis=-1) + self.assertTrue(np.allclose(paddle_result[2], numpy_result[0])) + self.assertTrue(np.allclose(paddle_result[3], numpy_result[1])) + + numpy_result = numpy_topk(self.input_data, k=2, axis=1) + self.assertTrue(np.allclose(paddle_result[4], numpy_result[0])) + self.assertTrue(np.allclose(paddle_result[5], numpy_result[1])) + + numpy_result = numpy_topk( + self.input_data, k=2, axis=1, largest=False) + self.assertTrue(np.allclose(paddle_result[6], numpy_result[0])) + self.assertTrue(np.allclose(paddle_result[7], numpy_result[1])) + + numpy_result = numpy_topk( + self.input_data, k=2, axis=-1, largest=False) + self.assertTrue(np.allclose(paddle_result[8], numpy_result[0])) + self.assertTrue(np.allclose(paddle_result[9], numpy_result[1])) + + numpy_result = numpy_topk(self.large_input_data, k=1, axis=-1) + self.assertTrue(np.allclose(paddle_result[10], numpy_result[0])) + self.assertTrue(np.allclose(paddle_result[11], numpy_result[1])) + sort_paddle = numpy_topk(paddle_result[12], axis=1, k=2) + numpy_result = numpy_topk(self.input_data, k=2, axis=1) + self.assertTrue(np.allclose(sort_paddle[0], numpy_result[0])) + + def test_cases(self): + places = [core.NPUPlace(0)] + for place in places: + self.run_dygraph(place) + self.run_static(place) + + def test_errors(self): + self.__class__.use_npu = True + self.place = paddle.NPUPlace(0) + paddle.disable_static() + x = paddle.to_tensor([1, 2, 3]) + with self.assertRaises(BaseException): + paddle.topk(x, k=-1) + + with self.assertRaises(BaseException): + paddle.topk(x, k=0) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py index ab1127afa58dd9..cade4b850cd1d6 100644 --- a/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py +++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_cells.py @@ -60,9 +60,16 @@ def test_with_zero_state(self): y2, h2 = rnn2(paddle.to_tensor(x)) np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5) + def test_errors(self): + def test_zero_hidden_size(): + cell = paddle.nn.SimpleRNNCell(-1, 0) + + self.assertRaises(ValueError, test_zero_hidden_size) + def runTest(self): self.test_with_initial_state() self.test_with_zero_state() + self.test_errors() class TestGRUCell(unittest.TestCase): @@ -103,9 +110,16 @@ def test_with_zero_state(self): y2, h2 = rnn2(paddle.to_tensor(x)) np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5) + def test_errors(self): + def test_zero_hidden_size(): + cell = paddle.nn.GRUCell(-1, 0) + + self.assertRaises(ValueError, test_zero_hidden_size) + def runTest(self): self.test_with_initial_state() self.test_with_zero_state() + self.test_errors() class TestLSTMCell(unittest.TestCase): @@ -150,9 +164,16 @@ def test_with_zero_state(self): np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5) np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5) + def test_errors(self): + def test_zero_hidden_size(): + cell = paddle.nn.LSTMCell(-1, 0) + + self.assertRaises(ValueError, test_zero_hidden_size) + def runTest(self): self.test_with_initial_state() self.test_with_zero_state() + self.test_errors() def load_tests(loader, tests, pattern): diff --git a/python/paddle/fluid/tests/unittests/test_backward.py b/python/paddle/fluid/tests/unittests/test_backward.py index 7ca0832b718fd0..e0d6a606e2569c 100644 --- a/python/paddle/fluid/tests/unittests/test_backward.py +++ b/python/paddle/fluid/tests/unittests/test_backward.py @@ -16,6 +16,9 @@ import unittest import paddle.fluid as fluid +import paddle.static as static +import paddle + import numpy as np @@ -327,6 +330,35 @@ def callback(block, context): loss=self.avg_loss, callbacks=callback) +class TestGradientsWithOptimizer(unittest.TestCase): + def _check_grad_op_name(self, forward_list, optimiezed_list): + backward_list = [op + "_grad" for op in reversed(forward_list)] + idx = optimiezed_list.index(backward_list[0], len(backward_list)) + + self.assertListEqual(backward_list, + optimiezed_list[idx:idx + len(backward_list)]) + + def test_gradient_with_optimizer(self): + main = fluid.Program() + startup = fluid.Program() + + with fluid.program_guard(main, startup): + img = static.data(name='image', shape=[None, 784]) + pred = static.nn.fc(x=img, size=10, activation='relu') + loss = paddle.mean(pred) + opt = paddle.optimizer.Momentum(learning_rate=0.01, momentum=0.9) + + forward_list = [o.type for o in main.current_block().ops] + optimize_ops, pram_grads = paddle.autograd.backward_mode.gradients_with_optimizer( + main, opt) + + optimized_list = [o.type for o in main.current_block().ops] + + self.assertGreater(len(optimized_list), len(forward_list)) + self.assertIn(opt.type, optimized_list) + self._check_grad_op_name(forward_list, optimized_list) + + # TODO(Aurelius84): add conditional network test class ConditionalNet(BackwardNet): def __init__(self): @@ -334,4 +366,5 @@ def __init__(self): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py b/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py index bc9ff3697717d1..f87b732d1b2cc0 100644 --- a/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py +++ b/python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/python/paddle/fluid/tests/unittests/test_fleet_launch_elastic.sh b/python/paddle/fluid/tests/unittests/test_fleet_launch_elastic.sh index 105ed1356ede3a..8b618195f55ea0 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_launch_elastic.sh +++ b/python/paddle/fluid/tests/unittests/test_fleet_launch_elastic.sh @@ -17,6 +17,15 @@ echo "begin test elastic" unset GREP_OPTIONS rm -rf log +pids=`ps -ef | grep "python -m paddle.distributed.launch elastic_demo.[py]" | awk '{print $2}'` +if [ -n "$pids" ]; then + echo $pids | xargs kill -9 +fi +pids=`ps -ef | grep "/usr/bin/python -u elastic_demo.[py]" | awk '{print $2}'` +if [ -n "$pids" ]; then + echo $pids | xargs kill -9 +fi + python -m pip install --no-cache-dir etcd3 -i https://mirror.baidu.com/pypi/simple # common env @@ -115,6 +124,8 @@ do fi done +> $lw0 + # rerun node 1 export NVIDIA_VISIBLE_DEVICES=1 export CUDA_VISIBLE_DEVICES=1 @@ -144,5 +155,54 @@ done check_env +> log_0.log + +for i in {1..10} +do + ## kill with -9 + kill -9 $p0 + sleep 1 + if [ `ps -p $p0 | wc -l` == "2" ]; then + echo "force stop node 0 error" + exit -1 + else + echo "force stop node 0 ok" + break + fi +done + +> $lw0 + +# rerun node 0 +export NVIDIA_VISIBLE_DEVICES=0 +export CUDA_VISIBLE_DEVICES=0 +export DISTRIBUTED_TRAINER_ENDPOINTS=10.10.10.10:8001,10.10.10.3:8001 +export PADDLE_TRAINERS=10.10.10.10,10.10.10.3 +export TRAINER_PORTS_NUM=1 +export POD_IP=10.10.10.10 +export PADDLE_TRAINER_ID=0 +export PADDLE_TRAINERS_NUM=2 + +python -m paddle.distributed.launch elastic_demo.py &> log_0.log & +p0=$! + +for i in {1..10} +do + if grep "INFO:ELASTIC:ready with hosts" log_1.log | grep -q '10.10.10.10'; then + echo "rerun node 0 ok" + break + else + sleep 1 + fi + if [ $i -eq 10 ]; then + echo "rerun node 0 error" + exit -1 + fi +done + +check_env + +echo "All check done" + sleep 3 kill $p0 $p1 diff --git a/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py index a9c37d78537eec..3f8d994ad19e44 100644 --- a/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_pipeline_meta_optimizer.py @@ -14,6 +14,10 @@ import unittest import paddle +import paddle.fluid as fluid +import paddle.static as static +import paddle.distributed.fleet as fleet +import paddle.distributed.fleet.base.role_maker as role_maker import os paddle.enable_static() @@ -25,26 +29,34 @@ def setUp(self): os.environ[ "PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36001,127.0.0.1:36002" - def test_pipeline_optimizer(self): - import paddle.distributed.fleet as fleet - import paddle.distributed.fleet.base.role_maker as role_maker - role = role_maker.PaddleCloudRoleMaker(is_collective=True) - fleet.init(role) - with paddle.fluid.device_guard("gpu:0"): + def net(self): + with static.device_guard("gpu:0"): input_x = paddle.fluid.layers.data( name="x", shape=[32], dtype='float32') input_y = paddle.fluid.layers.data( name="y", shape=[1], dtype='int64') + input_z = paddle.fluid.layers.data( + name="z", shape=[1], dtype="float32") + with static.device_guard("gpu:all"): + input_z = input_z * 1.0 + input_z.stop_gradient = True fc_1 = paddle.fluid.layers.fc(input=input_x, size=64, act='tanh') + fc_1 = fc_1 * input_z - with paddle.fluid.device_guard("gpu:1"): + with static.device_guard("gpu:1"): fc_2 = paddle.fluid.layers.fc(input=fc_1, size=64, act='tanh') + fc_2 = fc_2 * input_z prediction = paddle.fluid.layers.fc(input=[fc_2], size=2, act='softmax') cost = paddle.fluid.layers.cross_entropy( input=prediction, label=input_y) avg_cost = paddle.fluid.layers.mean(x=cost) + return avg_cost + + def test_pipeline_optimizer(self): + role = role_maker.PaddleCloudRoleMaker(is_collective=True) + fleet.init(role) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.pipeline = True @@ -53,9 +65,43 @@ def test_pipeline_optimizer(self): 'accumulate_steps': 2 } - optimizer = paddle.fluid.optimizer.Adam(0.01) - optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) - optimizer.minimize(avg_cost) + train_prog, startup_prog = static.Program(), static.Program() + with static.program_guard(train_prog, startup_prog): + with fluid.unique_name.guard(): + avg_cost = self.net() + + optimizer = paddle.fluid.optimizer.Adam(0.01) + optimizer = fleet.distributed_optimizer( + optimizer, strategy=strategy) + optimizer.minimize(avg_cost) + + def test_pipeline_amp_optimizer(self): + """ test pipeline& with device:all """ + role = role_maker.PaddleCloudRoleMaker(is_collective=True) + fleet.init(role) + + strategy = paddle.distributed.fleet.DistributedStrategy() + strategy.amp = True + strategy.pipeline = True + strategy.pipeline_configs = { + 'micro_batch_size': 1, + 'accumulate_steps': 2 + } + + train_prog, startup_prog = static.Program(), static.Program() + with static.program_guard(train_prog, startup_prog): + with fluid.unique_name.guard(): + avg_cost = self.net() + + optimizer = paddle.fluid.optimizer.Adam(0.01) + optimizer = fleet.distributed_optimizer( + optimizer, strategy=strategy) + optimizer.minimize(avg_cost) + + ops = train_prog._pipeline_opt['section_program'].global_block().ops + ops = [op.type for op in ops] + self.assertEqual(ops.count('send_v2'), 1) + self.assertEqual(ops.count('recv_v2'), 1) if __name__ == "__main__": diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py index 1387827736560e..b7cf9dfaec5760 100755 --- a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py @@ -264,8 +264,8 @@ def test_sharding_gradient_clip(self): 'elementwise_add_grad', 'mul_grad', 'tanh_grad', 'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', 'c_reduce_sum', - 'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', 'square', - 'reduce_sum', 'square', 'reduce_sum', 'square', 'reduce_sum', 'sum', + 'c_reduce_sum', 'c_reduce_sum', 'c_sync_comm_stream', + 'squared_l2_norm', 'squared_l2_norm', 'squared_l2_norm', 'sum', 'c_allreduce_sum', 'sqrt', 'fill_constant', 'elementwise_max', 'elementwise_div', 'elementwise_mul', 'elementwise_mul', 'elementwise_mul', 'momentum', 'momentum', 'momentum' @@ -366,6 +366,8 @@ def test_sharding_hybrid_dp(self): "gradient_merge_acc_step": 1, "mp_degree": 1 } + + strategy.fuse_all_reduce_ops = False self.optimizer(avg_cost, strategy, train_prog, startup_prog) startup_prog_ops = startup_prog.global_block().ops main_prog_ops = train_prog.global_block().ops diff --git a/python/paddle/fluid/tests/unittests/test_gast_with_compatibility.py b/python/paddle/fluid/tests/unittests/test_gast_with_compatibility.py index 17ba6869534fe7..8404c563274b1e 100644 --- a/python/paddle/fluid/tests/unittests/test_gast_with_compatibility.py +++ b/python/paddle/fluid/tests/unittests/test_gast_with_compatibility.py @@ -15,7 +15,7 @@ from __future__ import print_function import ast -import gast +from paddle.utils import gast import sys import textwrap import unittest diff --git a/python/paddle/fluid/tests/unittests/test_gradient_clip.py b/python/paddle/fluid/tests/unittests/test_gradient_clip.py index 14f5d4a41a1fed..9b6dbc00f7c565 100644 --- a/python/paddle/fluid/tests/unittests/test_gradient_clip.py +++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py @@ -22,6 +22,8 @@ import six from fake_reader import fake_imdb_reader +paddle.enable_static() + def bow_net(data, label, @@ -149,7 +151,7 @@ def clip_gradient(self, params_grads): def check_clip_result(self, out, out_clip): global_norm = 0 for v in out: - global_norm += np.sum(np.power(v, 2)) + global_norm += np.sum(np.square(v)) global_norm = np.sqrt(global_norm) scale = self.clip_norm / np.maximum(self.clip_norm, global_norm) res = [] @@ -160,7 +162,8 @@ def check_clip_result(self, out, out_clip): self.assertTrue( np.allclose( a=u, b=v, rtol=1e-5, atol=1e-8), - "gradient clip by global norm has wrong results!") + "gradient clip by global norm has wrong results!, \nu={}\nv={}\ndiff={}". + format(u, v, u - v)) # test whether the ouput is right when use 'set_gradient_clip' def test_old_gradient_clip(self): @@ -210,12 +213,16 @@ def test_none_grad(self): params_grads = [(x, None), (x, y), (y, x)] params_grads = clip(params_grads) self.assertTrue( - len(clip(params_grads)) == 2, + len(params_grads) == 2, "ClipByGlobalNorm: when grad is None, it shouldn't be returned by gradient clip!" ) - self.assertTrue( - params_grads[0][1].name != 'y', - "ClipByGlobalNorm: param_grad (x, y) should be clipped!") + + ops = [op.type for op in x.block.ops] + self.assertListEqual(ops, [ + 'squared_l2_norm', 'squared_l2_norm', 'sum', 'sqrt', + 'fill_constant', 'elementwise_max', 'elementwise_div', + 'elementwise_mul', 'elementwise_mul' + ]) # raise typeError def test_tpyeError(self): diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py index 590c3e061f26ee..965ae65614a40a 100644 --- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py +++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py @@ -575,6 +575,20 @@ def test_errors(self): weight, path_code=path_code_int32) + # test paddle.nn.HSigmoidLoss + paddle.disable_static(self.place) + x_arr = np.array([], dtype=np.float32) + x = paddle.to_tensor(np.reshape(x_arr, (100000, 0))) + label = paddle.to_tensor(0, dtype='int64') + self.assertRaises(ValueError, paddle.nn.HSigmoidLoss, x, label) + + # test paddle.nn.functional.hsigmoid_loss + x = paddle.to_tensor(np.reshape(x_arr, (10, 0)), dtype='float32') + label = paddle.to_tensor([], dtype='int64') + weight = paddle.to_tensor([], dtype='float32') + self.assertRaises(ValueError, F.hsigmoid_loss, x, label, 0, weight) + paddle.enable_static() + # test paddle.fluid.layers.hsigmoid with program_guard(Program()): label = fluid.data('label', [4, 1], 'int64') diff --git a/python/paddle/fluid/tests/unittests/test_imperative_thread_local_has_grad.py b/python/paddle/fluid/tests/unittests/test_imperative_thread_local_has_grad.py new file mode 100644 index 00000000000000..d81849725d75aa --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_imperative_thread_local_has_grad.py @@ -0,0 +1,59 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import paddle +import time +import paddle.nn as nn +import numpy as np +import threading + + +class SimpleNet(nn.Layer): + def __init__(self, in_dim, out_dim): + super(SimpleNet, self).__init__() + self.fc = nn.Linear(in_dim, out_dim) + + def forward(self, x): + return self.fc(x) + + +class TestCases(unittest.TestCase): + @paddle.no_grad() + def thread_1_main(self): + time.sleep(8) + + def thread_2_main(self): + in_dim = 10 + out_dim = 3 + net = SimpleNet(in_dim, out_dim) + for _ in range(1000): + x = paddle.to_tensor(np.random.rand(32, in_dim).astype('float32')) + self.assertTrue(x.stop_gradient) + x = net(x) + self.assertFalse(x.stop_gradient) + + def test_main(self): + threads = [] + for _ in range(10): + threads.append(threading.Thread(target=self.thread_1_main)) + threads.append(threading.Thread(target=self.thread_2_main)) + for t in threads: + t.start() + for t in threads: + t.join() + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fix_op_run_order.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fix_op_run_order.py index f48cfbd50eba35..24aa080e68c280 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fix_op_run_order.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fix_op_run_order.py @@ -16,7 +16,7 @@ import paddle.fluid as fluid import unittest import numpy as np -from paddle.vision.models import resnet50 +from paddle.vision.models import resnet18 from paddle.nn import CrossEntropyLoss @@ -33,7 +33,7 @@ def get_place(self): ) else paddle.CPUPlace() def get_feed(self): - batch_size = 32 + batch_size = 4 image = np.random.random([batch_size, 3, 224, 224]).astype('float32') label = np.random.randint(0, 1000, [batch_size, 1]).astype('int64') return {"image": image, "label": label} @@ -47,7 +47,7 @@ def create_model(self, fix_op_run_order): name="image", shape=[None, 3, 224, 224], dtype="float32") label = paddle.static.data( name="label", shape=[None, 1], dtype="int64") - model = resnet50() + model = resnet18() pred = model(image) loss_fn = CrossEntropyLoss() loss = loss_fn(pred, label) diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py index 2dd5bcb8113648..047366145584e5 100644 --- a/python/paddle/fluid/tests/unittests/test_reduce_op.py +++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py @@ -748,37 +748,6 @@ def test_errors(self): self.assertRaises(TypeError, fluid.layers.reduce_sum, x2) -class API_TestSumOpError(unittest.TestCase): - def test_errors(self): - def test_dtype1(): - with fluid.program_guard(fluid.Program(), fluid.Program()): - data = fluid.data(name="data", shape=[10], dtype="float64") - paddle.sum(data, dtype="float32") - - self.assertRaises(ValueError, test_dtype1) - - def test_dtype2(): - with fluid.program_guard(fluid.Program(), fluid.Program()): - data = fluid.data(name="data", shape=[10], dtype="int64") - paddle.sum(data, dtype="int32") - - self.assertRaises(ValueError, test_dtype2) - - def test_dtype3(): - with fluid.program_guard(fluid.Program(), fluid.Program()): - data = fluid.data(name="data", shape=[10], dtype="float64") - paddle.sum(data, dtype="int32") - - self.assertRaises(ValueError, test_dtype3) - - def test_type(): - with fluid.program_guard(fluid.Program(), fluid.Program()): - data = fluid.data(name="data", shape=[10], dtype="int32") - paddle.sum(data, dtype="bool") - - self.assertRaises(TypeError, test_type) - - class API_TestSumOp(unittest.TestCase): def run_static(self, shape, @@ -805,14 +774,26 @@ def test_static(self): shape = [10, 10] axis = 1 + self.run_static(shape, "bool", axis, attr_dtype=None) + self.run_static(shape, "bool", axis, attr_dtype="int32") + self.run_static(shape, "bool", axis, attr_dtype="int64") + self.run_static(shape, "int32", axis, attr_dtype=None) self.run_static(shape, "int32", axis, attr_dtype="int32") self.run_static(shape, "int32", axis, attr_dtype="int64") + self.run_static(shape, "int64", axis, attr_dtype=None) + self.run_static(shape, "int64", axis, attr_dtype="int64") + self.run_static(shape, "int64", axis, attr_dtype="int32") + self.run_static(shape, "float32", axis, attr_dtype=None) self.run_static(shape, "float32", axis, attr_dtype="float32") self.run_static(shape, "float32", axis, attr_dtype="float64") + self.run_static(shape, "float64", axis, attr_dtype=None) + self.run_static(shape, "float64", axis, attr_dtype="float32") + self.run_static(shape, "float64", axis, attr_dtype="float64") + shape = [5, 5, 5] self.run_static(shape, "int32", (0, 1), attr_dtype="int32") self.run_static( diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py index 64d6910e1f859c..7228c903d6ffa7 100644 --- a/python/paddle/nn/functional/activation.py +++ b/python/paddle/nn/functional/activation.py @@ -37,7 +37,7 @@ def elu(x, alpha=1.0, name=None): .. math:: - elu(x) = max(0, x) + min(0, \\alpha * (e^{x}-1)) + elu(x) = max(0, x) + min(0, \alpha * (e^{x}-1)) Parameters: x (Tensor): The input Tensor with data type float32, float64. @@ -91,13 +91,13 @@ def gelu(x, approximate=False, name=None): .. math:: - gelu(x) = 0.5 * x * (1 + tanh(\\sqrt{\\frac{2}{\\pi}} * (x + 0.044715x^{3}))) + gelu(x) = 0.5 * x * (1 + tanh(\sqrt{\frac{2}{\pi}} * (x + 0.044715x^{3}))) else .. math:: - gelu(x) = 0.5 * x * (1 + erf(\\frac{x}{\\sqrt{2}})) + gelu(x) = 0.5 * x * (1 + erf(\frac{x}{\sqrt{2}})) Parameters: x (Tensor): The input Tensor with data type float32, float64. @@ -144,13 +144,13 @@ def hardshrink(x, threshold=0.5, name=None): .. math:: hardshrink(x)= - \\left\\{ - \\begin{aligned} - &x, & & if \\ x > threshold \\\\ - &x, & & if \\ x < -threshold \\\\ - &0, & & if \\ others - \\end{aligned} - \\right. + \left\{ + \begin{array}{rcl} + x,& &if \ {x > threshold} \\ + x,& &if \ {x < -threshold} \\ + 0,& &if \ {others} & + \end{array} + \right. Args: x (Tensor): The input Tensor with data type float32, float64. @@ -192,11 +192,14 @@ def hardtanh(x, min=-1.0, max=1.0, name=None): .. math:: - hardtanh(x)= \\begin{cases} - max, \\text{if } x > max \\\\ - min, \\text{if } x < min \\\\ - x, \\text{otherwise} - \\end{cases} + hardtanh(x)= + \left\{ + \begin{array}{cll} + max,& & \text{if } x > max \\ + min,& & \text{if } x < min \\ + x,& & \text{otherwise} + \end{array} + \right. Parameters: x (Tensor): The input Tensor with data type float32, float64. @@ -246,13 +249,13 @@ def hardsigmoid(x, slope=0.1666667, offset=0.5, name=None): .. math:: hardsigmoid(x)= - \\left\\{ - \\begin{aligned} - &0, & & \\text{if } x \\leq -3 \\\\ - &1, & & \\text{if } x \\geq 3 \\\\ - &slope * x + offset, & & \\text{otherwise} - \\end{aligned} - \\right. + \left\{ + \begin{array}{lcl} + 0, & &\text{if } \ x \leq -3 \\ + 1, & &\text{if } \ x \geq 3 \\ + slope * x + offset, & &\text{otherwise} + \end{array} + \right. Parameters: x (Tensor): The input Tensor with data type float32, float64. @@ -302,13 +305,13 @@ def hardswish(x, name=None): .. math:: hardswish(x)= - \\left\\{ - \\begin{aligned} - &0, & & \\text{if } x \\leq -3 \\\\ - &x, & & \\text{if } x \\geq 3 \\\\ - &\\frac{x(x+3)}{6}, & & \\text{otherwise} - \\end{aligned} - \\right. + \left\{ + \begin{array}{cll} + 0 &, & \text{if } x \leq -3 \\ + x &, & \text{if } x \geq 3 \\ + \frac{x(x+3)}{6} &, & \text{otherwise} + \end{array} + \right. Parameters: x (Tensor): The input Tensor with data type float32, float64. @@ -345,13 +348,13 @@ def leaky_relu(x, negative_slope=0.01, name=None): leaky_relu activation .. math:: - leaky\\_relu(x)= - \\left\\{ - \\begin{aligned} - &x, & & if \\ x >= 0 \\\\ - &negative\_slope * x, & & otherwise \\\\ - \\end{aligned} - \\right. \\\\ + leaky\_relu(x)= + \left\{ + \begin{array}{rcl} + x, & & if \ x >= 0 \\ + negative\_slope * x, & & otherwise \\ + \end{array} + \right. Args: x (Tensor): The input Tensor with data type float32, float64. @@ -513,7 +516,7 @@ def log_sigmoid(x, name=None): .. math:: - log\\_sigmoid(x) = log \\frac{1}{1 + e^{-x}} + log\_sigmoid(x) = log \frac{1}{1 + e^{-x}} Parameters: x (Tensor): The input Tensor with data type float32, float64. @@ -554,12 +557,15 @@ def maxout(x, groups, axis=1, name=None): .. math:: - &out_{si+j} = \\max_{k} x_{gsi + sk + j} \\\\ - &g = groups \\\\ - &s = \\frac{input.size}{num\\_channels} \\\\ - &0 \\le i < \\frac{num\\_channels}{groups} \\\\ - &0 \\le j < s \\\\ - &0 \\le k < groups + \begin{array}{l} + &out_{si+j} = \max_{k} x_{gsi + sk + j} \\ + &g = groups \\ + &s = \frac{input.size}{num\_channels} \\ + &0 \le i < \frac{num\_channels}{groups} \\ + &0 \le j < s \\ + &0 \le k < groups + \end{array} + Parameters: x (Tensor): The input is 4-D Tensor with shape [N, C, H, W] or [N, H, W, C], the data type @@ -670,10 +676,12 @@ def selu(x, .. math:: selu(x)= scale * - \\begin{cases} - x, \\text{if } x > 0 \\\\ - alpha * e^{x} - alpha, \\text{if } x <= 0 - \\end{cases} + \left\{ + \begin{array}{lcl} + x,& &\text{if } \ x > 0 \\ + alpha * e^{x} - alpha,& &\text{if } \ x <= 0 + \end{array} + \right. Parameters: x (Tensor): The input Tensor with data type float32, float64. @@ -719,9 +727,11 @@ def selu(x, def silu(x, name=None): - """ - silu activation. - .. math: + r""" + silu activation + + .. math:: + silu(x) = \frac{x}{1 + e^{-x}} Parameters: @@ -734,11 +744,12 @@ def silu(x, name=None): Examples: .. code-block:: python - import paddle - import paddle.nn.functional as F - - x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0]) - out = F.silu(x) # [ 0.731059, 1.761594, 2.857722, 3.928055 ] + + import paddle + import paddle.nn.functional as F + + x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0]) + out = F.silu(x) # [ 0.731059, 1.761594, 2.857722, 3.928055 ] """ if in_dygraph_mode(): @@ -778,7 +789,7 @@ def softmax(x, axis=-1, dtype=None, name=None): .. math:: - softmax[i, j] = \\frac{\\exp(x[i, j])}{\\sum_j(exp(x[i, j])} + softmax[i, j] = \frac{\exp(x[i, j])}{\sum_j(exp(x[i, j])} Example: @@ -923,8 +934,8 @@ def softplus(x, beta=1, threshold=20, name=None): .. math:: - softplus(x) = \\frac{1}{beta} * \\log(1 + e^{beta * x}) \\\\ - \\text{For numerical stability, the implementation reverts to the linear function when: beta * x > threshold.} + softplus(x) = \frac{1}{beta} * \log(1 + e^{beta * x}) \\ + \text{For numerical stability, the implementation reverts to the linear function when: beta * x > threshold.} Parameters: x (Tensor): The input Tensor with data type float32, float64. @@ -968,11 +979,14 @@ def softshrink(x, threshold=0.5, name=None): .. math:: - softshrink(x)= \\begin{cases} - x - threshold, \\text{if } x > threshold \\\\ - x + threshold, \\text{if } x < -threshold \\\\ - 0, \\text{otherwise} - \\end{cases} + softshrink(x)= + \left\{ + \begin{array}{rcl} + x - threshold,& & \text{if } x > threshold \\ + x + threshold,& & \text{if } x < -threshold \\ + 0,& & \text{otherwise} + \end{array} + \right. Parameters: x (Tensor): The input Tensor with data type float32, float64. @@ -1019,7 +1033,7 @@ def softsign(x, name=None): .. math:: - softsign(x) = \\frac{x}{1 + |x|} + softsign(x) = \frac{x}{1 + |x|} Parameters: x (Tensor): The input Tensor with data type float32, float64. @@ -1056,7 +1070,7 @@ def swish(x, name=None): .. math:: - swish(x) = \\frac{x}{1 + e^{-x}} + swish(x) = \frac{x}{1 + e^{-x}} Parameters: x (Tensor): The input Tensor with data type float32, float64. @@ -1134,10 +1148,14 @@ def thresholded_relu(x, threshold=1.0, name=None): .. math:: - thresholded\\_relu(x) = \\begin{cases} - x, \\text{if } x > threshold \\\\ - 0, \\text{otherwise} - \\end{cases} + thresholded\_relu(x) = + \left\{ + \begin{array}{rl} + x,& \text{if } \ x > threshold \\ + 0,& \text{otherwise} + \end{array} + \right. + Parameters: x (Tensor): The input Tensor with data type float32, float64. @@ -1181,10 +1199,10 @@ def log_softmax(x, axis=-1, dtype=None, name=None): .. math:: - \\begin{aligned} - log\\_softmax[i, j] &= log(softmax(x)) \\\\ - &= log(\\frac{\\exp(X[i, j])}{\\sum_j(\\exp(X[i, j])}) - \\end{aligned} + \begin{aligned} + log\_softmax[i, j] &= log(softmax(x)) \\ + &= log(\frac{\exp(X[i, j])}{\sum_j(\exp(X[i, j])}) + \end{aligned} Parameters: x (Tensor): The input Tensor with data type float32, float64. diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py index cb7a50ade7ac8f..ef2bfb3b8e0d3a 100755 --- a/python/paddle/nn/functional/loss.py +++ b/python/paddle/nn/functional/loss.py @@ -180,18 +180,18 @@ def binary_cross_entropy_with_logits(logit, First this operator calculate loss function as follows: .. math:: - Out = -Labels * \\log(\\sigma(Logit)) - (1 - Labels) * \\log(1 - \\sigma(Logit)) + Out = -Labels * \log(\sigma(Logit)) - (1 - Labels) * \log(1 - \sigma(Logit)) - We know that :math:`\\sigma(Logit) = \\frac{1}{1 + e^{-Logit}}`. By substituting this we get: + We know that :math:`\sigma(Logit) = \frac{1}{1 + e^{-Logit}}`. By substituting this we get: .. math:: - Out = Logit - Logit * Labels + \\log(1 + e^{-Logit}) + Out = Logit - Logit * Labels + \log(1 + e^{-Logit}) For stability and to prevent overflow of :math:`e^{-Logit}` when Logit < 0, we reformulate the loss as follows: .. math:: - Out = \\max(Logit, 0) - Logit * Labels + \\log(1 + e^{-\|Logit\|}) + Out = \max(Logit, 0) - Logit * Labels + \log(1 + e^{-\|Logit\|}) Then, if ``weight`` or ``pos_weight`` is not None, this operator multiply the weight tensor on the loss `Out`. The ``weight`` tensor will attach different @@ -450,17 +450,17 @@ def smooth_l1_loss(input, label, reduction='mean', delta=1.0, name=None): .. math:: - loss(x,y) = \\frac{1}{n}\\sum_{i}z_i + loss(x,y) = \frac{1}{n}\sum_{i}z_i where z_i is given by: .. math:: - \\mathop{z_i} = \\left\\{\\begin{array}{rcl} - 0.5(x_i - y_i)^2 & & {if |x_i - y_i| < delta} \\\\ + \mathop{z_i} = \left\{\begin{array}{rcl} + 0.5(x_i - y_i)^2 & & {if |x_i - y_i| < delta} \\ delta * |x_i - y_i| - 0.5 * delta^2 & & {otherwise} - \\end{array} \\right. + \end{array} \right. Parameters: input (Tensor): Input tensor, the data type is float32 or float64. Shape is @@ -631,17 +631,17 @@ def l1_loss(input, label, reduction='mean', name=None): If `reduction` set to ``'none'``, the loss is: .. math:: - Out = \\lvert input - label \\rvert + Out = \lvert input - label \rvert If `reduction` set to ``'mean'``, the loss is: .. math:: - Out = MEAN(\\lvert input - label \\rvert) + Out = MEAN(\lvert input - label \rvert) If `reduction` set to ``'sum'``, the loss is: .. math:: - Out = SUM(\\lvert input - label\\rvert) + Out = SUM(\lvert input - label \rvert) Parameters: @@ -1563,15 +1563,15 @@ def sigmoid_focal_loss(logit, This operator measures focal loss function as follows: .. math:: - Out = -Labels * alpha * {(1 - \\sigma(Logit))}^{gamma}\\log(\\sigma(Logit)) - (1 - Labels) * (1 - alpha) * {\\sigma(Logit)}^{gamma}\\log(1 - \\sigma(Logit)) + Out = -Labels * alpha * {(1 - \sigma(Logit))}^{gamma}\log(\sigma(Logit)) - (1 - Labels) * (1 - alpha) * {\sigma(Logit)}^{gamma}\log(1 - \sigma(Logit)) - We know that :math:`\\sigma(Logit) = \\frac{1}{1 + \\exp(-Logit)}`. + We know that :math:`\sigma(Logit) = \frac{1}{1 + \exp(-Logit)}`. Then, if :attr:`normalizer` is not None, this operator divides the normalizer tensor on the loss `Out`: .. math:: - Out = \\frac{Out}{normalizer} + Out = \frac{Out}{normalizer} Finally, this operator applies reduce operation on the loss. If :attr:`reduction` set to ``'none'``, the operator will return the original loss `Out`. diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py index 286f8ef167b457..db73e56f879a77 100644 --- a/python/paddle/nn/functional/norm.py +++ b/python/paddle/nn/functional/norm.py @@ -34,12 +34,12 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None): .. math:: - y = \\frac{x}{ \\max\\left( \\lvert \\lvert x \\rvert \\rvert_p, epsilon\\right) } + y = \frac{x}{ \max\left( \lvert \lvert x \rvert \rvert_p, epsilon\right) } .. math:: - \\lvert \\lvert x \\rvert \\rvert_p = \\left( \\sum_i {\\lvert x_i \\rvert^p} \\right)^{1/p} + \lvert \lvert x \rvert \rvert_p = \left( \sum_i {\lvert x_i \rvert^p} \right)^{1/p} - where, :math:`\\sum_i{\\lvert x_i \\rvert^p}` is calculated along the ``axis`` dimension. + where, :math:`\sum_i{\lvert x_i \rvert^p}` is calculated along the ``axis`` dimension. Parameters: @@ -432,7 +432,7 @@ def local_response_norm(x, .. math:: - Output(i, x, y) = Input(i, x, y) / \\left(k + \\alpha \\sum\\limits^{\\min(C-1, i + size/2)}_{j = \\max(0, i - size/2)}(Input(j, x, y))^2\\right)^{\\beta} + Output(i, x, y) = Input(i, x, y) / \left(k + \alpha \sum\limits^{\min(C-1, i + size/2)}_{j = \max(0, i - size/2)}(Input(j, x, y))^2\right)^{\beta} In the above equation: diff --git a/python/paddle/nn/initializer/kaiming.py b/python/paddle/nn/initializer/kaiming.py index f0847c85237b25..88a52268776fcb 100644 --- a/python/paddle/nn/initializer/kaiming.py +++ b/python/paddle/nn/initializer/kaiming.py @@ -33,7 +33,7 @@ class KaimingNormal(MSRAInitializer): .. math:: - \sqrt{\\frac{2.0}{fan\_in}} + \sqrt{\frac{2.0}{fan\_in}} Args: fan_in (float32|None): fan_in for Kaiming normal Initializer. If None, it is\ @@ -75,7 +75,7 @@ class KaimingUniform(MSRAInitializer): .. math:: - x = \sqrt{\\frac{6.0}{fan\_in}} + x = \sqrt{\frac{6.0}{fan\_in}} Args: fan_in (float32|None): fan_in for Kaiming uniform Initializer. If None, it is\ diff --git a/python/paddle/nn/initializer/xavier.py b/python/paddle/nn/initializer/xavier.py index f2d5593032f64d..aff3a2c15aeec3 100644 --- a/python/paddle/nn/initializer/xavier.py +++ b/python/paddle/nn/initializer/xavier.py @@ -28,7 +28,7 @@ class XavierNormal(XavierInitializer): .. math:: - \sqrt{\\frac{2.0}{fan\_in + fan\_out}} + \sqrt{\frac{2.0}{fan\_in + fan\_out}} Args: @@ -83,7 +83,7 @@ class XavierUniform(XavierInitializer): .. math:: - x = \sqrt{\\frac{6.0}{fan\_in + fan\_out}} + x = \sqrt{\frac{6.0}{fan\_in + fan\_out}} Args: fan_in (float, optional): fan_in for Xavier initialization, it is diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py index 695e387bda84f0..abfeff0641a472 100644 --- a/python/paddle/nn/layer/activation.py +++ b/python/paddle/nn/layer/activation.py @@ -31,7 +31,7 @@ class ELU(Layer): .. math:: - ELU(x) = max(0, x) + min(0, \\alpha * (e^{x}-1)) + ELU(x) = max(0, x) + min(0, \alpha * (e^{x}-1)) Parameters: alpha (float, optional): The 'alpha' value of the ELU formulation. Default is 1.0. @@ -75,13 +75,13 @@ class GELU(Layer): .. math:: - GELU(x) = 0.5 * x * (1 + tanh(\\sqrt{\\frac{2}{\\pi}} * (x + 0.044715x^{3}))) + GELU(x) = 0.5 * x * (1 + tanh(\sqrt{\frac{2}{\pi}} * (x + 0.044715x^{3}))) else .. math:: - GELU(x) = 0.5 * x * (1 + erf(\\frac{x}{\\sqrt{2}})) + GELU(x) = 0.5 * x * (1 + erf(\frac{x}{\sqrt{2}})) Parameters: approximate (bool, optional): Wether to enable approximation. Default is False. @@ -127,13 +127,13 @@ class Hardshrink(Layer): .. math:: hardshrink(x)= - \\left\\{ - \\begin{aligned} - &x, & & if \\ x > threshold \\\\ - &x, & & if \\ x < -threshold \\\\ - &0, & & if \\ others - \\end{aligned} - \\right. + \left\{ + \begin{array}{rcl} + x, & & if \ x > threshold \\ + x, & & if \ x < -threshold \\ + 0, & & if \ others + \end{array} + \right. Parameters: threshold (float, optional): The value of threshold for hardthrink. Default is 0.5 @@ -179,13 +179,14 @@ class Hardswish(Layer): .. math:: Hardswish(x)= - \\left\\{ - \\begin{aligned} - &0, & & \\text{if } x \\leq -3 \\\\ - &x, & & \\text{if } x \\geq 3 \\\\ - &\\frac{x(x+3)}{6}, & & \\text{otherwise} - \\end{aligned} - \\right. + \left\{ + \begin{array}{cll} + 0 &, & \text{if } x \leq -3 \\ + x &, & \text{if } x \geq 3 \\ + \frac{x(x+3)}{6} &, & \text{otherwise} + \end{array} + \right. + Parameters: name (str, optional): Name for the operation (optional, default is None). @@ -223,7 +224,7 @@ class Tanh(Layer): Tanh Activation. .. math:: - Tanh(x) = \\frac{e^{x} - e^{-x}}{e^{x} + e^{-x}} + Tanh(x) = \frac{e^{x} - e^{-x}}{e^{x} + e^{-x}} Parameters: name (str, optional): Name for the operation (optional, default is None). @@ -265,11 +266,15 @@ class Hardtanh(Layer): .. math:: - Hardtanh(x)= \\begin{cases} - max, \\text{if } x > max \\\\ - min, \\text{if } x < min \\\\ - x, \\text{otherwise} - \\end{cases} + Hardtanh(x)= + \left\{ + \begin{array}{cll} + max,& & \text{if } x > max \\ + min,& & \text{if } x < min \\ + x,& & \text{otherwise} + \end{array} + \right. + Parameters: min (float, optional): The value of min for Hardtanh. Default is -1. @@ -461,10 +466,12 @@ class SELU(Layer): .. math:: SELU(x)= scale * - \\begin{cases} - x, \\text{if } x > 0 \\\\ - alpha * e^{x} - alpha, \\text{if } x <= 0 - \\end{cases} + \left\{ + \begin{array}{lcl} + x,& &\text{if } \ x > 0 \\ + alpha * e^{x} - alpha,& &\text{if } \ x <= 0 + \end{array} + \right. Parameters: scale (float, optional): The value of scale(must be greater than 1.0) for SELU. Default is 1.0507009873554804934193349852946 @@ -512,12 +519,13 @@ class LeakyReLU(Layer): .. math:: LeakyReLU(x)= - \\left\\{ - \\begin{aligned} - &x, & & if \\ x >= 0 \\\\ - &negative\_slope * x, & & otherwise \\\\ - \\end{aligned} - \\right. \\\\ + \left\{ + \begin{array}{rcl} + x, & & if \ x >= 0 \\ + negative\_slope * x, & & otherwise \\ + \end{array} + \right. + Parameters: negative_slope (float, optional): Slope of the activation function at @@ -604,13 +612,14 @@ class Hardsigmoid(Layer): .. math:: Hardsigmoid(x)= - \\left\\{ - \\begin{aligned} - &0, & & \\text{if } x \\leq -3 \\\\ - &1, & & \\text{if } x \\geq 3 \\\\ - &x/6 + 1/2, & & \\text{otherwise} - \\end{aligned} - \\right. + \left\{ + \begin{array}{rcl} + 0, & & \text{if } \ x \leq -3 \\ + 1, & & \text{if } \ x \geq 3 \\ + x/6 + 1/2, & & \text{otherwise} + \end{array} + \right. + Parameters: name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. @@ -650,8 +659,8 @@ class Softplus(Layer): .. math:: - Softplus(x) = \\frac{1}{beta} * \\log(1 + e^{beta * x}) \\\\ - \\text{For numerical stability, the implementation reverts to the linear function when: beta * x > threshold.} + Softplus(x) = \frac{1}{beta} * \log(1 + e^{beta * x}) \\ + \text{For numerical stability, the implementation reverts to the linear function when: beta * x > threshold.} Parameters: beta (float, optional): The value of beta for Softplus. Default is 1 @@ -695,11 +704,15 @@ class Softshrink(Layer): .. math:: - Softshrink(x)= \\begin{cases} - x - threshold, \\text{if } x > threshold \\\\ - x + threshold, \\text{if } x < -threshold \\\\ - 0, \\text{otherwise} - \\end{cases} + Softshrink(x)= + \left\{ + \begin{array}{rcl} + x - threshold,& & \text{if } x > threshold \\ + x + threshold,& & \text{if } x < -threshold \\ + 0,& & \text{otherwise} + \end{array} + \right. + Parameters: threshold (float, optional): The value of threshold(must be no less than zero) for softplus. Default is 0.5 @@ -740,7 +753,7 @@ class Softsign(Layer): .. math:: - Softsign(x) = \\frac{x}{1 + |x|} + Softsign(x) = \frac{x}{1 + |x|} Parameters: name (str, optional): Name for the operation (optional, default is None). @@ -779,7 +792,7 @@ class Swish(Layer): .. math:: - Swish(x) = \\frac{x}{1 + e^{-x}} + Swish(x) = \frac{x}{1 + e^{-x}} Parameters: name (str, optional): Name for the operation (optional, default is None). @@ -857,10 +870,14 @@ class ThresholdedReLU(Layer): .. math:: - ThresholdedReLU(x) = \\begin{cases} - x, \\text{if } x > threshold \\\\ - 0, \\text{otherwise} - \\end{cases} + ThresholdedReLU(x) = + \left\{ + \begin{array}{rl} + x,& \text{if } \ x > threshold \\ + 0,& \text{otherwise} + \end{array} + \right. + Parameters: threshold (float, optional): The value of threshold for ThresholdedReLU. Default is 1.0 @@ -939,7 +956,7 @@ class LogSigmoid(Layer): .. math:: - LogSigmoid(x) = log \\frac{1}{1 + e^{-x}} + LogSigmoid(x) = log \frac{1}{1 + e^{-x}} Parameters: x (Tensor): The input Tensor with data type float32, or float64. @@ -1001,7 +1018,7 @@ class Softmax(Layer): .. math:: - Softmax[i, j] = \\frac{\\exp(x[i, j])}{\\sum_j(exp(x[i, j])} + Softmax[i, j] = \frac{\exp(x[i, j])}{\sum_j(exp(x[i, j])} Example: @@ -1105,10 +1122,10 @@ class LogSoftmax(Layer): .. math:: - \\begin{aligned} - Out[i, j] &= log(softmax(x)) \\\\ - &= log(\\frac{\\exp(X[i, j])}{\\sum_j(\\exp(X[i, j])}) - \\end{aligned} + \begin{array} {rcl} + Out[i, j] &= &log(softmax(x)) \\ + &= &log(\frac{\exp(X[i, j])}{\sum_j(\exp(X[i, j])}) + \end{array} Parameters: axis (int, optional): The axis along which to perform log_softmax @@ -1167,12 +1184,14 @@ class Maxout(Layer): .. math:: - &out_{si+j} = \max_{k} x_{gsi + sk + j} \\\\ - &g = groups \\\\ - &s = \\frac{input.size}{num\\_channels} \\\\ - &0 \\le i < \\frac{num\\_channels}{groups} \\\\ - &0 \\le j < s \\\\ - &0 \\le k < groups + \begin{array}{l} + &out_{si+j} = \max_{k} x_{gsi + sk + j} \\ + &g = groups \\ + &s = \frac{input.size}{num\_channels} \\ + &0 \le i < \frac{num\_channels}{groups} \\ + &0 \le j < s \\ + &0 \le k < groups + \end{array} Parameters: groups (int, optional): The groups number of maxout. `groups` specifies the diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py index 31b552bed162c2..3ac0d675fb72c6 100644 --- a/python/paddle/nn/layer/loss.py +++ b/python/paddle/nn/layer/loss.py @@ -40,18 +40,18 @@ class BCEWithLogitsLoss(Layer): First this operator calculate loss function as follows: .. math:: - Out = -Labels * \\log(\\sigma(Logit)) - (1 - Labels) * \\log(1 - \\sigma(Logit)) + Out = -Labels * \log(\sigma(Logit)) - (1 - Labels) * \log(1 - \sigma(Logit)) - We know that :math:`\\sigma(Logit) = \\frac{1}{1 + \\e^{-Logit}}`. By substituting this we get: + We know that :math:`\sigma(Logit) = \frac{1}{1 + e^{-Logit}}`. By substituting this we get: .. math:: - Out = Logit - Logit * Labels + \\log(1 + \\e^{-Logit}) + Out = Logit - Logit * Labels + \log(1 + e^{-Logit}) - For stability and to prevent overflow of :math:`\\e^{-Logit}` when Logit < 0, + For stability and to prevent overflow of :math:`e^{-Logit}` when Logit < 0, we reformulate the loss as follows: .. math:: - Out = \\max(Logit, 0) - Logit * Labels + \\log(1 + \\e^{-\|Logit\|}) + Out = \max(Logit, 0) - Logit * Labels + \log(1 + e^{-\|Logit\|}) Then, if ``weight`` or ``pos_weight`` is not None, this operator multiply the weight tensor on the loss `Out`. The ``weight`` tensor will attach different @@ -779,8 +779,6 @@ def forward(self, input, label): class NLLLoss(Layer): r""" - :alias_main: paddle.nn.NLLLoss - :alias: paddle.nn.NLLLoss,paddle.nn.layer.NLLLoss,paddle.nn.layer.loss.NLLLoss This class accepts input and target label and returns negative log likelihood cross error. It is useful to train a classification problem with C classes. @@ -800,20 +798,25 @@ class NLLLoss(Layer): The unreduced (i.e. with :attr:`reduction` set to ``'none'``) loss can be described as: .. math:: - \ell(x, y) = L = \{l_1,\dots,l_N\}^\\top, \quad + + \ell(x, y) = L = \{l_1,\dots,l_N\}^\top, \quad l_n = - w_{y_n} x_{n,y_n}, \quad - w_{c} = \\text{weight}[c] \cdot \mathbb{1}\{c \\not= \\text{ignore\\_index}\}, + w_{c} = \text{weight}[c] \cdot \mathbb{1}\{c \not= \text{ignore\_index}\}, where :math:`N` is the batch size. If :attr:`reduction` is not ``'none'`` (default ``'mean'``), then .. math:: - \ell(x, y) = \\begin{cases} - \\sum_{n=1}^N \\frac{1}{\\sum_{n=1}^N w_{y_n}} l_n, & - \\text{if reduction} = \\text{'mean';}\\\\ - \\sum_{n=1}^N l_n, & - \\text{if reduction} = \\text{'sum'.} - \\end{cases} + + \ell(x, y) = + \left\{ + \begin{array}{lcl} + \sum_{n=1}^N \frac{1}{\sum_{n=1}^N w_{y_n}} l_n, & + \text{if reduction} = \text{'mean';}\\ + \sum_{n=1}^N l_n, & + \text{if reduction} = \text{'sum'.} + \end{array} + \right. Parameters: weight (Tensor, optional): Weight tensor, a manual rescaling weight given @@ -1136,16 +1139,16 @@ class SmoothL1Loss(Layer): .. math:: - loss(x,y) = \\frac{1}{n}\\sum_{i}z_i + loss(x,y) = \frac{1}{n}\sum_{i}z_i where z_i is given by: .. math:: - \\mathop{z_i} = \\left\\{\\begin{array}{rcl} - 0.5(x_i - y_i)^2 & & {if |x_i - y_i| < delta} \\\\ + \mathop{z_i} = \left\{\begin{array}{rcl} + 0.5(x_i - y_i)^2 & & {if |x_i - y_i| < delta} \\ delta * |x_i - y_i| - 0.5 * delta^2 & & {otherwise} - \\end{array} \\right. + \end{array} \right. Parameters: reduction (str, optional): Indicate how to average the loss by batch_size, diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py index 9abbc494258948..41599809810ee7 100644 --- a/python/paddle/nn/layer/norm.py +++ b/python/paddle/nn/layer/norm.py @@ -115,13 +115,13 @@ class InstanceNorm1D(_InstanceNormBase): .. math:: - \\mu_{\\beta} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW} x_i \\qquad &//\\ - \\ mean\ of\ one\ feature\ map\ in\ mini-batch \\\\ - \\sigma_{\\beta}^{2} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW}(x_i - \\ - \\mu_{\\beta})^2 \\qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\\\ - \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\ - \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\ - y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift + \mu_{\beta} &\gets \frac{1}{HW} \sum_{i=1}^{HW} x_i \qquad &//\ + \ mean\ of\ one\ feature\ map\ in\ mini-batch \\ + \sigma_{\beta}^{2} &\gets \frac{1}{HW} \sum_{i=1}^{HW}(x_i - \ + \mu_{\beta})^2 \qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\ + \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\ + \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\ + y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift Note: `H` means height of feature map, `W` means width of feature map. @@ -187,13 +187,13 @@ class InstanceNorm2D(_InstanceNormBase): .. math:: - \\mu_{\\beta} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW} x_i \\qquad &//\\ - \\ mean\ of\ one\ feature\ map\ in\ mini-batch \\\\ - \\sigma_{\\beta}^{2} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW}(x_i - \\ - \\mu_{\\beta})^2 \\qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\\\ - \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\ - \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\ - y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift + \mu_{\beta} &\gets \frac{1}{HW} \sum_{i=1}^{HW} x_i \qquad &//\ + \ mean\ of\ one\ feature\ map\ in\ mini-batch \\ + \sigma_{\beta}^{2} &\gets \frac{1}{HW} \sum_{i=1}^{HW}(x_i - \ + \mu_{\beta})^2 \qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\ + \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\ + \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\ + y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift Note: `H` means height of feature map, `W` means width of feature map. @@ -257,13 +257,13 @@ class InstanceNorm3D(_InstanceNormBase): .. math:: - \\mu_{\\beta} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW} x_i \\qquad &//\\ - \\ mean\ of\ one\ feature\ map\ in\ mini-batch \\\\ - \\sigma_{\\beta}^{2} &\\gets \\frac{1}{HW} \\sum_{i=1}^{HW}(x_i - \\ - \\mu_{\\beta})^2 \\qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\\\ - \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\ - \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\ - y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift + \mu_{\beta} &\gets \frac{1}{HW} \sum_{i=1}^{HW} x_i \qquad &//\ + \ mean\ of\ one\ feature\ map\ in\ mini-batch \\ + \sigma_{\beta}^{2} &\gets \frac{1}{HW} \sum_{i=1}^{HW}(x_i - \ + \mu_{\beta})^2 \qquad &//\ variance\ of\ one\ feature\ map\ in\ mini-batch \\ + \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\ + \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\ + y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift Note: `H` means height of feature map, `W` means width of feature map. @@ -450,15 +450,15 @@ class LayerNorm(Layer): .. math:: - \\mu & = \\frac{1}{H}\\sum_{i=1}^{H} x_i + \mu & = \frac{1}{H}\sum_{i=1}^{H} x_i - \\sigma & = \\sqrt{\\frac{1}{H}\sum_{i=1}^{H}{(x_i - \\mu)^2} + \\epsilon} + \sigma & = \sqrt{\frac{1}{H}\sum_{i=1}^{H}{(x_i - \mu)^2} + \epsilon} - y & = f(\\frac{g}{\\sigma}(x - \\mu) + b) + y & = f(\frac{g}{\sigma}(x - \mu) + b) - :math:`x`: the vector representation of the summed inputs to the neurons in that layer. - :math:`H`: the number of hidden units in a layers - - :math:`\\epsilon`: the small value added to the variance to prevent division by zero. + - :math:`\epsilon`: the small value added to the variance to prevent division by zero. - :math:`g`: the trainable scale parameter. - :math:`b`: the trainable bias parameter. @@ -666,37 +666,36 @@ class BatchNorm1D(_BatchNormBase): r""" Applies Batch Normalization over a 2D or 3D input (a mini-batch of 1D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift . - When use_global_stats = False, the :math:`\\mu_{\\beta}` - and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch. + When use_global_stats = False, the :math:`\mu_{\beta}` + and :math:`\sigma_{\beta}^{2}` are the statistics of one mini-batch. Calculated as follows: .. math:: - \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\ - \ mini-batch\ mean \\\\ - \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\ - \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\ + \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad &//\ + \ mini-batch\ mean \\ + \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - \ + \mu_{\beta})^2 \qquad &//\ mini-batch\ variance \\ - When use_global_stats = True, the :math:`\\mu_{\\beta}` - and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch. + When use_global_stats = True, the :math:`\mu_{\beta}` + and :math:`\sigma_{\beta}^{2}` are not the statistics of one mini-batch. They are global or running statistics (moving_mean and moving_variance). It usually got from the pre-trained model. Calculated as follows: .. math:: - moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\ - moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\ + moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global \ mean \\ + moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global \ variance \\ The normalization function formula is as follows: .. math:: - \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\ - \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\ - y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift + \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\ + y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift - - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero - - :math:`\\gamma` : trainable proportional parameter - - :math:`\\beta` : trainable deviation parameter + - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero + - :math:`\gamma` : trainable proportional parameter + - :math:`\beta` : trainable deviation parameter Parameters: num_features(int): Indicate the number of channels of the input ``Tensor``. @@ -770,37 +769,36 @@ class BatchNorm2D(_BatchNormBase): r""" Applies Batch Normalization over a 4D input (a mini-batch of 2D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift . - When use_global_stats = False, the :math:`\\mu_{\\beta}` - and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch. + When use_global_stats = False, the :math:`\mu_{\beta}` + and :math:`\sigma_{\beta}^{2}` are the statistics of one mini-batch. Calculated as follows: .. math:: - \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\ - \ mini-batch\ mean \\\\ - \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\ - \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\ + \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad &// + \ mini-batch\ mean \\ + \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - + \mu_{\beta})^2 \qquad &//\ mini-batch\ variance \\ - When use_global_stats = True, the :math:`\\mu_{\\beta}` - and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch. + When use_global_stats = True, the :math:`\mu_{\beta}` + and :math:`\sigma_{\beta}^{2}` are not the statistics of one mini-batch. They are global or running statistics (moving_mean and moving_variance). It usually got from the pre-trained model. Calculated as follows: .. math:: - moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\ - moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\ + moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global \ mean \\ + moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global \ variance \\ The normalization function formula is as follows: .. math:: - \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\ - \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\ - y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift + \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\ + y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift - - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero - - :math:`\\gamma` : trainable proportional parameter - - :math:`\\beta` : trainable deviation parameter + - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero + - :math:`\gamma` : trainable proportional parameter + - :math:`\beta` : trainable deviation parameter Parameters: num_features(int): Indicate the number of channels of the input ``Tensor``. @@ -859,16 +857,16 @@ class BatchNorm3D(_BatchNormBase): r""" Applies Batch Normalization over a 5D input (a mini-batch of 3D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift . - When use_global_stats = False, the :math:`\\mu_{\\beta}` - and :math:`\\sigma_{\\beta}^{2}` are the statistics of one mini-batch. + When use_global_stats = False, the :math:`\mu_{\beta}` + and :math:`\sigma_{\beta}^{2}` are the statistics of one mini-batch. Calculated as follows: .. math:: - \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\ - \ mini-batch\ mean \\\\ - \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\ - \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\ + \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad &//\ + \ mini-batch\ mean \\ + \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - \ + \mu_{\beta})^2 \qquad &//\ mini-batch\ variance \\ When use_global_stats = True, the :math:`\\mu_{\\beta}` and :math:`\\sigma_{\\beta}^{2}` are not the statistics of one mini-batch. @@ -876,20 +874,19 @@ class BatchNorm3D(_BatchNormBase): pre-trained model. Calculated as follows: .. math:: - moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\ - moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\ + moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global \ mean \\ + moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global \ variance \\ The normalization function formula is as follows: .. math:: - \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\ - \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\ - y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift + \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\ + y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift - - :math:`\\epsilon` : add a smaller value to the variance to prevent division by zero - - :math:`\\gamma` : trainable proportional parameter - - :math:`\\beta` : trainable deviation parameter + - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero + - :math:`\gamma` : trainable proportional parameter + - :math:`\beta` : trainable deviation parameter Parameters: num_features(int): Indicate the number of channels of the input ``Tensor``. @@ -976,33 +973,33 @@ class SyncBatchNorm(_BatchNormBase): .. math:: - \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\ - \ mini-batch\ mean \\\\ - \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\ - \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\ + \mu_{\beta} &\gets \frac{1}{m} \sum_{i=1}^{m} x_i \qquad &//\ + \ mini-batch\ mean \\ + \sigma_{\beta}^{2} &\gets \frac{1}{m} \sum_{i=1}^{m}(x_i - \ + \mu_{\beta})^2 \qquad &//\ mini-batch\ variance \\ - :math:`x` : whole mini-batch data in all gpus - :math:`m` : the size of the whole mini-batch data When model in evaluation mode, the :math:`\\mu_{\\beta}` - and :math:`\\sigma_{\\beta}^{2}` are global statistics (moving_mean and moving_variance, + and :math:`\sigma_{\beta}^{2}` are global statistics (moving_mean and moving_variance, which usually got from the pre-trained model). Global statistics calculated as follows: .. math:: - moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\ - moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\ + moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global \ mean \\ + moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global \ variance \\ The formula of normalization is as follows: .. math:: - \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\ - \\sigma_{\\beta}^{2} + \\eps}} \\qquad &//\ normalize \\\\ - y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift + \hat{x_i} &\gets \frac{x_i - \mu_\beta} {\sqrt{\ + \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\ + y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift - - :math:`\\eps` : add a smaller value to the variance to prevent division by zero - - :math:`\\gamma` : trainable scale parameter vector - - :math:`\\beta` : trainable shift parameter vector + - :math:`\epsilon` : add a smaller value to the variance to prevent division by zero + - :math:`\gamma` : trainable scale parameter vector + - :math:`\beta` : trainable shift parameter vector Note: If you want to use container to pack your model and has ``SyncBatchNorm`` in the diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py index 77168566d88c60..fbb648af42a337 100644 --- a/python/paddle/nn/layer/rnn.py +++ b/python/paddle/nn/layer/rnn.py @@ -332,6 +332,10 @@ def __init__(self, bias_hh_attr=None, name=None): super(SimpleRNNCell, self).__init__() + if hidden_size <= 0: + raise ValueError( + "hidden_size of {} must be greater than 0, but now equals to {}". + format(self.__class__.__name__, hidden_size)) std = 1.0 / math.sqrt(hidden_size) self.weight_ih = self.create_parameter( (hidden_size, input_size), @@ -480,6 +484,10 @@ def __init__(self, bias_hh_attr=None, name=None): super(LSTMCell, self).__init__() + if hidden_size <= 0: + raise ValueError( + "hidden_size of {} must be greater than 0, but now equals to {}". + format(self.__class__.__name__, hidden_size)) std = 1.0 / math.sqrt(hidden_size) self.weight_ih = self.create_parameter( (4 * hidden_size, input_size), @@ -627,6 +635,10 @@ def __init__(self, bias_hh_attr=None, name=None): super(GRUCell, self).__init__() + if hidden_size <= 0: + raise ValueError( + "hidden_size of {} must be greater than 0, but now equals to {}". + format(self.__class__.__name__, hidden_size)) std = 1.0 / math.sqrt(hidden_size) self.weight_ih = self.create_parameter( (3 * hidden_size, input_size), diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py index 01be63c5dfed48..394d46b9161903 100755 --- a/python/paddle/tensor/math.py +++ b/python/paddle/tensor/math.py @@ -716,13 +716,15 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None): else: reduce_all_flag = False - dtype_flag = False - if dtype is not None: - if dtype in ['float64', 'int64']: - if (convert_dtype(x.dtype) == "float32" and dtype == "float64") or \ - (convert_dtype(x.dtype) == "int32" and dtype == "int64"): - dtype_flag = True - + def get_dtype(x, dtype): + if dtype is not None: + return (True, dtype) + src_type = convert_dtype(x.dtype) + if src_type in ['bool','int32', 'int64']: + return (True, 'int64') + return (False, src_type) + + dtype_flag, dtype = get_dtype(x, dtype) if in_dygraph_mode(): axis = axis if axis != None and axis != [] else [0] if dtype_flag: @@ -740,27 +742,17 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None): 'reduce_all': reduce_all_flag } - if dtype is not None: - if dtype in ['float64', 'int64']: - if (convert_dtype(x.dtype) == "float32" and dtype == "float64") or \ - (convert_dtype(x.dtype) == "int32" and dtype == "int64"): - attrs.update({ - 'in_dtype': x.dtype, - 'out_dtype': convert_np_dtype_to_dtype_(dtype) - }) + if dtype_flag: + attrs.update({ + 'in_dtype': x.dtype, + 'out_dtype': convert_np_dtype_to_dtype_(dtype) + }) check_variable_and_dtype( - x, 'x', ['float32', 'float64', 'int32', 'int64'], 'sum') - - if dtype is not None: - check_dtype(dtype, 'dtype', ['float32', 'float64', 'int32', 'int64'], 'sum') - x_dtype = convert_dtype(x.dtype) - - if (x_dtype == "float64" and dtype in ["float32", "int32"]) or \ - (x_dtype == "int64" and dtype == "int32"): - raise ValueError("The input(x)'s dtype is {} but the attr(dtype) of sum is {}, " - "which may cause data type overflows. Please reset attr(dtype) of sum." - .format(x_dtype, dtype)) + x, 'x', ['bool', 'float16', 'float32', 'float64', + 'int32', 'int64', 'complex64', 'complex128', + u'bool', u'float16', u'float32', u'float64', + u'int32', u'int64', u'complex64', u'complex128'], 'sum') check_type(axis, 'axis', (int, list, tuple, type(None)), 'sum') diff --git a/python/paddle/tests/test_model.py b/python/paddle/tests/test_model.py index abeb83391751be..f90ff0c99af959 100644 --- a/python/paddle/tests/test_model.py +++ b/python/paddle/tests/test_model.py @@ -169,7 +169,7 @@ class TestModel(unittest.TestCase): @classmethod def setUpClass(cls): if not fluid.is_compiled_with_cuda(): - cls.skipTest('module not tested when ONLY_CPU compling') + cls().skipTest('module not tested when ONLY_CPU compling') cls.device = paddle.set_device('gpu') fluid.enable_dygraph(cls.device) diff --git a/python/paddle/utils/__init__.py b/python/paddle/utils/__init__.py index c23841ea8b802b..2c7bca71698d44 100644 --- a/python/paddle/utils/__init__.py +++ b/python/paddle/utils/__init__.py @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from . import gast from .profiler import ProfilerOptions # noqa: F401 from .profiler import Profiler # noqa: F401 from .profiler import get_profiler # noqa: F401 diff --git a/python/paddle/utils/gast/__init__.py b/python/paddle/utils/gast/__init__.py new file mode 100644 index 00000000000000..0bcbf5abb81b26 --- /dev/null +++ b/python/paddle/utils/gast/__init__.py @@ -0,0 +1,33 @@ +# Copyright (c) 2016, Serge Guelton +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# Neither the name of HPCProject, Serge Guelton nor the names of its +# contributors may be used to endorse or promote products derived from this +# software without specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# NOTE(paddle-dev): We introduce third-party library Gast as unified AST +# representation. See https://github.com/serge-sans-paille/gast for details. + +from .gast import * +from ast import NodeVisitor, NodeTransformer, iter_fields, dump diff --git a/python/paddle/utils/gast/ast3.py b/python/paddle/utils/gast/ast3.py new file mode 100644 index 00000000000000..58840d5c29074c --- /dev/null +++ b/python/paddle/utils/gast/ast3.py @@ -0,0 +1,449 @@ +# Copyright (c) 2016, Serge Guelton +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# Neither the name of HPCProject, Serge Guelton nor the names of its +# contributors may be used to endorse or promote products derived from this +# software without specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# NOTE(paddle-dev): We introduce third-party library Gast as unified AST +# representation. See https://github.com/serge-sans-paille/gast for details. + +from .astn import AstToGAst, GAstToAst +from . import gast +import ast +import sys + + +class Ast3ToGAst(AstToGAst): + if sys.version_info.minor < 9: + + def visit_ExtSlice(self, node): + new_node = gast.Tuple(self._visit(node.dims), gast.Load()) + gast.copy_location(new_node, node) + return new_node + + def visit_Index(self, node): + return self._visit(node.value) + + if sys.version_info.minor < 8: + + def visit_Module(self, node): + new_node = gast.Module( + self._visit(node.body), + [] # type_ignores + ) + return new_node + + def visit_Num(self, node): + new_node = gast.Constant( + node.n, + None, ) + gast.copy_location(new_node, node) + return new_node + + def visit_Ellipsis(self, node): + new_node = gast.Constant( + Ellipsis, + None, ) + gast.copy_location(new_node, node) + new_node.end_lineno = new_node.end_col_offset = None + return new_node + + def visit_Str(self, node): + new_node = gast.Constant( + node.s, + None, ) + gast.copy_location(new_node, node) + return new_node + + def visit_Bytes(self, node): + new_node = gast.Constant( + node.s, + None, ) + gast.copy_location(new_node, node) + return new_node + + def visit_FunctionDef(self, node): + new_node = gast.FunctionDef( + self._visit(node.name), + self._visit(node.args), + self._visit(node.body), + self._visit(node.decorator_list), + self._visit(node.returns), + None, # type_comment + ) + gast.copy_location(new_node, node) + return new_node + + def visit_AsyncFunctionDef(self, node): + new_node = gast.AsyncFunctionDef( + self._visit(node.name), + self._visit(node.args), + self._visit(node.body), + self._visit(node.decorator_list), + self._visit(node.returns), + None, # type_comment + ) + gast.copy_location(new_node, node) + return new_node + + def visit_For(self, node): + new_node = gast.For( + self._visit(node.target), + self._visit(node.iter), + self._visit(node.body), + self._visit(node.orelse), + None, # type_comment + ) + gast.copy_location(new_node, node) + return new_node + + def visit_AsyncFor(self, node): + new_node = gast.AsyncFor( + self._visit(node.target), + self._visit(node.iter), + self._visit(node.body), + self._visit(node.orelse), + None, # type_comment + ) + gast.copy_location(new_node, node) + return new_node + + def visit_With(self, node): + new_node = gast.With( + self._visit(node.items), + self._visit(node.body), + None, # type_comment + ) + gast.copy_location(new_node, node) + return new_node + + def visit_AsyncWith(self, node): + new_node = gast.AsyncWith( + self._visit(node.items), + self._visit(node.body), + None, # type_comment + ) + gast.copy_location(new_node, node) + return new_node + + def visit_Call(self, node): + if sys.version_info.minor < 5: + if node.starargs: + star = gast.Starred(self._visit(node.starargs), gast.Load()) + gast.copy_location(star, node) + starred = [star] + else: + starred = [] + + if node.kwargs: + kw = gast.keyword(None, self._visit(node.kwargs)) + gast.copy_location(kw, node.kwargs) + kwargs = [kw] + else: + kwargs = [] + else: + starred = kwargs = [] + + new_node = gast.Call( + self._visit(node.func), + self._visit(node.args) + starred, + self._visit(node.keywords) + kwargs, ) + gast.copy_location(new_node, node) + return new_node + + def visit_NameConstant(self, node): + if node.value is None: + new_node = gast.Constant(None, None) + elif node.value is True: + new_node = gast.Constant(True, None) + elif node.value is False: + new_node = gast.Constant(False, None) + gast.copy_location(new_node, node) + return new_node + + def visit_arguments(self, node): + new_node = gast.arguments( + self._visit(node.args), + [], # posonlyargs + self._visit(node.vararg), + self._visit(node.kwonlyargs), + self._visit(node.kw_defaults), + self._visit(node.kwarg), + self._visit(node.defaults), ) + gast.copy_location(new_node, node) + return new_node + + def visit_Name(self, node): + new_node = gast.Name( + self._visit(node.id), + self._visit(node.ctx), + None, + None, ) + ast.copy_location(new_node, node) + return new_node + + def visit_arg(self, node): + if sys.version_info.minor < 8: + extra_args = [None] + else: + extra_args = [self._visit(node.type_comment)] + + new_node = gast.Name( + self._visit(node.arg), + gast.Param(), + self._visit(node.annotation), + *extra_args # type_comment + ) + ast.copy_location(new_node, node) + return new_node + + def visit_ExceptHandler(self, node): + if node.name: + new_node = gast.ExceptHandler( + self._visit(node.type), + gast.Name(node.name, gast.Store(), None, None), + self._visit(node.body)) + ast.copy_location(new_node, node) + return new_node + else: + return self.generic_visit(node) + + if sys.version_info.minor < 6: + + def visit_comprehension(self, node): + new_node = gast.comprehension( + target=self._visit(node.target), + iter=self._visit(node.iter), + ifs=self._visit(node.ifs), + is_async=0, ) + return ast.copy_location(new_node, node) + + +class GAstToAst3(GAstToAst): + if sys.version_info.minor < 9: + + def visit_Subscript(self, node): + def adjust_slice(s): + if isinstance(s, ast.Slice): + return s + else: + return ast.Index(s) + + if isinstance(node.slice, gast.Tuple): + if any(isinstance(elt, gast.slice) for elt in node.slice.elts): + new_slice = ast.ExtSlice([ + adjust_slice(x) for x in self._visit(node.slice.elts) + ]) + else: + value = ast.Tuple(self._visit(node.slice.elts), ast.Load()) + ast.copy_location(value, node.slice) + new_slice = ast.Index(value) + else: + new_slice = adjust_slice(self._visit(node.slice)) + ast.copy_location(new_slice, node.slice) + + new_node = ast.Subscript( + self._visit(node.value), + new_slice, + self._visit(node.ctx), ) + ast.copy_location(new_node, node) + return new_node + + if sys.version_info.minor < 8: + + def visit_Module(self, node): + new_node = ast.Module(self._visit(node.body)) + return new_node + + def visit_Constant(self, node): + if node.value is None: + new_node = ast.NameConstant(node.value) + elif node.value is Ellipsis: + new_node = ast.Ellipsis() + elif isinstance(node.value, bool): + new_node = ast.NameConstant(node.value) + elif isinstance(node.value, (int, float, complex)): + new_node = ast.Num(node.value) + elif isinstance(node.value, str): + new_node = ast.Str(node.value) + else: + new_node = ast.Bytes(node.value) + ast.copy_location(new_node, node) + return new_node + + def _make_arg(self, node): + if node is None: + return None + + if sys.version_info.minor < 8: + extra_args = tuple() + else: + extra_args = self._visit(node.type_comment), + + new_node = ast.arg( + self._visit(node.id), self._visit(node.annotation), *extra_args) + return ast.copy_location(new_node, node) + + def visit_Name(self, node): + new_node = ast.Name( + self._visit(node.id), + self._visit(node.ctx), ) + ast.copy_location(new_node, node) + return new_node + + def visit_ExceptHandler(self, node): + if node.name: + new_node = ast.ExceptHandler( + self._visit(node.type), node.name.id, self._visit(node.body)) + return ast.copy_location(new_node, node) + else: + return self.generic_visit(node) + + if sys.version_info.minor < 5: + + def visit_Call(self, node): + if node.args and isinstance(node.args[-1], gast.Starred): + args = node.args[:-1] + starargs = node.args[-1].value + else: + args = node.args + starargs = None + + if node.keywords and node.keywords[-1].arg is None: + keywords = node.keywords[:-1] + kwargs = node.keywords[-1].value + else: + keywords = node.keywords + kwargs = None + + new_node = ast.Call( + self._visit(node.func), + self._visit(args), + self._visit(keywords), + self._visit(starargs), + self._visit(kwargs), ) + ast.copy_location(new_node, node) + return new_node + + def visit_ClassDef(self, node): + self.generic_visit(node) + new_node = ast.ClassDef( + name=self._visit(node.name), + bases=self._visit(node.bases), + keywords=self._visit(node.keywords), + body=self._visit(node.body), + decorator_list=self._visit(node.decorator_list), + starargs=None, + kwargs=None, ) + return ast.copy_location(new_node, node) + + elif sys.version_info.minor < 8: + + def visit_FunctionDef(self, node): + new_node = ast.FunctionDef( + self._visit(node.name), + self._visit(node.args), + self._visit(node.body), + self._visit(node.decorator_list), + self._visit(node.returns), ) + ast.copy_location(new_node, node) + return new_node + + def visit_AsyncFunctionDef(self, node): + new_node = ast.AsyncFunctionDef( + self._visit(node.name), + self._visit(node.args), + self._visit(node.body), + self._visit(node.decorator_list), + self._visit(node.returns), ) + ast.copy_location(new_node, node) + return new_node + + def visit_For(self, node): + new_node = ast.For( + self._visit(node.target), + self._visit(node.iter), + self._visit(node.body), + self._visit(node.orelse), ) + ast.copy_location(new_node, node) + return new_node + + def visit_AsyncFor(self, node): + new_node = ast.AsyncFor( + self._visit(node.target), + self._visit(node.iter), + self._visit(node.body), + self._visit(node.orelse), + None, # type_comment + ) + ast.copy_location(new_node, node) + return new_node + + def visit_With(self, node): + new_node = ast.With( + self._visit(node.items), + self._visit(node.body), ) + ast.copy_location(new_node, node) + return new_node + + def visit_AsyncWith(self, node): + new_node = ast.AsyncWith( + self._visit(node.items), + self._visit(node.body), ) + ast.copy_location(new_node, node) + return new_node + + def visit_Call(self, node): + new_node = ast.Call( + self._visit(node.func), + self._visit(node.args), + self._visit(node.keywords), ) + ast.copy_location(new_node, node) + return new_node + + def visit_arguments(self, node): + extra_args = [ + self._make_arg(node.vararg), + [self._make_arg(n) for n in node.kwonlyargs], + self._visit(node.kw_defaults), + self._make_arg(node.kwarg), + self._visit(node.defaults), + ] + if sys.version_info.minor >= 8: + new_node = ast.arguments( + [self._make_arg(arg) for arg in node.posonlyargs], + [self._make_arg(n) for n in node.args], *extra_args) + else: + new_node = ast.arguments([self._make_arg(n) for n in node.args], + *extra_args) + return new_node + + +def ast_to_gast(node): + return Ast3ToGAst().visit(node) + + +def gast_to_ast(node): + return GAstToAst3().visit(node) diff --git a/python/paddle/utils/gast/astn.py b/python/paddle/utils/gast/astn.py new file mode 100644 index 00000000000000..bd88ba5efc512a --- /dev/null +++ b/python/paddle/utils/gast/astn.py @@ -0,0 +1,64 @@ +# Copyright (c) 2016, Serge Guelton +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# Neither the name of HPCProject, Serge Guelton nor the names of its +# contributors may be used to endorse or promote products derived from this +# software without specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# NOTE(paddle-dev): We introduce third-party library Gast as unified AST +# representation. See https://github.com/serge-sans-paille/gast for details. + +import ast +from . import gast + + +def _generate_translators(to): + class Translator(ast.NodeTransformer): + def _visit(self, node): + if isinstance(node, list): + return [self._visit(n) for n in node] + elif isinstance(node, ast.AST): + return self.visit(node) + else: + return node + + def generic_visit(self, node): + cls = type(node).__name__ + # handle nodes that are not part of the AST + if not hasattr(to, cls): + return + new_node = getattr(to, cls)() + for field in node._fields: + setattr(new_node, field, self._visit(getattr(node, field))) + for attr in getattr(node, '_attributes'): + if hasattr(node, attr): + setattr(new_node, attr, getattr(node, attr)) + return new_node + + return Translator + + +AstToGAst = _generate_translators(gast) + +GAstToAst = _generate_translators(ast) diff --git a/python/paddle/utils/gast/gast.py b/python/paddle/utils/gast/gast.py new file mode 100644 index 00000000000000..f561c83995ac1d --- /dev/null +++ b/python/paddle/utils/gast/gast.py @@ -0,0 +1,609 @@ +# Copyright (c) 2016, Serge Guelton +# All rights reserved. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: + +# Redistributions of source code must retain the above copyright notice, this +# list of conditions and the following disclaimer. + +# Redistributions in binary form must reproduce the above copyright notice, +# this list of conditions and the following disclaimer in the documentation +# and/or other materials provided with the distribution. + +# Neither the name of HPCProject, Serge Guelton nor the names of its +# contributors may be used to endorse or promote products derived from this +# software without specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +# NOTE(paddle-dev): We introduce third-party library Gast as unified AST +# representation. See https://github.com/serge-sans-paille/gast for details. + +import sys as _sys +import ast as _ast +from ast import boolop, cmpop, excepthandler, expr, expr_context, operator +from ast import slice, stmt, unaryop, mod, AST +from ast import iter_child_nodes, walk + +try: + from ast import TypeIgnore +except ImportError: + + class TypeIgnore(AST): + pass + + +def _make_node(Name, Fields, Attributes, Bases): + def create_node(self, *args, **kwargs): + nbparam = len(args) + len(kwargs) + assert nbparam in (0, len(Fields)), \ + "Bad argument number for {}: {}, expecting {}".\ + format(Name, nbparam, len(Fields)) + self._fields = Fields + self._attributes = Attributes + for argname, argval in zip(self._fields, args): + setattr(self, argname, argval) + for argname, argval in kwargs.items(): + assert argname in Fields, \ + "Invalid Keyword argument for {}: {}".format(Name, argname) + setattr(self, argname, argval) + + setattr(_sys.modules[__name__], Name, + type(Name, Bases, {'__init__': create_node})) + + +_nodes = ( + # mod + ('Module', (('body', 'type_ignores'), (), (mod, ))), + ('Interactive', (('body', ), (), (mod, ))), + ('Expression', (('body', ), (), (mod, ))), + ('FunctionType', (('argtypes', 'returns'), (), (mod, ))), + ('Suite', (('body', ), (), (mod, ))), + + # stmt + ('FunctionDef', (('name', 'args', 'body', 'decorator_list', 'returns', + 'type_comment'), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (stmt, ))), + ('AsyncFunctionDef', (('name', 'args', 'body', 'decorator_list', 'returns', + 'type_comment'), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (stmt, ))), + ('ClassDef', (( + 'name', + 'bases', + 'keywords', + 'body', + 'decorator_list', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (stmt, ))), + ('Return', (('value', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (stmt, ))), + ('Delete', (('targets', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (stmt, ))), + ('Assign', (( + 'targets', + 'value', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (stmt, ))), + ('AugAssign', (( + 'target', + 'op', + 'value', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (stmt, ))), + ('AnnAssign', (( + 'target', + 'annotation', + 'value', + 'simple', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (stmt, ))), + ('Print', (( + 'dest', + 'values', + 'nl', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (stmt, ))), + ('For', (('target', 'iter', 'body', 'orelse', 'type_comment'), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (stmt, ))), + ('AsyncFor', (('target', 'iter', 'body', 'orelse', 'type_comment'), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (stmt, ))), + ('While', (( + 'test', + 'body', + 'orelse', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (stmt, ))), + ('If', (( + 'test', + 'body', + 'orelse', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (stmt, ))), + ('With', (('items', 'body', 'type_comment'), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (stmt, ))), + ('AsyncWith', (('items', 'body', 'type_comment'), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (stmt, ))), + ('Raise', (( + 'exc', + 'cause', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (stmt, ))), + ('Try', (( + 'body', + 'handlers', + 'orelse', + 'finalbody', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (stmt, ))), + ('Assert', (( + 'test', + 'msg', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (stmt, ))), + ('Import', (('names', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (stmt, ))), + ('ImportFrom', (( + 'module', + 'names', + 'level', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (stmt, ))), + ('Exec', (( + 'body', + 'globals', + 'locals', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (stmt, ))), + ('Global', (('names', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (stmt, ))), + ('Nonlocal', (('names', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (stmt, ))), + ('Expr', (('value', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (stmt, ))), + ('Pass', ((), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (stmt, ))), + ('Break', ((), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (stmt, ))), + ('Continue', ((), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (stmt, ))), + + # expr + ('BoolOp', (( + 'op', + 'values', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (expr, ))), + ('BinOp', (( + 'left', + 'op', + 'right', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (expr, ))), + ('UnaryOp', (( + 'op', + 'operand', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (expr, ))), + ('Lambda', (( + 'args', + 'body', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (expr, ))), + ('IfExp', (( + 'test', + 'body', + 'orelse', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (expr, ))), + ('Dict', (( + 'keys', + 'values', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (expr, ))), + ('Set', (('elts', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (expr, ))), + ('ListComp', (( + 'elt', + 'generators', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (expr, ))), + ('SetComp', (( + 'elt', + 'generators', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (expr, ))), + ('DictComp', (( + 'key', + 'value', + 'generators', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (expr, ))), + ('GeneratorExp', (( + 'elt', + 'generators', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (expr, ))), + ('Await', (('value', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (expr, ))), + ('Yield', (('value', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (expr, ))), + ('YieldFrom', (('value', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (expr, ))), + ('Compare', (( + 'left', + 'ops', + 'comparators', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (expr, ))), + ('Call', (( + 'func', + 'args', + 'keywords', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (expr, ))), + ('Repr', (('value', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (expr, ))), + ('FormattedValue', (( + 'value', + 'conversion', + 'format_spec', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (expr, ))), + ('JoinedStr', (('values', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (expr, ))), + ('Constant', (('value', 'kind'), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (expr, ))), + ('Attribute', (( + 'value', + 'attr', + 'ctx', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (expr, ))), + ('Subscript', (( + 'value', + 'slice', + 'ctx', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (expr, ))), + ('Starred', (( + 'value', + 'ctx', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (expr, ))), + ('Name', (('id', 'ctx', 'annotation', 'type_comment'), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (expr, ))), + ('List', (( + 'elts', + 'ctx', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (expr, ))), + ('Tuple', (( + 'elts', + 'ctx', ), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (expr, ))), + + # expr_context + ('Load', ((), (), (expr_context, ))), + ('Store', ((), (), (expr_context, ))), + ('Del', ((), (), (expr_context, ))), + ('AugLoad', ((), (), (expr_context, ))), + ('AugStore', ((), (), (expr_context, ))), + ('Param', ((), (), (expr_context, ))), + + # slice + ('Slice', (('lower', 'upper', 'step'), ( + 'lineno', + 'col_offset', + 'end_lineno', + 'end_col_offset', ), (slice, ))), + + # boolop + ('And', ((), (), (boolop, ))), + ('Or', ((), (), (boolop, ))), + + # operator + ('Add', ((), (), (operator, ))), + ('Sub', ((), (), (operator, ))), + ('Mult', ((), (), (operator, ))), + ('MatMult', ((), (), (operator, ))), + ('Div', ((), (), (operator, ))), + ('Mod', ((), (), (operator, ))), + ('Pow', ((), (), (operator, ))), + ('LShift', ((), (), (operator, ))), + ('RShift', ((), (), (operator, ))), + ('BitOr', ((), (), (operator, ))), + ('BitXor', ((), (), (operator, ))), + ('BitAnd', ((), (), (operator, ))), + ('FloorDiv', ((), (), (operator, ))), + + # unaryop + ('Invert', ((), (), ( + unaryop, + AST, ))), + ('Not', ((), (), ( + unaryop, + AST, ))), + ('UAdd', ((), (), ( + unaryop, + AST, ))), + ('USub', ((), (), ( + unaryop, + AST, ))), + + # cmpop + ('Eq', ((), (), (cmpop, ))), + ('NotEq', ((), (), (cmpop, ))), + ('Lt', ((), (), (cmpop, ))), + ('LtE', ((), (), (cmpop, ))), + ('Gt', ((), (), (cmpop, ))), + ('GtE', ((), (), (cmpop, ))), + ('Is', ((), (), (cmpop, ))), + ('IsNot', ((), (), (cmpop, ))), + ('In', ((), (), (cmpop, ))), + ('NotIn', ((), (), (cmpop, ))), + + # comprehension + ('comprehension', (('target', 'iter', 'ifs', 'is_async'), (), (AST, ))), + + # excepthandler + ('ExceptHandler', (('type', 'name', 'body'), + ('lineno', 'col_offset', 'end_lineno', + 'end_col_offset'), (excepthandler, ))), + + # arguments + ('arguments', (('args', 'posonlyargs', 'vararg', 'kwonlyargs', + 'kw_defaults', 'kwarg', 'defaults'), (), (AST, ))), + + # keyword + ('keyword', + (('arg', 'value'), + ('lineno', 'col_offset', 'end_lineno', 'end_col_offset'), (AST, ))), + + # alias + ('alias', (('name', 'asname'), (), (AST, ))), + + # withitem + ('withitem', (('context_expr', 'optional_vars'), (), (AST, ))), + + # type_ignore + ('type_ignore', ((), ('lineno', 'tag'), (TypeIgnore, ))), ) + +for name, descr in _nodes: + _make_node(name, *descr) + +py_version = _sys.version_info.major +if py_version != 3: + raise RuntimeError( + 'Required Python version >= 3, but received Python version == {}'. + format(py_version)) + +from .ast3 import ast_to_gast, gast_to_ast + + +def parse(*args, **kwargs): + return ast_to_gast(_ast.parse(*args, **kwargs)) + + +def literal_eval(node_or_string): + if isinstance(node_or_string, AST): + node_or_string = gast_to_ast(node_or_string) + return _ast.literal_eval(node_or_string) + + +def get_docstring(node, clean=True): + if not isinstance(node, (FunctionDef, ClassDef, Module)): + raise TypeError("%r can't have docstrings" % node.__class__.__name__) + if node.body and isinstance(node.body[0], Expr) and \ + isinstance(node.body[0].value, Constant): + if clean: + import inspect + holder = node.body[0].value + return inspect.cleandoc(getattr(holder, holder._fields[0])) + return node.body[0].value.s + + +# the following are directly imported from python3.8's Lib/ast.py # + + +def copy_location(new_node, old_node): + """ + Copy source location (`lineno`, `col_offset`, `end_lineno`, and + `end_col_offset` attributes) from *old_node* to *new_node* if possible, + and return *new_node*. + """ + for attr in 'lineno', 'col_offset', 'end_lineno', 'end_col_offset': + if attr in old_node._attributes and attr in new_node._attributes \ + and hasattr(old_node, attr): + setattr(new_node, attr, getattr(old_node, attr)) + return new_node + + +def fix_missing_locations(node): + """ + When you compile a node tree with compile(), the compiler expects lineno + and col_offset attributes for every node that supports them. This is + rather tedious to fill in for generated nodes, so this helper adds these + attributes recursively where not already set, by setting them to the values + of the parent node. It works recursively starting at *node*. + """ + + def _fix(node, lineno, col_offset, end_lineno, end_col_offset): + if 'lineno' in node._attributes: + if not hasattr(node, 'lineno'): + node.lineno = lineno + else: + lineno = node.lineno + if 'end_lineno' in node._attributes: + if not hasattr(node, 'end_lineno'): + node.end_lineno = end_lineno + else: + end_lineno = node.end_lineno + if 'col_offset' in node._attributes: + if not hasattr(node, 'col_offset'): + node.col_offset = col_offset + else: + col_offset = node.col_offset + if 'end_col_offset' in node._attributes: + if not hasattr(node, 'end_col_offset'): + node.end_col_offset = end_col_offset + else: + end_col_offset = node.end_col_offset + for child in iter_child_nodes(node): + _fix(child, lineno, col_offset, end_lineno, end_col_offset) + + _fix(node, 1, 0, 1, 0) + return node + + +def increment_lineno(node, n=1): + """ + Increment the line number and end line number of each node in the tree + starting at *node* by *n*. This is useful to "move code" to a different + location in a file. + """ + for child in walk(node): + if 'lineno' in child._attributes: + child.lineno = (getattr(child, 'lineno', 0) or 0) + n + if 'end_lineno' in child._attributes: + child.end_lineno = (getattr(child, 'end_lineno', 0) or 0) + n + return node diff --git a/python/requirements.txt b/python/requirements.txt index e9da2aa24d6cb2..4232700761581c 100644 --- a/python/requirements.txt +++ b/python/requirements.txt @@ -2,8 +2,6 @@ requests>=2.20.0 numpy>=1.13 ; python_version>="3.5" and platform_system != "Windows" numpy>=1.13, <=1.19.3 ; python_version>="3.5" and platform_system == "Windows" protobuf>=3.1.0 -gast>=0.3.3, <=0.4.0 ; platform_system != "Windows" -gast==0.3.3 ; platform_system == "Windows" Pillow six decorator diff --git a/python/setup.py.in b/python/setup.py.in index 0db6c0c27d743d..d530f8483bcde7 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -139,6 +139,7 @@ write_distributed_training_mode_py(filename='@PADDLE_BINARY_DIR@/python/paddle/f packages=['paddle', 'paddle.libs', 'paddle.utils', + 'paddle.utils.gast', 'paddle.utils.cpp_extension', 'paddle.dataset', 'paddle.reader', @@ -149,6 +150,7 @@ packages=['paddle', 'paddle.incubate.operators', 'paddle.distributed.fleet', 'paddle.distributed.fleet.base', + 'paddle.distributed.fleet.elastic', 'paddle.distributed.fleet.meta_optimizers', 'paddle.distributed.fleet.meta_optimizers.sharding', 'paddle.distributed.fleet.meta_optimizers.ascend', @@ -393,11 +395,11 @@ def find_files(pattern, root, recursive=False): headers = ( list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle')) + list(find_files('*.h', '@PADDLE_SOURCE_DIR@/paddle/fluid/extension/include')) + # extension - list(find_files('*', '${BOOST_INCLUDE_DIR}/boost', True)) + # boost # For paddle uew custom op, only copy data type headers from `paddle/fluid/platform` # to `extension/incude`, ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/complex.h'] + - ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/float16.h']) + ['@PADDLE_SOURCE_DIR@/paddle/fluid/platform/float16.h'] + + ['@PADDLE_SOURCE_DIR@/paddle/utils/any.h']) if '${WITH_MKLDNN}' == 'ON': headers += list(find_files('*', '${MKLDNN_INSTALL_DIR}/include')) # mkldnn @@ -444,12 +446,12 @@ class InstallHeaders(Command): elif 'third_party' not in header: # paddle headers install_dir = re.sub('@PADDLE_SOURCE_DIR@/', '', header) - if 'fluid' in install_dir: + if 'fluid' in install_dir or 'utils' in install_dir: install_dir = "paddle/extension/include/" else: # third_party install_dir = re.sub('${THIRD_PARTY_PATH}', 'third_party', header) - patterns = ['boost/src/extern_boost', 'install/mkldnn/include'] + patterns = ['install/mkldnn/include'] for pattern in patterns: install_dir = re.sub(pattern, '', install_dir) install_dir = os.path.join(self.install_dir, os.path.dirname(install_dir))